├── imgs
├── MFB-github.png
└── MFH-github.png
├── .gitignore
├── mfb_baseline
├── config.py
├── train_mfb_baseline.py
├── utils.py
├── vqa_data_layer.py
└── vqa_data_layer_kld.py
├── mfh_baseline
├── config.py
├── utils.py
├── train_mfh_baseline.py
├── vqa_data_layer.py
└── vqa_data_layer_kld.py
├── mfb_coatt_glove
├── config.py
├── utils.py
├── vqa_data_layer.py
├── vqa_data_layer_kld.py
└── train_mfb_coatt_glove.py
├── mfh_coatt_glove
├── config.py
├── utils.py
├── vqa_data_layer.py
└── vqa_data_layer_kld.py
├── README.md
└── eval
└── ensemble.py
/imgs/MFB-github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuzcccc/vqa-mfb/HEAD/imgs/MFB-github.png
--------------------------------------------------------------------------------
/imgs/MFH-github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuzcccc/vqa-mfb/HEAD/imgs/MFH-github.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | mfb_baseline/*.pyc
2 | mfb_coatt_glove/*.pyc
3 | eval/*.pyc
4 |
5 | # Byte-compiled / optimized / DLL files
6 | __pycache__/
7 | *.py[cod]
8 | *$py.class
9 |
10 | # C extensions
11 | *.so
12 |
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | .hypothesis/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 |
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 |
64 | # Scrapy stuff:
65 | .scrapy
66 |
67 | # Sphinx documentation
68 | docs/_build/
69 |
70 | # PyBuilder
71 | target/
72 |
73 | # IPython Notebook
74 | .ipynb_checkpoints
75 |
76 | # pyenv
77 | .python-version
78 |
79 | # celery beat schedule file
80 | celerybeat-schedule
81 |
82 | # dotenv
83 | .env
84 |
85 | # virtualenv
86 | venv/
87 | ENV/
88 |
89 | # Spyder project settings
90 | .spyderproject
91 |
92 | # Rope project settings
93 | .ropeproject
94 |
--------------------------------------------------------------------------------
/mfb_baseline/config.py:
--------------------------------------------------------------------------------
1 | #training parameters
2 | TRAIN_GPU_ID = 0
3 | TEST_GPU_ID = 0
4 | BATCH_SIZE = 200
5 | VAL_BATCH_SIZE = 200
6 | PRINT_INTERVAL = 100
7 | VALIDATE_INTERVAL = 5000
8 | MAX_ITERATIONS = 100000
9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed!
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | # what data to use for the vocabulary
13 | QUESTION_VOCAB_SPACE = 'train'
14 | ANSWER_VOCAB_SPACE = 'train' # test/test-dev/genome should not appear here
15 |
16 | #network parameters
17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
18 | MFB_FACTOR_NUM = 5
19 | MFB_OUT_DIM = 1000
20 | LSTM_UNIT_NUM = 1024
21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM
22 | MAX_WORDS_IN_QUESTION = 15
23 | LSTM_DROPOUT_RATIO = 0.3
24 | MFB_DROPOUT_RATIO = 0.1
25 |
26 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
27 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools'
28 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools'
29 |
30 | # location of the data
31 | VQA_PREFIX = '/home/yuz/data/VQA'
32 |
33 | feat = 'pool5'
34 | DATA_PATHS = {
35 | 'train': {
36 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
37 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
38 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat
39 | },
40 | 'val': {
41 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
42 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
43 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat
44 | },
45 | 'test-dev': {
46 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
47 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
48 | },
49 | 'test': {
50 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
51 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
52 | },
53 | 'genome': {
54 | 'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json',
55 | 'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/mfh_baseline/config.py:
--------------------------------------------------------------------------------
1 | #training parameters
2 | TRAIN_GPU_ID = 0
3 | TEST_GPU_ID = 0
4 | BATCH_SIZE = 200
5 | VAL_BATCH_SIZE = 200
6 | PRINT_INTERVAL = 100
7 | VALIDATE_INTERVAL = 5000
8 | MAX_ITERATIONS = 100000
9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed!
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | # what data to use for the vocabulary
13 | QUESTION_VOCAB_SPACE = 'train'
14 | ANSWER_VOCAB_SPACE = 'train' # test/test-dev/genome should not appear here
15 |
16 | #network parameters
17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
18 | MFB_FACTOR_NUM = 5
19 | MFB_OUT_DIM = 1000
20 | LSTM_UNIT_NUM = 1024
21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM
22 | MAX_WORDS_IN_QUESTION = 15
23 | LSTM_DROPOUT_RATIO = 0.3
24 | MFB_DROPOUT_RATIO = 0.1
25 |
26 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
27 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools'
28 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools'
29 |
30 | # location of the data
31 | VQA_PREFIX = '/home/yuz/data/VQA'
32 |
33 | feat = 'pool5'
34 | DATA_PATHS = {
35 | 'train': {
36 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
37 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
38 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat
39 | },
40 | 'val': {
41 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
42 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
43 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat
44 | },
45 | 'test-dev': {
46 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
47 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
48 | },
49 | 'test': {
50 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
51 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
52 | },
53 | 'genome': {
54 | 'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json',
55 | 'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/mfb_coatt_glove/config.py:
--------------------------------------------------------------------------------
1 | #training parameters
2 | TRAIN_GPU_ID = 0
3 | TEST_GPU_ID = 0
4 | BATCH_SIZE = 64
5 | VAL_BATCH_SIZE = 32
6 | PRINT_INTERVAL = 100
7 | VALIDATE_INTERVAL = 5000
8 | MAX_ITERATIONS = 100000
9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed!
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train+val'
12 | # what data to use for the vocabulary
13 | QUESTION_VOCAB_SPACE = 'train+val'
14 | ANSWER_VOCAB_SPACE = 'train+val' # test/test-dev/genome should not appear here
15 |
16 | #network parameters
17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
18 | MFB_FACTOR_NUM = 5
19 | MFB_OUT_DIM = 1000
20 | LSTM_UNIT_NUM = 1024
21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM
22 | NUM_IMG_GLIMPSE = 2
23 | NUM_QUESTION_GLIMPSE = 2
24 | IMG_FEAT_WIDTH = 14
25 | IMG_FEAT_SIZE = IMG_FEAT_WIDTH * IMG_FEAT_WIDTH
26 | MAX_WORDS_IN_QUESTION = 15
27 | LSTM_DROPOUT_RATIO = 0.3
28 | MFB_DROPOUT_RATIO = 0.1
29 |
30 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
31 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools'
32 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools'
33 |
34 | # location of the data
35 | VQA_PREFIX = '/home/yuz/data/VQA'
36 |
37 | feat = 'res5c'
38 | DATA_PATHS = {
39 | 'train': {
40 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
41 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
42 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat
43 | },
44 | 'val': {
45 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
46 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
47 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat
48 | },
49 | 'test-dev': {
50 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
51 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
52 | },
53 | 'test': {
54 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
55 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
56 | },
57 | 'genome': {
58 | 'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json',
59 | 'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/mfh_coatt_glove/config.py:
--------------------------------------------------------------------------------
1 | #training parameters
2 | TRAIN_GPU_ID = 0
3 | TEST_GPU_ID = 0
4 | BATCH_SIZE = 64
5 | VAL_BATCH_SIZE = 32
6 | PRINT_INTERVAL = 100
7 | VALIDATE_INTERVAL = 10000
8 | MAX_ITERATIONS = 100000
9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed!
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train+val'
12 | # what data to use for the vocabulary
13 | QUESTION_VOCAB_SPACE = 'train+val'
14 | ANSWER_VOCAB_SPACE = 'train+val' # test/test-dev/genome should not appear here
15 |
16 | #network parameters
17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
18 | MFB_FACTOR_NUM = 5
19 | MFB_OUT_DIM = 1000
20 | LSTM_UNIT_NUM = 1024
21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM
22 | NUM_IMG_GLIMPSE = 2
23 | NUM_QUESTION_GLIMPSE = 2
24 | IMG_FEAT_WIDTH = 14
25 | IMG_FEAT_SIZE = IMG_FEAT_WIDTH * IMG_FEAT_WIDTH
26 | MAX_WORDS_IN_QUESTION = 15
27 | LSTM_DROPOUT_RATIO = 0.3
28 | MFB_DROPOUT_RATIO = 0.1
29 |
30 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
31 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools'
32 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools'
33 |
34 | # location of the data
35 | VQA_PREFIX = '/home/yuz/data/VQA'
36 |
37 | feat = 'res5c'
38 | DATA_PATHS = {
39 | 'train': {
40 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
41 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
42 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat
43 | },
44 | 'val': {
45 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
46 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
47 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat
48 | },
49 | 'test-dev': {
50 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
51 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
52 | },
53 | 'test': {
54 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
55 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
56 | },
57 | 'genome': {
58 | 'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json',
59 | 'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # MFB and MFH for VQA
2 |
3 | **This project is deprecated! The Pytorch implementation of MFB(MFH)+CoAtt with pre-trained models, along with several state-of-the-art VQA models are maintained in our [OpenVQA](https://github.com/MILVLG/openvqa) project, which is much more convenient to use!**
4 |
5 | This project is the implementation of the papers *[Multi-modal Factorized Bilinear Pooling with Co-Attention Learning for Visual Question Answering (MFB)](https://arxiv.org/abs/1708.01471)* and *[Beyond Bilinear: Generalized Multi-modal Factorized High-order Pooling for Visual Question Answering (MFH)](https://arxiv.org/abs/1708.03619)*. Compared with existing state-of-the-art approaches such as MCB and MLB, our MFB models achieved superior performance on the large-scale VQA-1.0 and VQA-2.0 datasets. Moreover, MFH, the high-order extention of MFB, is also proveided to report better VQA performance. The MFB(MFH)+CoAtt network architecture for VQA is illustrated in Figure 1.
6 |
7 | 
8 |
Figure 1: The MFB+CoAtt Network architecture for VQA.
9 |
10 | ## Update Dec. 2nd, 2017
11 | The 3rd-party pytorch implementation for MFB(MFH) is released [here](https://github.com/asdf0982/vqa-mfb.pytorch). Great thanks, Liam!
12 |
13 | ## Update Sep. 5th, 2017
14 | Using the Bottom-up and Top-Down (BUTD) image features (the model with adaptive K ranges from [10,100]) [here](https://github.com/yuzcccc/bottom-up-attention), our single MFH+CoAtt+GloVe model achieved the overall accuracy **68.76%** on the test-dev set of VQA-2.0 dataset. With an ensemble of 8 models, we achieved the new state-of-the-art performance on the VQA-2.0 dataset's [leaderboard](https://evalai.cloudcv.org/web/challenges/challenge-page/1/leaderboard) with the overall accuracy **70.92%**.
15 |
16 | ## Update Aug. 1st, 2019
17 | Our solution for the VQA Challenge 2017 is updated!
18 |
19 | We proposed a **high-order** extention for MFB, i.e., the Multi-modal Factorized High-order Pooling (MFH). See the flowchart in Figure 2 and the implementations in `mfh_baseline` and `mfh-coatt-glove` folders. With an ensemble of 9 MFH+CoAtt+GloVe(+VG) models, **we won the 2nd place (tied with another team) in the VQA Challenge 2017**. The detailed information can be found in our paper (the second paper in the CITATION section on bottom of this page).
20 |
21 | 
22 | Figure 2: The high-order MFH model which consists of p MFB blocks (without sharing parameters).
23 |
24 | ## Prerequisites
25 |
26 | Our codes is implemented based on the high-quality [vqa-mcb](https://github.com/akirafukui/vqa-mcb) project. The data preprocessing and and other prerequisites are the same with theirs. Before running our scripts to train or test MFB model, see the `Prerequisites` and `Data Preprocessing` sections in the README of vqa-mcb's project first.
27 |
28 | - The Caffe version required for our MFB is slightly different from the MCB. We add some layers, e.g., sum pooling, permute and KLD loss layers to the `feature/20160617_cb_softattention` branch of Caffe for MCB. Please checkout our caffe version [here](https://github.com/yuzcccc/caffe) and compile it. **Note that CuDNN is not compatible with sum pooling currently, you should switch it off to run the codes correctly**.
29 |
30 | ## Pretrained Models
31 |
32 | We release the pretrained **single model** "MFB(or MFH)+CoAtt+GloVe+VG" in the papers. To the best of our knowledge, our MFH+CoAtt+GloVe+VG model report the best result (test-dev) with a single model on both the VQA-1.0 and VQA-2.0 datasets(train + val + visual genome). The corresponding results are shown in the table below. The results JSON files (results.zip for VQA-1.0) are also included in the model folders, which can be uploaded to the evaluation servers directly. **Note that the models are trained with a old version of GloVe in spacy. If you use the latest one, they maybe incosistent, leading to inferior performance. I suggest training the model from scratch by yourself.**
33 |
34 | | Datasets\Models | MCB | MFB | MFH | MFH (BUTD img features) |
35 | |:-----------------:|:-----------------:|:-----------------:|:-----------------:|:-----------------:|
36 | | VQA-1.0 | 65.38% |66.87% [BaiduYun](http://pan.baidu.com/s/1o8LURge) | 67.72% [BaiduYun](http://pan.baidu.com/s/1c2neUv2) or [Dropbox](https://www.dropbox.com/s/qh1swgsq0na1bua/VQA1.0-mfh-coatt-glove-vg.zip?dl=0) | **69.82%** |
37 | | VQA-2.0 | 62.33%1 |65.09% [BaiduYun](http://pan.baidu.com/s/1pLjtkSV) | 66.12% [BaiduYun](http://pan.baidu.com/s/1pLLUvIN) or [Dropbox](https://www.dropbox.com/s/zld15405a69how6/VQA2.0-mfh-coatt-glove-vg.zip?dl=0) | **68.76%**2 |
38 |
39 | 1 the MCB result on VQA-2.0 is provided by the VQA Challenge organizer with does not introdunce the GloVe embedding.
40 |
41 | 2 overall: 68.76, yes/no: 84.27, num: 49.56, other: 59.89
42 |
43 | ## Training from Scratch
44 |
45 | We provide the scripts for training two MFB models from scratch, i.e., `mfb-baseline` and `mfb-coatt-glove` folders. Simply running the python scripts `train_*.py` to train the models from scratch.
46 |
47 | - Most of the hyper-parameters and configrations with comments are defined in the `config.py` file.
48 | - The solver configrations are defined in the `get_solver` function in the `train_*.py` scripts.
49 | - Pretrained GloVe word embedding model (the spacy library) is required to train the mfb-coatt-glove model. The installation instructions of spacy and GloVe model can be found [here](https://github.com/akirafukui/vqa-mcb/tree/master/train).
50 |
51 | ## Evaluation
52 |
53 | To generate an answers JSON file in the format expected by the VQA evaluation code and VQA test server, you can use `eval/ensemble.py`. This code can also ensemble multiple models. Running `python ensemble.py` will print out a help message telling you what arguments to use.
54 |
55 | ## Licence
56 |
57 | This code is distributed under MIT LICENSE. The released models are only allowed for non-commercial use.
58 |
59 | ## Citation
60 |
61 | If the codes are helpful for your research, please cite
62 |
63 | ```
64 | @article{yu2017mfb,
65 | title={Multi-modal Factorized Bilinear Pooling with Co-Attention Learning for Visual Question Answering},
66 | author={Yu, Zhou and Yu, Jun and Fan, Jianping and Tao, Dacheng},
67 | journal={IEEE International Conference on Computer Vision (ICCV)},
68 | pages={1839--1848},
69 | year={2017}
70 | }
71 |
72 | @article{yu2018beyond,
73 | title={Beyond Bilinear: Generalized Multimodal Factorized High-Order Pooling for Visual Question Answering},
74 | author={Yu, Zhou and Yu, Jun and Xiang, Chenchao and Fan, Jianping and Tao, Dacheng},
75 | journal={IEEE Transactions on Neural Networks and Learning Systems},
76 | volume={29},
77 | number={12},
78 | pages={5947--5959},
79 | year={2018}
80 | }
81 | ```
82 |
83 | ## Concat
84 |
85 | Zhou Yu [yuz(AT)hdu.edu.cn]
86 |
--------------------------------------------------------------------------------
/mfb_baseline/train_mfb_baseline.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | import os
4 | import sys
5 | import numpy as np
6 | import json
7 | import matplotlib.pyplot as plt
8 |
9 | import caffe
10 | from caffe import layers as L
11 | from caffe import params as P
12 | from caffe.proto import caffe_pb2
13 |
14 | from vqa_data_layer_kld import VQADataProvider
15 | from utils import exec_validation, drawgraph
16 | import config
17 | import time
18 |
19 | def get_solver(folder):
20 | s = caffe_pb2.SolverParameter()
21 | s.train_net = './%s/proto_train.prototxt'%folder
22 | s.snapshot = int(config.VALIDATE_INTERVAL)
23 | s.snapshot_prefix = './%s/'%folder
24 | s.max_iter = int(config.MAX_ITERATIONS)
25 | s.display = int(config.VALIDATE_INTERVAL)
26 | s.type = 'Adam'
27 | s.stepsize = int(config.MAX_ITERATIONS*0.4)
28 | s.gamma = 0.5
29 | s.lr_policy = "step"
30 | s.base_lr = 0.0007
31 | s.momentum = 0.9
32 | s.momentum2 = 0.999
33 | s.weight_decay = 0.000
34 | s.clip_gradients = 10
35 | return s
36 |
37 | def get_auxiliary_json():
38 | aux = {}
39 | aux["batch_size"] = int(config.VAL_BATCH_SIZE)
40 | aux["data_shape"] = [2048]
41 | aux["img_feature_prefix"] = config.DATA_PATHS['test']['features_prefix']
42 | aux["glove"] = False
43 | return aux
44 |
45 |
46 | def mfb_baseline(mode, batchsize, T, question_vocab_size, folder):
47 | n = caffe.NetSpec()
48 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
49 | if mode == 'val':
50 | n.data, n.cont, n.img_feature, n.label = L.Python( \
51 | module='vqa_data_layer', layer='VQADataProviderLayer', \
52 | param_str=mode_str, ntop=4 )
53 | else:
54 | n.data, n.cont, n.img_feature, n.label = L.Python(\
55 | module='vqa_data_layer_kld', layer='VQADataProviderLayer', \
56 | param_str=mode_str, ntop=4 )
57 | n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
58 | weight_filler=dict(type='xavier'))
59 | n.embed_tanh = L.TanH(n.embed)
60 |
61 | # LSTM
62 | n.lstm1 = L.LSTM(\
63 | n.embed_tanh, n.cont,\
64 | recurrent_param=dict(\
65 | num_output=config.LSTM_UNIT_NUM,\
66 | weight_filler=dict(type='xavier')))
67 | tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
68 | for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
69 | n.__setattr__('slice_first'+str(i), tops1[int(i)])
70 | n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
71 | n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
72 | n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
73 | reshape_param=dict(\
74 | shape=dict(dim=[-1,1024])))
75 | n.q_feat = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
76 | '''
77 | Coarse Image-Question MFB fusion
78 | '''
79 |
80 | n.mfb_q_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
81 | weight_filler=dict(type='xavier'))
82 | n.mfb_i_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
83 | weight_filler=dict(type='xavier'))
84 | n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0))
85 | n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
86 | n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
87 | n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \
88 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
89 | n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\
90 | reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
91 | n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out)
92 | n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt)
93 |
94 | n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS,
95 | weight_filler=dict(type='xavier'))
96 | if mode == 'val':
97 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
98 | else:
99 | n.loss = L.SoftmaxKLDLoss(n.prediction, n.label)
100 | return n.to_proto()
101 |
102 | def make_answer_vocab(adic, vocab_size):
103 | """
104 | Returns a dictionary that maps words to indices.
105 | """
106 | adict = {'':0}
107 | nadict = {'':1000000}
108 | vid = 1
109 | for qid in adic.keys():
110 | answer_obj = adic[qid]
111 | answer_list = [ans['answer'] for ans in answer_obj]
112 |
113 | for q_ans in answer_list:
114 | # create dict
115 | if adict.has_key(q_ans):
116 | nadict[q_ans] += 1
117 | else:
118 | nadict[q_ans] = 1
119 | adict[q_ans] = vid
120 | vid +=1
121 |
122 | # debug
123 | nalist = []
124 | for k,v in sorted(nadict.items(), key=lambda x:x[1]):
125 | nalist.append((k,v))
126 |
127 | # remove words that appear less than once
128 | n_del_ans = 0
129 | n_valid_ans = 0
130 | adict_nid = {}
131 | for i, w in enumerate(nalist[:-vocab_size]):
132 | del adict[w[0]]
133 | n_del_ans += w[1]
134 | for i, w in enumerate(nalist[-vocab_size:]):
135 | n_valid_ans += w[1]
136 | adict_nid[w[0]] = i
137 |
138 | return adict_nid
139 |
140 | def make_question_vocab(qdic):
141 | """
142 | Returns a dictionary that maps words to indices.
143 | """
144 | vdict = {'':0}
145 | vid = 1
146 | for qid in qdic.keys():
147 | # sequence to list
148 | q_str = qdic[qid]['qstr']
149 | q_list = VQADataProvider.seq_to_list(q_str)
150 |
151 | # create dict
152 | for w in q_list:
153 | if not vdict.has_key(w):
154 | vdict[w] = vid
155 | vid +=1
156 |
157 | return vdict
158 |
159 | def make_vocab_files():
160 | """
161 | Produce the question and answer vocabulary files.
162 | """
163 | print 'making question vocab...', config.QUESTION_VOCAB_SPACE
164 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
165 | question_vocab = make_question_vocab(qdic)
166 | print 'making answer vocab...', config.ANSWER_VOCAB_SPACE
167 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
168 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
169 | return question_vocab, answer_vocab
170 |
171 | def main():
172 | folder = 'mfb_baseline_%s'%(config.TRAIN_DATA_SPLITS)
173 | if not os.path.exists('./%s'%folder):
174 | os.makedirs('./%s'%folder)
175 |
176 | question_vocab, answer_vocab = {}, {}
177 | if os.path.exists('./%s/vdict.json'%folder) and os.path.exists('./%s/adict.json'%folder):
178 | print 'restoring vocab'
179 | with open('./%s/vdict.json'%folder,'r') as f:
180 | question_vocab = json.load(f)
181 | with open('./%s/adict.json'%folder,'r') as f:
182 | answer_vocab = json.load(f)
183 | else:
184 | question_vocab, answer_vocab = make_vocab_files()
185 | with open('./%s/vdict.json'%folder,'w') as f:
186 | json.dump(question_vocab, f)
187 | with open('./%s/adict.json'%folder,'w') as f:
188 | json.dump(answer_vocab, f)
189 |
190 | print 'question vocab size:', len(question_vocab)
191 | print 'answer vocab size:', len(answer_vocab)
192 |
193 | with open('./%s/proto_train.prototxt'%folder, 'w') as f:
194 | f.write(str(mfb_baseline(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
195 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
196 |
197 | with open('./%s/proto_test.prototxt'%folder, 'w') as f:
198 | f.write(str(mfb_baseline('val', config.VAL_BATCH_SIZE, \
199 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
200 |
201 | with open('./%s/solver.prototxt'%folder, 'w') as f:
202 | f.write(str(get_solver(folder)))
203 | with open('./%s/auxiliary.json'%folder, 'w') as f:
204 | json.dump(get_auxiliary_json(),f, indent=2)
205 |
206 | caffe.set_device(config.TRAIN_GPU_ID)
207 | caffe.set_mode_gpu()
208 | solver = caffe.get_solver('./%s/solver.prototxt'%folder)
209 |
210 | train_loss = np.zeros(config.MAX_ITERATIONS+1)
211 | results = []
212 |
213 | if config.RESTORE_ITER:
214 | restore_iter = config.RESTORE_ITER
215 | solver.restore('./%s/_iter_%d.solverstate'%(folder,restore_iter))
216 | else:
217 | restore_iter = 0
218 |
219 | start = time.clock()
220 | for it in range(restore_iter, config.MAX_ITERATIONS+1):
221 | solver.step(1)
222 |
223 | # store the train loss
224 | train_loss[it] = solver.net.blobs['loss'].data
225 |
226 | if it % config.PRINT_INTERVAL == 0 and it != 0:
227 | elapsed = (time.clock() - start)
228 | print 'Iteration:', it
229 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
230 | print 'Train loss:', c_mean_loss, ' Elapsed seconds:', elapsed
231 | start = time.clock()
232 | if it % config.VALIDATE_INTERVAL == 0 and it != restore_iter:
233 | model_name = './%s/tmp.caffemodel'%(folder)
234 | solver.net.save(model_name)
235 | print 'Validating...'
236 | '''
237 | # for test-dev /test set. the json file will be generated under the file
238 | exec_validation(config.TEST_GPU_ID, 'test-dev', model_name, it=it, folder=folder)
239 | caffe.set_device(config.TRAIN_GPU_ID)
240 | '''
241 | #for val set. the accuracy will be computed and ploted
242 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.TEST_GPU_ID, 'val', model_name, it=it, folder=folder)
243 | caffe.set_device(config.TRAIN_GPU_ID)
244 | print 'Test loss:', test_loss
245 | print 'Accuracy:', acc_overall
246 | print 'Test per ans', acc_per_ans
247 | results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
248 | best_result_idx = np.array([x[3] for x in results]).argmax()
249 | print 'Best accuracy of', results[best_result_idx][3], 'was at iteration', results[best_result_idx][0]
250 | drawgraph(results,folder,config.MFB_FACTOR_NUM,config.MFB_OUT_DIM,prefix='mfb_baseline')
251 |
252 | if __name__ == '__main__':
253 | main()
254 |
--------------------------------------------------------------------------------
/mfb_baseline/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import os
4 | import sys
5 | import json
6 | import re
7 | import shutil
8 | from PIL import Image
9 | from PIL import ImageFont, ImageDraw
10 |
11 | import caffe
12 | from caffe import layers as L
13 | from caffe import params as P
14 |
15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer
16 |
17 | import config
18 | sys.path.append(config.VQA_TOOLS_PATH)
19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
20 |
21 | from vqaTools.vqa import VQA
22 | from vqaEvaluation.vqaEval import VQAEval
23 |
24 | def visualize_failures(stat_list,mode):
25 |
26 | def save_qtype(qtype_list, save_filename, mode):
27 |
28 | if mode == 'val':
29 | savepath = os.path.join('./eval', save_filename)
30 | # TODO
31 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014'
32 | elif mode == 'test-dev':
33 | savepath = os.path.join('./test-dev', save_filename)
34 | # TODO
35 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
36 | elif mode == 'test':
37 | savepath = os.path.join('./test', save_filename)
38 | # TODO
39 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
40 | else:
41 | raise Exception('Unsupported mode')
42 | if os.path.exists(savepath): shutil.rmtree(savepath)
43 | if not os.path.exists(savepath): os.makedirs(savepath)
44 |
45 | for qt in qtype_list:
46 | count = 0
47 | for t_question in stat_list:
48 | #print count, t_question
49 | if count < 40/len(qtype_list):
50 | t_question_list = t_question['q_list']
51 | saveflag = False
52 | #print 'debug****************************'
53 | #print qt
54 | #print t_question_list
55 | #print t_question_list[0] == qt[0]
56 | #print t_question_list[1] == qt[1]
57 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
58 | saveflag = True
59 | else:
60 | saveflag = False
61 |
62 | if saveflag == True:
63 | t_iid = t_question['iid']
64 | if mode == 'val':
65 | t_img = Image.open(os.path.join(img_pre, \
66 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
67 | elif mode == 'test-dev' or 'test':
68 | t_img = Image.open(os.path.join(img_pre, \
69 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
70 |
71 | # for caption
72 | #print t_iid
73 | #annIds = caps.getAnnIds(t_iid)
74 | #anns = caps.loadAnns(annIds)
75 | #cap_list = [ann['caption'] for ann in anns]
76 | ans_list = t_question['ans_list']
77 | draw = ImageDraw.Draw(t_img)
78 | for i in range(len(ans_list)):
79 | try:
80 | draw.text((10,10*i), str(ans_list[i]))
81 | except:
82 | pass
83 |
84 | ans = t_question['answer']
85 | pred = t_question['pred']
86 | if ans == -1:
87 | pre = ''
88 | elif ans == pred:
89 | pre = 'correct '
90 | else:
91 | pre = 'failure '
92 | #print ' aaa ', ans, pred
93 | ans = re.sub( '/', ' ', str(ans))
94 | pred = re.sub( '/', ' ', str(pred))
95 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \
96 | str(ans) + ' p_' + str(pred) + '.png'
97 | count += 1
98 | print os.path.join(savepath,img_title)
99 | t_img.save(os.path.join(savepath,img_title))
100 |
101 | print 'saving whatis'
102 | qt_color_list = [['what','color']]
103 | save_qtype(qt_color_list, 'colors', mode)
104 |
105 | print 'saving whatis'
106 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
107 | save_qtype(qt_whatis_list, 'whatis', mode)
108 |
109 | print 'saving is'
110 | qt_is_list = [['is','the'], ['is','this'],['is','there']]
111 | save_qtype(qt_is_list, 'is', mode)
112 |
113 | print 'saving how many'
114 | qt_howmany_list =[['how','many']]
115 | save_qtype(qt_howmany_list, 'howmany', mode)
116 |
117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False):
118 |
119 | caffe.set_device(device_id)
120 | caffe.set_mode_gpu()
121 | net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST)
122 |
123 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder)
124 | total_questions = len(dp.getQuesIds())
125 | epoch = 0
126 |
127 | pred_list = []
128 | testloss_list = []
129 | stat_list = []
130 |
131 | while epoch == 0:
132 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
133 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
134 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
135 | net.blobs['img_feature'].data[...] = t_img_feature
136 | net.blobs['label'].data[...] = t_answer
137 | net.forward()
138 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
139 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
140 | testloss_list.append(net.blobs['loss'].data)
141 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
142 | #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
143 | pred_list.append((pred,int(dp.getStrippedQuesId(qid))))
144 | if visualize:
145 | q_list = dp.seq_to_list(dp.getQuesStr(qid))
146 | if mode == 'test-dev' or 'test':
147 | ans_str = ''
148 | ans_list = ['']*10
149 | else:
150 | ans_str = dp.vec_to_answer(ans)
151 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
152 | stat_list.append({\
153 | 'qid' : qid,
154 | 'q_list' : q_list,
155 | 'iid' : iid,
156 | 'answer': ans_str,
157 | 'ans_list': ans_list,
158 | 'pred' : pred })
159 | percent = 100 * float(len(pred_list)) / total_questions
160 | sys.stdout.write('\r' + ('%.2f' % percent) + '%')
161 | sys.stdout.flush()
162 |
163 |
164 | print 'Deduping arr of len', len(pred_list)
165 | deduped = []
166 | seen = set()
167 | for ans, qid in pred_list:
168 | if qid not in seen:
169 | seen.add(qid)
170 | deduped.append((ans, qid))
171 | print 'New len', len(deduped)
172 | final_list=[]
173 | for ans,qid in deduped:
174 | final_list.append({u'answer': ans, u'question_id': qid})
175 |
176 | mean_testloss = np.array(testloss_list).mean()
177 |
178 | if mode == 'val':
179 | valFile = './%s/val2015_resfile'%folder
180 | with open(valFile, 'w') as f:
181 | json.dump(final_list, f)
182 | if visualize:
183 | visualize_failures(stat_list,mode)
184 | annFile = config.DATA_PATHS['val']['ans_file']
185 | quesFile = config.DATA_PATHS['val']['ques_file']
186 | vqa = VQA(annFile, quesFile)
187 | vqaRes = vqa.loadRes(valFile, quesFile)
188 | vqaEval = VQAEval(vqa, vqaRes, n=2)
189 | vqaEval.evaluate()
190 | acc_overall = vqaEval.accuracy['overall']
191 | acc_perQuestionType = vqaEval.accuracy['perQuestionType']
192 | acc_perAnswerType = vqaEval.accuracy['perAnswerType']
193 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
194 | elif mode == 'test-dev':
195 | filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
196 | with open(filename+'.json', 'w') as f:
197 | json.dump(final_list, f)
198 | if visualize:
199 | visualize_failures(stat_list,mode)
200 | elif mode == 'test':
201 | filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
202 | with open(filename+'.json', 'w') as f:
203 | json.dump(final_list, f)
204 | if visualize:
205 | visualize_failures(stat_list,mode)
206 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False):
207 | # 0:it
208 | # 1:trainloss
209 | # 2:testloss
210 | # 3:oa_acc
211 | # 4:qt_acc
212 | # 5:at_acc
213 |
214 | # training curve
215 | it = np.array([l[0] for l in results])
216 | loss = np.array([l[1] for l in results])
217 | valloss = np.array([l[2] for l in results])
218 | valacc = np.array([l[3] for l in results])
219 |
220 | fig = plt.figure()
221 | ax1 = fig.add_subplot(111)
222 | ax2 = ax1.twinx()
223 |
224 | ax1.plot(it,loss, color='blue', label='train loss')
225 | ax1.plot(it,valloss, '--', color='blue', label='test loss')
226 | ax2.plot(it,valacc, color='red', label='acc on val')
227 | plt.legend(loc='lower left')
228 |
229 | ax1.set_xlabel('Iterations')
230 | ax1.set_ylabel('Loss Value')
231 | ax2.set_ylabel('Accuracy on Val [%]')
232 |
233 | plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix))
234 | plt.clf()
235 | plt.close("all")
236 |
237 | # question type
238 | it = np.array([l[0] for l in results])
239 | oa_acc = np.array([l[3] for l in results])
240 | qt_dic_list = [l[4] for l in results]
241 |
242 | def draw_qt_acc(target_key_list, figname):
243 | fig = plt.figure()
244 | for k in target_key_list:
245 | print k,type(k)
246 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
247 | plt.plot(it,t_val,label=str(k))
248 | plt.legend(fontsize='small')
249 | plt.ylim(0,100.)
250 | #plt.legend(prop={'size':6})
251 |
252 | plt.xlabel('Iterations')
253 | plt.ylabel('Accuracy on Val [%]')
254 |
255 | plt.savefig(figname,dpi=200)
256 | plt.clf()
257 | plt.close("all")
258 |
259 | if save_question_type_graphs:
260 | s_keys = sorted(qt_dic_list[0].keys())
261 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png')
262 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
263 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png')
264 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png')
265 | draw_qt_acc(['what color is the','what color are the','what color is',\
266 | 'what color','what is the color of the'],'./qt_color.png')
267 | draw_qt_acc(['how many','how','how many people are',\
268 | 'how many people are in'],'./qt_number.png')
269 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\
270 | 'which'],'./qt_who_why_where_which.png')
271 | draw_qt_acc(['what is the man','is the man','are they','is he',\
272 | 'is the woman','is this person','what is the woman','is the person',\
273 | 'what is the person'],'./qt_human.png')
274 |
275 |
276 |
--------------------------------------------------------------------------------
/mfh_baseline/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import os
4 | import sys
5 | import json
6 | import re
7 | import shutil
8 | from PIL import Image
9 | from PIL import ImageFont, ImageDraw
10 |
11 | import caffe
12 | from caffe import layers as L
13 | from caffe import params as P
14 |
15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer
16 |
17 | import config
18 | sys.path.append(config.VQA_TOOLS_PATH)
19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
20 |
21 | from vqaTools.vqa import VQA
22 | from vqaEvaluation.vqaEval import VQAEval
23 |
24 | def visualize_failures(stat_list,mode):
25 |
26 | def save_qtype(qtype_list, save_filename, mode):
27 |
28 | if mode == 'val':
29 | savepath = os.path.join('./eval', save_filename)
30 | # TODO
31 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014'
32 | elif mode == 'test-dev':
33 | savepath = os.path.join('./test-dev', save_filename)
34 | # TODO
35 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
36 | elif mode == 'test':
37 | savepath = os.path.join('./test', save_filename)
38 | # TODO
39 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
40 | else:
41 | raise Exception('Unsupported mode')
42 | if os.path.exists(savepath): shutil.rmtree(savepath)
43 | if not os.path.exists(savepath): os.makedirs(savepath)
44 |
45 | for qt in qtype_list:
46 | count = 0
47 | for t_question in stat_list:
48 | #print count, t_question
49 | if count < 40/len(qtype_list):
50 | t_question_list = t_question['q_list']
51 | saveflag = False
52 | #print 'debug****************************'
53 | #print qt
54 | #print t_question_list
55 | #print t_question_list[0] == qt[0]
56 | #print t_question_list[1] == qt[1]
57 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
58 | saveflag = True
59 | else:
60 | saveflag = False
61 |
62 | if saveflag == True:
63 | t_iid = t_question['iid']
64 | if mode == 'val':
65 | t_img = Image.open(os.path.join(img_pre, \
66 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
67 | elif mode == 'test-dev' or 'test':
68 | t_img = Image.open(os.path.join(img_pre, \
69 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
70 |
71 | # for caption
72 | #print t_iid
73 | #annIds = caps.getAnnIds(t_iid)
74 | #anns = caps.loadAnns(annIds)
75 | #cap_list = [ann['caption'] for ann in anns]
76 | ans_list = t_question['ans_list']
77 | draw = ImageDraw.Draw(t_img)
78 | for i in range(len(ans_list)):
79 | try:
80 | draw.text((10,10*i), str(ans_list[i]))
81 | except:
82 | pass
83 |
84 | ans = t_question['answer']
85 | pred = t_question['pred']
86 | if ans == -1:
87 | pre = ''
88 | elif ans == pred:
89 | pre = 'correct '
90 | else:
91 | pre = 'failure '
92 | #print ' aaa ', ans, pred
93 | ans = re.sub( '/', ' ', str(ans))
94 | pred = re.sub( '/', ' ', str(pred))
95 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \
96 | str(ans) + ' p_' + str(pred) + '.png'
97 | count += 1
98 | print os.path.join(savepath,img_title)
99 | t_img.save(os.path.join(savepath,img_title))
100 |
101 | print 'saving whatis'
102 | qt_color_list = [['what','color']]
103 | save_qtype(qt_color_list, 'colors', mode)
104 |
105 | print 'saving whatis'
106 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
107 | save_qtype(qt_whatis_list, 'whatis', mode)
108 |
109 | print 'saving is'
110 | qt_is_list = [['is','the'], ['is','this'],['is','there']]
111 | save_qtype(qt_is_list, 'is', mode)
112 |
113 | print 'saving how many'
114 | qt_howmany_list =[['how','many']]
115 | save_qtype(qt_howmany_list, 'howmany', mode)
116 |
117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False):
118 |
119 | caffe.set_device(device_id)
120 | caffe.set_mode_gpu()
121 | net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST)
122 |
123 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder)
124 | total_questions = len(dp.getQuesIds())
125 | epoch = 0
126 |
127 | pred_list = []
128 | testloss_list = []
129 | stat_list = []
130 |
131 | while epoch == 0:
132 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
133 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
134 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
135 | net.blobs['img_feature'].data[...] = t_img_feature
136 | net.blobs['label'].data[...] = t_answer
137 | net.forward()
138 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
139 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
140 | testloss_list.append(net.blobs['loss'].data)
141 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
142 | #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
143 | pred_list.append((pred,int(dp.getStrippedQuesId(qid))))
144 | if visualize:
145 | q_list = dp.seq_to_list(dp.getQuesStr(qid))
146 | if mode == 'test-dev' or 'test':
147 | ans_str = ''
148 | ans_list = ['']*10
149 | else:
150 | ans_str = dp.vec_to_answer(ans)
151 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
152 | stat_list.append({\
153 | 'qid' : qid,
154 | 'q_list' : q_list,
155 | 'iid' : iid,
156 | 'answer': ans_str,
157 | 'ans_list': ans_list,
158 | 'pred' : pred })
159 | percent = 100 * float(len(pred_list)) / total_questions
160 | sys.stdout.write('\r' + ('%.2f' % percent) + '%')
161 | sys.stdout.flush()
162 |
163 |
164 | print 'Deduping arr of len', len(pred_list)
165 | deduped = []
166 | seen = set()
167 | for ans, qid in pred_list:
168 | if qid not in seen:
169 | seen.add(qid)
170 | deduped.append((ans, qid))
171 | print 'New len', len(deduped)
172 | final_list=[]
173 | for ans,qid in deduped:
174 | final_list.append({u'answer': ans, u'question_id': qid})
175 |
176 | mean_testloss = np.array(testloss_list).mean()
177 |
178 | if mode == 'val':
179 | valFile = './%s/val2015_resfile'%folder
180 | with open(valFile, 'w') as f:
181 | json.dump(final_list, f)
182 | if visualize:
183 | visualize_failures(stat_list,mode)
184 | annFile = config.DATA_PATHS['val']['ans_file']
185 | quesFile = config.DATA_PATHS['val']['ques_file']
186 | vqa = VQA(annFile, quesFile)
187 | vqaRes = vqa.loadRes(valFile, quesFile)
188 | vqaEval = VQAEval(vqa, vqaRes, n=2)
189 | vqaEval.evaluate()
190 | acc_overall = vqaEval.accuracy['overall']
191 | acc_perQuestionType = vqaEval.accuracy['perQuestionType']
192 | acc_perAnswerType = vqaEval.accuracy['perAnswerType']
193 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
194 | elif mode == 'test-dev':
195 | filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
196 | with open(filename+'.json', 'w') as f:
197 | json.dump(final_list, f)
198 | if visualize:
199 | visualize_failures(stat_list,mode)
200 | elif mode == 'test':
201 | filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
202 | with open(filename+'.json', 'w') as f:
203 | json.dump(final_list, f)
204 | if visualize:
205 | visualize_failures(stat_list,mode)
206 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False):
207 | # 0:it
208 | # 1:trainloss
209 | # 2:testloss
210 | # 3:oa_acc
211 | # 4:qt_acc
212 | # 5:at_acc
213 |
214 | # training curve
215 | it = np.array([l[0] for l in results])
216 | loss = np.array([l[1] for l in results])
217 | valloss = np.array([l[2] for l in results])
218 | valacc = np.array([l[3] for l in results])
219 |
220 | fig = plt.figure()
221 | ax1 = fig.add_subplot(111)
222 | ax2 = ax1.twinx()
223 |
224 | ax1.plot(it,loss, color='blue', label='train loss')
225 | ax1.plot(it,valloss, '--', color='blue', label='test loss')
226 | ax2.plot(it,valacc, color='red', label='acc on val')
227 | plt.legend(loc='lower left')
228 |
229 | ax1.set_xlabel('Iterations')
230 | ax1.set_ylabel('Loss Value')
231 | ax2.set_ylabel('Accuracy on Val [%]')
232 |
233 | plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix))
234 | plt.clf()
235 | plt.close("all")
236 |
237 | # question type
238 | it = np.array([l[0] for l in results])
239 | oa_acc = np.array([l[3] for l in results])
240 | qt_dic_list = [l[4] for l in results]
241 |
242 | def draw_qt_acc(target_key_list, figname):
243 | fig = plt.figure()
244 | for k in target_key_list:
245 | print k,type(k)
246 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
247 | plt.plot(it,t_val,label=str(k))
248 | plt.legend(fontsize='small')
249 | plt.ylim(0,100.)
250 | #plt.legend(prop={'size':6})
251 |
252 | plt.xlabel('Iterations')
253 | plt.ylabel('Accuracy on Val [%]')
254 |
255 | plt.savefig(figname,dpi=200)
256 | plt.clf()
257 | plt.close("all")
258 |
259 | if save_question_type_graphs:
260 | s_keys = sorted(qt_dic_list[0].keys())
261 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png')
262 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
263 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png')
264 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png')
265 | draw_qt_acc(['what color is the','what color are the','what color is',\
266 | 'what color','what is the color of the'],'./qt_color.png')
267 | draw_qt_acc(['how many','how','how many people are',\
268 | 'how many people are in'],'./qt_number.png')
269 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\
270 | 'which'],'./qt_who_why_where_which.png')
271 | draw_qt_acc(['what is the man','is the man','are they','is he',\
272 | 'is the woman','is this person','what is the woman','is the person',\
273 | 'what is the person'],'./qt_human.png')
274 |
275 |
276 |
--------------------------------------------------------------------------------
/mfh_baseline/train_mfh_baseline.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | import os
4 | import sys
5 | import numpy as np
6 | import json
7 | import matplotlib.pyplot as plt
8 |
9 | import caffe
10 | from caffe import layers as L
11 | from caffe import params as P
12 | from caffe.proto import caffe_pb2
13 |
14 | from vqa_data_layer_kld import VQADataProvider
15 | from utils import exec_validation, drawgraph
16 | import config
17 | import time
18 |
19 | def get_solver(folder):
20 | s = caffe_pb2.SolverParameter()
21 | s.train_net = './%s/proto_train.prototxt'%folder
22 | s.snapshot = 10000
23 | s.snapshot_prefix = './%s/'%folder
24 | s.max_iter = int(config.MAX_ITERATIONS)
25 | s.display = int(config.VALIDATE_INTERVAL)
26 | s.type = 'Adam'
27 | s.stepsize = int(config.MAX_ITERATIONS*0.2)
28 | s.gamma = 0.5
29 | s.lr_policy = "step"
30 | s.base_lr = 0.0007
31 | s.momentum = 0.9
32 | s.momentum2 = 0.999
33 | s.weight_decay = 0.000
34 | s.clip_gradients = 10
35 | return s
36 |
37 | def get_auxiliary_json():
38 | aux = {}
39 | aux["batch_size"] = int(config.VAL_BATCH_SIZE)
40 | aux["data_shape"] = [2048]
41 | aux["img_feature_prefix"] = config.DATA_PATHS['test']['features_prefix']
42 | aux["glove"] = False
43 | return aux
44 |
45 |
46 | def mfh_baseline(mode, batchsize, T, question_vocab_size, folder):
47 | n = caffe.NetSpec()
48 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
49 | if mode == 'val':
50 | n.data, n.cont, n.img_feature, n.label = L.Python( \
51 | module='vqa_data_layer', layer='VQADataProviderLayer', \
52 | param_str=mode_str, ntop=4 )
53 | else:
54 | n.data, n.cont, n.img_feature, n.label = L.Python(\
55 | module='vqa_data_layer_kld', layer='VQADataProviderLayer', \
56 | param_str=mode_str, ntop=4 )
57 | n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
58 | weight_filler=dict(type='xavier'))
59 | n.embed_tanh = L.TanH(n.embed)
60 |
61 | # LSTM
62 | n.lstm1 = L.LSTM(\
63 | n.embed_tanh, n.cont,\
64 | recurrent_param=dict(\
65 | num_output=config.LSTM_UNIT_NUM,\
66 | weight_filler=dict(type='xavier')))
67 | tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
68 | for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
69 | n.__setattr__('slice_first'+str(i), tops1[int(i)])
70 | n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
71 | n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
72 | n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
73 | reshape_param=dict(\
74 | shape=dict(dim=[-1,1024])))
75 | n.q_feat = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
76 |
77 | '''
78 | Coarse Image-Question MFH fusion
79 | '''
80 |
81 | n.mfb_q_o2_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
82 | weight_filler=dict(type='xavier'))
83 | n.mfb_i_o2_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
84 | weight_filler=dict(type='xavier'))
85 | n.mfb_iq_o2_eltwise = L.Eltwise(n.mfb_q_o2_proj, n.mfb_i_o2_proj, eltwise_param=dict(operation=0))
86 | n.mfb_iq_o2_drop = L.Dropout(n.mfb_iq_o2_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
87 | n.mfb_iq_o2_resh = L.Reshape(n.mfb_iq_o2_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
88 | n.mfb_iq_o2_sumpool = L.Pooling(n.mfb_iq_o2_resh, pool=P.Pooling.SUM, \
89 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
90 | n.mfb_o2_out = L.Reshape(n.mfb_iq_o2_sumpool,\
91 | reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
92 | n.mfb_o2_sign_sqrt = L.SignedSqrt(n.mfb_o2_out)
93 | n.mfb_o2_l2 = L.L2Normalize(n.mfb_o2_sign_sqrt)
94 |
95 | n.mfb_q_o3_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
96 | weight_filler=dict(type='xavier'))
97 | n.mfb_i_o3_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
98 | weight_filler=dict(type='xavier'))
99 | n.mfb_iq_o3_eltwise = L.Eltwise(n.mfb_q_o3_proj, n.mfb_i_o3_proj,n.mfb_iq_o2_drop, eltwise_param=dict(operation=0))
100 | n.mfb_iq_o3_drop = L.Dropout(n.mfb_iq_o3_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
101 | n.mfb_iq_o3_resh = L.Reshape(n.mfb_iq_o3_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
102 | n.mfb_iq_o3_sumpool = L.Pooling(n.mfb_iq_o3_resh, pool=P.Pooling.SUM, \
103 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
104 | n.mfb_o3_out = L.Reshape(n.mfb_iq_o3_sumpool,\
105 | reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
106 | n.mfb_o3_sign_sqrt = L.SignedSqrt(n.mfb_o3_out)
107 | n.mfb_o3_l2 = L.L2Normalize(n.mfb_o3_sign_sqrt)
108 |
109 | n.mfb_o23_l2 = L.Concat(n.mfb_o2_l2,n.mfb_o3_l2)
110 |
111 | n.prediction = L.InnerProduct(n.mfb_o23_l2, num_output=config.NUM_OUTPUT_UNITS,
112 | weight_filler=dict(type='xavier'))
113 |
114 | if mode == 'val':
115 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
116 | else:
117 | n.loss = L.SoftmaxKLDLoss(n.prediction, n.label)
118 | return n.to_proto()
119 |
120 | def make_answer_vocab(adic, vocab_size):
121 | """
122 | Returns a dictionary that maps words to indices.
123 | """
124 | adict = {'':0}
125 | nadict = {'':1000000}
126 | vid = 1
127 | for qid in adic.keys():
128 | answer_obj = adic[qid]
129 | answer_list = [ans['answer'] for ans in answer_obj]
130 |
131 | for q_ans in answer_list:
132 | # create dict
133 | if adict.has_key(q_ans):
134 | nadict[q_ans] += 1
135 | else:
136 | nadict[q_ans] = 1
137 | adict[q_ans] = vid
138 | vid +=1
139 |
140 | # debug
141 | nalist = []
142 | for k,v in sorted(nadict.items(), key=lambda x:x[1]):
143 | nalist.append((k,v))
144 |
145 | # remove words that appear less than once
146 | n_del_ans = 0
147 | n_valid_ans = 0
148 | adict_nid = {}
149 | for i, w in enumerate(nalist[:-vocab_size]):
150 | del adict[w[0]]
151 | n_del_ans += w[1]
152 | for i, w in enumerate(nalist[-vocab_size:]):
153 | n_valid_ans += w[1]
154 | adict_nid[w[0]] = i
155 |
156 | return adict_nid
157 |
158 | def make_question_vocab(qdic):
159 | """
160 | Returns a dictionary that maps words to indices.
161 | """
162 | vdict = {'':0}
163 | vid = 1
164 | for qid in qdic.keys():
165 | # sequence to list
166 | q_str = qdic[qid]['qstr']
167 | q_list = VQADataProvider.seq_to_list(q_str)
168 |
169 | # create dict
170 | for w in q_list:
171 | if not vdict.has_key(w):
172 | vdict[w] = vid
173 | vid +=1
174 |
175 | return vdict
176 |
177 | def make_vocab_files():
178 | """
179 | Produce the question and answer vocabulary files.
180 | """
181 | print 'making question vocab...', config.QUESTION_VOCAB_SPACE
182 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
183 | question_vocab = make_question_vocab(qdic)
184 | print 'making answer vocab...', config.ANSWER_VOCAB_SPACE
185 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
186 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
187 | return question_vocab, answer_vocab
188 |
189 | def main():
190 | folder = 'mfh_baseline_%s'%(config.TRAIN_DATA_SPLITS)
191 | if not os.path.exists('./%s'%folder):
192 | os.makedirs('./%s'%folder)
193 |
194 | question_vocab, answer_vocab = {}, {}
195 | if os.path.exists('./%s/vdict.json'%folder) and os.path.exists('./%s/adict.json'%folder):
196 | print 'restoring vocab'
197 | with open('./%s/vdict.json'%folder,'r') as f:
198 | question_vocab = json.load(f)
199 | with open('./%s/adict.json'%folder,'r') as f:
200 | answer_vocab = json.load(f)
201 | else:
202 | question_vocab, answer_vocab = make_vocab_files()
203 | with open('./%s/vdict.json'%folder,'w') as f:
204 | json.dump(question_vocab, f)
205 | with open('./%s/adict.json'%folder,'w') as f:
206 | json.dump(answer_vocab, f)
207 |
208 | print 'question vocab size:', len(question_vocab)
209 | print 'answer vocab size:', len(answer_vocab)
210 |
211 | with open('./%s/proto_train.prototxt'%folder, 'w') as f:
212 | f.write(str(mfh_baseline(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
213 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
214 |
215 | with open('./%s/proto_test.prototxt'%folder, 'w') as f:
216 | f.write(str(mfh_baseline('val', config.VAL_BATCH_SIZE, \
217 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
218 |
219 | with open('./%s/solver.prototxt'%folder, 'w') as f:
220 | f.write(str(get_solver(folder)))
221 | with open('./%s/auxiliary.json'%folder, 'w') as f:
222 | json.dump(get_auxiliary_json(),f, indent=2)
223 |
224 | caffe.set_device(config.TRAIN_GPU_ID)
225 | caffe.set_mode_gpu()
226 | solver = caffe.get_solver('./%s/solver.prototxt'%folder)
227 |
228 | train_loss = np.zeros(config.MAX_ITERATIONS+1)
229 | results = []
230 |
231 | if config.RESTORE_ITER:
232 | restore_iter = config.RESTORE_ITER
233 | solver.restore('./%s/_iter_%d.solverstate'%(folder,restore_iter))
234 | else:
235 | restore_iter = 0
236 |
237 | start = time.clock()
238 | for it in range(restore_iter, config.MAX_ITERATIONS+1):
239 | solver.step(1)
240 |
241 | # store the train loss
242 | train_loss[it] = solver.net.blobs['loss'].data
243 |
244 | if it % config.PRINT_INTERVAL == 0 and it != 0:
245 | elapsed = (time.clock() - start)
246 | print 'Iteration:', it
247 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
248 | print 'Train loss:', c_mean_loss, ' Elapsed seconds:', elapsed
249 | start = time.clock()
250 | if it % config.VALIDATE_INTERVAL == 0 and it != restore_iter:
251 | model_name = './%s/tmp.caffemodel'%(folder)
252 | solver.net.save(model_name)
253 | print 'Validating...'
254 | '''
255 | # for test-dev /test set. the json file will be generated under the file
256 | exec_validation(config.TEST_GPU_ID, 'test-dev', model_name, it=it, folder=folder)
257 | caffe.set_device(config.TRAIN_GPU_ID)
258 | '''
259 | #for val set. the accuracy will be computed and ploted
260 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.TEST_GPU_ID, 'val', model_name, it=it, folder=folder)
261 | caffe.set_device(config.TRAIN_GPU_ID)
262 | print 'Test loss:', test_loss
263 | print 'Accuracy:', acc_overall
264 | print 'Test per ans', acc_per_ans
265 | results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
266 | best_result_idx = np.array([x[3] for x in results]).argmax()
267 | print 'Best accuracy of', results[best_result_idx][3], 'was at iteration', results[best_result_idx][0]
268 | drawgraph(results,folder,config.MFB_FACTOR_NUM,config.MFB_OUT_DIM,prefix='mfh_baseline')
269 |
270 | if __name__ == '__main__':
271 | main()
272 |
--------------------------------------------------------------------------------
/eval/ensemble.py:
--------------------------------------------------------------------------------
1 | """
2 | Generates predictions on test-dev or test using an ensemble of nets. The
3 | ensemble is produced using the average of the pre-softmax output from each net.
4 |
5 | Place each model in its own folder. The folder must contain:
6 |
7 | - The .caffemodel file
8 | - proto_test.prototxt
9 | - adict.json
10 | - vdict.json
11 | - auxiliary.json
12 |
13 | auxiliary.json should contain the following keys:
14 |
15 | - batch_size (value should be integer)
16 | - data_shape (value should be array of integer)
17 | - img_feature_prefix (value should be string)
18 | - spatial_coord (value should be boolean)
19 | - glove (value should be boolean)
20 |
21 | If the folder also contains "preds.pkl", evaluation is skipped for that network.
22 |
23 | """
24 |
25 | import caffe
26 | import numpy as np
27 | import cPickle
28 | import argparse, os, glob
29 | import sys
30 | import json
31 | from collections import defaultdict
32 | import vqa_data_layer
33 | from vqa_data_layer import LoadVQADataProvider
34 |
35 | def verify_all(folder_paths):
36 | """
37 | Calls verify_one on each folder path. Also checks to make sure all the
38 | answer vocabularies are the same.
39 | """
40 | adict_paths = []
41 | for folder_path in folder_paths:
42 | paths = verify_one(folder_path)
43 | adict_paths.append(paths[2])
44 | adicts = []
45 | for path in adict_paths:
46 | with open(path, 'r') as f:
47 | adict = json.load(f)
48 | adicts.append(adict)
49 | if len(adicts) > 1:
50 | for a2 in adicts[1:]:
51 | if set(adicts[0].keys()) != set(a2.keys()):
52 | print set(adicts[0].keys()) - set(a2.keys())
53 | print set(a2.keys()) - set(adicts[0].keys())
54 | raise Exception('Answer vocab mismatch')
55 | return adicts
56 |
57 | def verify_one(folder_path):
58 | """
59 | Makes sure all the required files exist in the folder. If so, returns the
60 | paths to all the files.
61 | """
62 | model_path = glob.glob(folder_path + '/tmp*.caffemodel')
63 | print model_path
64 | assert len(model_path) == 1, 'one .caffemodel per folder, please'
65 | model_path = model_path[0]
66 | proto_path = folder_path + '/proto_test.prototxt'
67 | adict_path = folder_path + '/adict.json'
68 | vdict_path = folder_path + '/vdict.json'
69 | aux_path = folder_path + '/auxiliary.json'
70 | assert os.path.exists(proto_path), 'proto_test.prototxt missing'
71 | assert os.path.exists(adict_path), 'adict.json missing'
72 | assert os.path.exists(vdict_path), 'vdict.json missing'
73 | assert os.path.exists(aux_path), 'auxiliary.json missing'
74 | with open(aux_path, 'r') as f:
75 | aux = json.load(f)
76 | batch_size = int(aux['batch_size'])
77 | data_shape = tuple(map(int, aux['data_shape']))
78 | img_feature_prefix = aux['img_feature_prefix']
79 | spatial_coord = aux['spatial_coord'] if 'spatial_coord' in aux else False
80 | glove = aux['glove'] if 'glove' in aux else False
81 | model_weight = float(aux['model_weight']) if 'model_weight' in aux else 1.0
82 | #print 'weight: ', model_weight
83 | return model_path, proto_path, adict_path, vdict_path, batch_size, data_shape, img_feature_prefix, spatial_coord, glove, model_weight
84 |
85 | def get_pkl_fname(ques_file):
86 | if '_val2014_' in ques_file:
87 | return '/preds_val.pkl'
88 | elif '_test-dev2015_' in ques_file:
89 | return '/preds_test_dev.pkl'
90 | elif '_test2015_' in ques_file:
91 | return '/preds_test.pkl'
92 | else:
93 | raise NotImplementedError
94 |
95 | def eval_one(folder_path, gpuid, ques_file):
96 | """
97 | Evaluates a single model (in folder_path) on the questions in ques_file.
98 | Returns an array of (QID, answer vector) tuples.
99 | """
100 |
101 | model_path, proto_path, adict_path, vdict_path, batch_size, data_shape, \
102 | img_feature_prefix, spatial_coord, glove, model_weight = verify_one(folder_path)
103 |
104 | dp = LoadVQADataProvider(ques_file, img_feature_prefix, vdict_path, \
105 | adict_path, mode='test', batchsize=batch_size, data_shape=data_shape)
106 | total_questions = len(dp.getQuesIds())
107 | print total_questions, 'total questions'
108 |
109 | if os.path.exists(folder_path + get_pkl_fname(ques_file)):
110 | print 'Found existing prediction file, trying to load...'
111 | with open(folder_path + get_pkl_fname(ques_file), 'r') as f:
112 | preds = cPickle.load(f)
113 | if len(preds) >= total_questions:
114 | print 'Loaded.'
115 | return preds
116 | else:
117 | print 'Number of saved answers does not match number of questions, continuing...'
118 |
119 | caffe.set_device(gpuid)
120 | caffe.set_mode_gpu()
121 |
122 | vqa_data_layer.CURRENT_DATA_SHAPE = data_shape # This is a huge hack
123 | vqa_data_layer.SPATIAL_COORD = spatial_coord
124 | vqa_data_layer.GLOVE = glove
125 |
126 | net = caffe.Net(proto_path, model_path, caffe.TEST)
127 |
128 | print 'Model loaded:', model_path
129 | print 'Image feature prefix:', img_feature_prefix
130 | sys.stdout.flush()
131 |
132 |
133 | pred_layers = []
134 |
135 | epoch = 0
136 | while epoch == 0:
137 | t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, _, epoch = dp.get_batch_vec()
138 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
139 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
140 | net.blobs['img_feature'].data[...] = t_img_feature
141 | net.blobs['label'].data[...] = t_answer # dummy
142 | if glove:
143 | net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2))
144 | net.forward()
145 | ans_matrix = net.blobs['prediction'].data
146 |
147 | for i in range(len(t_qid_list)):
148 | qid = t_qid_list[i]
149 | pred_layers.append((qid, np.copy(model_weight * ans_matrix[i]))) # model_weight * answer_matrix
150 |
151 | percent = 100 * float(len(pred_layers)) / total_questions
152 | sys.stdout.write('\r' + ('%.2f' % percent) + '%')
153 | sys.stdout.flush()
154 |
155 | #print 'Saving predictions...'
156 | #with open(folder_path + get_pkl_fname(ques_file), 'w') as f:
157 | # cPickle.dump(pred_layers, f, protocol=-1)
158 | #print 'Saved.'
159 | return pred_layers
160 |
161 | def make_rev_adict(adict):
162 | """
163 | An adict maps text answers to neuron indices. A reverse adict maps neuron
164 | indices to text answers.
165 | """
166 | rev_adict = {}
167 | for k,v in adict.items():
168 | rev_adict[v] = k
169 | return rev_adict
170 |
171 | def softmax(arr):
172 | e = np.exp(arr)
173 | dist = e / np.sum(e)
174 | return dist
175 |
176 | def get_qid_valid_answer_dict(ques_file, adict):
177 | """
178 | Returns a dictionary mapping question IDs to valid neuron indices.
179 | """
180 | print 'Multiple choice mode: making valid answer dictionary...'
181 | valid_answer_dict = {}
182 | with open(ques_file, 'r') as f:
183 | qdata = json.load(f)
184 | for q in qdata['questions']:
185 | valid_answer_dict[q['question_id']] = q['multiple_choices']
186 | for qid in valid_answer_dict:
187 | answers = valid_answer_dict[qid]
188 | valid_indices = []
189 | for answer in answers:
190 | if answer in adict:
191 | valid_indices.append(adict[answer])
192 | if len(valid_indices) == 0:
193 | print "we won't be able to answer qid", qid
194 | valid_answer_dict[qid] = valid_indices
195 | return valid_answer_dict
196 |
197 | def dedupe(arr):
198 | print 'Deduping arr of len', len(arr)
199 | deduped = []
200 | seen = set()
201 | for qid, pred in arr:
202 | if qid not in seen:
203 | seen.add(qid)
204 | deduped.append((qid, pred))
205 | print 'New len', len(deduped)
206 | return deduped
207 |
208 | def reorder_one(predictions, this_adict, canonical_adict):
209 | index_map = {}
210 | for idx, word in make_rev_adict(this_adict).iteritems():
211 | index_map[int(idx)] = int(canonical_adict[word])
212 | index_array = np.zeros(len(index_map), dtype=int)
213 | for src_idx, dest_idx in index_map.iteritems():
214 | index_array[src_idx] = dest_idx
215 | reordered = []
216 | for qid, output in predictions:
217 | reordered.append((qid, np.copy(output[index_array])))
218 | return reordered
219 |
220 | def reorder_predictions(predictions, adicts):
221 | """
222 | Reorders prediction matrices so that the unit order matches that of the
223 | first answer dictionary.
224 | """
225 | if len(adicts) == 1:
226 | return predictions
227 | need_to_reorder = False
228 | for a2 in adicts[1:]:
229 | if adicts[0] != a2:
230 | need_to_reorder = True
231 | print 'Reordering...' if need_to_reorder else 'No need to reorder!'
232 | if not need_to_reorder:
233 | return predictions
234 | reordered = []
235 | for i in range(1, len(adicts)):
236 | if adicts[0] != adicts[i]:
237 | reordered.append(reorder_one(predictions[i], adicts[i], adicts[0]))
238 | else:
239 | reordered.append(predictions[i])
240 | return reordered
241 |
242 | def average_outputs(arr_of_arr, rev_adict, qid_valid_answer_dict):
243 | """
244 | Given a list of lists, where each list contains (QID, answer vector) tuples,
245 | returns a single dictionary which maps a question ID to the text answer.
246 | """
247 | print 'Averaging outputs...'
248 | merged = defaultdict(list)
249 | for arr in arr_of_arr:
250 | for qid, ans_vec in arr:
251 | merged[qid].append(ans_vec)
252 |
253 | merged = {qid: softmax(np.vstack(ans_vecs).mean(axis=0)) for qid, ans_vecs in merged.iteritems()}
254 | mask_len = len(merged.values()[0])
255 |
256 | # Multiple choice filtering
257 | if qid_valid_answer_dict is not None:
258 | for qid in merged:
259 | valid_indices = qid_valid_answer_dict[qid]
260 | mask = np.zeros(mask_len)
261 | for idx in valid_indices:
262 | mask[idx] = 1
263 | merged[qid] *= mask
264 |
265 | merged = {qid: rev_adict[ans_vec.argmax()] for qid, ans_vec in merged.iteritems()}
266 |
267 | return merged
268 |
269 | def save_json(qid_ans_dict, fname):
270 | tmp = []
271 | for qid, ans in qid_ans_dict.iteritems():
272 | tmp.append({u'answer': ans, u'question_id': qid})
273 | with open(fname, 'w') as f:
274 | json.dump(tmp, f)
275 | print 'Saved to', fname
276 |
277 | def main():
278 | parser = argparse.ArgumentParser()
279 | parser.add_argument('--ques_file', required=True)
280 | parser.add_argument('--gpu', type=int, required=True)
281 | parser.add_argument('--out_file', required=True)
282 | parser.add_argument('folders', nargs='*',
283 | help='space-separated list of folders containing models')
284 | args = parser.parse_args()
285 | assert len(args.folders) > 0, 'please specify at least one folder'
286 | print 'Folders', args.folders
287 |
288 | adicts = verify_all(args.folders)
289 | print '-----------------------------------------------'
290 | qid_valid_answer_dict = None
291 | if 'MultipleChoice' in args.ques_file:
292 | qid_valid_answer_dict = get_qid_valid_answer_dict(args.ques_file, adicts[0])
293 |
294 | arr_of_arr = [eval_one(folder_path, args.gpu, args.ques_file) for folder_path in args.folders]
295 | arr_of_arr = [dedupe(x) for x in arr_of_arr]
296 | #np.save('%s.predict_arr.npz'%args.out_file,x = arr_of_arr)
297 | reordered = reorder_predictions(arr_of_arr, adicts)
298 | qid_ans_dict = average_outputs(reordered, make_rev_adict(adicts[0]), qid_valid_answer_dict)
299 | save_json(qid_ans_dict, args.out_file)
300 |
301 | if __name__ == '__main__':
302 | main()
303 |
--------------------------------------------------------------------------------
/mfb_coatt_glove/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import os
4 | import sys
5 | import json
6 | import re
7 | import shutil
8 | from PIL import Image
9 | from PIL import ImageFont, ImageDraw
10 |
11 | import caffe
12 | from caffe import layers as L
13 | from caffe import params as P
14 |
15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer
16 |
17 | import config
18 | sys.path.append(config.VQA_TOOLS_PATH)
19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
20 |
21 | from vqaTools.vqa import VQA
22 | from vqaEvaluation.vqaEval import VQAEval
23 |
24 | def visualize_failures(stat_list,mode):
25 |
26 | def save_qtype(qtype_list, save_filename, mode):
27 |
28 | if mode == 'val':
29 | savepath = os.path.join('./eval', save_filename)
30 | # TODO
31 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014'
32 | elif mode == 'test-dev':
33 | savepath = os.path.join('./test-dev', save_filename)
34 | # TODO
35 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
36 | elif mode == 'test':
37 | savepath = os.path.join('./test', save_filename)
38 | # TODO
39 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
40 | else:
41 | raise Exception('Unsupported mode')
42 | if os.path.exists(savepath): shutil.rmtree(savepath)
43 | if not os.path.exists(savepath): os.makedirs(savepath)
44 |
45 | for qt in qtype_list:
46 | count = 0
47 | for t_question in stat_list:
48 | #print count, t_question
49 | if count < 40/len(qtype_list):
50 | t_question_list = t_question['q_list']
51 | saveflag = False
52 | #print 'debug****************************'
53 | #print qt
54 | #print t_question_list
55 | #print t_question_list[0] == qt[0]
56 | #print t_question_list[1] == qt[1]
57 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
58 | saveflag = True
59 | else:
60 | saveflag = False
61 |
62 | if saveflag == True:
63 | t_iid = t_question['iid']
64 | if mode == 'val':
65 | t_img = Image.open(os.path.join(img_pre, \
66 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
67 | elif mode == 'test-dev' or 'test':
68 | t_img = Image.open(os.path.join(img_pre, \
69 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
70 |
71 | # for caption
72 | #print t_iid
73 | #annIds = caps.getAnnIds(t_iid)
74 | #anns = caps.loadAnns(annIds)
75 | #cap_list = [ann['caption'] for ann in anns]
76 | ans_list = t_question['ans_list']
77 | draw = ImageDraw.Draw(t_img)
78 | for i in range(len(ans_list)):
79 | try:
80 | draw.text((10,10*i), str(ans_list[i]))
81 | except:
82 | pass
83 |
84 | ans = t_question['answer']
85 | pred = t_question['pred']
86 | if ans == -1:
87 | pre = ''
88 | elif ans == pred:
89 | pre = 'correct '
90 | else:
91 | pre = 'failure '
92 | #print ' aaa ', ans, pred
93 | ans = re.sub( '/', ' ', str(ans))
94 | pred = re.sub( '/', ' ', str(pred))
95 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \
96 | str(ans) + ' p_' + str(pred) + '.png'
97 | count += 1
98 | print os.path.join(savepath,img_title)
99 | t_img.save(os.path.join(savepath,img_title))
100 |
101 | print 'saving whatis'
102 | qt_color_list = [['what','color']]
103 | save_qtype(qt_color_list, 'colors', mode)
104 |
105 | print 'saving whatis'
106 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
107 | save_qtype(qt_whatis_list, 'whatis', mode)
108 |
109 | print 'saving is'
110 | qt_is_list = [['is','the'], ['is','this'],['is','there']]
111 | save_qtype(qt_is_list, 'is', mode)
112 |
113 | print 'saving how many'
114 | qt_howmany_list =[['how','many']]
115 | save_qtype(qt_howmany_list, 'howmany', mode)
116 |
117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False):
118 |
119 | caffe.set_device(device_id)
120 | caffe.set_mode_gpu()
121 | net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST)
122 |
123 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder)
124 | total_questions = len(dp.getQuesIds())
125 | epoch = 0
126 |
127 | pred_list = []
128 | testloss_list = []
129 | stat_list = []
130 |
131 | while epoch == 0:
132 | t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
133 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
134 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
135 | net.blobs['img_feature'].data[...] = t_img_feature
136 | net.blobs['label'].data[...] = t_answer
137 | net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2))
138 | net.forward()
139 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
140 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
141 | testloss_list.append(net.blobs['loss'].data)
142 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
143 | #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
144 | pred_list.append((pred,int(dp.getStrippedQuesId(qid))))
145 | if visualize:
146 | q_list = dp.seq_to_list(dp.getQuesStr(qid))
147 | if mode == 'test-dev' or 'test':
148 | ans_str = ''
149 | ans_list = ['']*10
150 | else:
151 | ans_str = dp.vec_to_answer(ans)
152 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
153 | stat_list.append({\
154 | 'qid' : qid,
155 | 'q_list' : q_list,
156 | 'iid' : iid,
157 | 'answer': ans_str,
158 | 'ans_list': ans_list,
159 | 'pred' : pred })
160 | percent = 100 * float(len(pred_list)) / total_questions
161 | sys.stdout.write('\r' + ('%.2f' % percent) + '%')
162 | sys.stdout.flush()
163 |
164 |
165 | print 'Deduping arr of len', len(pred_list)
166 | deduped = []
167 | seen = set()
168 | for ans, qid in pred_list:
169 | if qid not in seen:
170 | seen.add(qid)
171 | deduped.append((ans, qid))
172 | print 'New len', len(deduped)
173 | final_list=[]
174 | for ans,qid in deduped:
175 | final_list.append({u'answer': ans, u'question_id': qid})
176 |
177 | mean_testloss = np.array(testloss_list).mean()
178 |
179 | if mode == 'val':
180 | valFile = './%s/val2015_resfile'%folder
181 | with open(valFile, 'w') as f:
182 | json.dump(final_list, f)
183 | if visualize:
184 | visualize_failures(stat_list,mode)
185 | annFile = config.DATA_PATHS['val']['ans_file']
186 | quesFile = config.DATA_PATHS['val']['ques_file']
187 | vqa = VQA(annFile, quesFile)
188 | vqaRes = vqa.loadRes(valFile, quesFile)
189 | vqaEval = VQAEval(vqa, vqaRes, n=2)
190 | vqaEval.evaluate()
191 | acc_overall = vqaEval.accuracy['overall']
192 | acc_perQuestionType = vqaEval.accuracy['perQuestionType']
193 | acc_perAnswerType = vqaEval.accuracy['perAnswerType']
194 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
195 | elif mode == 'test-dev':
196 | filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
197 | with open(filename+'.json', 'w') as f:
198 | json.dump(final_list, f)
199 | if visualize:
200 | visualize_failures(stat_list,mode)
201 | elif mode == 'test':
202 | filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
203 | with open(filename+'.json', 'w') as f:
204 | json.dump(final_list, f)
205 | if visualize:
206 | visualize_failures(stat_list,mode)
207 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False):
208 | # 0:it
209 | # 1:trainloss
210 | # 2:testloss
211 | # 3:oa_acc
212 | # 4:qt_acc
213 | # 5:at_acc
214 |
215 | # training curve
216 | it = np.array([l[0] for l in results])
217 | loss = np.array([l[1] for l in results])
218 | valloss = np.array([l[2] for l in results])
219 | valacc = np.array([l[3] for l in results])
220 |
221 | fig = plt.figure()
222 | ax1 = fig.add_subplot(111)
223 | ax2 = ax1.twinx()
224 |
225 | ax1.plot(it,loss, color='blue', label='train loss')
226 | ax1.plot(it,valloss, '--', color='blue', label='test loss')
227 | ax2.plot(it,valacc, color='red', label='acc on val')
228 | plt.legend(loc='lower left')
229 |
230 | ax1.set_xlabel('Iterations')
231 | ax1.set_ylabel('Loss Value')
232 | ax2.set_ylabel('Accuracy on Val [%]')
233 |
234 | plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix))
235 | plt.clf()
236 | plt.close("all")
237 |
238 | # question type
239 | it = np.array([l[0] for l in results])
240 | oa_acc = np.array([l[3] for l in results])
241 | qt_dic_list = [l[4] for l in results]
242 |
243 | def draw_qt_acc(target_key_list, figname):
244 | fig = plt.figure()
245 | for k in target_key_list:
246 | print k,type(k)
247 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
248 | plt.plot(it,t_val,label=str(k))
249 | plt.legend(fontsize='small')
250 | plt.ylim(0,100.)
251 | #plt.legend(prop={'size':6})
252 |
253 | plt.xlabel('Iterations')
254 | plt.ylabel('Accuracy on Val [%]')
255 |
256 | plt.savefig(figname,dpi=200)
257 | plt.clf()
258 | plt.close("all")
259 |
260 | if save_question_type_graphs:
261 | s_keys = sorted(qt_dic_list[0].keys())
262 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png')
263 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
264 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png')
265 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png')
266 | draw_qt_acc(['what color is the','what color are the','what color is',\
267 | 'what color','what is the color of the'],'./qt_color.png')
268 | draw_qt_acc(['how many','how','how many people are',\
269 | 'how many people are in'],'./qt_number.png')
270 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\
271 | 'which'],'./qt_who_why_where_which.png')
272 | draw_qt_acc(['what is the man','is the man','are they','is he',\
273 | 'is the woman','is this person','what is the woman','is the person',\
274 | 'what is the person'],'./qt_human.png')
275 |
276 |
277 |
--------------------------------------------------------------------------------
/mfh_coatt_glove/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import os
4 | import sys
5 | import json
6 | import re
7 | import shutil
8 | from PIL import Image
9 | from PIL import ImageFont, ImageDraw
10 |
11 | import caffe
12 | from caffe import layers as L
13 | from caffe import params as P
14 |
15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer
16 |
17 | import config
18 | sys.path.append(config.VQA_TOOLS_PATH)
19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
20 |
21 | from vqaTools.vqa import VQA
22 | from vqaEvaluation.vqaEval import VQAEval
23 |
24 | def visualize_failures(stat_list,mode):
25 |
26 | def save_qtype(qtype_list, save_filename, mode):
27 |
28 | if mode == 'val':
29 | savepath = os.path.join('./eval', save_filename)
30 | # TODO
31 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014'
32 | elif mode == 'test-dev':
33 | savepath = os.path.join('./test-dev', save_filename)
34 | # TODO
35 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
36 | elif mode == 'test':
37 | savepath = os.path.join('./test', save_filename)
38 | # TODO
39 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
40 | else:
41 | raise Exception('Unsupported mode')
42 | if os.path.exists(savepath): shutil.rmtree(savepath)
43 | if not os.path.exists(savepath): os.makedirs(savepath)
44 |
45 | for qt in qtype_list:
46 | count = 0
47 | for t_question in stat_list:
48 | #print count, t_question
49 | if count < 40/len(qtype_list):
50 | t_question_list = t_question['q_list']
51 | saveflag = False
52 | #print 'debug****************************'
53 | #print qt
54 | #print t_question_list
55 | #print t_question_list[0] == qt[0]
56 | #print t_question_list[1] == qt[1]
57 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
58 | saveflag = True
59 | else:
60 | saveflag = False
61 |
62 | if saveflag == True:
63 | t_iid = t_question['iid']
64 | if mode == 'val':
65 | t_img = Image.open(os.path.join(img_pre, \
66 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
67 | elif mode == 'test-dev' or 'test':
68 | t_img = Image.open(os.path.join(img_pre, \
69 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
70 |
71 | # for caption
72 | #print t_iid
73 | #annIds = caps.getAnnIds(t_iid)
74 | #anns = caps.loadAnns(annIds)
75 | #cap_list = [ann['caption'] for ann in anns]
76 | ans_list = t_question['ans_list']
77 | draw = ImageDraw.Draw(t_img)
78 | for i in range(len(ans_list)):
79 | try:
80 | draw.text((10,10*i), str(ans_list[i]))
81 | except:
82 | pass
83 |
84 | ans = t_question['answer']
85 | pred = t_question['pred']
86 | if ans == -1:
87 | pre = ''
88 | elif ans == pred:
89 | pre = 'correct '
90 | else:
91 | pre = 'failure '
92 | #print ' aaa ', ans, pred
93 | ans = re.sub( '/', ' ', str(ans))
94 | pred = re.sub( '/', ' ', str(pred))
95 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \
96 | str(ans) + ' p_' + str(pred) + '.png'
97 | count += 1
98 | print os.path.join(savepath,img_title)
99 | t_img.save(os.path.join(savepath,img_title))
100 |
101 | print 'saving whatis'
102 | qt_color_list = [['what','color']]
103 | save_qtype(qt_color_list, 'colors', mode)
104 |
105 | print 'saving whatis'
106 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
107 | save_qtype(qt_whatis_list, 'whatis', mode)
108 |
109 | print 'saving is'
110 | qt_is_list = [['is','the'], ['is','this'],['is','there']]
111 | save_qtype(qt_is_list, 'is', mode)
112 |
113 | print 'saving how many'
114 | qt_howmany_list =[['how','many']]
115 | save_qtype(qt_howmany_list, 'howmany', mode)
116 |
117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False):
118 |
119 | caffe.set_device(device_id)
120 | caffe.set_mode_gpu()
121 | net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST)
122 |
123 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder)
124 | total_questions = len(dp.getQuesIds())
125 | epoch = 0
126 |
127 | pred_list = []
128 | testloss_list = []
129 | stat_list = []
130 |
131 | while epoch == 0:
132 | t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
133 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
134 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
135 | net.blobs['img_feature'].data[...] = t_img_feature
136 | net.blobs['label'].data[...] = t_answer
137 | net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2))
138 | net.forward()
139 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
140 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
141 | testloss_list.append(net.blobs['loss'].data)
142 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
143 | #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
144 | pred_list.append((pred,int(dp.getStrippedQuesId(qid))))
145 | if visualize:
146 | q_list = dp.seq_to_list(dp.getQuesStr(qid))
147 | if mode == 'test-dev' or 'test':
148 | ans_str = ''
149 | ans_list = ['']*10
150 | else:
151 | ans_str = dp.vec_to_answer(ans)
152 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
153 | stat_list.append({\
154 | 'qid' : qid,
155 | 'q_list' : q_list,
156 | 'iid' : iid,
157 | 'answer': ans_str,
158 | 'ans_list': ans_list,
159 | 'pred' : pred })
160 | percent = 100 * float(len(pred_list)) / total_questions
161 | sys.stdout.write('\r' + ('%.2f' % percent) + '%')
162 | sys.stdout.flush()
163 |
164 |
165 | print 'Deduping arr of len', len(pred_list)
166 | deduped = []
167 | seen = set()
168 | for ans, qid in pred_list:
169 | if qid not in seen:
170 | seen.add(qid)
171 | deduped.append((ans, qid))
172 | print 'New len', len(deduped)
173 | final_list=[]
174 | for ans,qid in deduped:
175 | final_list.append({u'answer': ans, u'question_id': qid})
176 |
177 | mean_testloss = np.array(testloss_list).mean()
178 |
179 | if mode == 'val':
180 | valFile = './%s/val2015_resfile'%folder
181 | with open(valFile, 'w') as f:
182 | json.dump(final_list, f)
183 | if visualize:
184 | visualize_failures(stat_list,mode)
185 | annFile = config.DATA_PATHS['val']['ans_file']
186 | quesFile = config.DATA_PATHS['val']['ques_file']
187 | vqa = VQA(annFile, quesFile)
188 | vqaRes = vqa.loadRes(valFile, quesFile)
189 | vqaEval = VQAEval(vqa, vqaRes, n=2)
190 | vqaEval.evaluate()
191 | acc_overall = vqaEval.accuracy['overall']
192 | acc_perQuestionType = vqaEval.accuracy['perQuestionType']
193 | acc_perAnswerType = vqaEval.accuracy['perAnswerType']
194 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
195 | elif mode == 'test-dev':
196 | filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
197 | with open(filename+'.json', 'w') as f:
198 | json.dump(final_list, f)
199 | if visualize:
200 | visualize_failures(stat_list,mode)
201 | elif mode == 'test':
202 | filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
203 | with open(filename+'.json', 'w') as f:
204 | json.dump(final_list, f)
205 | if visualize:
206 | visualize_failures(stat_list,mode)
207 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False):
208 | # 0:it
209 | # 1:trainloss
210 | # 2:testloss
211 | # 3:oa_acc
212 | # 4:qt_acc
213 | # 5:at_acc
214 |
215 | # training curve
216 | it = np.array([l[0] for l in results])
217 | loss = np.array([l[1] for l in results])
218 | valloss = np.array([l[2] for l in results])
219 | valacc = np.array([l[3] for l in results])
220 |
221 | fig = plt.figure()
222 | ax1 = fig.add_subplot(111)
223 | ax2 = ax1.twinx()
224 |
225 | ax1.plot(it,loss, color='blue', label='train loss')
226 | ax1.plot(it,valloss, '--', color='blue', label='test loss')
227 | ax2.plot(it,valacc, color='red', label='acc on val')
228 | plt.legend(loc='lower left')
229 |
230 | ax1.set_xlabel('Iterations')
231 | ax1.set_ylabel('Loss Value')
232 | ax2.set_ylabel('Accuracy on Val [%]')
233 |
234 | plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix))
235 | plt.clf()
236 | plt.close("all")
237 |
238 | # question type
239 | it = np.array([l[0] for l in results])
240 | oa_acc = np.array([l[3] for l in results])
241 | qt_dic_list = [l[4] for l in results]
242 |
243 | def draw_qt_acc(target_key_list, figname):
244 | fig = plt.figure()
245 | for k in target_key_list:
246 | print k,type(k)
247 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
248 | plt.plot(it,t_val,label=str(k))
249 | plt.legend(fontsize='small')
250 | plt.ylim(0,100.)
251 | #plt.legend(prop={'size':6})
252 |
253 | plt.xlabel('Iterations')
254 | plt.ylabel('Accuracy on Val [%]')
255 |
256 | plt.savefig(figname,dpi=200)
257 | plt.clf()
258 | plt.close("all")
259 |
260 | if save_question_type_graphs:
261 | s_keys = sorted(qt_dic_list[0].keys())
262 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png')
263 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
264 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png')
265 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png')
266 | draw_qt_acc(['what color is the','what color are the','what color is',\
267 | 'what color','what is the color of the'],'./qt_color.png')
268 | draw_qt_acc(['how many','how','how many people are',\
269 | 'how many people are in'],'./qt_number.png')
270 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\
271 | 'which'],'./qt_who_why_where_which.png')
272 | draw_qt_acc(['what is the man','is the man','are they','is he',\
273 | 'is the woman','is this person','what is the woman','is the person',\
274 | 'what is the person'],'./qt_human.png')
275 |
276 |
277 |
--------------------------------------------------------------------------------
/mfb_baseline/vqa_data_layer.py:
--------------------------------------------------------------------------------
1 | import caffe
2 | import numpy as np
3 | import re, json, random
4 | import config
5 |
6 | QID_KEY_SEPARATOR = '/'
7 | GLOVE_EMBEDDING_SIZE = 300
8 |
9 | class VQADataProvider:
10 |
11 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
12 | self.batchsize = batchsize
13 | self.d_vocabulary = None
14 | self.batch_index = None
15 | self.batch_len = None
16 | self.rev_adict = None
17 | self.max_length = max_length
18 | self.mode = mode
19 | self.qdic, self.adic = VQADataProvider.load_data(mode)
20 |
21 | with open('./%s/vdict.json'%folder,'r') as f:
22 | self.vdict = json.load(f)
23 | with open('./%s/adict.json'%folder,'r') as f:
24 | self.adict = json.load(f)
25 |
26 | self.n_ans_vocabulary = len(self.adict)
27 |
28 | @staticmethod
29 | def load_vqa_json(data_split):
30 | """
31 | Parses the question and answer json files for the given data split.
32 | Returns the question dictionary and the answer dictionary.
33 | """
34 | qdic, adic = {}, {}
35 |
36 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
37 | qdata = json.load(f)['questions']
38 | for q in qdata:
39 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
40 | {'qstr': q['question'], 'iid': q['image_id']}
41 |
42 | if 'test' not in data_split:
43 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
44 | adata = json.load(f)['annotations']
45 | for a in adata:
46 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
47 | a['answers']
48 |
49 | print 'parsed', len(qdic), 'questions for', data_split
50 | return qdic, adic
51 |
52 | @staticmethod
53 | def load_genome_json():
54 | """
55 | Parses the genome json file. Returns the question dictionary and the
56 | answer dictionary.
57 | """
58 | qdic, adic = {}, {}
59 |
60 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
61 | qdata = json.load(f)
62 | for q in qdata:
63 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
64 | qdic[key] = {'qstr': q['question'], 'iid': q['image']}
65 | adic[key] = [{'answer': q['answer']}]
66 |
67 | print 'parsed', len(qdic), 'questions for genome'
68 | return qdic, adic
69 |
70 | @staticmethod
71 | def load_data(data_split_str):
72 | all_qdic, all_adic = {}, {}
73 | for data_split in data_split_str.split('+'):
74 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
75 | if data_split == 'genome':
76 | qdic, adic = VQADataProvider.load_genome_json()
77 | all_qdic.update(qdic)
78 | all_adic.update(adic)
79 | else:
80 | qdic, adic = VQADataProvider.load_vqa_json(data_split)
81 | all_qdic.update(qdic)
82 | all_adic.update(adic)
83 | return all_qdic, all_adic
84 |
85 | def getQuesIds(self):
86 | return self.qdic.keys()
87 |
88 | def getStrippedQuesId(self, qid):
89 | return qid.split(QID_KEY_SEPARATOR)[1]
90 |
91 | def getImgId(self,qid):
92 | return self.qdic[qid]['iid']
93 |
94 | def getQuesStr(self,qid):
95 | return self.qdic[qid]['qstr']
96 |
97 | def getAnsObj(self,qid):
98 | if self.mode == 'test-dev' or self.mode == 'test':
99 | return -1
100 | return self.adic[qid]
101 |
102 | @staticmethod
103 | def seq_to_list(s):
104 | t_str = s.lower()
105 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
106 | t_str = re.sub( i, '', t_str)
107 | for i in [r'\-',r'\/']:
108 | t_str = re.sub( i, ' ', t_str)
109 | q_list = re.sub(r'\?','',t_str.lower()).split(' ')
110 | q_list = filter(lambda x: len(x) > 0, q_list)
111 | return q_list
112 |
113 | def extract_answer(self,answer_obj):
114 | """ Return the most popular answer in string."""
115 | if self.mode == 'test-dev' or self.mode == 'test':
116 | return -1
117 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
118 | dic = {}
119 | for ans in answer_list:
120 | if dic.has_key(ans):
121 | dic[ans] +=1
122 | else:
123 | dic[ans] = 1
124 | max_key = max((v,k) for (k,v) in dic.items())[1]
125 | return max_key
126 |
127 | def extract_answer_prob(self,answer_obj):
128 | """ Return the most popular answer in string."""
129 | if self.mode == 'test-dev' or self.mode == 'test':
130 | return -1
131 |
132 | answer_list = [ ans['answer'] for ans in answer_obj]
133 | prob_answer_list = []
134 | for ans in answer_list:
135 | if self.adict.has_key(ans):
136 | prob_answer_list.append(ans)
137 |
138 | if len(prob_answer_list) == 0:
139 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
140 | return 'hoge'
141 | else:
142 | raise Exception("This should not happen.")
143 | else:
144 | return random.choice(prob_answer_list)
145 |
146 | def qlist_to_vec(self, max_length, q_list):
147 | """
148 | Converts a list of words into a format suitable for the embedding layer.
149 |
150 | Arguments:
151 | max_length -- the maximum length of a question sequence
152 | q_list -- a list of words which are the tokens in the question
153 |
154 | Returns:
155 | qvec -- A max_length length vector containing one-hot indices for each word
156 | cvec -- A max_length length sequence continuation indicator vector
157 | """
158 | qvec = np.zeros(max_length)
159 | cvec = np.zeros(max_length)
160 | for i in xrange(max_length):
161 | if i < max_length - len(q_list):
162 | cvec[i] = 0
163 | else:
164 | w = q_list[i-(max_length-len(q_list))]
165 | # is the word in the vocabulary?
166 | if self.vdict.has_key(w) is False:
167 | w = ''
168 | qvec[i] = self.vdict[w]
169 | cvec[i] = 0 if i == max_length - len(q_list) else 1
170 |
171 | return qvec, cvec
172 |
173 | def answer_to_vec(self, ans_str):
174 | """ Return answer id if the answer is included in vocabulary otherwise '' """
175 | if self.mode =='test-dev' or self.mode == 'test':
176 | return -1
177 |
178 | if self.adict.has_key(ans_str):
179 | ans = self.adict[ans_str]
180 | else:
181 | ans = self.adict['']
182 | return ans
183 |
184 | def vec_to_answer(self, ans_symbol):
185 | """ Return answer id if the answer is included in vocabulary otherwise '' """
186 | if self.rev_adict is None:
187 | rev_adict = {}
188 | for k,v in self.adict.items():
189 | rev_adict[v] = k
190 | self.rev_adict = rev_adict
191 |
192 | return self.rev_adict[ans_symbol]
193 |
194 | def create_batch(self,qid_list):
195 |
196 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
197 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
198 | ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048)
199 | avec = (np.zeros(self.batchsize)).reshape(self.batchsize)
200 |
201 | for i,qid in enumerate(qid_list):
202 |
203 | # load raw question information
204 | q_str = self.getQuesStr(qid)
205 | q_ans = self.getAnsObj(qid)
206 | q_iid = self.getImgId(qid)
207 |
208 | # convert question to vec
209 | q_list = VQADataProvider.seq_to_list(q_str)
210 | t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list)
211 |
212 | try:
213 | qid_split = qid.split(QID_KEY_SEPARATOR)
214 | data_split = qid_split[0]
215 | if data_split == 'genome':
216 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
217 | else:
218 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
219 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
220 | except:
221 | t_ivec = 0.
222 | print 'data not found for qid : ', q_iid, self.mode
223 |
224 | # convert answer to vec
225 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
226 | q_ans_str = self.extract_answer(q_ans)
227 | else:
228 | q_ans_str = self.extract_answer_prob(q_ans)
229 | t_avec = self.answer_to_vec(q_ans_str)
230 |
231 | qvec[i,...] = t_qvec
232 | cvec[i,...] = t_cvec
233 | ivec[i,...] = t_ivec
234 | avec[i,...] = t_avec
235 |
236 | return qvec, cvec, ivec, avec
237 |
238 |
239 | def get_batch_vec(self):
240 | if self.batch_len is None:
241 | self.n_skipped = 0
242 | qid_list = self.getQuesIds()
243 | random.shuffle(qid_list)
244 | self.qid_list = qid_list
245 | self.batch_len = len(qid_list)
246 | self.batch_index = 0
247 | self.epoch_counter = 0
248 |
249 | def has_at_least_one_valid_answer(t_qid):
250 | answer_obj = self.getAnsObj(t_qid)
251 | answer_list = [ans['answer'] for ans in answer_obj]
252 | for ans in answer_list:
253 | if self.adict.has_key(ans):
254 | return True
255 |
256 | counter = 0
257 | t_qid_list = []
258 | t_iid_list = []
259 | while counter < self.batchsize:
260 | t_qid = self.qid_list[self.batch_index]
261 | t_iid = self.getImgId(t_qid)
262 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
263 | t_qid_list.append(t_qid)
264 | t_iid_list.append(t_iid)
265 | counter += 1
266 | elif has_at_least_one_valid_answer(t_qid):
267 | t_qid_list.append(t_qid)
268 | t_iid_list.append(t_iid)
269 | counter += 1
270 | else:
271 | self.n_skipped += 1
272 |
273 | if self.batch_index < self.batch_len-1:
274 | self.batch_index += 1
275 | else:
276 | self.epoch_counter += 1
277 | qid_list = self.getQuesIds()
278 | random.shuffle(qid_list)
279 | self.qid_list = qid_list
280 | self.batch_index = 0
281 | print("%d questions were skipped in a single epoch" % self.n_skipped)
282 | self.n_skipped = 0
283 |
284 | t_batch = self.create_batch(t_qid_list)
285 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
286 |
287 |
288 | class VQADataProviderLayer(caffe.Layer):
289 | """
290 | Provide input data for VQA.
291 | """
292 |
293 | def setup(self, bottom, top):
294 | self.batchsize = json.loads(self.param_str)['batchsize']
295 | self.top_names = ['data','cont','feature','label']
296 | top[0].reshape(15,self.batchsize)
297 | top[1].reshape(15,self.batchsize)
298 | top[2].reshape(self.batchsize,2048)
299 | top[3].reshape(self.batchsize)
300 |
301 | self.mode = json.loads(self.param_str)['mode']
302 | self.folder = json.loads(self.param_str)['folder']
303 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
304 | pass
305 | else:
306 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
307 |
308 | def reshape(self, bottom, top):
309 | pass
310 |
311 | def forward(self, bottom, top):
312 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
313 | pass
314 | else:
315 | word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec()
316 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
317 | top[1].data[...] = np.transpose(cont,(1,0))
318 | top[2].data[...] = feature
319 | top[3].data[...] = answer
320 |
321 | def backward(self, top, propagate_down, bottom):
322 | pass
323 |
324 |
--------------------------------------------------------------------------------
/mfh_baseline/vqa_data_layer.py:
--------------------------------------------------------------------------------
1 | import caffe
2 | import numpy as np
3 | import re, json, random
4 | import config
5 |
6 | QID_KEY_SEPARATOR = '/'
7 | GLOVE_EMBEDDING_SIZE = 300
8 |
9 | class VQADataProvider:
10 |
11 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
12 | self.batchsize = batchsize
13 | self.d_vocabulary = None
14 | self.batch_index = None
15 | self.batch_len = None
16 | self.rev_adict = None
17 | self.max_length = max_length
18 | self.mode = mode
19 | self.qdic, self.adic = VQADataProvider.load_data(mode)
20 |
21 | with open('./%s/vdict.json'%folder,'r') as f:
22 | self.vdict = json.load(f)
23 | with open('./%s/adict.json'%folder,'r') as f:
24 | self.adict = json.load(f)
25 |
26 | self.n_ans_vocabulary = len(self.adict)
27 |
28 | @staticmethod
29 | def load_vqa_json(data_split):
30 | """
31 | Parses the question and answer json files for the given data split.
32 | Returns the question dictionary and the answer dictionary.
33 | """
34 | qdic, adic = {}, {}
35 |
36 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
37 | qdata = json.load(f)['questions']
38 | for q in qdata:
39 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
40 | {'qstr': q['question'], 'iid': q['image_id']}
41 |
42 | if 'test' not in data_split:
43 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
44 | adata = json.load(f)['annotations']
45 | for a in adata:
46 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
47 | a['answers']
48 |
49 | print 'parsed', len(qdic), 'questions for', data_split
50 | return qdic, adic
51 |
52 | @staticmethod
53 | def load_genome_json():
54 | """
55 | Parses the genome json file. Returns the question dictionary and the
56 | answer dictionary.
57 | """
58 | qdic, adic = {}, {}
59 |
60 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
61 | qdata = json.load(f)
62 | for q in qdata:
63 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
64 | qdic[key] = {'qstr': q['question'], 'iid': q['image']}
65 | adic[key] = [{'answer': q['answer']}]
66 |
67 | print 'parsed', len(qdic), 'questions for genome'
68 | return qdic, adic
69 |
70 | @staticmethod
71 | def load_data(data_split_str):
72 | all_qdic, all_adic = {}, {}
73 | for data_split in data_split_str.split('+'):
74 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
75 | if data_split == 'genome':
76 | qdic, adic = VQADataProvider.load_genome_json()
77 | all_qdic.update(qdic)
78 | all_adic.update(adic)
79 | else:
80 | qdic, adic = VQADataProvider.load_vqa_json(data_split)
81 | all_qdic.update(qdic)
82 | all_adic.update(adic)
83 | return all_qdic, all_adic
84 |
85 | def getQuesIds(self):
86 | return self.qdic.keys()
87 |
88 | def getStrippedQuesId(self, qid):
89 | return qid.split(QID_KEY_SEPARATOR)[1]
90 |
91 | def getImgId(self,qid):
92 | return self.qdic[qid]['iid']
93 |
94 | def getQuesStr(self,qid):
95 | return self.qdic[qid]['qstr']
96 |
97 | def getAnsObj(self,qid):
98 | if self.mode == 'test-dev' or self.mode == 'test':
99 | return -1
100 | return self.adic[qid]
101 |
102 | @staticmethod
103 | def seq_to_list(s):
104 | t_str = s.lower()
105 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
106 | t_str = re.sub( i, '', t_str)
107 | for i in [r'\-',r'\/']:
108 | t_str = re.sub( i, ' ', t_str)
109 | q_list = re.sub(r'\?','',t_str.lower()).split(' ')
110 | q_list = filter(lambda x: len(x) > 0, q_list)
111 | return q_list
112 |
113 | def extract_answer(self,answer_obj):
114 | """ Return the most popular answer in string."""
115 | if self.mode == 'test-dev' or self.mode == 'test':
116 | return -1
117 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
118 | dic = {}
119 | for ans in answer_list:
120 | if dic.has_key(ans):
121 | dic[ans] +=1
122 | else:
123 | dic[ans] = 1
124 | max_key = max((v,k) for (k,v) in dic.items())[1]
125 | return max_key
126 |
127 | def extract_answer_prob(self,answer_obj):
128 | """ Return the most popular answer in string."""
129 | if self.mode == 'test-dev' or self.mode == 'test':
130 | return -1
131 |
132 | answer_list = [ ans['answer'] for ans in answer_obj]
133 | prob_answer_list = []
134 | for ans in answer_list:
135 | if self.adict.has_key(ans):
136 | prob_answer_list.append(ans)
137 |
138 | if len(prob_answer_list) == 0:
139 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
140 | return 'hoge'
141 | else:
142 | raise Exception("This should not happen.")
143 | else:
144 | return random.choice(prob_answer_list)
145 |
146 | def qlist_to_vec(self, max_length, q_list):
147 | """
148 | Converts a list of words into a format suitable for the embedding layer.
149 |
150 | Arguments:
151 | max_length -- the maximum length of a question sequence
152 | q_list -- a list of words which are the tokens in the question
153 |
154 | Returns:
155 | qvec -- A max_length length vector containing one-hot indices for each word
156 | cvec -- A max_length length sequence continuation indicator vector
157 | """
158 | qvec = np.zeros(max_length)
159 | cvec = np.zeros(max_length)
160 | for i in xrange(max_length):
161 | if i < max_length - len(q_list):
162 | cvec[i] = 0
163 | else:
164 | w = q_list[i-(max_length-len(q_list))]
165 | # is the word in the vocabulary?
166 | if self.vdict.has_key(w) is False:
167 | w = ''
168 | qvec[i] = self.vdict[w]
169 | cvec[i] = 0 if i == max_length - len(q_list) else 1
170 |
171 | return qvec, cvec
172 |
173 | def answer_to_vec(self, ans_str):
174 | """ Return answer id if the answer is included in vocabulary otherwise '' """
175 | if self.mode =='test-dev' or self.mode == 'test':
176 | return -1
177 |
178 | if self.adict.has_key(ans_str):
179 | ans = self.adict[ans_str]
180 | else:
181 | ans = self.adict['']
182 | return ans
183 |
184 | def vec_to_answer(self, ans_symbol):
185 | """ Return answer id if the answer is included in vocabulary otherwise '' """
186 | if self.rev_adict is None:
187 | rev_adict = {}
188 | for k,v in self.adict.items():
189 | rev_adict[v] = k
190 | self.rev_adict = rev_adict
191 |
192 | return self.rev_adict[ans_symbol]
193 |
194 | def create_batch(self,qid_list):
195 |
196 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
197 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
198 | ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048)
199 | avec = (np.zeros(self.batchsize)).reshape(self.batchsize)
200 |
201 | for i,qid in enumerate(qid_list):
202 |
203 | # load raw question information
204 | q_str = self.getQuesStr(qid)
205 | q_ans = self.getAnsObj(qid)
206 | q_iid = self.getImgId(qid)
207 |
208 | # convert question to vec
209 | q_list = VQADataProvider.seq_to_list(q_str)
210 | t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list)
211 |
212 | try:
213 | qid_split = qid.split(QID_KEY_SEPARATOR)
214 | data_split = qid_split[0]
215 | if data_split == 'genome':
216 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
217 | else:
218 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
219 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
220 | except:
221 | t_ivec = 0.
222 | print 'data not found for qid : ', q_iid, self.mode
223 |
224 | # convert answer to vec
225 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
226 | q_ans_str = self.extract_answer(q_ans)
227 | else:
228 | q_ans_str = self.extract_answer_prob(q_ans)
229 | t_avec = self.answer_to_vec(q_ans_str)
230 |
231 | qvec[i,...] = t_qvec
232 | cvec[i,...] = t_cvec
233 | ivec[i,...] = t_ivec
234 | avec[i,...] = t_avec
235 |
236 | return qvec, cvec, ivec, avec
237 |
238 |
239 | def get_batch_vec(self):
240 | if self.batch_len is None:
241 | self.n_skipped = 0
242 | qid_list = self.getQuesIds()
243 | random.shuffle(qid_list)
244 | self.qid_list = qid_list
245 | self.batch_len = len(qid_list)
246 | self.batch_index = 0
247 | self.epoch_counter = 0
248 |
249 | def has_at_least_one_valid_answer(t_qid):
250 | answer_obj = self.getAnsObj(t_qid)
251 | answer_list = [ans['answer'] for ans in answer_obj]
252 | for ans in answer_list:
253 | if self.adict.has_key(ans):
254 | return True
255 |
256 | counter = 0
257 | t_qid_list = []
258 | t_iid_list = []
259 | while counter < self.batchsize:
260 | t_qid = self.qid_list[self.batch_index]
261 | t_iid = self.getImgId(t_qid)
262 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
263 | t_qid_list.append(t_qid)
264 | t_iid_list.append(t_iid)
265 | counter += 1
266 | elif has_at_least_one_valid_answer(t_qid):
267 | t_qid_list.append(t_qid)
268 | t_iid_list.append(t_iid)
269 | counter += 1
270 | else:
271 | self.n_skipped += 1
272 |
273 | if self.batch_index < self.batch_len-1:
274 | self.batch_index += 1
275 | else:
276 | self.epoch_counter += 1
277 | qid_list = self.getQuesIds()
278 | random.shuffle(qid_list)
279 | self.qid_list = qid_list
280 | self.batch_index = 0
281 | print("%d questions were skipped in a single epoch" % self.n_skipped)
282 | self.n_skipped = 0
283 |
284 | t_batch = self.create_batch(t_qid_list)
285 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
286 |
287 |
288 | class VQADataProviderLayer(caffe.Layer):
289 | """
290 | Provide input data for VQA.
291 | """
292 |
293 | def setup(self, bottom, top):
294 | self.batchsize = json.loads(self.param_str)['batchsize']
295 | self.top_names = ['data','cont','feature','label']
296 | top[0].reshape(15,self.batchsize)
297 | top[1].reshape(15,self.batchsize)
298 | top[2].reshape(self.batchsize,2048)
299 | top[3].reshape(self.batchsize)
300 |
301 | self.mode = json.loads(self.param_str)['mode']
302 | self.folder = json.loads(self.param_str)['folder']
303 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
304 | pass
305 | else:
306 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
307 |
308 | def reshape(self, bottom, top):
309 | pass
310 |
311 | def forward(self, bottom, top):
312 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
313 | pass
314 | else:
315 | word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec()
316 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
317 | top[1].data[...] = np.transpose(cont,(1,0))
318 | top[2].data[...] = feature
319 | top[3].data[...] = answer
320 |
321 | def backward(self, top, propagate_down, bottom):
322 | pass
323 |
324 |
--------------------------------------------------------------------------------
/mfb_baseline/vqa_data_layer_kld.py:
--------------------------------------------------------------------------------
1 | import caffe
2 | import numpy as np
3 | import re, json, random
4 | import config
5 |
6 | QID_KEY_SEPARATOR = '/'
7 | GLOVE_EMBEDDING_SIZE = 300
8 |
9 | class VQADataProvider:
10 |
11 | def __init__(self, folder='./result', batchsize=64, max_length=15, mode='train'):
12 | self.batchsize = batchsize
13 | self.d_vocabulary = None
14 | self.batch_index = None
15 | self.batch_len = None
16 | self.rev_adict = None
17 | self.max_length = max_length
18 | self.mode = mode
19 | self.qdic, self.adic = VQADataProvider.load_data(mode)
20 |
21 | with open('./%s/vdict.json'%folder,'r') as f:
22 | self.vdict = json.load(f)
23 | with open('./%s/adict.json'%folder,'r') as f:
24 | self.adict = json.load(f)
25 |
26 |
27 | @staticmethod
28 | def load_vqa_json(data_split):
29 | """
30 | Parses the question and answer json files for the given data split.
31 | Returns the question dictionary and the answer dictionary.
32 | """
33 | qdic, adic = {}, {}
34 |
35 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
36 | qdata = json.load(f)['questions']
37 | for q in qdata:
38 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
39 | {'qstr': q['question'], 'iid': q['image_id']}
40 |
41 | if 'test' not in data_split:
42 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
43 | adata = json.load(f)['annotations']
44 | for a in adata:
45 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
46 | a['answers']
47 |
48 | print 'parsed', len(qdic), 'questions for', data_split
49 | return qdic, adic
50 |
51 | @staticmethod
52 | def load_genome_json():
53 | """
54 | Parses the genome json file. Returns the question dictionary and the
55 | answer dictionary.
56 | """
57 | qdic, adic = {}, {}
58 |
59 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
60 | qdata = json.load(f)
61 | for q in qdata:
62 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
63 | qdic[key] = {'qstr': q['question'], 'iid': q['image']}
64 | adic[key] = [{'answer': q['answer']}]
65 |
66 | print 'parsed', len(qdic), 'questions for genome'
67 | return qdic, adic
68 |
69 | @staticmethod
70 | def load_data(data_split_str):
71 | all_qdic, all_adic = {}, {}
72 | for data_split in data_split_str.split('+'):
73 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
74 | if data_split == 'genome':
75 | qdic, adic = VQADataProvider.load_genome_json()
76 | all_qdic.update(qdic)
77 | all_adic.update(adic)
78 | else:
79 | qdic, adic = VQADataProvider.load_vqa_json(data_split)
80 | all_qdic.update(qdic)
81 | all_adic.update(adic)
82 | return all_qdic, all_adic
83 |
84 | def getQuesIds(self):
85 | return self.qdic.keys()
86 |
87 | def getStrippedQuesId(self, qid):
88 | return qid.split(QID_KEY_SEPARATOR)[1]
89 |
90 | def getImgId(self,qid):
91 | return self.qdic[qid]['iid']
92 |
93 | def getQuesStr(self,qid):
94 | return self.qdic[qid]['qstr']
95 |
96 | def getAnsObj(self,qid):
97 | if self.mode == 'test-dev' or self.mode == 'test':
98 | return -1
99 | return self.adic[qid]
100 |
101 | @staticmethod
102 | def seq_to_list(s):
103 | t_str = s.lower()
104 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
105 | t_str = re.sub( i, '', t_str)
106 | for i in [r'\-',r'\/']:
107 | t_str = re.sub( i, ' ', t_str)
108 | q_list = re.sub(r'\?','',t_str.lower()).split(' ')
109 | q_list = filter(lambda x: len(x) > 0, q_list)
110 | return q_list
111 |
112 | def extract_answer(self,answer_obj):
113 | """ Return the most popular answer in string."""
114 | if self.mode == 'test-dev' or self.mode == 'test':
115 | return -1
116 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
117 | dic = {}
118 | for ans in answer_list:
119 | if dic.has_key(ans):
120 | dic[ans] +=1
121 | else:
122 | dic[ans] = 1
123 | max_key = max((v,k) for (k,v) in dic.items())[1]
124 | return max_key
125 |
126 | def extract_answer_prob(self,answer_obj):
127 | """ Return the most popular answer in string."""
128 | if self.mode == 'test-dev' or self.mode == 'test':
129 | return -1
130 |
131 | answer_list = [ ans['answer'] for ans in answer_obj]
132 | prob_answer_list = []
133 | for ans in answer_list:
134 | if self.adict.has_key(ans):
135 | prob_answer_list.append(ans)
136 | def extract_answer_list(self,answer_obj):
137 | answer_list = [ ans['answer'] for ans in answer_obj]
138 | prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS)
139 | for ans in answer_list:
140 | if self.adict.has_key(ans):
141 | index = self.adict[ans]
142 | prob_answer_vec[index] += 1
143 | return prob_answer_vec / np.sum(prob_answer_vec)
144 |
145 | if len(prob_answer_list) == 0:
146 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
147 | return 'hoge'
148 | else:
149 | raise Exception("This should not happen.")
150 | else:
151 | return random.choice(prob_answer_list)
152 |
153 | def qlist_to_vec(self, max_length, q_list):
154 | """
155 | Converts a list of words into a format suitable for the embedding layer.
156 |
157 | Arguments:
158 | max_length -- the maximum length of a question sequence
159 | q_list -- a list of words which are the tokens in the question
160 |
161 | Returns:
162 | qvec -- A max_length length vector containing one-hot indices for each word
163 | cvec -- A max_length length sequence continuation indicator vector
164 | """
165 | qvec = np.zeros(max_length)
166 | cvec = np.zeros(max_length)
167 | for i in xrange(max_length):
168 | if i < max_length - len(q_list):
169 | cvec[i] = 0
170 | else:
171 | w = q_list[i-(max_length-len(q_list))]
172 | # is the word in the vocabulary?
173 | if self.vdict.has_key(w) is False:
174 | w = ''
175 | qvec[i] = self.vdict[w]
176 | cvec[i] = 0 if i == max_length - len(q_list) else 1
177 |
178 | return qvec, cvec
179 |
180 | def answer_to_vec(self, ans_str):
181 | """ Return answer id if the answer is included in vocabulary otherwise '' """
182 | if self.mode =='test-dev' or self.mode == 'test':
183 | return -1
184 |
185 | if self.adict.has_key(ans_str):
186 | ans = self.adict[ans_str]
187 | else:
188 | ans = self.adict['']
189 | return ans
190 |
191 | def vec_to_answer(self, ans_symbol):
192 | """ Return answer id if the answer is included in vocabulary otherwise '' """
193 | if self.rev_adict is None:
194 | rev_adict = {}
195 | for k,v in self.adict.items():
196 | rev_adict[v] = k
197 | self.rev_adict = rev_adict
198 |
199 | return self.rev_adict[ans_symbol]
200 |
201 | def create_batch(self,qid_list):
202 |
203 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
204 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
205 | ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048)
206 | avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
207 |
208 | for i,qid in enumerate(qid_list):
209 |
210 | # load raw question information
211 | q_str = self.getQuesStr(qid)
212 | q_ans = self.getAnsObj(qid)
213 | q_iid = self.getImgId(qid)
214 |
215 | # convert question to vec
216 | q_list = VQADataProvider.seq_to_list(q_str)
217 | t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list)
218 |
219 | try:
220 | qid_split = qid.split(QID_KEY_SEPARATOR)
221 | data_split = qid_split[0]
222 | if data_split == 'genome':
223 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
224 | else:
225 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
226 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
227 | except:
228 | t_ivec = 0.
229 | print 'data not found for qid : ', q_iid, self.mode
230 |
231 | # convert answer to vec
232 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
233 | q_ans_str = self.extract_answer(q_ans)
234 | t_avec = self.answer_to_vec(q_ans_str)
235 | else:
236 | t_avec = self.extract_answer_list(q_ans)
237 |
238 | qvec[i,...] = t_qvec
239 | cvec[i,...] = t_cvec
240 | ivec[i,...] = t_ivec
241 | avec[i,...] = t_avec
242 |
243 | return qvec, cvec, ivec, avec
244 |
245 |
246 | def get_batch_vec(self):
247 | if self.batch_len is None:
248 | self.n_skipped = 0
249 | qid_list = self.getQuesIds()
250 | random.shuffle(qid_list)
251 | self.qid_list = qid_list
252 | self.batch_len = len(qid_list)
253 | self.batch_index = 0
254 | self.epoch_counter = 0
255 |
256 | def has_at_least_one_valid_answer(t_qid):
257 | answer_obj = self.getAnsObj(t_qid)
258 | answer_list = [ans['answer'] for ans in answer_obj]
259 | for ans in answer_list:
260 | if self.adict.has_key(ans):
261 | return True
262 |
263 | counter = 0
264 | t_qid_list = []
265 | t_iid_list = []
266 | while counter < self.batchsize:
267 | t_qid = self.qid_list[self.batch_index]
268 | t_iid = self.getImgId(t_qid)
269 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
270 | t_qid_list.append(t_qid)
271 | t_iid_list.append(t_iid)
272 | counter += 1
273 | elif has_at_least_one_valid_answer(t_qid):
274 | t_qid_list.append(t_qid)
275 | t_iid_list.append(t_iid)
276 | counter += 1
277 | else:
278 | self.n_skipped += 1
279 |
280 | if self.batch_index < self.batch_len-1:
281 | self.batch_index += 1
282 | else:
283 | self.epoch_counter += 1
284 | qid_list = self.getQuesIds()
285 | random.shuffle(qid_list)
286 | self.qid_list = qid_list
287 | self.batch_index = 0
288 | print("%d questions were skipped in a single epoch" % self.n_skipped)
289 | self.n_skipped = 0
290 |
291 | t_batch = self.create_batch(t_qid_list)
292 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
293 |
294 |
295 | class VQADataProviderLayer(caffe.Layer):
296 | """
297 | Provide input data for VQA.
298 | """
299 |
300 | def setup(self, bottom, top):
301 | self.batchsize = json.loads(self.param_str)['batchsize']
302 | self.top_names = ['data','cont','feature','label']
303 | top[0].reshape(15,self.batchsize)
304 | top[1].reshape(15,self.batchsize)
305 | top[2].reshape(self.batchsize,2048)
306 | top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
307 |
308 | self.mode = json.loads(self.param_str)['mode']
309 | self.folder = json.loads(self.param_str)['folder']
310 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
311 | pass
312 | else:
313 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
314 |
315 | def reshape(self, bottom, top):
316 | pass
317 |
318 | def forward(self, bottom, top):
319 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
320 | pass
321 | else:
322 | word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec()
323 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
324 | top[1].data[...] = np.transpose(cont,(1,0))
325 | top[2].data[...] = feature
326 | top[3].data[...] = answer
327 |
328 | def backward(self, top, propagate_down, bottom):
329 | pass
330 |
331 |
--------------------------------------------------------------------------------
/mfh_baseline/vqa_data_layer_kld.py:
--------------------------------------------------------------------------------
1 | import caffe
2 | import numpy as np
3 | import re, json, random
4 | import config
5 |
6 | QID_KEY_SEPARATOR = '/'
7 | GLOVE_EMBEDDING_SIZE = 300
8 |
9 | class VQADataProvider:
10 |
11 | def __init__(self, folder='./result', batchsize=64, max_length=15, mode='train'):
12 | self.batchsize = batchsize
13 | self.d_vocabulary = None
14 | self.batch_index = None
15 | self.batch_len = None
16 | self.rev_adict = None
17 | self.max_length = max_length
18 | self.mode = mode
19 | self.qdic, self.adic = VQADataProvider.load_data(mode)
20 |
21 | with open('./%s/vdict.json'%folder,'r') as f:
22 | self.vdict = json.load(f)
23 | with open('./%s/adict.json'%folder,'r') as f:
24 | self.adict = json.load(f)
25 |
26 |
27 | @staticmethod
28 | def load_vqa_json(data_split):
29 | """
30 | Parses the question and answer json files for the given data split.
31 | Returns the question dictionary and the answer dictionary.
32 | """
33 | qdic, adic = {}, {}
34 |
35 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
36 | qdata = json.load(f)['questions']
37 | for q in qdata:
38 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
39 | {'qstr': q['question'], 'iid': q['image_id']}
40 |
41 | if 'test' not in data_split:
42 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
43 | adata = json.load(f)['annotations']
44 | for a in adata:
45 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
46 | a['answers']
47 |
48 | print 'parsed', len(qdic), 'questions for', data_split
49 | return qdic, adic
50 |
51 | @staticmethod
52 | def load_genome_json():
53 | """
54 | Parses the genome json file. Returns the question dictionary and the
55 | answer dictionary.
56 | """
57 | qdic, adic = {}, {}
58 |
59 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
60 | qdata = json.load(f)
61 | for q in qdata:
62 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
63 | qdic[key] = {'qstr': q['question'], 'iid': q['image']}
64 | adic[key] = [{'answer': q['answer']}]
65 |
66 | print 'parsed', len(qdic), 'questions for genome'
67 | return qdic, adic
68 |
69 | @staticmethod
70 | def load_data(data_split_str):
71 | all_qdic, all_adic = {}, {}
72 | for data_split in data_split_str.split('+'):
73 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
74 | if data_split == 'genome':
75 | qdic, adic = VQADataProvider.load_genome_json()
76 | all_qdic.update(qdic)
77 | all_adic.update(adic)
78 | else:
79 | qdic, adic = VQADataProvider.load_vqa_json(data_split)
80 | all_qdic.update(qdic)
81 | all_adic.update(adic)
82 | return all_qdic, all_adic
83 |
84 | def getQuesIds(self):
85 | return self.qdic.keys()
86 |
87 | def getStrippedQuesId(self, qid):
88 | return qid.split(QID_KEY_SEPARATOR)[1]
89 |
90 | def getImgId(self,qid):
91 | return self.qdic[qid]['iid']
92 |
93 | def getQuesStr(self,qid):
94 | return self.qdic[qid]['qstr']
95 |
96 | def getAnsObj(self,qid):
97 | if self.mode == 'test-dev' or self.mode == 'test':
98 | return -1
99 | return self.adic[qid]
100 |
101 | @staticmethod
102 | def seq_to_list(s):
103 | t_str = s.lower()
104 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
105 | t_str = re.sub( i, '', t_str)
106 | for i in [r'\-',r'\/']:
107 | t_str = re.sub( i, ' ', t_str)
108 | q_list = re.sub(r'\?','',t_str.lower()).split(' ')
109 | q_list = filter(lambda x: len(x) > 0, q_list)
110 | return q_list
111 |
112 | def extract_answer(self,answer_obj):
113 | """ Return the most popular answer in string."""
114 | if self.mode == 'test-dev' or self.mode == 'test':
115 | return -1
116 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
117 | dic = {}
118 | for ans in answer_list:
119 | if dic.has_key(ans):
120 | dic[ans] +=1
121 | else:
122 | dic[ans] = 1
123 | max_key = max((v,k) for (k,v) in dic.items())[1]
124 | return max_key
125 |
126 | def extract_answer_prob(self,answer_obj):
127 | """ Return the most popular answer in string."""
128 | if self.mode == 'test-dev' or self.mode == 'test':
129 | return -1
130 |
131 | answer_list = [ ans['answer'] for ans in answer_obj]
132 | prob_answer_list = []
133 | for ans in answer_list:
134 | if self.adict.has_key(ans):
135 | prob_answer_list.append(ans)
136 | def extract_answer_list(self,answer_obj):
137 | answer_list = [ ans['answer'] for ans in answer_obj]
138 | prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS)
139 | for ans in answer_list:
140 | if self.adict.has_key(ans):
141 | index = self.adict[ans]
142 | prob_answer_vec[index] += 1
143 | return prob_answer_vec / np.sum(prob_answer_vec)
144 |
145 | if len(prob_answer_list) == 0:
146 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
147 | return 'hoge'
148 | else:
149 | raise Exception("This should not happen.")
150 | else:
151 | return random.choice(prob_answer_list)
152 |
153 | def qlist_to_vec(self, max_length, q_list):
154 | """
155 | Converts a list of words into a format suitable for the embedding layer.
156 |
157 | Arguments:
158 | max_length -- the maximum length of a question sequence
159 | q_list -- a list of words which are the tokens in the question
160 |
161 | Returns:
162 | qvec -- A max_length length vector containing one-hot indices for each word
163 | cvec -- A max_length length sequence continuation indicator vector
164 | """
165 | qvec = np.zeros(max_length)
166 | cvec = np.zeros(max_length)
167 | for i in xrange(max_length):
168 | if i < max_length - len(q_list):
169 | cvec[i] = 0
170 | else:
171 | w = q_list[i-(max_length-len(q_list))]
172 | # is the word in the vocabulary?
173 | if self.vdict.has_key(w) is False:
174 | w = ''
175 | qvec[i] = self.vdict[w]
176 | cvec[i] = 0 if i == max_length - len(q_list) else 1
177 |
178 | return qvec, cvec
179 |
180 | def answer_to_vec(self, ans_str):
181 | """ Return answer id if the answer is included in vocabulary otherwise '' """
182 | if self.mode =='test-dev' or self.mode == 'test':
183 | return -1
184 |
185 | if self.adict.has_key(ans_str):
186 | ans = self.adict[ans_str]
187 | else:
188 | ans = self.adict['']
189 | return ans
190 |
191 | def vec_to_answer(self, ans_symbol):
192 | """ Return answer id if the answer is included in vocabulary otherwise '' """
193 | if self.rev_adict is None:
194 | rev_adict = {}
195 | for k,v in self.adict.items():
196 | rev_adict[v] = k
197 | self.rev_adict = rev_adict
198 |
199 | return self.rev_adict[ans_symbol]
200 |
201 | def create_batch(self,qid_list):
202 |
203 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
204 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
205 | ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048)
206 | avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
207 |
208 | for i,qid in enumerate(qid_list):
209 |
210 | # load raw question information
211 | q_str = self.getQuesStr(qid)
212 | q_ans = self.getAnsObj(qid)
213 | q_iid = self.getImgId(qid)
214 |
215 | # convert question to vec
216 | q_list = VQADataProvider.seq_to_list(q_str)
217 | t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list)
218 |
219 | try:
220 | qid_split = qid.split(QID_KEY_SEPARATOR)
221 | data_split = qid_split[0]
222 | if data_split == 'genome':
223 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
224 | else:
225 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
226 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
227 | except:
228 | t_ivec = 0.
229 | print 'data not found for qid : ', q_iid, self.mode
230 |
231 | # convert answer to vec
232 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
233 | q_ans_str = self.extract_answer(q_ans)
234 | t_avec = self.answer_to_vec(q_ans_str)
235 | else:
236 | t_avec = self.extract_answer_list(q_ans)
237 |
238 | qvec[i,...] = t_qvec
239 | cvec[i,...] = t_cvec
240 | ivec[i,...] = t_ivec
241 | avec[i,...] = t_avec
242 |
243 | return qvec, cvec, ivec, avec
244 |
245 |
246 | def get_batch_vec(self):
247 | if self.batch_len is None:
248 | self.n_skipped = 0
249 | qid_list = self.getQuesIds()
250 | random.shuffle(qid_list)
251 | self.qid_list = qid_list
252 | self.batch_len = len(qid_list)
253 | self.batch_index = 0
254 | self.epoch_counter = 0
255 |
256 | def has_at_least_one_valid_answer(t_qid):
257 | answer_obj = self.getAnsObj(t_qid)
258 | answer_list = [ans['answer'] for ans in answer_obj]
259 | for ans in answer_list:
260 | if self.adict.has_key(ans):
261 | return True
262 |
263 | counter = 0
264 | t_qid_list = []
265 | t_iid_list = []
266 | while counter < self.batchsize:
267 | t_qid = self.qid_list[self.batch_index]
268 | t_iid = self.getImgId(t_qid)
269 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
270 | t_qid_list.append(t_qid)
271 | t_iid_list.append(t_iid)
272 | counter += 1
273 | elif has_at_least_one_valid_answer(t_qid):
274 | t_qid_list.append(t_qid)
275 | t_iid_list.append(t_iid)
276 | counter += 1
277 | else:
278 | self.n_skipped += 1
279 |
280 | if self.batch_index < self.batch_len-1:
281 | self.batch_index += 1
282 | else:
283 | self.epoch_counter += 1
284 | qid_list = self.getQuesIds()
285 | random.shuffle(qid_list)
286 | self.qid_list = qid_list
287 | self.batch_index = 0
288 | print("%d questions were skipped in a single epoch" % self.n_skipped)
289 | self.n_skipped = 0
290 |
291 | t_batch = self.create_batch(t_qid_list)
292 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
293 |
294 |
295 | class VQADataProviderLayer(caffe.Layer):
296 | """
297 | Provide input data for VQA.
298 | """
299 |
300 | def setup(self, bottom, top):
301 | self.batchsize = json.loads(self.param_str)['batchsize']
302 | self.top_names = ['data','cont','feature','label']
303 | top[0].reshape(15,self.batchsize)
304 | top[1].reshape(15,self.batchsize)
305 | top[2].reshape(self.batchsize,2048)
306 | top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
307 |
308 | self.mode = json.loads(self.param_str)['mode']
309 | self.folder = json.loads(self.param_str)['folder']
310 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
311 | pass
312 | else:
313 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
314 |
315 | def reshape(self, bottom, top):
316 | pass
317 |
318 | def forward(self, bottom, top):
319 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
320 | pass
321 | else:
322 | word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec()
323 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
324 | top[1].data[...] = np.transpose(cont,(1,0))
325 | top[2].data[...] = feature
326 | top[3].data[...] = answer
327 |
328 | def backward(self, top, propagate_down, bottom):
329 | pass
330 |
331 |
--------------------------------------------------------------------------------
/mfb_coatt_glove/vqa_data_layer.py:
--------------------------------------------------------------------------------
1 | import caffe
2 | import numpy as np
3 | import re, json, random
4 | import config
5 | import spacy
6 |
7 | QID_KEY_SEPARATOR = '/'
8 | GLOVE_EMBEDDING_SIZE = 300
9 |
10 | class VQADataProvider:
11 |
12 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
13 | self.batchsize = batchsize
14 | self.d_vocabulary = None
15 | self.batch_index = None
16 | self.batch_len = None
17 | self.rev_adict = None
18 | self.max_length = max_length
19 | self.mode = mode
20 | self.qdic, self.adic = VQADataProvider.load_data(mode)
21 |
22 | with open('./%s/vdict.json'%folder,'r') as f:
23 | self.vdict = json.load(f)
24 | with open('./%s/adict.json'%folder,'r') as f:
25 | self.adict = json.load(f)
26 |
27 | self.n_ans_vocabulary = len(self.adict)
28 | self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
29 | self.glove_dict = {} # word -> glove vector
30 |
31 | @staticmethod
32 | def load_vqa_json(data_split):
33 | """
34 | Parses the question and answer json files for the given data split.
35 | Returns the question dictionary and the answer dictionary.
36 | """
37 | qdic, adic = {}, {}
38 |
39 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
40 | qdata = json.load(f)['questions']
41 | for q in qdata:
42 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
43 | {'qstr': q['question'], 'iid': q['image_id']}
44 |
45 | if 'test' not in data_split:
46 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
47 | adata = json.load(f)['annotations']
48 | for a in adata:
49 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
50 | a['answers']
51 |
52 | print 'parsed', len(qdic), 'questions for', data_split
53 | return qdic, adic
54 |
55 | @staticmethod
56 | def load_genome_json():
57 | """
58 | Parses the genome json file. Returns the question dictionary and the
59 | answer dictionary.
60 | """
61 | qdic, adic = {}, {}
62 |
63 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
64 | qdata = json.load(f)
65 | for q in qdata:
66 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
67 | qdic[key] = {'qstr': q['question'], 'iid': q['image']}
68 | adic[key] = [{'answer': q['answer']}]
69 |
70 | print 'parsed', len(qdic), 'questions for genome'
71 | return qdic, adic
72 |
73 | @staticmethod
74 | def load_data(data_split_str):
75 | all_qdic, all_adic = {}, {}
76 | for data_split in data_split_str.split('+'):
77 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
78 | if data_split == 'genome':
79 | qdic, adic = VQADataProvider.load_genome_json()
80 | all_qdic.update(qdic)
81 | all_adic.update(adic)
82 | else:
83 | qdic, adic = VQADataProvider.load_vqa_json(data_split)
84 | all_qdic.update(qdic)
85 | all_adic.update(adic)
86 | return all_qdic, all_adic
87 |
88 | def getQuesIds(self):
89 | return self.qdic.keys()
90 |
91 | def getStrippedQuesId(self, qid):
92 | return qid.split(QID_KEY_SEPARATOR)[1]
93 |
94 | def getImgId(self,qid):
95 | return self.qdic[qid]['iid']
96 |
97 | def getQuesStr(self,qid):
98 | return self.qdic[qid]['qstr']
99 |
100 | def getAnsObj(self,qid):
101 | if self.mode == 'test-dev' or self.mode == 'test':
102 | return -1
103 | return self.adic[qid]
104 |
105 | @staticmethod
106 | def seq_to_list(s):
107 | t_str = s.lower()
108 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
109 | t_str = re.sub( i, '', t_str)
110 | for i in [r'\-',r'\/']:
111 | t_str = re.sub( i, ' ', t_str)
112 | q_list = re.sub(r'\?','',t_str.lower()).split(' ')
113 | q_list = filter(lambda x: len(x) > 0, q_list)
114 | return q_list
115 |
116 | def extract_answer(self,answer_obj):
117 | """ Return the most popular answer in string."""
118 | if self.mode == 'test-dev' or self.mode == 'test':
119 | return -1
120 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
121 | dic = {}
122 | for ans in answer_list:
123 | if dic.has_key(ans):
124 | dic[ans] +=1
125 | else:
126 | dic[ans] = 1
127 | max_key = max((v,k) for (k,v) in dic.items())[1]
128 | return max_key
129 |
130 | def extract_answer_prob(self,answer_obj):
131 | """ Return the most popular answer in string."""
132 | if self.mode == 'test-dev' or self.mode == 'test':
133 | return -1
134 |
135 | answer_list = [ ans['answer'] for ans in answer_obj]
136 | prob_answer_list = []
137 | for ans in answer_list:
138 | if self.adict.has_key(ans):
139 | prob_answer_list.append(ans)
140 |
141 | if len(prob_answer_list) == 0:
142 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
143 | return 'hoge'
144 | else:
145 | raise Exception("This should not happen.")
146 | else:
147 | return random.choice(prob_answer_list)
148 |
149 | def qlist_to_vec(self, max_length, q_list):
150 | """
151 | Converts a list of words into a format suitable for the embedding layer.
152 |
153 | Arguments:
154 | max_length -- the maximum length of a question sequence
155 | q_list -- a list of words which are the tokens in the question
156 |
157 | Returns:
158 | qvec -- A max_length length vector containing one-hot indices for each word
159 | cvec -- A max_length length sequence continuation indicator vector
160 | glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for
161 | each word
162 | """
163 | qvec = np.zeros(max_length)
164 | cvec = np.zeros(max_length)
165 | glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE)
166 | for i in xrange(max_length):
167 | if i < max_length - len(q_list):
168 | cvec[i] = 0
169 | else:
170 | w = q_list[i-(max_length-len(q_list))]
171 | if w not in self.glove_dict:
172 | self.glove_dict[w] = self.nlp(u'%s' % w).vector
173 | glove_matrix[i] = self.glove_dict[w]
174 | # is the word in the vocabulary?
175 | if self.vdict.has_key(w) is False:
176 | w = ''
177 | qvec[i] = self.vdict[w]
178 | cvec[i] = 0 if i == max_length - len(q_list) else 1
179 |
180 | return qvec, cvec, glove_matrix
181 |
182 | def answer_to_vec(self, ans_str):
183 | """ Return answer id if the answer is included in vocabulary otherwise '' """
184 | if self.mode =='test-dev' or self.mode == 'test':
185 | return -1
186 |
187 | if self.adict.has_key(ans_str):
188 | ans = self.adict[ans_str]
189 | else:
190 | ans = self.adict['']
191 | return ans
192 |
193 | def vec_to_answer(self, ans_symbol):
194 | """ Return answer id if the answer is included in vocabulary otherwise '' """
195 | if self.rev_adict is None:
196 | rev_adict = {}
197 | for k,v in self.adict.items():
198 | rev_adict[v] = k
199 | self.rev_adict = rev_adict
200 |
201 | return self.rev_adict[ans_symbol]
202 |
203 | def create_batch(self,qid_list):
204 |
205 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
206 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
207 | ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14)
208 | avec = (np.zeros(self.batchsize)).reshape(self.batchsize)
209 | glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\
210 | self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE)
211 |
212 | for i,qid in enumerate(qid_list):
213 |
214 | # load raw question information
215 | q_str = self.getQuesStr(qid)
216 | q_ans = self.getAnsObj(qid)
217 | q_iid = self.getImgId(qid)
218 |
219 | # convert question to vec
220 | q_list = VQADataProvider.seq_to_list(q_str)
221 | t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list)
222 |
223 | try:
224 | qid_split = qid.split(QID_KEY_SEPARATOR)
225 | data_split = qid_split[0]
226 | if data_split == 'genome':
227 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
228 | else:
229 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
230 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
231 | except:
232 | t_ivec = 0.
233 | print 'data not found for qid : ', q_iid, self.mode
234 |
235 | # convert answer to vec
236 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
237 | q_ans_str = self.extract_answer(q_ans)
238 | else:
239 | q_ans_str = self.extract_answer_prob(q_ans)
240 | t_avec = self.answer_to_vec(q_ans_str)
241 | qvec[i,...] = t_qvec
242 | cvec[i,...] = t_cvec
243 | ivec[i,...] = t_ivec
244 | avec[i,...] = t_avec
245 | glove_matrix[i,...] = t_glove_matrix
246 |
247 | return qvec, cvec, ivec, avec, glove_matrix
248 |
249 |
250 | def get_batch_vec(self):
251 | if self.batch_len is None:
252 | self.n_skipped = 0
253 | qid_list = self.getQuesIds()
254 | random.shuffle(qid_list)
255 | self.qid_list = qid_list
256 | self.batch_len = len(qid_list)
257 | self.batch_index = 0
258 | self.epoch_counter = 0
259 |
260 | def has_at_least_one_valid_answer(t_qid):
261 | answer_obj = self.getAnsObj(t_qid)
262 | answer_list = [ans['answer'] for ans in answer_obj]
263 | for ans in answer_list:
264 | if self.adict.has_key(ans):
265 | return True
266 |
267 | counter = 0
268 | t_qid_list = []
269 | t_iid_list = []
270 | while counter < self.batchsize:
271 | t_qid = self.qid_list[self.batch_index]
272 | t_iid = self.getImgId(t_qid)
273 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
274 | t_qid_list.append(t_qid)
275 | t_iid_list.append(t_iid)
276 | counter += 1
277 | elif has_at_least_one_valid_answer(t_qid):
278 | t_qid_list.append(t_qid)
279 | t_iid_list.append(t_iid)
280 | counter += 1
281 | else:
282 | self.n_skipped += 1
283 |
284 | if self.batch_index < self.batch_len-1:
285 | self.batch_index += 1
286 | else:
287 | self.epoch_counter += 1
288 | qid_list = self.getQuesIds()
289 | random.shuffle(qid_list)
290 | self.qid_list = qid_list
291 | self.batch_index = 0
292 | print("%d questions were skipped in a single epoch" % self.n_skipped)
293 | self.n_skipped = 0
294 |
295 | t_batch = self.create_batch(t_qid_list)
296 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
297 |
298 |
299 | class VQADataProviderLayer(caffe.Layer):
300 | """
301 | Provide input data for VQA.
302 | """
303 |
304 | def setup(self, bottom, top):
305 | self.batchsize = json.loads(self.param_str)['batchsize']
306 | self.top_names = ['data','cont','feature','label','glove']
307 | top[0].reshape(15,self.batchsize)
308 | top[1].reshape(15,self.batchsize)
309 | top[2].reshape(self.batchsize,2048,14,14)
310 | top[3].reshape(self.batchsize)
311 | top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE)
312 |
313 | self.mode = json.loads(self.param_str)['mode']
314 | self.folder = json.loads(self.param_str)['folder']
315 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
316 | pass
317 | else:
318 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
319 |
320 | def reshape(self, bottom, top):
321 | pass
322 |
323 | def forward(self, bottom, top):
324 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
325 | pass
326 | else:
327 | word, cont, feature, answer, glove_matrix, _, _, _ = self.dp.get_batch_vec()
328 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
329 | top[1].data[...] = np.transpose(cont,(1,0))
330 | top[2].data[...] = feature
331 | top[3].data[...] = answer
332 | top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300
333 |
334 | def backward(self, top, propagate_down, bottom):
335 | pass
336 |
337 |
--------------------------------------------------------------------------------
/mfh_coatt_glove/vqa_data_layer.py:
--------------------------------------------------------------------------------
1 | import caffe
2 | import numpy as np
3 | import re, json, random
4 | import config
5 | import spacy
6 |
7 | QID_KEY_SEPARATOR = '/'
8 | GLOVE_EMBEDDING_SIZE = 300
9 |
10 | class VQADataProvider:
11 |
12 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
13 | self.batchsize = batchsize
14 | self.d_vocabulary = None
15 | self.batch_index = None
16 | self.batch_len = None
17 | self.rev_adict = None
18 | self.max_length = max_length
19 | self.mode = mode
20 | self.qdic, self.adic = VQADataProvider.load_data(mode)
21 |
22 | with open('./%s/vdict.json'%folder,'r') as f:
23 | self.vdict = json.load(f)
24 | with open('./%s/adict.json'%folder,'r') as f:
25 | self.adict = json.load(f)
26 |
27 | self.n_ans_vocabulary = len(self.adict)
28 | self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
29 | self.glove_dict = {} # word -> glove vector
30 |
31 | @staticmethod
32 | def load_vqa_json(data_split):
33 | """
34 | Parses the question and answer json files for the given data split.
35 | Returns the question dictionary and the answer dictionary.
36 | """
37 | qdic, adic = {}, {}
38 |
39 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
40 | qdata = json.load(f)['questions']
41 | for q in qdata:
42 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
43 | {'qstr': q['question'], 'iid': q['image_id']}
44 |
45 | if 'test' not in data_split:
46 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
47 | adata = json.load(f)['annotations']
48 | for a in adata:
49 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
50 | a['answers']
51 |
52 | print 'parsed', len(qdic), 'questions for', data_split
53 | return qdic, adic
54 |
55 | @staticmethod
56 | def load_genome_json():
57 | """
58 | Parses the genome json file. Returns the question dictionary and the
59 | answer dictionary.
60 | """
61 | qdic, adic = {}, {}
62 |
63 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
64 | qdata = json.load(f)
65 | for q in qdata:
66 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
67 | qdic[key] = {'qstr': q['question'], 'iid': q['image']}
68 | adic[key] = [{'answer': q['answer']}]
69 |
70 | print 'parsed', len(qdic), 'questions for genome'
71 | return qdic, adic
72 |
73 | @staticmethod
74 | def load_data(data_split_str):
75 | all_qdic, all_adic = {}, {}
76 | for data_split in data_split_str.split('+'):
77 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
78 | if data_split == 'genome':
79 | qdic, adic = VQADataProvider.load_genome_json()
80 | all_qdic.update(qdic)
81 | all_adic.update(adic)
82 | else:
83 | qdic, adic = VQADataProvider.load_vqa_json(data_split)
84 | all_qdic.update(qdic)
85 | all_adic.update(adic)
86 | return all_qdic, all_adic
87 |
88 | def getQuesIds(self):
89 | return self.qdic.keys()
90 |
91 | def getStrippedQuesId(self, qid):
92 | return qid.split(QID_KEY_SEPARATOR)[1]
93 |
94 | def getImgId(self,qid):
95 | return self.qdic[qid]['iid']
96 |
97 | def getQuesStr(self,qid):
98 | return self.qdic[qid]['qstr']
99 |
100 | def getAnsObj(self,qid):
101 | if self.mode == 'test-dev' or self.mode == 'test':
102 | return -1
103 | return self.adic[qid]
104 |
105 | @staticmethod
106 | def seq_to_list(s):
107 | t_str = s.lower()
108 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
109 | t_str = re.sub( i, '', t_str)
110 | for i in [r'\-',r'\/']:
111 | t_str = re.sub( i, ' ', t_str)
112 | q_list = re.sub(r'\?','',t_str.lower()).split(' ')
113 | q_list = filter(lambda x: len(x) > 0, q_list)
114 | return q_list
115 |
116 | def extract_answer(self,answer_obj):
117 | """ Return the most popular answer in string."""
118 | if self.mode == 'test-dev' or self.mode == 'test':
119 | return -1
120 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
121 | dic = {}
122 | for ans in answer_list:
123 | if dic.has_key(ans):
124 | dic[ans] +=1
125 | else:
126 | dic[ans] = 1
127 | max_key = max((v,k) for (k,v) in dic.items())[1]
128 | return max_key
129 |
130 | def extract_answer_prob(self,answer_obj):
131 | """ Return the most popular answer in string."""
132 | if self.mode == 'test-dev' or self.mode == 'test':
133 | return -1
134 |
135 | answer_list = [ ans['answer'] for ans in answer_obj]
136 | prob_answer_list = []
137 | for ans in answer_list:
138 | if self.adict.has_key(ans):
139 | prob_answer_list.append(ans)
140 |
141 | if len(prob_answer_list) == 0:
142 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
143 | return 'hoge'
144 | else:
145 | raise Exception("This should not happen.")
146 | else:
147 | return random.choice(prob_answer_list)
148 |
149 | def qlist_to_vec(self, max_length, q_list):
150 | """
151 | Converts a list of words into a format suitable for the embedding layer.
152 |
153 | Arguments:
154 | max_length -- the maximum length of a question sequence
155 | q_list -- a list of words which are the tokens in the question
156 |
157 | Returns:
158 | qvec -- A max_length length vector containing one-hot indices for each word
159 | cvec -- A max_length length sequence continuation indicator vector
160 | glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for
161 | each word
162 | """
163 | qvec = np.zeros(max_length)
164 | cvec = np.zeros(max_length)
165 | glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE)
166 | for i in xrange(max_length):
167 | if i < max_length - len(q_list):
168 | cvec[i] = 0
169 | else:
170 | w = q_list[i-(max_length-len(q_list))]
171 | if w not in self.glove_dict:
172 | self.glove_dict[w] = self.nlp(u'%s' % w).vector
173 | glove_matrix[i] = self.glove_dict[w]
174 | # is the word in the vocabulary?
175 | if self.vdict.has_key(w) is False:
176 | w = ''
177 | qvec[i] = self.vdict[w]
178 | cvec[i] = 0 if i == max_length - len(q_list) else 1
179 |
180 | return qvec, cvec, glove_matrix
181 |
182 | def answer_to_vec(self, ans_str):
183 | """ Return answer id if the answer is included in vocabulary otherwise '' """
184 | if self.mode =='test-dev' or self.mode == 'test':
185 | return -1
186 |
187 | if self.adict.has_key(ans_str):
188 | ans = self.adict[ans_str]
189 | else:
190 | ans = self.adict['']
191 | return ans
192 |
193 | def vec_to_answer(self, ans_symbol):
194 | """ Return answer id if the answer is included in vocabulary otherwise '' """
195 | if self.rev_adict is None:
196 | rev_adict = {}
197 | for k,v in self.adict.items():
198 | rev_adict[v] = k
199 | self.rev_adict = rev_adict
200 |
201 | return self.rev_adict[ans_symbol]
202 |
203 | def create_batch(self,qid_list):
204 |
205 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
206 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
207 | ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14)
208 | avec = (np.zeros(self.batchsize)).reshape(self.batchsize)
209 | glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\
210 | self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE)
211 |
212 | for i,qid in enumerate(qid_list):
213 |
214 | # load raw question information
215 | q_str = self.getQuesStr(qid)
216 | q_ans = self.getAnsObj(qid)
217 | q_iid = self.getImgId(qid)
218 |
219 | # convert question to vec
220 | q_list = VQADataProvider.seq_to_list(q_str)
221 | t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list)
222 |
223 | try:
224 | qid_split = qid.split(QID_KEY_SEPARATOR)
225 | data_split = qid_split[0]
226 | if data_split == 'genome':
227 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
228 | else:
229 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
230 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
231 | except:
232 | t_ivec = 0.
233 | print 'data not found for qid : ', q_iid, self.mode
234 |
235 | # convert answer to vec
236 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
237 | q_ans_str = self.extract_answer(q_ans)
238 | else:
239 | q_ans_str = self.extract_answer_prob(q_ans)
240 | t_avec = self.answer_to_vec(q_ans_str)
241 | qvec[i,...] = t_qvec
242 | cvec[i,...] = t_cvec
243 | ivec[i,...] = t_ivec
244 | avec[i,...] = t_avec
245 | glove_matrix[i,...] = t_glove_matrix
246 |
247 | return qvec, cvec, ivec, avec, glove_matrix
248 |
249 |
250 | def get_batch_vec(self):
251 | if self.batch_len is None:
252 | self.n_skipped = 0
253 | qid_list = self.getQuesIds()
254 | random.shuffle(qid_list)
255 | self.qid_list = qid_list
256 | self.batch_len = len(qid_list)
257 | self.batch_index = 0
258 | self.epoch_counter = 0
259 |
260 | def has_at_least_one_valid_answer(t_qid):
261 | answer_obj = self.getAnsObj(t_qid)
262 | answer_list = [ans['answer'] for ans in answer_obj]
263 | for ans in answer_list:
264 | if self.adict.has_key(ans):
265 | return True
266 |
267 | counter = 0
268 | t_qid_list = []
269 | t_iid_list = []
270 | while counter < self.batchsize:
271 | t_qid = self.qid_list[self.batch_index]
272 | t_iid = self.getImgId(t_qid)
273 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
274 | t_qid_list.append(t_qid)
275 | t_iid_list.append(t_iid)
276 | counter += 1
277 | elif has_at_least_one_valid_answer(t_qid):
278 | t_qid_list.append(t_qid)
279 | t_iid_list.append(t_iid)
280 | counter += 1
281 | else:
282 | self.n_skipped += 1
283 |
284 | if self.batch_index < self.batch_len-1:
285 | self.batch_index += 1
286 | else:
287 | self.epoch_counter += 1
288 | qid_list = self.getQuesIds()
289 | random.shuffle(qid_list)
290 | self.qid_list = qid_list
291 | self.batch_index = 0
292 | print("%d questions were skipped in a single epoch" % self.n_skipped)
293 | self.n_skipped = 0
294 |
295 | t_batch = self.create_batch(t_qid_list)
296 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
297 |
298 |
299 | class VQADataProviderLayer(caffe.Layer):
300 | """
301 | Provide input data for VQA.
302 | """
303 |
304 | def setup(self, bottom, top):
305 | self.batchsize = json.loads(self.param_str)['batchsize']
306 | self.top_names = ['data','cont','feature','label','glove']
307 | top[0].reshape(15,self.batchsize)
308 | top[1].reshape(15,self.batchsize)
309 | top[2].reshape(self.batchsize,2048,14,14)
310 | top[3].reshape(self.batchsize)
311 | top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE)
312 |
313 | self.mode = json.loads(self.param_str)['mode']
314 | self.folder = json.loads(self.param_str)['folder']
315 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
316 | pass
317 | else:
318 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
319 |
320 | def reshape(self, bottom, top):
321 | pass
322 |
323 | def forward(self, bottom, top):
324 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
325 | pass
326 | else:
327 | word, cont, feature, answer, glove_matrix, _, _, _ = self.dp.get_batch_vec()
328 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
329 | top[1].data[...] = np.transpose(cont,(1,0))
330 | top[2].data[...] = feature
331 | top[3].data[...] = answer
332 | top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300
333 |
334 | def backward(self, top, propagate_down, bottom):
335 | pass
336 |
337 |
--------------------------------------------------------------------------------
/mfb_coatt_glove/vqa_data_layer_kld.py:
--------------------------------------------------------------------------------
1 | import caffe
2 | import numpy as np
3 | import re, json, random
4 | import config
5 | import spacy
6 |
7 | QID_KEY_SEPARATOR = '/'
8 | GLOVE_EMBEDDING_SIZE = 300
9 |
10 | class VQADataProvider:
11 |
12 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
13 | self.batchsize = batchsize
14 | self.d_vocabulary = None
15 | self.batch_index = None
16 | self.batch_len = None
17 | self.rev_adict = None
18 | self.max_length = max_length
19 | self.mode = mode
20 | self.qdic, self.adic = VQADataProvider.load_data(mode)
21 |
22 | with open('./%s/vdict.json'%folder,'r') as f:
23 | self.vdict = json.load(f)
24 | with open('./%s/adict.json'%folder,'r') as f:
25 | self.adict = json.load(f)
26 |
27 | self.n_ans_vocabulary = len(self.adict)
28 | self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
29 | self.glove_dict = {} # word -> glove vector
30 |
31 | @staticmethod
32 | def load_vqa_json(data_split):
33 | """
34 | Parses the question and answer json files for the given data split.
35 | Returns the question dictionary and the answer dictionary.
36 | """
37 | qdic, adic = {}, {}
38 |
39 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
40 | qdata = json.load(f)['questions']
41 | for q in qdata:
42 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
43 | {'qstr': q['question'], 'iid': q['image_id']}
44 |
45 | if 'test' not in data_split:
46 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
47 | adata = json.load(f)['annotations']
48 | for a in adata:
49 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
50 | a['answers']
51 |
52 | print 'parsed', len(qdic), 'questions for', data_split
53 | return qdic, adic
54 |
55 | @staticmethod
56 | def load_genome_json():
57 | """
58 | Parses the genome json file. Returns the question dictionary and the
59 | answer dictionary.
60 | """
61 | qdic, adic = {}, {}
62 |
63 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
64 | qdata = json.load(f)
65 | for q in qdata:
66 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
67 | qdic[key] = {'qstr': q['question'], 'iid': q['image']}
68 | adic[key] = [{'answer': q['answer']}]
69 |
70 | print 'parsed', len(qdic), 'questions for genome'
71 | return qdic, adic
72 |
73 | @staticmethod
74 | def load_data(data_split_str):
75 | all_qdic, all_adic = {}, {}
76 | for data_split in data_split_str.split('+'):
77 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
78 | if data_split == 'genome':
79 | qdic, adic = VQADataProvider.load_genome_json()
80 | all_qdic.update(qdic)
81 | all_adic.update(adic)
82 | else:
83 | qdic, adic = VQADataProvider.load_vqa_json(data_split)
84 | all_qdic.update(qdic)
85 | all_adic.update(adic)
86 | return all_qdic, all_adic
87 |
88 | def getQuesIds(self):
89 | return self.qdic.keys()
90 |
91 | def getStrippedQuesId(self, qid):
92 | return qid.split(QID_KEY_SEPARATOR)[1]
93 |
94 | def getImgId(self,qid):
95 | return self.qdic[qid]['iid']
96 |
97 | def getQuesStr(self,qid):
98 | return self.qdic[qid]['qstr']
99 |
100 | def getAnsObj(self,qid):
101 | if self.mode == 'test-dev' or self.mode == 'test':
102 | return -1
103 | return self.adic[qid]
104 |
105 | @staticmethod
106 | def seq_to_list(s):
107 | t_str = s.lower()
108 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
109 | t_str = re.sub( i, '', t_str)
110 | for i in [r'\-',r'\/']:
111 | t_str = re.sub( i, ' ', t_str)
112 | q_list = re.sub(r'\?','',t_str.lower()).split(' ')
113 | q_list = filter(lambda x: len(x) > 0, q_list)
114 | return q_list
115 |
116 | def extract_answer(self,answer_obj):
117 | """ Return the most popular answer in string."""
118 | if self.mode == 'test-dev' or self.mode == 'test':
119 | return -1
120 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
121 | dic = {}
122 | for ans in answer_list:
123 | if dic.has_key(ans):
124 | dic[ans] +=1
125 | else:
126 | dic[ans] = 1
127 | max_key = max((v,k) for (k,v) in dic.items())[1]
128 | return max_key
129 |
130 | def extract_answer_prob(self,answer_obj):
131 | """ Return the most popular answer in string."""
132 | if self.mode == 'test-dev' or self.mode == 'test':
133 | return -1
134 |
135 | answer_list = [ ans['answer'] for ans in answer_obj]
136 | prob_answer_list = []
137 | for ans in answer_list:
138 | if self.adict.has_key(ans):
139 | prob_answer_list.append(ans)
140 |
141 | if len(prob_answer_list) == 0:
142 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
143 | return 'hoge'
144 | else:
145 | raise Exception("This should not happen.")
146 | else:
147 | return random.choice(prob_answer_list)
148 |
149 | def extract_answer_list(self,answer_obj):
150 | answer_list = [ ans['answer'] for ans in answer_obj]
151 | prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS)
152 | for ans in answer_list:
153 | if self.adict.has_key(ans):
154 | index = self.adict[ans]
155 | prob_answer_vec[index] += 1
156 | prob_answer_vec = prob_answer_vec / np.sum(prob_answer_vec)
157 | return prob_answer_vec
158 |
159 | def qlist_to_vec(self, max_length, q_list):
160 | """
161 | Converts a list of words into a format suitable for the embedding layer.
162 |
163 | Arguments:
164 | max_length -- the maximum length of a question sequence
165 | q_list -- a list of words which are the tokens in the question
166 |
167 | Returns:
168 | qvec -- A max_length length vector containing one-hot indices for each word
169 | cvec -- A max_length length sequence continuation indicator vector
170 | glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for
171 | each word
172 | """
173 | qvec = np.zeros(max_length)
174 | cvec = np.zeros(max_length)
175 | glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE)
176 | for i in xrange(max_length):
177 | if i < max_length - len(q_list):
178 | cvec[i] = 0
179 | else:
180 | w = q_list[i-(max_length-len(q_list))]
181 | if w not in self.glove_dict:
182 | self.glove_dict[w] = self.nlp(u'%s' % w).vector
183 | glove_matrix[i] = self.glove_dict[w]
184 | # is the word in the vocabulary?
185 | if self.vdict.has_key(w) is False:
186 | w = ''
187 | qvec[i] = self.vdict[w]
188 | cvec[i] = 0 if i == max_length - len(q_list) else 1
189 |
190 | return qvec, cvec, glove_matrix
191 |
192 | def answer_to_vec(self, ans_str):
193 | """ Return answer id if the answer is included in vocabulary otherwise '' """
194 | if self.mode =='test-dev' or self.mode == 'test':
195 | return -1
196 |
197 | if self.adict.has_key(ans_str):
198 | ans = self.adict[ans_str]
199 | else:
200 | ans = self.adict['']
201 | return ans
202 |
203 | def vec_to_answer(self, ans_symbol):
204 | """ Return answer id if the answer is included in vocabulary otherwise '' """
205 | if self.rev_adict is None:
206 | rev_adict = {}
207 | for k,v in self.adict.items():
208 | rev_adict[v] = k
209 | self.rev_adict = rev_adict
210 |
211 | return self.rev_adict[ans_symbol]
212 |
213 | def create_batch(self,qid_list):
214 |
215 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
216 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
217 | ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14)
218 | avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
219 | glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\
220 | self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE)
221 |
222 | for i,qid in enumerate(qid_list):
223 |
224 | # load raw question information
225 | q_str = self.getQuesStr(qid)
226 | q_ans = self.getAnsObj(qid)
227 | q_iid = self.getImgId(qid)
228 |
229 | # convert question to vec
230 | q_list = VQADataProvider.seq_to_list(q_str)
231 | t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list)
232 |
233 | try:
234 | qid_split = qid.split(QID_KEY_SEPARATOR)
235 | data_split = qid_split[0]
236 | if data_split == 'genome':
237 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
238 | else:
239 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
240 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
241 | except:
242 | t_ivec = 0.
243 | print 'data not found for qid : ', q_iid, self.mode
244 |
245 | # convert answer to vec
246 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
247 | q_ans_str = self.extract_answer(q_ans)
248 | t_avec = self.answer_to_vec(q_ans_str)
249 | else:
250 | t_avec = self.extract_answer_list(q_ans)
251 |
252 | qvec[i,...] = t_qvec
253 | cvec[i,...] = t_cvec
254 | ivec[i,...] = t_ivec
255 | avec[i,...] = t_avec
256 | glove_matrix[i,...] = t_glove_matrix
257 |
258 | return qvec, cvec, ivec, avec, glove_matrix
259 |
260 |
261 | def get_batch_vec(self):
262 | if self.batch_len is None:
263 | self.n_skipped = 0
264 | qid_list = self.getQuesIds()
265 | random.shuffle(qid_list)
266 | self.qid_list = qid_list
267 | self.batch_len = len(qid_list)
268 | self.batch_index = 0
269 | self.epoch_counter = 0
270 |
271 | def has_at_least_one_valid_answer(t_qid):
272 | answer_obj = self.getAnsObj(t_qid)
273 | answer_list = [ans['answer'] for ans in answer_obj]
274 | for ans in answer_list:
275 | if self.adict.has_key(ans):
276 | return True
277 |
278 | counter = 0
279 | t_qid_list = []
280 | t_iid_list = []
281 | while counter < self.batchsize:
282 | t_qid = self.qid_list[self.batch_index]
283 | t_iid = self.getImgId(t_qid)
284 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
285 | t_qid_list.append(t_qid)
286 | t_iid_list.append(t_iid)
287 | counter += 1
288 | elif has_at_least_one_valid_answer(t_qid):
289 | t_qid_list.append(t_qid)
290 | t_iid_list.append(t_iid)
291 | counter += 1
292 | else:
293 | self.n_skipped += 1
294 |
295 | if self.batch_index < self.batch_len-1:
296 | self.batch_index += 1
297 | else:
298 | self.epoch_counter += 1
299 | qid_list = self.getQuesIds()
300 | random.shuffle(qid_list)
301 | self.qid_list = qid_list
302 | self.batch_index = 0
303 | print("%d questions were skipped in a single epoch" % self.n_skipped)
304 | self.n_skipped = 0
305 |
306 | t_batch = self.create_batch(t_qid_list)
307 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
308 |
309 |
310 | class VQADataProviderLayer(caffe.Layer):
311 | """
312 | Provide input data for VQA.
313 | """
314 |
315 | def setup(self, bottom, top):
316 | self.batchsize = json.loads(self.param_str)['batchsize']
317 | self.top_names = ['data','cont','feature','label','glove']
318 | top[0].reshape(15,self.batchsize)
319 | top[1].reshape(15,self.batchsize)
320 | top[2].reshape(self.batchsize,2048,14,14)
321 | top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
322 | top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE)
323 |
324 | self.mode = json.loads(self.param_str)['mode']
325 | self.folder = json.loads(self.param_str)['folder']
326 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
327 | pass
328 | else:
329 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
330 |
331 | def reshape(self, bottom, top):
332 | pass
333 |
334 | def forward(self, bottom, top):
335 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
336 | pass
337 | else:
338 | word, cont, feature, answer, glove_matrix, _, _, epoch_counter = self.dp.get_batch_vec()
339 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
340 | top[1].data[...] = np.transpose(cont,(1,0))
341 | top[2].data[...] = feature
342 | top[3].data[...] = answer
343 | top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300
344 |
345 | def backward(self, top, propagate_down, bottom):
346 | pass
347 |
348 |
--------------------------------------------------------------------------------
/mfh_coatt_glove/vqa_data_layer_kld.py:
--------------------------------------------------------------------------------
1 | import caffe
2 | import numpy as np
3 | import re, json, random
4 | import config
5 | import spacy
6 |
7 | QID_KEY_SEPARATOR = '/'
8 | GLOVE_EMBEDDING_SIZE = 300
9 |
10 | class VQADataProvider:
11 |
12 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
13 | self.batchsize = batchsize
14 | self.d_vocabulary = None
15 | self.batch_index = None
16 | self.batch_len = None
17 | self.rev_adict = None
18 | self.max_length = max_length
19 | self.mode = mode
20 | self.qdic, self.adic = VQADataProvider.load_data(mode)
21 |
22 | with open('./%s/vdict.json'%folder,'r') as f:
23 | self.vdict = json.load(f)
24 | with open('./%s/adict.json'%folder,'r') as f:
25 | self.adict = json.load(f)
26 |
27 | self.n_ans_vocabulary = len(self.adict)
28 | self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
29 | self.glove_dict = {} # word -> glove vector
30 |
31 | @staticmethod
32 | def load_vqa_json(data_split):
33 | """
34 | Parses the question and answer json files for the given data split.
35 | Returns the question dictionary and the answer dictionary.
36 | """
37 | qdic, adic = {}, {}
38 |
39 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
40 | qdata = json.load(f)['questions']
41 | for q in qdata:
42 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
43 | {'qstr': q['question'], 'iid': q['image_id']}
44 |
45 | if 'test' not in data_split:
46 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
47 | adata = json.load(f)['annotations']
48 | for a in adata:
49 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
50 | a['answers']
51 |
52 | print 'parsed', len(qdic), 'questions for', data_split
53 | return qdic, adic
54 |
55 | @staticmethod
56 | def load_genome_json():
57 | """
58 | Parses the genome json file. Returns the question dictionary and the
59 | answer dictionary.
60 | """
61 | qdic, adic = {}, {}
62 |
63 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
64 | qdata = json.load(f)
65 | for q in qdata:
66 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
67 | qdic[key] = {'qstr': q['question'], 'iid': q['image']}
68 | adic[key] = [{'answer': q['answer']}]
69 |
70 | print 'parsed', len(qdic), 'questions for genome'
71 | return qdic, adic
72 |
73 | @staticmethod
74 | def load_data(data_split_str):
75 | all_qdic, all_adic = {}, {}
76 | for data_split in data_split_str.split('+'):
77 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
78 | if data_split == 'genome':
79 | qdic, adic = VQADataProvider.load_genome_json()
80 | all_qdic.update(qdic)
81 | all_adic.update(adic)
82 | else:
83 | qdic, adic = VQADataProvider.load_vqa_json(data_split)
84 | all_qdic.update(qdic)
85 | all_adic.update(adic)
86 | return all_qdic, all_adic
87 |
88 | def getQuesIds(self):
89 | return self.qdic.keys()
90 |
91 | def getStrippedQuesId(self, qid):
92 | return qid.split(QID_KEY_SEPARATOR)[1]
93 |
94 | def getImgId(self,qid):
95 | return self.qdic[qid]['iid']
96 |
97 | def getQuesStr(self,qid):
98 | return self.qdic[qid]['qstr']
99 |
100 | def getAnsObj(self,qid):
101 | if self.mode == 'test-dev' or self.mode == 'test':
102 | return -1
103 | return self.adic[qid]
104 |
105 | @staticmethod
106 | def seq_to_list(s):
107 | t_str = s.lower()
108 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
109 | t_str = re.sub( i, '', t_str)
110 | for i in [r'\-',r'\/']:
111 | t_str = re.sub( i, ' ', t_str)
112 | q_list = re.sub(r'\?','',t_str.lower()).split(' ')
113 | q_list = filter(lambda x: len(x) > 0, q_list)
114 | return q_list
115 |
116 | def extract_answer(self,answer_obj):
117 | """ Return the most popular answer in string."""
118 | if self.mode == 'test-dev' or self.mode == 'test':
119 | return -1
120 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
121 | dic = {}
122 | for ans in answer_list:
123 | if dic.has_key(ans):
124 | dic[ans] +=1
125 | else:
126 | dic[ans] = 1
127 | max_key = max((v,k) for (k,v) in dic.items())[1]
128 | return max_key
129 |
130 | def extract_answer_prob(self,answer_obj):
131 | """ Return the most popular answer in string."""
132 | if self.mode == 'test-dev' or self.mode == 'test':
133 | return -1
134 |
135 | answer_list = [ ans['answer'] for ans in answer_obj]
136 | prob_answer_list = []
137 | for ans in answer_list:
138 | if self.adict.has_key(ans):
139 | prob_answer_list.append(ans)
140 |
141 | if len(prob_answer_list) == 0:
142 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
143 | return 'hoge'
144 | else:
145 | raise Exception("This should not happen.")
146 | else:
147 | return random.choice(prob_answer_list)
148 |
149 | def extract_answer_list(self,answer_obj):
150 | answer_list = [ ans['answer'] for ans in answer_obj]
151 | prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS)
152 | for ans in answer_list:
153 | if self.adict.has_key(ans):
154 | index = self.adict[ans]
155 | prob_answer_vec[index] += 1
156 | prob_answer_vec = prob_answer_vec / np.sum(prob_answer_vec)
157 | return prob_answer_vec
158 |
159 | def qlist_to_vec(self, max_length, q_list):
160 | """
161 | Converts a list of words into a format suitable for the embedding layer.
162 |
163 | Arguments:
164 | max_length -- the maximum length of a question sequence
165 | q_list -- a list of words which are the tokens in the question
166 |
167 | Returns:
168 | qvec -- A max_length length vector containing one-hot indices for each word
169 | cvec -- A max_length length sequence continuation indicator vector
170 | glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for
171 | each word
172 | """
173 | qvec = np.zeros(max_length)
174 | cvec = np.zeros(max_length)
175 | glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE)
176 | for i in xrange(max_length):
177 | if i < max_length - len(q_list):
178 | cvec[i] = 0
179 | else:
180 | w = q_list[i-(max_length-len(q_list))]
181 | if w not in self.glove_dict:
182 | self.glove_dict[w] = self.nlp(u'%s' % w).vector
183 | glove_matrix[i] = self.glove_dict[w]
184 | # is the word in the vocabulary?
185 | if self.vdict.has_key(w) is False:
186 | w = ''
187 | qvec[i] = self.vdict[w]
188 | cvec[i] = 0 if i == max_length - len(q_list) else 1
189 |
190 | return qvec, cvec, glove_matrix
191 |
192 | def answer_to_vec(self, ans_str):
193 | """ Return answer id if the answer is included in vocabulary otherwise '' """
194 | if self.mode =='test-dev' or self.mode == 'test':
195 | return -1
196 |
197 | if self.adict.has_key(ans_str):
198 | ans = self.adict[ans_str]
199 | else:
200 | ans = self.adict['']
201 | return ans
202 |
203 | def vec_to_answer(self, ans_symbol):
204 | """ Return answer id if the answer is included in vocabulary otherwise '' """
205 | if self.rev_adict is None:
206 | rev_adict = {}
207 | for k,v in self.adict.items():
208 | rev_adict[v] = k
209 | self.rev_adict = rev_adict
210 |
211 | return self.rev_adict[ans_symbol]
212 |
213 | def create_batch(self,qid_list):
214 |
215 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
216 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
217 | ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14)
218 | avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
219 | glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\
220 | self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE)
221 |
222 | for i,qid in enumerate(qid_list):
223 |
224 | # load raw question information
225 | q_str = self.getQuesStr(qid)
226 | q_ans = self.getAnsObj(qid)
227 | q_iid = self.getImgId(qid)
228 |
229 | # convert question to vec
230 | q_list = VQADataProvider.seq_to_list(q_str)
231 | t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list)
232 |
233 | try:
234 | qid_split = qid.split(QID_KEY_SEPARATOR)
235 | data_split = qid_split[0]
236 | if data_split == 'genome':
237 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
238 | else:
239 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
240 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
241 | except:
242 | t_ivec = 0.
243 | print 'data not found for qid : ', q_iid, self.mode
244 |
245 | # convert answer to vec
246 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
247 | q_ans_str = self.extract_answer(q_ans)
248 | t_avec = self.answer_to_vec(q_ans_str)
249 | else:
250 | t_avec = self.extract_answer_list(q_ans)
251 |
252 | qvec[i,...] = t_qvec
253 | cvec[i,...] = t_cvec
254 | ivec[i,...] = t_ivec
255 | avec[i,...] = t_avec
256 | glove_matrix[i,...] = t_glove_matrix
257 |
258 | return qvec, cvec, ivec, avec, glove_matrix
259 |
260 |
261 | def get_batch_vec(self):
262 | if self.batch_len is None:
263 | self.n_skipped = 0
264 | qid_list = self.getQuesIds()
265 | random.shuffle(qid_list)
266 | self.qid_list = qid_list
267 | self.batch_len = len(qid_list)
268 | self.batch_index = 0
269 | self.epoch_counter = 0
270 |
271 | def has_at_least_one_valid_answer(t_qid):
272 | answer_obj = self.getAnsObj(t_qid)
273 | answer_list = [ans['answer'] for ans in answer_obj]
274 | for ans in answer_list:
275 | if self.adict.has_key(ans):
276 | return True
277 |
278 | counter = 0
279 | t_qid_list = []
280 | t_iid_list = []
281 | while counter < self.batchsize:
282 | t_qid = self.qid_list[self.batch_index]
283 | t_iid = self.getImgId(t_qid)
284 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
285 | t_qid_list.append(t_qid)
286 | t_iid_list.append(t_iid)
287 | counter += 1
288 | elif has_at_least_one_valid_answer(t_qid):
289 | t_qid_list.append(t_qid)
290 | t_iid_list.append(t_iid)
291 | counter += 1
292 | else:
293 | self.n_skipped += 1
294 |
295 | if self.batch_index < self.batch_len-1:
296 | self.batch_index += 1
297 | else:
298 | self.epoch_counter += 1
299 | qid_list = self.getQuesIds()
300 | random.shuffle(qid_list)
301 | self.qid_list = qid_list
302 | self.batch_index = 0
303 | print("%d questions were skipped in a single epoch" % self.n_skipped)
304 | self.n_skipped = 0
305 |
306 | t_batch = self.create_batch(t_qid_list)
307 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
308 |
309 |
310 | class VQADataProviderLayer(caffe.Layer):
311 | """
312 | Provide input data for VQA.
313 | """
314 |
315 | def setup(self, bottom, top):
316 | self.batchsize = json.loads(self.param_str)['batchsize']
317 | self.top_names = ['data','cont','feature','label','glove']
318 | top[0].reshape(15,self.batchsize)
319 | top[1].reshape(15,self.batchsize)
320 | top[2].reshape(self.batchsize,2048,14,14)
321 | top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
322 | top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE)
323 |
324 | self.mode = json.loads(self.param_str)['mode']
325 | self.folder = json.loads(self.param_str)['folder']
326 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
327 | pass
328 | else:
329 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
330 |
331 | def reshape(self, bottom, top):
332 | pass
333 |
334 | def forward(self, bottom, top):
335 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
336 | pass
337 | else:
338 | word, cont, feature, answer, glove_matrix, _, _, epoch_counter = self.dp.get_batch_vec()
339 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
340 | top[1].data[...] = np.transpose(cont,(1,0))
341 | top[2].data[...] = feature
342 | top[3].data[...] = answer
343 | top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300
344 |
345 | def backward(self, top, propagate_down, bottom):
346 | pass
347 |
348 |
--------------------------------------------------------------------------------
/mfb_coatt_glove/train_mfb_coatt_glove.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | matplotlib.use('Agg')
3 | import os
4 | import sys
5 | import numpy as np
6 | import json
7 | import matplotlib.pyplot as plt
8 |
9 | import caffe
10 | from caffe import layers as L
11 | from caffe import params as P
12 | from caffe.proto import caffe_pb2
13 |
14 | from vqa_data_layer_kld import VQADataProvider
15 | from utils import exec_validation, drawgraph
16 | import config
17 | import time
18 |
19 | def get_solver(folder):
20 | s = caffe_pb2.SolverParameter()
21 | s.train_net = './%s/proto_train.prototxt'%folder
22 | s.snapshot = int(config.VALIDATE_INTERVAL)
23 | s.snapshot_prefix = './%s/'%folder
24 | s.max_iter = int(config.MAX_ITERATIONS)
25 | s.display = int(config.VALIDATE_INTERVAL)
26 | s.type = 'Adam'
27 | s.stepsize = int(config.MAX_ITERATIONS*0.4)
28 | s.gamma = 0.5
29 | s.lr_policy = "step"
30 | s.base_lr = 0.0007
31 | s.momentum = 0.9
32 | s.momentum2 = 0.999
33 | s.weight_decay = 0.000
34 | s.clip_gradients = 10
35 | return s
36 |
37 | def get_auxiliary_json():
38 | aux = {}
39 | aux["batch_size"] = int(config.VAL_BATCH_SIZE)
40 | aux["data_shape"] = [2048,14,14]
41 | aux["img_feature_prefix"] = config.DATA_PATHS['test']['features_prefix']
42 | aux["glove"] = True
43 | return aux
44 |
45 |
46 | def mfb_coatt(mode, batchsize, T, question_vocab_size, folder):
47 | n = caffe.NetSpec()
48 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
49 | if mode == 'val':
50 | n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \
51 | module='vqa_data_layer', layer='VQADataProviderLayer', \
52 | param_str=mode_str, ntop=5 )
53 | else:
54 | n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
55 | module='vqa_data_layer_kld', layer='VQADataProviderLayer', \
56 | param_str=mode_str, ntop=5 )
57 | n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
58 | weight_filler=dict(type='xavier'))
59 | n.embed_tanh = L.TanH(n.embed)
60 | concat_word_embed = [n.embed_tanh, n.glove]
61 | n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600
62 |
63 | # LSTM
64 | n.lstm1 = L.LSTM(\
65 | n.concat_embed, n.cont,\
66 | recurrent_param=dict(\
67 | num_output=config.LSTM_UNIT_NUM,\
68 | weight_filler=dict(type='xavier')))
69 | n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
70 | n.lstm1_resh = L.Permute(n.lstm1_droped, permute_param=dict(order=[1,2,0]))
71 | n.lstm1_resh2 = L.Reshape(n.lstm1_resh, \
72 | reshape_param=dict(shape=dict(dim=[0,0,0,1])))
73 |
74 | '''
75 | Question Attention
76 | '''
77 | n.qatt_conv1 = L.Convolution(n.lstm1_resh2, kernel_size=1, stride=1, num_output=512, pad=0,
78 | weight_filler=dict(type='xavier'))
79 | n.qatt_relu = L.ReLU(n.qatt_conv1)
80 | n.qatt_conv2 = L.Convolution(n.qatt_relu, kernel_size=1, stride=1, num_output=config.NUM_QUESTION_GLIMPSE, pad=0,
81 | weight_filler=dict(type='xavier'))
82 | n.qatt_reshape = L.Reshape(n.qatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_QUESTION_GLIMPSE,config.MAX_WORDS_IN_QUESTION,1]))) # N*NUM_QUESTION_GLIMPSE*15
83 | n.qatt_softmax = L.Softmax(n.qatt_reshape, axis=2)
84 |
85 | qatt_maps = L.Slice(n.qatt_softmax,ntop=config.NUM_QUESTION_GLIMPSE,slice_param={'axis':1})
86 | dummy_lstm = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
87 | qatt_feature_list = []
88 | for i in xrange(config.NUM_QUESTION_GLIMPSE):
89 | if config.NUM_QUESTION_GLIMPSE == 1:
90 | n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps, dummy_lstm))
91 | else:
92 | n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps[i], dummy_lstm))
93 | qatt_feature_list.append(n.__getattr__('qatt_feat%d'%i))
94 | n.qatt_feat_concat = L.Concat(*qatt_feature_list)
95 | '''
96 | Image Attention with MFB
97 | '''
98 | n.q_feat_resh = L.Reshape(n.qatt_feat_concat,reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
99 | n.i_feat_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[0,-1,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
100 |
101 | n.iatt_q_proj = L.InnerProduct(n.q_feat_resh, num_output = config.JOINT_EMB_SIZE,
102 | weight_filler=dict(type='xavier'))
103 | n.iatt_q_resh = L.Reshape(n.iatt_q_proj, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,1,1])))
104 | n.iatt_q_tile1 = L.Tile(n.iatt_q_resh, axis=2, tiles=config.IMG_FEAT_WIDTH)
105 | n.iatt_q_tile2 = L.Tile(n.iatt_q_tile1, axis=3, tiles=config.IMG_FEAT_WIDTH)
106 |
107 |
108 | n.iatt_i_conv = L.Convolution(n.i_feat_resh, kernel_size=1, stride=1, num_output=config.JOINT_EMB_SIZE, pad=0,
109 | weight_filler=dict(type='xavier'))
110 | n.iatt_i_resh1 = L.Reshape(n.iatt_i_conv, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,
111 | config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
112 | n.iatt_iq_eltwise = L.Eltwise(n.iatt_q_tile2, n.iatt_i_resh1, eltwise_param=dict(operation=0))
113 | n.iatt_iq_droped = L.Dropout(n.iatt_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
114 | n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_droped, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,196,1])))
115 | n.iatt_iq_permute1 = L.Permute(n.iatt_iq_resh2, permute_param=dict(order=[0,2,1,3]))
116 | n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_permute1, reshape_param=dict(shape=dict(dim=[-1,config.IMG_FEAT_SIZE,
117 | config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
118 | n.iatt_iq_sumpool = L.Pooling(n.iatt_iq_resh2, pool=P.Pooling.SUM, \
119 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
120 | n.iatt_iq_permute2 = L.Permute(n.iatt_iq_sumpool, permute_param=dict(order=[0,2,1,3]))
121 |
122 | n.iatt_iq_sqrt = L.SignedSqrt(n.iatt_iq_permute2)
123 | n.iatt_iq_l2 = L.L2Normalize(n.iatt_iq_sqrt)
124 |
125 |
126 | ## 2 conv layers 1000 -> 512 -> 2
127 | n.iatt_conv1 = L.Convolution(n.iatt_iq_l2, kernel_size=1, stride=1, num_output=512, pad=0,
128 | weight_filler=dict(type='xavier'))
129 | n.iatt_relu = L.ReLU(n.iatt_conv1)
130 | n.iatt_conv2 = L.Convolution(n.iatt_relu, kernel_size=1, stride=1, num_output=config.NUM_IMG_GLIMPSE, pad=0,
131 | weight_filler=dict(type='xavier'))
132 | n.iatt_resh = L.Reshape(n.iatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_SIZE])))
133 | n.iatt_softmax = L.Softmax(n.iatt_resh, axis=2)
134 | n.iatt_softmax_resh = L.Reshape(n.iatt_softmax,reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
135 | iatt_maps = L.Slice(n.iatt_softmax_resh, ntop=config.NUM_IMG_GLIMPSE,slice_param={'axis':1})
136 | dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
137 | iatt_feature_list = []
138 | for i in xrange(config.NUM_IMG_GLIMPSE):
139 | if config.NUM_IMG_GLIMPSE == 1:
140 | n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps, dummy))
141 | else:
142 | n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps[i], dummy))
143 | n.__setattr__('iatt_feat%d_resh'%i, L.Reshape(n.__getattr__('iatt_feat%d'%i), \
144 | reshape_param=dict(shape=dict(dim=[0,-1]))))
145 | iatt_feature_list.append(n.__getattr__('iatt_feat%d_resh'%i))
146 | n.iatt_feat_concat = L.Concat(*iatt_feature_list)
147 | n.iatt_feat_concat_resh = L.Reshape(n.iatt_feat_concat, reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
148 |
149 | '''
150 | Fine-grained Image-Question MFB fusion
151 | '''
152 |
153 | n.mfb_q_proj = L.InnerProduct(n.q_feat_resh, num_output=config.JOINT_EMB_SIZE,
154 | weight_filler=dict(type='xavier'))
155 | n.mfb_i_proj = L.InnerProduct(n.iatt_feat_concat_resh, num_output=config.JOINT_EMB_SIZE,
156 | weight_filler=dict(type='xavier'))
157 | n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0))
158 | n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
159 | n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
160 | n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \
161 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
162 | n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\
163 | reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
164 | n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out)
165 | n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt)
166 |
167 | n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS,
168 | weight_filler=dict(type='xavier'))
169 | if mode == 'val':
170 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
171 | else:
172 | n.loss = L.SoftmaxKLDLoss(n.prediction, n.label)
173 | return n.to_proto()
174 |
175 | def make_answer_vocab(adic, vocab_size):
176 | """
177 | Returns a dictionary that maps words to indices.
178 | """
179 | adict = {'':0}
180 | nadict = {'':1000000}
181 | vid = 1
182 | for qid in adic.keys():
183 | answer_obj = adic[qid]
184 | answer_list = [ans['answer'] for ans in answer_obj]
185 |
186 | for q_ans in answer_list:
187 | # create dict
188 | if adict.has_key(q_ans):
189 | nadict[q_ans] += 1
190 | else:
191 | nadict[q_ans] = 1
192 | adict[q_ans] = vid
193 | vid +=1
194 |
195 | # debug
196 | nalist = []
197 | for k,v in sorted(nadict.items(), key=lambda x:x[1]):
198 | nalist.append((k,v))
199 |
200 | # remove words that appear less than once
201 | n_del_ans = 0
202 | n_valid_ans = 0
203 | adict_nid = {}
204 | for i, w in enumerate(nalist[:-vocab_size]):
205 | del adict[w[0]]
206 | n_del_ans += w[1]
207 | for i, w in enumerate(nalist[-vocab_size:]):
208 | n_valid_ans += w[1]
209 | adict_nid[w[0]] = i
210 |
211 | return adict_nid
212 |
213 | def make_question_vocab(qdic):
214 | """
215 | Returns a dictionary that maps words to indices.
216 | """
217 | vdict = {'':0}
218 | vid = 1
219 | for qid in qdic.keys():
220 | # sequence to list
221 | q_str = qdic[qid]['qstr']
222 | q_list = VQADataProvider.seq_to_list(q_str)
223 |
224 | # create dict
225 | for w in q_list:
226 | if not vdict.has_key(w):
227 | vdict[w] = vid
228 | vid +=1
229 |
230 | return vdict
231 |
232 | def make_vocab_files():
233 | """
234 | Produce the question and answer vocabulary files.
235 | """
236 | print 'making question vocab...', config.QUESTION_VOCAB_SPACE
237 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
238 | question_vocab = make_question_vocab(qdic)
239 | print 'making answer vocab...', config.ANSWER_VOCAB_SPACE
240 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
241 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
242 | return question_vocab, answer_vocab
243 |
244 | def main():
245 | folder = 'mfb_coatt_glove_q%dv%d_%s'%(config.NUM_QUESTION_GLIMPSE, config.NUM_IMG_GLIMPSE,config.TRAIN_DATA_SPLITS)
246 | if not os.path.exists('./%s'%folder):
247 | os.makedirs('./%s'%folder)
248 |
249 | question_vocab, answer_vocab = {}, {}
250 | if os.path.exists('./%s/vdict.json'%folder) and os.path.exists('./%s/adict.json'%folder):
251 | print 'restoring vocab'
252 | with open('./%s/vdict.json'%folder,'r') as f:
253 | question_vocab = json.load(f)
254 | with open('./%s/adict.json'%folder,'r') as f:
255 | answer_vocab = json.load(f)
256 | else:
257 | question_vocab, answer_vocab = make_vocab_files()
258 | with open('./%s/vdict.json'%folder,'w') as f:
259 | json.dump(question_vocab, f)
260 | with open('./%s/adict.json'%folder,'w') as f:
261 | json.dump(answer_vocab, f)
262 |
263 | print 'question vocab size:', len(question_vocab)
264 | print 'answer vocab size:', len(answer_vocab)
265 |
266 | with open('./%s/proto_train.prototxt'%folder, 'w') as f:
267 | f.write(str(mfb_coatt(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
268 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
269 |
270 | with open('./%s/proto_test.prototxt'%folder, 'w') as f:
271 | f.write(str(mfb_coatt('val', config.VAL_BATCH_SIZE, \
272 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
273 |
274 | with open('./%s/solver.prototxt'%folder, 'w') as f:
275 | f.write(str(get_solver(folder)))
276 | with open('./%s/auxiliary.json'%folder, 'w') as f:
277 | json.dump(get_auxiliary_json(),f, indent=2)
278 |
279 | caffe.set_device(config.TRAIN_GPU_ID)
280 | caffe.set_mode_gpu()
281 | solver = caffe.get_solver('./%s/solver.prototxt'%folder)
282 |
283 | train_loss = np.zeros(config.MAX_ITERATIONS+1)
284 | results = []
285 |
286 | if config.RESTORE_ITER:
287 | restore_iter = config.RESTORE_ITER
288 | solver.restore('./%s/_iter_%d.solverstate'%(folder,restore_iter))
289 | else:
290 | restore_iter = 0
291 |
292 | start = time.clock()
293 | for it in range(restore_iter, config.MAX_ITERATIONS+1):
294 | solver.step(1)
295 |
296 | # store the train loss
297 | train_loss[it] = solver.net.blobs['loss'].data
298 |
299 | if it % config.PRINT_INTERVAL == 0 and it != 0:
300 | elapsed = (time.clock() - start)
301 | print 'Iteration:', it
302 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
303 | print 'Train loss:', c_mean_loss, ' Elapsed seconds:', elapsed
304 | start = time.clock()
305 | if it % config.VALIDATE_INTERVAL == 0 and it != restore_iter:
306 | model_name = './%s/tmp.caffemodel'%(folder)
307 | solver.net.save(model_name)
308 | print 'Validating...'
309 |
310 | # for test-dev /test set. the json file will be generated under the file
311 | exec_validation(config.TEST_GPU_ID, 'test-dev', model_name, it=it, folder=folder)
312 | caffe.set_device(config.TRAIN_GPU_ID)
313 | '''
314 | #for val set. the accuracy will be computed and ploted
315 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.TEST_GPU_ID, 'val', model_name, it=it, folder=folder)
316 | caffe.set_device(config.TRAIN_GPU_ID)
317 | print 'Test loss:', test_loss
318 | print 'Accuracy:', acc_overall
319 | print 'Test per ans', acc_per_ans
320 | results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
321 | best_result_idx = np.array([x[3] for x in results]).argmax()
322 | print 'Best accuracy of', results[best_result_idx][3], 'was at iteration', results[best_result_idx][0]
323 | drawgraph(results,folder,config.MFB_FACTOR_NUM,config.MFB_OUT_DIM,prefix='mfb_coatt_glove')
324 | '''
325 | if __name__ == '__main__':
326 | main()
327 |
--------------------------------------------------------------------------------