├── imgs
    ├── MFB-github.png
    └── MFH-github.png
├── .gitignore
├── mfb_baseline
    ├── config.py
    ├── train_mfb_baseline.py
    ├── utils.py
    ├── vqa_data_layer.py
    └── vqa_data_layer_kld.py
├── mfh_baseline
    ├── config.py
    ├── utils.py
    ├── train_mfh_baseline.py
    ├── vqa_data_layer.py
    └── vqa_data_layer_kld.py
├── mfb_coatt_glove
    ├── config.py
    ├── utils.py
    ├── vqa_data_layer.py
    ├── vqa_data_layer_kld.py
    └── train_mfb_coatt_glove.py
├── mfh_coatt_glove
    ├── config.py
    ├── utils.py
    ├── vqa_data_layer.py
    └── vqa_data_layer_kld.py
├── README.md
└── eval
    └── ensemble.py


/imgs/MFB-github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuzcccc/vqa-mfb/HEAD/imgs/MFB-github.png


--------------------------------------------------------------------------------
/imgs/MFH-github.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yuzcccc/vqa-mfb/HEAD/imgs/MFH-github.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | mfb_baseline/*.pyc
 2 | mfb_coatt_glove/*.pyc
 3 | eval/*.pyc
 4 | 
 5 | # Byte-compiled / optimized / DLL files
 6 | __pycache__/
 7 | *.py[cod]
 8 | *$py.class
 9 | 
10 | # C extensions
11 | *.so
12 | 
13 | # Distribution / packaging
14 | .Python
15 | env/
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | 
31 | # PyInstaller
32 | #  Usually these files are written by a python script from a template
33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 | 
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 | 
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *,cover
50 | .hypothesis/
51 | 
52 | # Translations
53 | *.mo
54 | *.pot
55 | 
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | 
60 | # Flask stuff:
61 | instance/
62 | .webassets-cache
63 | 
64 | # Scrapy stuff:
65 | .scrapy
66 | 
67 | # Sphinx documentation
68 | docs/_build/
69 | 
70 | # PyBuilder
71 | target/
72 | 
73 | # IPython Notebook
74 | .ipynb_checkpoints
75 | 
76 | # pyenv
77 | .python-version
78 | 
79 | # celery beat schedule file
80 | celerybeat-schedule
81 | 
82 | # dotenv
83 | .env
84 | 
85 | # virtualenv
86 | venv/
87 | ENV/
88 | 
89 | # Spyder project settings
90 | .spyderproject
91 | 
92 | # Rope project settings
93 | .ropeproject
94 | 


--------------------------------------------------------------------------------
/mfb_baseline/config.py:
--------------------------------------------------------------------------------
 1 | #training parameters
 2 | TRAIN_GPU_ID = 0
 3 | TEST_GPU_ID = 0
 4 | BATCH_SIZE = 200
 5 | VAL_BATCH_SIZE = 200
 6 | PRINT_INTERVAL = 100
 7 | VALIDATE_INTERVAL = 5000 
 8 | MAX_ITERATIONS = 100000
 9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed!
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | # what data to use for the vocabulary
13 | QUESTION_VOCAB_SPACE = 'train'
14 | ANSWER_VOCAB_SPACE = 'train' # test/test-dev/genome should not appear here
15 | 
16 | #network parameters
17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
18 | MFB_FACTOR_NUM = 5
19 | MFB_OUT_DIM = 1000
20 | LSTM_UNIT_NUM = 1024
21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM
22 | MAX_WORDS_IN_QUESTION = 15
23 | LSTM_DROPOUT_RATIO = 0.3
24 | MFB_DROPOUT_RATIO = 0.1
25 | 
26 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
27 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools'
28 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools'
29 | 
30 | # location of the data
31 | VQA_PREFIX = '/home/yuz/data/VQA'
32 | 
33 | feat = 'pool5'
34 | DATA_PATHS = {
35 | 	'train': {
36 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
37 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
38 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat
39 | 	},
40 | 	'val': {
41 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
42 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
43 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat
44 | 	},
45 | 	'test-dev': {
46 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
47 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
48 | 	},
49 | 	'test': {
50 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
51 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
52 | 	},
53 | 	'genome': {
54 | 		'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json',
55 | 		'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/mfh_baseline/config.py:
--------------------------------------------------------------------------------
 1 | #training parameters
 2 | TRAIN_GPU_ID = 0
 3 | TEST_GPU_ID = 0
 4 | BATCH_SIZE = 200
 5 | VAL_BATCH_SIZE = 200
 6 | PRINT_INTERVAL = 100
 7 | VALIDATE_INTERVAL = 5000 
 8 | MAX_ITERATIONS = 100000
 9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed!
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train'
12 | # what data to use for the vocabulary
13 | QUESTION_VOCAB_SPACE = 'train'
14 | ANSWER_VOCAB_SPACE = 'train' # test/test-dev/genome should not appear here
15 | 
16 | #network parameters
17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
18 | MFB_FACTOR_NUM = 5
19 | MFB_OUT_DIM = 1000
20 | LSTM_UNIT_NUM = 1024
21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM
22 | MAX_WORDS_IN_QUESTION = 15
23 | LSTM_DROPOUT_RATIO = 0.3
24 | MFB_DROPOUT_RATIO = 0.1
25 | 
26 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
27 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools'
28 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools'
29 | 
30 | # location of the data
31 | VQA_PREFIX = '/home/yuz/data/VQA'
32 | 
33 | feat = 'pool5'
34 | DATA_PATHS = {
35 | 	'train': {
36 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
37 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
38 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat
39 | 	},
40 | 	'val': {
41 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
42 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
43 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat
44 | 	},
45 | 	'test-dev': {
46 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
47 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
48 | 	},
49 | 	'test': {
50 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
51 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
52 | 	},
53 | 	'genome': {
54 | 		'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json',
55 | 		'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/mfb_coatt_glove/config.py:
--------------------------------------------------------------------------------
 1 | #training parameters
 2 | TRAIN_GPU_ID = 0
 3 | TEST_GPU_ID = 0
 4 | BATCH_SIZE = 64
 5 | VAL_BATCH_SIZE = 32
 6 | PRINT_INTERVAL = 100
 7 | VALIDATE_INTERVAL = 5000 
 8 | MAX_ITERATIONS = 100000
 9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed!
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train+val'
12 | # what data to use for the vocabulary
13 | QUESTION_VOCAB_SPACE = 'train+val'
14 | ANSWER_VOCAB_SPACE = 'train+val' # test/test-dev/genome should not appear here
15 | 
16 | #network parameters
17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
18 | MFB_FACTOR_NUM = 5
19 | MFB_OUT_DIM = 1000
20 | LSTM_UNIT_NUM = 1024
21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM
22 | NUM_IMG_GLIMPSE = 2
23 | NUM_QUESTION_GLIMPSE = 2
24 | IMG_FEAT_WIDTH = 14
25 | IMG_FEAT_SIZE = IMG_FEAT_WIDTH * IMG_FEAT_WIDTH
26 | MAX_WORDS_IN_QUESTION = 15
27 | LSTM_DROPOUT_RATIO = 0.3
28 | MFB_DROPOUT_RATIO = 0.1
29 | 
30 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
31 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools'
32 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools'
33 | 
34 | # location of the data
35 | VQA_PREFIX = '/home/yuz/data/VQA'
36 | 
37 | feat = 'res5c'
38 | DATA_PATHS = {
39 | 	'train': {
40 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
41 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
42 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat
43 | 	},
44 | 	'val': {
45 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
46 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
47 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat
48 | 	},
49 | 	'test-dev': {
50 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
51 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
52 | 	},
53 | 	'test': {
54 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
55 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
56 | 	},
57 | 	'genome': {
58 | 		'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json',
59 | 		'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/mfh_coatt_glove/config.py:
--------------------------------------------------------------------------------
 1 | #training parameters
 2 | TRAIN_GPU_ID = 0
 3 | TEST_GPU_ID = 0
 4 | BATCH_SIZE = 64
 5 | VAL_BATCH_SIZE = 32
 6 | PRINT_INTERVAL = 100
 7 | VALIDATE_INTERVAL = 10000 
 8 | MAX_ITERATIONS = 100000
 9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed!
10 | # what data to use for training
11 | TRAIN_DATA_SPLITS = 'train+val'
12 | # what data to use for the vocabulary
13 | QUESTION_VOCAB_SPACE = 'train+val'
14 | ANSWER_VOCAB_SPACE = 'train+val' # test/test-dev/genome should not appear here
15 | 
16 | #network parameters
17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size
18 | MFB_FACTOR_NUM = 5
19 | MFB_OUT_DIM = 1000
20 | LSTM_UNIT_NUM = 1024
21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM
22 | NUM_IMG_GLIMPSE = 2
23 | NUM_QUESTION_GLIMPSE = 2
24 | IMG_FEAT_WIDTH = 14
25 | IMG_FEAT_SIZE = IMG_FEAT_WIDTH * IMG_FEAT_WIDTH
26 | MAX_WORDS_IN_QUESTION = 15
27 | LSTM_DROPOUT_RATIO = 0.3
28 | MFB_DROPOUT_RATIO = 0.1
29 | 
30 | # vqa tools - get from https://github.com/VT-vision-lab/VQA
31 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools'
32 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools'
33 | 
34 | # location of the data
35 | VQA_PREFIX = '/home/yuz/data/VQA'
36 | 
37 | feat = 'res5c'
38 | DATA_PATHS = {
39 | 	'train': {
40 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json',
41 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json',
42 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat
43 | 	},
44 | 	'val': {
45 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json',
46 | 		'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json',
47 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat
48 | 	},
49 | 	'test-dev': {
50 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json',
51 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
52 | 	},
53 | 	'test': {
54 | 		'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json',
55 | 		'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat
56 | 	},
57 | 	'genome': {
58 | 		'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json',
59 | 		'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MFB and MFH for VQA
 2 | 
 3 | **This project is deprecated! The Pytorch implementation of MFB(MFH)+CoAtt with pre-trained models, along with several state-of-the-art VQA models are maintained in our [OpenVQA](https://github.com/MILVLG/openvqa) project, which is much more convenient to use!**
 4 | 
 5 | This project is the implementation of the papers *[Multi-modal Factorized Bilinear Pooling with Co-Attention Learning for Visual Question Answering (MFB)](https://arxiv.org/abs/1708.01471)* and *[Beyond Bilinear: Generalized Multi-modal Factorized High-order Pooling for Visual Question Answering (MFH)](https://arxiv.org/abs/1708.03619)*. Compared with existing state-of-the-art approaches such as MCB and MLB, our MFB models achieved superior performance on the large-scale VQA-1.0 and VQA-2.0 datasets. Moreover, MFH, the high-order extention of MFB, is also proveided to report better VQA performance. The MFB(MFH)+CoAtt network architecture for VQA is illustrated in Figure 1. 
 6 | 
 7 | ![Figure 1: The MFB+CoAtt Network architecture for VQA.](https://github.com/yuzcccc/mfb/raw/master/imgs/MFB-github.png)
 8 | <center>Figure 1: The MFB+CoAtt Network architecture for VQA.</center>
 9 | 
10 | ## Update Dec. 2nd, 2017
11 | The 3rd-party pytorch implementation for MFB(MFH) is released [here](https://github.com/asdf0982/vqa-mfb.pytorch). Great thanks, Liam!
12 | 
13 | ## Update Sep. 5th, 2017
14 | Using the Bottom-up and Top-Down (BUTD) image features (the model with adaptive K ranges from [10,100]) [here](https://github.com/yuzcccc/bottom-up-attention), our single MFH+CoAtt+GloVe model achieved the overall accuracy **68.76%** on the test-dev set of VQA-2.0 dataset. With an ensemble of 8 models, we achieved the new state-of-the-art performance on the VQA-2.0 dataset's [leaderboard](https://evalai.cloudcv.org/web/challenges/challenge-page/1/leaderboard) with the overall accuracy **70.92%**. 
15 | 
16 | ## Update Aug. 1st, 2019
17 | Our solution for the VQA Challenge 2017 is updated! 
18 | 
19 | We proposed a **high-order** extention for MFB, i.e., the Multi-modal Factorized High-order Pooling (MFH). See the flowchart in Figure 2 and the implementations in `mfh_baseline` and `mfh-coatt-glove` folders. With an ensemble of 9 MFH+CoAtt+GloVe(+VG) models, **we won the 2nd place (tied with another team) in the VQA Challenge 2017**. The detailed information can be found in our paper (the second paper in the CITATION section on bottom of this page). 
20 | 
21 | ![](https://github.com/yuzcccc/mfb/raw/master/imgs/MFH-github.png)
22 | <center>Figure 2: The high-order MFH model which consists of p MFB blocks (without sharing parameters).</center>
23 | 
24 | ## Prerequisites
25 | 
26 | Our codes is implemented based on the high-quality [vqa-mcb](https://github.com/akirafukui/vqa-mcb) project. The data preprocessing and and other prerequisites are the same with theirs. Before running our scripts to train or test MFB model, see the `Prerequisites` and `Data Preprocessing` sections in the README of vqa-mcb's project first. 
27 | 
28 | - The Caffe version required for our MFB is slightly different from the MCB. We add some layers, e.g., sum pooling, permute and KLD loss layers to the `feature/20160617_cb_softattention` branch of Caffe for MCB. Please checkout our caffe version [here](https://github.com/yuzcccc/caffe) and compile it. **Note that CuDNN is not compatible with sum pooling currently, you should switch it off to run the codes correctly**.
29 | 
30 | ## Pretrained Models
31 | 
32 | We release the pretrained **single model** "MFB(or MFH)+CoAtt+GloVe+VG" in the papers. To the best of our knowledge, our MFH+CoAtt+GloVe+VG model report the best result (test-dev) with a single model on both the VQA-1.0 and VQA-2.0 datasets(train + val + visual genome). The corresponding results are shown in the table below. The results JSON files (results.zip for VQA-1.0) are also included in the model folders, which can be uploaded to the evaluation servers directly. **Note that the models are trained with a old version of GloVe in spacy. If you use the latest one, they maybe incosistent, leading to inferior performance. I suggest training the model from scratch by yourself.**
33 | 
34 | |   Datasets\Models    | MCB | MFB | MFH  | MFH (BUTD img features) |
35 | |:-----------------:|:-----------------:|:-----------------:|:-----------------:|:-----------------:|
36 | | VQA-1.0   | 65.38%   |66.87% [BaiduYun](http://pan.baidu.com/s/1o8LURge)   | 67.72% [BaiduYun](http://pan.baidu.com/s/1c2neUv2) or [Dropbox](https://www.dropbox.com/s/qh1swgsq0na1bua/VQA1.0-mfh-coatt-glove-vg.zip?dl=0) | **69.82%** |
37 | | VQA-2.0   | 62.33%<sup>1</sup>   |65.09% [BaiduYun](http://pan.baidu.com/s/1pLjtkSV)   | 66.12% [BaiduYun](http://pan.baidu.com/s/1pLLUvIN) or [Dropbox](https://www.dropbox.com/s/zld15405a69how6/VQA2.0-mfh-coatt-glove-vg.zip?dl=0) | **68.76%**<sup>2</sup> |
38 | 
39 | <sup>1</sup> the MCB result on VQA-2.0 is provided by the VQA Challenge organizer with does not introdunce the GloVe embedding.
40 | 
41 | <sup>2</sup> overall: 68.76, yes/no: 84.27, num: 49.56, other: 59.89
42 | 
43 | ## Training from Scratch
44 | 
45 | We provide the scripts for training two MFB models from scratch, i.e., `mfb-baseline` and `mfb-coatt-glove` folders. Simply running the python scripts `train_*.py` to train the models from scratch. 
46 | 
47 | - Most of the hyper-parameters and configrations with comments are defined in the `config.py` file. 
48 | - The solver configrations are defined in the `get_solver` function in the `train_*.py` scripts. 
49 | - Pretrained GloVe word embedding model (the spacy library) is required to train the mfb-coatt-glove model. The installation instructions of spacy and GloVe model can be found [here](https://github.com/akirafukui/vqa-mcb/tree/master/train).
50 | 
51 | ## Evaluation
52 | 
53 | To generate an answers JSON file in the format expected by the VQA evaluation code and VQA test server, you can use `eval/ensemble.py`. This code can also ensemble multiple models. Running `python ensemble.py` will print out a help message telling you what arguments to use.
54 | 
55 | ## Licence
56 | 
57 | This code is distributed under MIT LICENSE. The released models are only allowed for non-commercial use.
58 | 
59 | ## Citation
60 | 
61 | If the codes are helpful for your research, please cite
62 | 
63 | ```
64 | @article{yu2017mfb,
65 |   title={Multi-modal Factorized Bilinear Pooling with Co-Attention Learning for Visual Question Answering},
66 |   author={Yu, Zhou and Yu, Jun and Fan, Jianping and Tao, Dacheng},
67 |   journal={IEEE International Conference on Computer Vision (ICCV)},
68 |   pages={1839--1848},
69 |   year={2017}
70 | }
71 | 
72 | @article{yu2018beyond,
73 |   title={Beyond Bilinear: Generalized Multimodal Factorized High-Order Pooling for Visual Question Answering},
74 |   author={Yu, Zhou and Yu, Jun and Xiang, Chenchao and Fan, Jianping and Tao, Dacheng},
75 |   journal={IEEE Transactions on Neural Networks and Learning Systems},
76 |   volume={29},
77 |   number={12},
78 |   pages={5947--5959},
79 |   year={2018}
80 | }
81 | ```
82 | 
83 | ## Concat
84 | 
85 | Zhou Yu  [yuz(AT)hdu.edu.cn]
86 | 


--------------------------------------------------------------------------------
/mfb_baseline/train_mfb_baseline.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | import json
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | import caffe
 10 | from caffe import layers as L
 11 | from caffe import params as P
 12 | from caffe.proto import caffe_pb2
 13 | 
 14 | from vqa_data_layer_kld import VQADataProvider
 15 | from utils import exec_validation, drawgraph
 16 | import config
 17 | import time
 18 | 
 19 | def get_solver(folder):
 20 |     s = caffe_pb2.SolverParameter()
 21 |     s.train_net = './%s/proto_train.prototxt'%folder
 22 |     s.snapshot = int(config.VALIDATE_INTERVAL)
 23 |     s.snapshot_prefix = './%s/'%folder
 24 |     s.max_iter = int(config.MAX_ITERATIONS)
 25 |     s.display = int(config.VALIDATE_INTERVAL)
 26 |     s.type = 'Adam'
 27 |     s.stepsize = int(config.MAX_ITERATIONS*0.4)
 28 |     s.gamma = 0.5
 29 |     s.lr_policy = "step"
 30 |     s.base_lr = 0.0007
 31 |     s.momentum = 0.9
 32 |     s.momentum2 = 0.999
 33 |     s.weight_decay = 0.000
 34 |     s.clip_gradients = 10
 35 |     return s
 36 | 
 37 | def get_auxiliary_json():
 38 |     aux = {}
 39 |     aux["batch_size"] = int(config.VAL_BATCH_SIZE)
 40 |     aux["data_shape"] = [2048]
 41 |     aux["img_feature_prefix"] = config.DATA_PATHS['test']['features_prefix']
 42 |     aux["glove"] = False
 43 |     return aux
 44 | 
 45 | 
 46 | def mfb_baseline(mode, batchsize, T, question_vocab_size, folder):
 47 |     n = caffe.NetSpec()
 48 |     mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
 49 |     if mode == 'val':
 50 |         n.data, n.cont, n.img_feature, n.label = L.Python( \
 51 |             module='vqa_data_layer', layer='VQADataProviderLayer', \
 52 |             param_str=mode_str, ntop=4 )
 53 |     else:
 54 |         n.data, n.cont, n.img_feature, n.label = L.Python(\
 55 |             module='vqa_data_layer_kld', layer='VQADataProviderLayer', \
 56 |             param_str=mode_str, ntop=4 ) 
 57 |     n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
 58 |                          weight_filler=dict(type='xavier'))
 59 |     n.embed_tanh = L.TanH(n.embed) 
 60 | 
 61 |     # LSTM
 62 |     n.lstm1 = L.LSTM(\
 63 |                    n.embed_tanh, n.cont,\
 64 |                    recurrent_param=dict(\
 65 |                        num_output=config.LSTM_UNIT_NUM,\
 66 |                        weight_filler=dict(type='xavier')))
 67 |     tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
 68 |     for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
 69 |         n.__setattr__('slice_first'+str(i), tops1[int(i)])
 70 |         n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
 71 |     n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
 72 |     n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
 73 |                           reshape_param=dict(\
 74 |                               shape=dict(dim=[-1,1024])))
 75 |     n.q_feat = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
 76 |     '''
 77 |     Coarse Image-Question MFB fusion
 78 |     '''
 79 | 
 80 |     n.mfb_q_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE, 
 81 |                                   weight_filler=dict(type='xavier'))
 82 |     n.mfb_i_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE, 
 83 |                                   weight_filler=dict(type='xavier'))
 84 |     n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0))
 85 |     n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
 86 |     n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
 87 |     n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \
 88 |                                       pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
 89 |     n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\
 90 |                                     reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
 91 |     n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out)
 92 |     n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) 
 93 |     
 94 |     n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS,
 95 |                                   weight_filler=dict(type='xavier')) 
 96 |     if mode == 'val':
 97 |         n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
 98 |     else:
 99 |         n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
100 |     return n.to_proto()
101 | 
102 | def make_answer_vocab(adic, vocab_size):
103 |     """
104 |     Returns a dictionary that maps words to indices.
105 |     """
106 |     adict = {'':0}
107 |     nadict = {'':1000000}
108 |     vid = 1
109 |     for qid in adic.keys():
110 |         answer_obj = adic[qid]
111 |         answer_list = [ans['answer'] for ans in answer_obj]
112 |         
113 |         for q_ans in answer_list:
114 |             # create dict
115 |             if adict.has_key(q_ans):
116 |                 nadict[q_ans] += 1
117 |             else:
118 |                 nadict[q_ans] = 1
119 |                 adict[q_ans] = vid
120 |                 vid +=1
121 | 
122 |     # debug
123 |     nalist = []
124 |     for k,v in sorted(nadict.items(), key=lambda x:x[1]):
125 |         nalist.append((k,v))
126 | 
127 |     # remove words that appear less than once 
128 |     n_del_ans = 0
129 |     n_valid_ans = 0
130 |     adict_nid = {}
131 |     for i, w in enumerate(nalist[:-vocab_size]):
132 |         del adict[w[0]]
133 |         n_del_ans += w[1]
134 |     for i, w in enumerate(nalist[-vocab_size:]):
135 |         n_valid_ans += w[1]
136 |         adict_nid[w[0]] = i
137 |     
138 |     return adict_nid
139 | 
140 | def make_question_vocab(qdic):
141 |     """
142 |     Returns a dictionary that maps words to indices.
143 |     """
144 |     vdict = {'':0}
145 |     vid = 1
146 |     for qid in qdic.keys():
147 |         # sequence to list
148 |         q_str = qdic[qid]['qstr']
149 |         q_list = VQADataProvider.seq_to_list(q_str)
150 | 
151 |         # create dict
152 |         for w in q_list:
153 |             if not vdict.has_key(w):
154 |                 vdict[w] = vid
155 |                 vid +=1
156 | 
157 |     return vdict
158 | 
159 | def make_vocab_files():
160 |     """
161 |     Produce the question and answer vocabulary files.
162 |     """
163 |     print 'making question vocab...', config.QUESTION_VOCAB_SPACE
164 |     qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
165 |     question_vocab = make_question_vocab(qdic)
166 |     print 'making answer vocab...', config.ANSWER_VOCAB_SPACE
167 |     _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
168 |     answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
169 |     return question_vocab, answer_vocab
170 | 
171 | def main():
172 |     folder = 'mfb_baseline_%s'%(config.TRAIN_DATA_SPLITS)
173 |     if not os.path.exists('./%s'%folder):
174 |         os.makedirs('./%s'%folder)
175 | 
176 |     question_vocab, answer_vocab = {}, {}
177 |     if os.path.exists('./%s/vdict.json'%folder) and os.path.exists('./%s/adict.json'%folder):
178 |         print 'restoring vocab'
179 |         with open('./%s/vdict.json'%folder,'r') as f:
180 |             question_vocab = json.load(f)
181 |         with open('./%s/adict.json'%folder,'r') as f:
182 |             answer_vocab = json.load(f)
183 |     else:
184 |         question_vocab, answer_vocab = make_vocab_files()
185 |         with open('./%s/vdict.json'%folder,'w') as f:
186 |             json.dump(question_vocab, f)
187 |         with open('./%s/adict.json'%folder,'w') as f:
188 |             json.dump(answer_vocab, f)
189 | 
190 |     print 'question vocab size:', len(question_vocab)
191 |     print 'answer vocab size:', len(answer_vocab)
192 | 
193 |     with open('./%s/proto_train.prototxt'%folder, 'w') as f:
194 |         f.write(str(mfb_baseline(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
195 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
196 |     
197 |     with open('./%s/proto_test.prototxt'%folder, 'w') as f:
198 |         f.write(str(mfb_baseline('val', config.VAL_BATCH_SIZE, \
199 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
200 | 
201 |     with open('./%s/solver.prototxt'%folder, 'w') as f:
202 |         f.write(str(get_solver(folder)))    
203 |     with open('./%s/auxiliary.json'%folder, 'w') as f:
204 |         json.dump(get_auxiliary_json(),f, indent=2)
205 | 
206 |     caffe.set_device(config.TRAIN_GPU_ID)
207 |     caffe.set_mode_gpu()
208 |     solver = caffe.get_solver('./%s/solver.prototxt'%folder)
209 | 
210 |     train_loss = np.zeros(config.MAX_ITERATIONS+1)
211 |     results = []
212 | 
213 |     if config.RESTORE_ITER:
214 |         restore_iter = config.RESTORE_ITER
215 |         solver.restore('./%s/_iter_%d.solverstate'%(folder,restore_iter))
216 |     else:
217 |         restore_iter = 0
218 |     
219 |     start = time.clock()
220 |     for it in range(restore_iter, config.MAX_ITERATIONS+1):
221 |         solver.step(1)
222 |     
223 |         # store the train loss
224 |         train_loss[it] = solver.net.blobs['loss'].data
225 |    
226 |         if it % config.PRINT_INTERVAL == 0 and it != 0:
227 |             elapsed = (time.clock() - start)
228 |             print 'Iteration:', it
229 |             c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
230 |             print 'Train loss:', c_mean_loss, ' Elapsed seconds:', elapsed
231 |             start = time.clock()
232 |         if it % config.VALIDATE_INTERVAL == 0 and it != restore_iter:
233 |             model_name = './%s/tmp.caffemodel'%(folder)
234 |             solver.net.save(model_name)
235 |             print 'Validating...'
236 |             ''' 
237 |             # for test-dev /test set. the json file will be generated under the <folder> file
238 |             exec_validation(config.TEST_GPU_ID, 'test-dev', model_name, it=it, folder=folder)
239 |             caffe.set_device(config.TRAIN_GPU_ID)
240 |             ''' 
241 |             #for val set. the accuracy will be computed and ploted        
242 |             test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.TEST_GPU_ID, 'val', model_name, it=it, folder=folder)
243 |             caffe.set_device(config.TRAIN_GPU_ID)
244 |             print 'Test loss:', test_loss
245 |             print 'Accuracy:', acc_overall
246 |             print 'Test per ans', acc_per_ans
247 |             results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
248 |             best_result_idx = np.array([x[3] for x in results]).argmax()
249 |             print 'Best accuracy of', results[best_result_idx][3], 'was at iteration', results[best_result_idx][0]
250 |             drawgraph(results,folder,config.MFB_FACTOR_NUM,config.MFB_OUT_DIM,prefix='mfb_baseline')
251 |              
252 | if __name__ == '__main__':
253 |     main()
254 | 


--------------------------------------------------------------------------------
/mfb_baseline/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer
 16 | 
 17 | import config
 18 | sys.path.append(config.VQA_TOOLS_PATH)
 19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 20 | 
 21 | from vqaTools.vqa import VQA
 22 | from vqaEvaluation.vqaEval import VQAEval
 23 | 
 24 | def visualize_failures(stat_list,mode):
 25 | 
 26 |     def save_qtype(qtype_list, save_filename, mode):
 27 | 
 28 |         if mode == 'val':
 29 |             savepath = os.path.join('./eval', save_filename)
 30 |             # TODO
 31 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014'
 32 |         elif mode == 'test-dev':
 33 |             savepath = os.path.join('./test-dev', save_filename)
 34 |             # TODO
 35 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
 36 |         elif mode == 'test':
 37 |             savepath = os.path.join('./test', save_filename)
 38 |             # TODO
 39 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
 40 |         else:
 41 |             raise Exception('Unsupported mode')
 42 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 43 |         if not os.path.exists(savepath): os.makedirs(savepath)
 44 | 
 45 |         for qt in qtype_list:
 46 |             count = 0
 47 |             for t_question in stat_list:
 48 |                 #print count, t_question
 49 |                 if count < 40/len(qtype_list):
 50 |                     t_question_list = t_question['q_list']
 51 |                     saveflag = False
 52 |                     #print 'debug****************************'
 53 |                     #print qt
 54 |                     #print t_question_list
 55 |                     #print t_question_list[0] == qt[0]
 56 |                     #print t_question_list[1] == qt[1]
 57 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 58 |                         saveflag = True
 59 |                     else:
 60 |                         saveflag = False
 61 |                                
 62 |                     if saveflag == True:
 63 |                         t_iid = t_question['iid']
 64 |                         if mode == 'val':
 65 |                             t_img = Image.open(os.path.join(img_pre, \
 66 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 67 |                         elif mode == 'test-dev' or 'test':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 70 | 
 71 |                         # for caption
 72 |                         #print t_iid
 73 |                         #annIds = caps.getAnnIds(t_iid)
 74 |                         #anns = caps.loadAnns(annIds)
 75 |                         #cap_list = [ann['caption'] for ann in anns]
 76 |                         ans_list = t_question['ans_list']
 77 |                         draw = ImageDraw.Draw(t_img)
 78 |                         for i in range(len(ans_list)):
 79 |                             try:
 80 |                                 draw.text((10,10*i), str(ans_list[i]))
 81 |                             except:
 82 |                                 pass
 83 | 
 84 |                         ans = t_question['answer']
 85 |                         pred = t_question['pred']
 86 |                         if ans == -1:
 87 |                             pre = ''
 88 |                         elif ans == pred:
 89 |                             pre = 'correct  '
 90 |                         else:
 91 |                             pre = 'failure  '
 92 |                         #print ' aaa ', ans, pred
 93 |                         ans = re.sub( '/', ' ', str(ans))
 94 |                         pred = re.sub( '/', ' ', str(pred))
 95 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 96 |                             str(ans) + ' p_' + str(pred) + '.png'
 97 |                         count += 1
 98 |                         print os.path.join(savepath,img_title)
 99 |                         t_img.save(os.path.join(savepath,img_title))
100 | 
101 |     print 'saving whatis'
102 |     qt_color_list = [['what','color']]
103 |     save_qtype(qt_color_list, 'colors', mode)
104 | 
105 |     print 'saving whatis'
106 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
107 |     save_qtype(qt_whatis_list, 'whatis', mode)
108 | 
109 |     print 'saving is'
110 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
111 |     save_qtype(qt_is_list, 'is', mode)
112 | 
113 |     print 'saving how many'
114 |     qt_howmany_list =[['how','many']]
115 |     save_qtype(qt_howmany_list, 'howmany', mode)
116 | 
117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False):
118 | 
119 |     caffe.set_device(device_id)
120 |     caffe.set_mode_gpu()
121 |     net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST)
122 | 
123 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder)
124 |     total_questions = len(dp.getQuesIds())
125 |     epoch = 0
126 | 
127 |     pred_list = []
128 |     testloss_list = []
129 |     stat_list = []
130 | 
131 |     while epoch == 0:
132 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
133 |         net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
134 |         net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
135 |         net.blobs['img_feature'].data[...] = t_img_feature
136 |         net.blobs['label'].data[...] = t_answer
137 |         net.forward()
138 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
139 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
140 |         testloss_list.append(net.blobs['loss'].data)
141 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
142 |             #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
143 |             pred_list.append((pred,int(dp.getStrippedQuesId(qid))))
144 |             if visualize:
145 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
146 |                 if mode == 'test-dev' or 'test':
147 |                     ans_str = ''
148 |                     ans_list = ['']*10
149 |                 else:
150 |                     ans_str = dp.vec_to_answer(ans)
151 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
152 |                 stat_list.append({\
153 |                                     'qid'   : qid,
154 |                                     'q_list' : q_list,
155 |                                     'iid'   : iid,
156 |                                     'answer': ans_str,
157 |                                     'ans_list': ans_list,
158 |                                     'pred'  : pred })
159 |         percent = 100 * float(len(pred_list)) / total_questions
160 |         sys.stdout.write('\r' + ('%.2f' % percent) + '%')
161 |         sys.stdout.flush()
162 | 
163 | 
164 |     print 'Deduping arr of len', len(pred_list)
165 |     deduped = []
166 |     seen = set()
167 |     for ans, qid in pred_list:
168 |         if qid not in seen:
169 |             seen.add(qid)
170 |             deduped.append((ans, qid))
171 |     print 'New len', len(deduped)
172 |     final_list=[]
173 |     for ans,qid in deduped:
174 |         final_list.append({u'answer': ans, u'question_id': qid})
175 |  
176 |     mean_testloss = np.array(testloss_list).mean()
177 | 
178 |     if mode == 'val':
179 |         valFile = './%s/val2015_resfile'%folder
180 |         with open(valFile, 'w') as f:
181 |             json.dump(final_list, f)
182 |         if visualize:
183 |             visualize_failures(stat_list,mode)
184 |         annFile = config.DATA_PATHS['val']['ans_file']
185 |         quesFile = config.DATA_PATHS['val']['ques_file']
186 |         vqa = VQA(annFile, quesFile)
187 |         vqaRes = vqa.loadRes(valFile, quesFile)
188 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
189 |         vqaEval.evaluate()
190 |         acc_overall = vqaEval.accuracy['overall']
191 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
192 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
193 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
194 |     elif mode == 'test-dev':
195 |         filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
196 |         with open(filename+'.json', 'w') as f:
197 |             json.dump(final_list, f)
198 |         if visualize:
199 |             visualize_failures(stat_list,mode)
200 |     elif mode == 'test':
201 |         filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
202 |         with open(filename+'.json', 'w') as f:
203 |             json.dump(final_list, f)
204 |         if visualize:
205 |             visualize_failures(stat_list,mode)
206 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False):
207 |     # 0:it
208 |     # 1:trainloss
209 |     # 2:testloss
210 |     # 3:oa_acc
211 |     # 4:qt_acc
212 |     # 5:at_acc
213 | 
214 |     # training curve
215 |     it = np.array([l[0] for l in results])
216 |     loss = np.array([l[1] for l in results])
217 |     valloss = np.array([l[2] for l in results])
218 |     valacc = np.array([l[3] for l in results])
219 | 
220 |     fig = plt.figure()
221 |     ax1 = fig.add_subplot(111)
222 |     ax2 = ax1.twinx()
223 | 
224 |     ax1.plot(it,loss, color='blue', label='train loss')
225 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
226 |     ax2.plot(it,valacc, color='red', label='acc on val')
227 |     plt.legend(loc='lower left')
228 | 
229 |     ax1.set_xlabel('Iterations')
230 |     ax1.set_ylabel('Loss Value')
231 |     ax2.set_ylabel('Accuracy on Val [%]')
232 | 
233 |     plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix))
234 |     plt.clf()
235 |     plt.close("all")
236 | 
237 |     # question type
238 |     it = np.array([l[0] for l in results])
239 |     oa_acc = np.array([l[3] for l in results])
240 |     qt_dic_list = [l[4] for l in results]
241 | 
242 |     def draw_qt_acc(target_key_list, figname):
243 |         fig = plt.figure()
244 |         for k in target_key_list:
245 |             print k,type(k)
246 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
247 |             plt.plot(it,t_val,label=str(k))
248 |         plt.legend(fontsize='small')
249 |         plt.ylim(0,100.)
250 |         #plt.legend(prop={'size':6})
251 | 
252 |         plt.xlabel('Iterations')
253 |         plt.ylabel('Accuracy on Val [%]')
254 | 
255 |         plt.savefig(figname,dpi=200)
256 |         plt.clf()
257 |         plt.close("all")
258 | 
259 |     if save_question_type_graphs:
260 |         s_keys = sorted(qt_dic_list[0].keys())
261 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
262 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
263 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
264 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
265 |         draw_qt_acc(['what color is the','what color are the','what color is',\
266 |             'what color','what is the color of the'],'./qt_color.png')
267 |         draw_qt_acc(['how many','how','how many people are',\
268 |             'how many people are in'],'./qt_number.png')
269 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
270 |             'which'],'./qt_who_why_where_which.png')
271 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
272 |             'is the woman','is this person','what is the woman','is the person',\
273 |             'what is the person'],'./qt_human.png')
274 | 
275 | 
276 | 


--------------------------------------------------------------------------------
/mfh_baseline/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer
 16 | 
 17 | import config
 18 | sys.path.append(config.VQA_TOOLS_PATH)
 19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 20 | 
 21 | from vqaTools.vqa import VQA
 22 | from vqaEvaluation.vqaEval import VQAEval
 23 | 
 24 | def visualize_failures(stat_list,mode):
 25 | 
 26 |     def save_qtype(qtype_list, save_filename, mode):
 27 | 
 28 |         if mode == 'val':
 29 |             savepath = os.path.join('./eval', save_filename)
 30 |             # TODO
 31 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014'
 32 |         elif mode == 'test-dev':
 33 |             savepath = os.path.join('./test-dev', save_filename)
 34 |             # TODO
 35 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
 36 |         elif mode == 'test':
 37 |             savepath = os.path.join('./test', save_filename)
 38 |             # TODO
 39 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
 40 |         else:
 41 |             raise Exception('Unsupported mode')
 42 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 43 |         if not os.path.exists(savepath): os.makedirs(savepath)
 44 | 
 45 |         for qt in qtype_list:
 46 |             count = 0
 47 |             for t_question in stat_list:
 48 |                 #print count, t_question
 49 |                 if count < 40/len(qtype_list):
 50 |                     t_question_list = t_question['q_list']
 51 |                     saveflag = False
 52 |                     #print 'debug****************************'
 53 |                     #print qt
 54 |                     #print t_question_list
 55 |                     #print t_question_list[0] == qt[0]
 56 |                     #print t_question_list[1] == qt[1]
 57 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 58 |                         saveflag = True
 59 |                     else:
 60 |                         saveflag = False
 61 |                                
 62 |                     if saveflag == True:
 63 |                         t_iid = t_question['iid']
 64 |                         if mode == 'val':
 65 |                             t_img = Image.open(os.path.join(img_pre, \
 66 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 67 |                         elif mode == 'test-dev' or 'test':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 70 | 
 71 |                         # for caption
 72 |                         #print t_iid
 73 |                         #annIds = caps.getAnnIds(t_iid)
 74 |                         #anns = caps.loadAnns(annIds)
 75 |                         #cap_list = [ann['caption'] for ann in anns]
 76 |                         ans_list = t_question['ans_list']
 77 |                         draw = ImageDraw.Draw(t_img)
 78 |                         for i in range(len(ans_list)):
 79 |                             try:
 80 |                                 draw.text((10,10*i), str(ans_list[i]))
 81 |                             except:
 82 |                                 pass
 83 | 
 84 |                         ans = t_question['answer']
 85 |                         pred = t_question['pred']
 86 |                         if ans == -1:
 87 |                             pre = ''
 88 |                         elif ans == pred:
 89 |                             pre = 'correct  '
 90 |                         else:
 91 |                             pre = 'failure  '
 92 |                         #print ' aaa ', ans, pred
 93 |                         ans = re.sub( '/', ' ', str(ans))
 94 |                         pred = re.sub( '/', ' ', str(pred))
 95 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 96 |                             str(ans) + ' p_' + str(pred) + '.png'
 97 |                         count += 1
 98 |                         print os.path.join(savepath,img_title)
 99 |                         t_img.save(os.path.join(savepath,img_title))
100 | 
101 |     print 'saving whatis'
102 |     qt_color_list = [['what','color']]
103 |     save_qtype(qt_color_list, 'colors', mode)
104 | 
105 |     print 'saving whatis'
106 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
107 |     save_qtype(qt_whatis_list, 'whatis', mode)
108 | 
109 |     print 'saving is'
110 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
111 |     save_qtype(qt_is_list, 'is', mode)
112 | 
113 |     print 'saving how many'
114 |     qt_howmany_list =[['how','many']]
115 |     save_qtype(qt_howmany_list, 'howmany', mode)
116 | 
117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False):
118 | 
119 |     caffe.set_device(device_id)
120 |     caffe.set_mode_gpu()
121 |     net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST)
122 | 
123 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder)
124 |     total_questions = len(dp.getQuesIds())
125 |     epoch = 0
126 | 
127 |     pred_list = []
128 |     testloss_list = []
129 |     stat_list = []
130 | 
131 |     while epoch == 0:
132 |         t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
133 |         net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
134 |         net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
135 |         net.blobs['img_feature'].data[...] = t_img_feature
136 |         net.blobs['label'].data[...] = t_answer
137 |         net.forward()
138 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
139 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
140 |         testloss_list.append(net.blobs['loss'].data)
141 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
142 |             #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
143 |             pred_list.append((pred,int(dp.getStrippedQuesId(qid))))
144 |             if visualize:
145 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
146 |                 if mode == 'test-dev' or 'test':
147 |                     ans_str = ''
148 |                     ans_list = ['']*10
149 |                 else:
150 |                     ans_str = dp.vec_to_answer(ans)
151 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
152 |                 stat_list.append({\
153 |                                     'qid'   : qid,
154 |                                     'q_list' : q_list,
155 |                                     'iid'   : iid,
156 |                                     'answer': ans_str,
157 |                                     'ans_list': ans_list,
158 |                                     'pred'  : pred })
159 |         percent = 100 * float(len(pred_list)) / total_questions
160 |         sys.stdout.write('\r' + ('%.2f' % percent) + '%')
161 |         sys.stdout.flush()
162 | 
163 | 
164 |     print 'Deduping arr of len', len(pred_list)
165 |     deduped = []
166 |     seen = set()
167 |     for ans, qid in pred_list:
168 |         if qid not in seen:
169 |             seen.add(qid)
170 |             deduped.append((ans, qid))
171 |     print 'New len', len(deduped)
172 |     final_list=[]
173 |     for ans,qid in deduped:
174 |         final_list.append({u'answer': ans, u'question_id': qid})
175 |  
176 |     mean_testloss = np.array(testloss_list).mean()
177 | 
178 |     if mode == 'val':
179 |         valFile = './%s/val2015_resfile'%folder
180 |         with open(valFile, 'w') as f:
181 |             json.dump(final_list, f)
182 |         if visualize:
183 |             visualize_failures(stat_list,mode)
184 |         annFile = config.DATA_PATHS['val']['ans_file']
185 |         quesFile = config.DATA_PATHS['val']['ques_file']
186 |         vqa = VQA(annFile, quesFile)
187 |         vqaRes = vqa.loadRes(valFile, quesFile)
188 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
189 |         vqaEval.evaluate()
190 |         acc_overall = vqaEval.accuracy['overall']
191 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
192 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
193 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
194 |     elif mode == 'test-dev':
195 |         filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
196 |         with open(filename+'.json', 'w') as f:
197 |             json.dump(final_list, f)
198 |         if visualize:
199 |             visualize_failures(stat_list,mode)
200 |     elif mode == 'test':
201 |         filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
202 |         with open(filename+'.json', 'w') as f:
203 |             json.dump(final_list, f)
204 |         if visualize:
205 |             visualize_failures(stat_list,mode)
206 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False):
207 |     # 0:it
208 |     # 1:trainloss
209 |     # 2:testloss
210 |     # 3:oa_acc
211 |     # 4:qt_acc
212 |     # 5:at_acc
213 | 
214 |     # training curve
215 |     it = np.array([l[0] for l in results])
216 |     loss = np.array([l[1] for l in results])
217 |     valloss = np.array([l[2] for l in results])
218 |     valacc = np.array([l[3] for l in results])
219 | 
220 |     fig = plt.figure()
221 |     ax1 = fig.add_subplot(111)
222 |     ax2 = ax1.twinx()
223 | 
224 |     ax1.plot(it,loss, color='blue', label='train loss')
225 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
226 |     ax2.plot(it,valacc, color='red', label='acc on val')
227 |     plt.legend(loc='lower left')
228 | 
229 |     ax1.set_xlabel('Iterations')
230 |     ax1.set_ylabel('Loss Value')
231 |     ax2.set_ylabel('Accuracy on Val [%]')
232 | 
233 |     plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix))
234 |     plt.clf()
235 |     plt.close("all")
236 | 
237 |     # question type
238 |     it = np.array([l[0] for l in results])
239 |     oa_acc = np.array([l[3] for l in results])
240 |     qt_dic_list = [l[4] for l in results]
241 | 
242 |     def draw_qt_acc(target_key_list, figname):
243 |         fig = plt.figure()
244 |         for k in target_key_list:
245 |             print k,type(k)
246 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
247 |             plt.plot(it,t_val,label=str(k))
248 |         plt.legend(fontsize='small')
249 |         plt.ylim(0,100.)
250 |         #plt.legend(prop={'size':6})
251 | 
252 |         plt.xlabel('Iterations')
253 |         plt.ylabel('Accuracy on Val [%]')
254 | 
255 |         plt.savefig(figname,dpi=200)
256 |         plt.clf()
257 |         plt.close("all")
258 | 
259 |     if save_question_type_graphs:
260 |         s_keys = sorted(qt_dic_list[0].keys())
261 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
262 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
263 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
264 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
265 |         draw_qt_acc(['what color is the','what color are the','what color is',\
266 |             'what color','what is the color of the'],'./qt_color.png')
267 |         draw_qt_acc(['how many','how','how many people are',\
268 |             'how many people are in'],'./qt_number.png')
269 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
270 |             'which'],'./qt_who_why_where_which.png')
271 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
272 |             'is the woman','is this person','what is the woman','is the person',\
273 |             'what is the person'],'./qt_human.png')
274 | 
275 | 
276 | 


--------------------------------------------------------------------------------
/mfh_baseline/train_mfh_baseline.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | import json
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | import caffe
 10 | from caffe import layers as L
 11 | from caffe import params as P
 12 | from caffe.proto import caffe_pb2
 13 | 
 14 | from vqa_data_layer_kld import VQADataProvider
 15 | from utils import exec_validation, drawgraph
 16 | import config
 17 | import time
 18 | 
 19 | def get_solver(folder):
 20 |     s = caffe_pb2.SolverParameter()
 21 |     s.train_net = './%s/proto_train.prototxt'%folder
 22 |     s.snapshot = 10000
 23 |     s.snapshot_prefix = './%s/'%folder
 24 |     s.max_iter = int(config.MAX_ITERATIONS)
 25 |     s.display = int(config.VALIDATE_INTERVAL)
 26 |     s.type = 'Adam'
 27 |     s.stepsize = int(config.MAX_ITERATIONS*0.2)
 28 |     s.gamma = 0.5
 29 |     s.lr_policy = "step"
 30 |     s.base_lr = 0.0007
 31 |     s.momentum = 0.9
 32 |     s.momentum2 = 0.999
 33 |     s.weight_decay = 0.000
 34 |     s.clip_gradients = 10
 35 |     return s
 36 | 
 37 | def get_auxiliary_json():
 38 |     aux = {}
 39 |     aux["batch_size"] = int(config.VAL_BATCH_SIZE)
 40 |     aux["data_shape"] = [2048]
 41 |     aux["img_feature_prefix"] = config.DATA_PATHS['test']['features_prefix']
 42 |     aux["glove"] = False
 43 |     return aux
 44 | 
 45 | 
 46 | def mfh_baseline(mode, batchsize, T, question_vocab_size, folder):
 47 |     n = caffe.NetSpec()
 48 |     mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
 49 |     if mode == 'val':
 50 |         n.data, n.cont, n.img_feature, n.label = L.Python( \
 51 |             module='vqa_data_layer', layer='VQADataProviderLayer', \
 52 |             param_str=mode_str, ntop=4 )
 53 |     else:
 54 |         n.data, n.cont, n.img_feature, n.label = L.Python(\
 55 |             module='vqa_data_layer_kld', layer='VQADataProviderLayer', \
 56 |             param_str=mode_str, ntop=4 ) 
 57 |     n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
 58 |                          weight_filler=dict(type='xavier'))
 59 |     n.embed_tanh = L.TanH(n.embed) 
 60 | 
 61 |     # LSTM
 62 |     n.lstm1 = L.LSTM(\
 63 |                    n.embed_tanh, n.cont,\
 64 |                    recurrent_param=dict(\
 65 |                        num_output=config.LSTM_UNIT_NUM,\
 66 |                        weight_filler=dict(type='xavier')))
 67 |     tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0})
 68 |     for i in xrange(config.MAX_WORDS_IN_QUESTION-1):
 69 |         n.__setattr__('slice_first'+str(i), tops1[int(i)])
 70 |         n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0))
 71 |     n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1]
 72 |     n.lstm1_reshaped = L.Reshape(n.lstm1_out,\
 73 |                           reshape_param=dict(\
 74 |                               shape=dict(dim=[-1,1024])))
 75 |     n.q_feat = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
 76 | 
 77 |     '''
 78 |     Coarse Image-Question MFH fusion
 79 |     '''
 80 | 
 81 |     n.mfb_q_o2_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
 82 |                                   weight_filler=dict(type='xavier'))
 83 |     n.mfb_i_o2_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
 84 |                                   weight_filler=dict(type='xavier'))
 85 |     n.mfb_iq_o2_eltwise = L.Eltwise(n.mfb_q_o2_proj, n.mfb_i_o2_proj, eltwise_param=dict(operation=0))
 86 |     n.mfb_iq_o2_drop = L.Dropout(n.mfb_iq_o2_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
 87 |     n.mfb_iq_o2_resh = L.Reshape(n.mfb_iq_o2_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
 88 |     n.mfb_iq_o2_sumpool = L.Pooling(n.mfb_iq_o2_resh, pool=P.Pooling.SUM, \
 89 |                                       pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
 90 |     n.mfb_o2_out = L.Reshape(n.mfb_iq_o2_sumpool,\
 91 |                                     reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
 92 |     n.mfb_o2_sign_sqrt = L.SignedSqrt(n.mfb_o2_out)
 93 |     n.mfb_o2_l2 = L.L2Normalize(n.mfb_o2_sign_sqrt)
 94 | 
 95 |     n.mfb_q_o3_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE,
 96 |                                   weight_filler=dict(type='xavier'))
 97 |     n.mfb_i_o3_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE,
 98 |                                   weight_filler=dict(type='xavier'))
 99 |     n.mfb_iq_o3_eltwise = L.Eltwise(n.mfb_q_o3_proj, n.mfb_i_o3_proj,n.mfb_iq_o2_drop, eltwise_param=dict(operation=0))
100 |     n.mfb_iq_o3_drop = L.Dropout(n.mfb_iq_o3_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
101 |     n.mfb_iq_o3_resh = L.Reshape(n.mfb_iq_o3_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
102 |     n.mfb_iq_o3_sumpool = L.Pooling(n.mfb_iq_o3_resh, pool=P.Pooling.SUM, \
103 |                                       pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
104 |     n.mfb_o3_out = L.Reshape(n.mfb_iq_o3_sumpool,\
105 |                                     reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
106 |     n.mfb_o3_sign_sqrt = L.SignedSqrt(n.mfb_o3_out)
107 |     n.mfb_o3_l2 = L.L2Normalize(n.mfb_o3_sign_sqrt)
108 | 
109 |     n.mfb_o23_l2 = L.Concat(n.mfb_o2_l2,n.mfb_o3_l2)
110 | 
111 |     n.prediction = L.InnerProduct(n.mfb_o23_l2, num_output=config.NUM_OUTPUT_UNITS,
112 |                                   weight_filler=dict(type='xavier'))
113 | 
114 |     if mode == 'val':
115 |         n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
116 |     else:
117 |         n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
118 |     return n.to_proto()
119 | 
120 | def make_answer_vocab(adic, vocab_size):
121 |     """
122 |     Returns a dictionary that maps words to indices.
123 |     """
124 |     adict = {'':0}
125 |     nadict = {'':1000000}
126 |     vid = 1
127 |     for qid in adic.keys():
128 |         answer_obj = adic[qid]
129 |         answer_list = [ans['answer'] for ans in answer_obj]
130 |         
131 |         for q_ans in answer_list:
132 |             # create dict
133 |             if adict.has_key(q_ans):
134 |                 nadict[q_ans] += 1
135 |             else:
136 |                 nadict[q_ans] = 1
137 |                 adict[q_ans] = vid
138 |                 vid +=1
139 | 
140 |     # debug
141 |     nalist = []
142 |     for k,v in sorted(nadict.items(), key=lambda x:x[1]):
143 |         nalist.append((k,v))
144 | 
145 |     # remove words that appear less than once 
146 |     n_del_ans = 0
147 |     n_valid_ans = 0
148 |     adict_nid = {}
149 |     for i, w in enumerate(nalist[:-vocab_size]):
150 |         del adict[w[0]]
151 |         n_del_ans += w[1]
152 |     for i, w in enumerate(nalist[-vocab_size:]):
153 |         n_valid_ans += w[1]
154 |         adict_nid[w[0]] = i
155 |     
156 |     return adict_nid
157 | 
158 | def make_question_vocab(qdic):
159 |     """
160 |     Returns a dictionary that maps words to indices.
161 |     """
162 |     vdict = {'':0}
163 |     vid = 1
164 |     for qid in qdic.keys():
165 |         # sequence to list
166 |         q_str = qdic[qid]['qstr']
167 |         q_list = VQADataProvider.seq_to_list(q_str)
168 | 
169 |         # create dict
170 |         for w in q_list:
171 |             if not vdict.has_key(w):
172 |                 vdict[w] = vid
173 |                 vid +=1
174 | 
175 |     return vdict
176 | 
177 | def make_vocab_files():
178 |     """
179 |     Produce the question and answer vocabulary files.
180 |     """
181 |     print 'making question vocab...', config.QUESTION_VOCAB_SPACE
182 |     qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
183 |     question_vocab = make_question_vocab(qdic)
184 |     print 'making answer vocab...', config.ANSWER_VOCAB_SPACE
185 |     _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
186 |     answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
187 |     return question_vocab, answer_vocab
188 | 
189 | def main():
190 |     folder = 'mfh_baseline_%s'%(config.TRAIN_DATA_SPLITS)
191 |     if not os.path.exists('./%s'%folder):
192 |         os.makedirs('./%s'%folder)
193 | 
194 |     question_vocab, answer_vocab = {}, {}
195 |     if os.path.exists('./%s/vdict.json'%folder) and os.path.exists('./%s/adict.json'%folder):
196 |         print 'restoring vocab'
197 |         with open('./%s/vdict.json'%folder,'r') as f:
198 |             question_vocab = json.load(f)
199 |         with open('./%s/adict.json'%folder,'r') as f:
200 |             answer_vocab = json.load(f)
201 |     else:
202 |         question_vocab, answer_vocab = make_vocab_files()
203 |         with open('./%s/vdict.json'%folder,'w') as f:
204 |             json.dump(question_vocab, f)
205 |         with open('./%s/adict.json'%folder,'w') as f:
206 |             json.dump(answer_vocab, f)
207 | 
208 |     print 'question vocab size:', len(question_vocab)
209 |     print 'answer vocab size:', len(answer_vocab)
210 | 
211 |     with open('./%s/proto_train.prototxt'%folder, 'w') as f:
212 |         f.write(str(mfh_baseline(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
213 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
214 |     
215 |     with open('./%s/proto_test.prototxt'%folder, 'w') as f:
216 |         f.write(str(mfh_baseline('val', config.VAL_BATCH_SIZE, \
217 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
218 | 
219 |     with open('./%s/solver.prototxt'%folder, 'w') as f:
220 |         f.write(str(get_solver(folder)))    
221 |     with open('./%s/auxiliary.json'%folder, 'w') as f:
222 |         json.dump(get_auxiliary_json(),f, indent=2)
223 | 
224 |     caffe.set_device(config.TRAIN_GPU_ID)
225 |     caffe.set_mode_gpu()
226 |     solver = caffe.get_solver('./%s/solver.prototxt'%folder)
227 | 
228 |     train_loss = np.zeros(config.MAX_ITERATIONS+1)
229 |     results = []
230 | 
231 |     if config.RESTORE_ITER:
232 |         restore_iter = config.RESTORE_ITER
233 |         solver.restore('./%s/_iter_%d.solverstate'%(folder,restore_iter))
234 |     else:
235 |         restore_iter = 0
236 |     
237 |     start = time.clock()
238 |     for it in range(restore_iter, config.MAX_ITERATIONS+1):
239 |         solver.step(1)
240 |     
241 |         # store the train loss
242 |         train_loss[it] = solver.net.blobs['loss'].data
243 |    
244 |         if it % config.PRINT_INTERVAL == 0 and it != 0:
245 |             elapsed = (time.clock() - start)
246 |             print 'Iteration:', it
247 |             c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
248 |             print 'Train loss:', c_mean_loss, ' Elapsed seconds:', elapsed
249 |             start = time.clock()
250 |         if it % config.VALIDATE_INTERVAL == 0 and it != restore_iter:
251 |             model_name = './%s/tmp.caffemodel'%(folder)
252 |             solver.net.save(model_name)
253 |             print 'Validating...'
254 |             ''' 
255 |             # for test-dev /test set. the json file will be generated under the <folder> file
256 |             exec_validation(config.TEST_GPU_ID, 'test-dev', model_name, it=it, folder=folder)
257 |             caffe.set_device(config.TRAIN_GPU_ID)
258 |             ''' 
259 |             #for val set. the accuracy will be computed and ploted        
260 |             test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.TEST_GPU_ID, 'val', model_name, it=it, folder=folder)
261 |             caffe.set_device(config.TRAIN_GPU_ID)
262 |             print 'Test loss:', test_loss
263 |             print 'Accuracy:', acc_overall
264 |             print 'Test per ans', acc_per_ans
265 |             results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
266 |             best_result_idx = np.array([x[3] for x in results]).argmax()
267 |             print 'Best accuracy of', results[best_result_idx][3], 'was at iteration', results[best_result_idx][0]
268 |             drawgraph(results,folder,config.MFB_FACTOR_NUM,config.MFB_OUT_DIM,prefix='mfh_baseline')
269 |              
270 | if __name__ == '__main__':
271 |     main()
272 | 


--------------------------------------------------------------------------------
/eval/ensemble.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Generates predictions on test-dev or test using an ensemble of nets. The
  3 | ensemble is produced using the average of the pre-softmax output from each net.
  4 | 
  5 | Place each model in its own folder. The folder must contain:
  6 | 
  7 | - The .caffemodel file
  8 | - proto_test.prototxt
  9 | - adict.json
 10 | - vdict.json
 11 | - auxiliary.json
 12 | 
 13 | auxiliary.json should contain the following keys:
 14 | 
 15 | - batch_size (value should be integer)
 16 | - data_shape (value should be array of integer)
 17 | - img_feature_prefix (value should be string)
 18 | - spatial_coord (value should be boolean)
 19 | - glove (value should be boolean)
 20 | 
 21 | If the folder also contains "preds.pkl", evaluation is skipped for that network.
 22 | 
 23 | """
 24 | 
 25 | import caffe
 26 | import numpy as np
 27 | import cPickle
 28 | import argparse, os, glob
 29 | import sys
 30 | import json
 31 | from collections import defaultdict
 32 | import vqa_data_layer
 33 | from vqa_data_layer import LoadVQADataProvider
 34 | 
 35 | def verify_all(folder_paths):
 36 |     """
 37 |     Calls verify_one on each folder path. Also checks to make sure all the
 38 |     answer vocabularies are the same.
 39 |     """
 40 |     adict_paths = []
 41 |     for folder_path in folder_paths:
 42 |         paths = verify_one(folder_path)
 43 |         adict_paths.append(paths[2])
 44 |     adicts = []
 45 |     for path in adict_paths:
 46 |         with open(path, 'r') as f:
 47 |             adict = json.load(f)
 48 |             adicts.append(adict)
 49 |     if len(adicts) > 1:
 50 |         for a2 in adicts[1:]:
 51 |             if set(adicts[0].keys()) != set(a2.keys()):
 52 |                 print set(adicts[0].keys()) - set(a2.keys())
 53 |                 print set(a2.keys()) - set(adicts[0].keys())
 54 |                 raise Exception('Answer vocab mismatch')
 55 |     return adicts
 56 | 
 57 | def verify_one(folder_path):
 58 |     """
 59 |     Makes sure all the required files exist in the folder. If so, returns the
 60 |     paths to all the files.
 61 |     """
 62 |     model_path = glob.glob(folder_path + '/tmp*.caffemodel')
 63 |     print model_path
 64 |     assert len(model_path) == 1, 'one .caffemodel per folder, please'
 65 |     model_path = model_path[0]
 66 |     proto_path = folder_path + '/proto_test.prototxt'
 67 |     adict_path = folder_path + '/adict.json'
 68 |     vdict_path = folder_path + '/vdict.json'
 69 |     aux_path = folder_path + '/auxiliary.json'
 70 |     assert os.path.exists(proto_path), 'proto_test.prototxt missing'
 71 |     assert os.path.exists(adict_path), 'adict.json missing'
 72 |     assert os.path.exists(vdict_path), 'vdict.json missing'
 73 |     assert os.path.exists(aux_path), 'auxiliary.json missing'
 74 |     with open(aux_path, 'r') as f:
 75 |         aux = json.load(f)
 76 |     batch_size = int(aux['batch_size'])
 77 |     data_shape = tuple(map(int, aux['data_shape']))
 78 |     img_feature_prefix = aux['img_feature_prefix']
 79 |     spatial_coord = aux['spatial_coord'] if 'spatial_coord' in aux else False
 80 |     glove = aux['glove'] if 'glove' in aux else False
 81 |     model_weight = float(aux['model_weight']) if 'model_weight' in aux else 1.0
 82 |     #print 'weight: ', model_weight
 83 |     return model_path, proto_path, adict_path, vdict_path, batch_size, data_shape, img_feature_prefix, spatial_coord, glove, model_weight
 84 | 
 85 | def get_pkl_fname(ques_file):
 86 |     if '_val2014_' in ques_file:
 87 |         return '/preds_val.pkl'
 88 |     elif '_test-dev2015_' in ques_file:
 89 |         return '/preds_test_dev.pkl'
 90 |     elif '_test2015_' in ques_file:
 91 |         return '/preds_test.pkl'
 92 |     else:
 93 |         raise NotImplementedError
 94 | 
 95 | def eval_one(folder_path, gpuid, ques_file):
 96 |     """
 97 |     Evaluates a single model (in folder_path) on the questions in ques_file.
 98 |     Returns an array of (QID, answer vector) tuples.
 99 |     """
100 | 
101 |     model_path, proto_path, adict_path, vdict_path, batch_size, data_shape, \
102 |                     img_feature_prefix, spatial_coord, glove, model_weight = verify_one(folder_path)
103 |     
104 |     dp = LoadVQADataProvider(ques_file, img_feature_prefix, vdict_path, \
105 |         adict_path, mode='test', batchsize=batch_size, data_shape=data_shape)
106 |     total_questions = len(dp.getQuesIds())
107 |     print total_questions, 'total questions'
108 | 
109 |     if os.path.exists(folder_path + get_pkl_fname(ques_file)):
110 |         print 'Found existing prediction file, trying to load...'
111 |         with open(folder_path + get_pkl_fname(ques_file), 'r') as f:
112 |             preds = cPickle.load(f)
113 |         if len(preds) >= total_questions:
114 |             print 'Loaded.'
115 |             return preds
116 |         else:
117 |             print 'Number of saved answers does not match number of questions, continuing...'
118 | 
119 |     caffe.set_device(gpuid)
120 |     caffe.set_mode_gpu()
121 | 
122 |     vqa_data_layer.CURRENT_DATA_SHAPE = data_shape # This is a huge hack
123 |     vqa_data_layer.SPATIAL_COORD = spatial_coord
124 |     vqa_data_layer.GLOVE = glove
125 | 
126 |     net = caffe.Net(proto_path, model_path, caffe.TEST)
127 | 
128 |     print 'Model loaded:', model_path
129 |     print 'Image feature prefix:', img_feature_prefix
130 |     sys.stdout.flush()
131 | 
132 | 
133 |     pred_layers = []
134 | 
135 |     epoch = 0
136 |     while epoch == 0:
137 |         t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, _, epoch = dp.get_batch_vec()
138 |         net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
139 |         net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
140 |         net.blobs['img_feature'].data[...] = t_img_feature
141 |         net.blobs['label'].data[...] = t_answer # dummy
142 |         if glove:
143 |             net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2))
144 |         net.forward()
145 |         ans_matrix = net.blobs['prediction'].data
146 | 
147 |         for i in range(len(t_qid_list)):
148 |             qid = t_qid_list[i]
149 |             pred_layers.append((qid, np.copy(model_weight * ans_matrix[i]))) # model_weight * answer_matrix
150 | 
151 |         percent = 100 * float(len(pred_layers)) / total_questions
152 |         sys.stdout.write('\r' + ('%.2f' % percent) + '%')
153 |         sys.stdout.flush()
154 | 
155 |     #print 'Saving predictions...'
156 |     #with open(folder_path + get_pkl_fname(ques_file), 'w') as f:
157 |     #   cPickle.dump(pred_layers, f, protocol=-1)
158 |     #print 'Saved.'
159 |     return pred_layers
160 | 
161 | def make_rev_adict(adict):
162 |     """
163 |     An adict maps text answers to neuron indices. A reverse adict maps neuron
164 |     indices to text answers.
165 |     """
166 |     rev_adict = {}
167 |     for k,v in adict.items():
168 |         rev_adict[v] = k
169 |     return rev_adict
170 | 
171 | def softmax(arr):
172 |     e = np.exp(arr)
173 |     dist = e / np.sum(e)
174 |     return dist
175 | 
176 | def get_qid_valid_answer_dict(ques_file, adict):
177 |     """
178 |     Returns a dictionary mapping question IDs to valid neuron indices.
179 |     """
180 |     print 'Multiple choice mode: making valid answer dictionary...'
181 |     valid_answer_dict = {}
182 |     with open(ques_file, 'r') as f:
183 |         qdata = json.load(f)
184 |         for q in qdata['questions']:
185 |             valid_answer_dict[q['question_id']] = q['multiple_choices']
186 |     for qid in valid_answer_dict:
187 |         answers = valid_answer_dict[qid]
188 |         valid_indices = []
189 |         for answer in answers:
190 |             if answer in adict:
191 |                 valid_indices.append(adict[answer])
192 |         if len(valid_indices) == 0:
193 |             print "we won't be able to answer qid", qid
194 |         valid_answer_dict[qid] = valid_indices
195 |     return valid_answer_dict
196 | 
197 | def dedupe(arr):
198 |     print 'Deduping arr of len', len(arr)
199 |     deduped = []
200 |     seen = set()
201 |     for qid, pred in arr:
202 |         if qid not in seen:
203 |             seen.add(qid)
204 |             deduped.append((qid, pred))
205 |     print 'New len', len(deduped)
206 |     return deduped
207 | 
208 | def reorder_one(predictions, this_adict, canonical_adict):
209 |     index_map = {}
210 |     for idx, word in make_rev_adict(this_adict).iteritems():
211 |         index_map[int(idx)] = int(canonical_adict[word])
212 |     index_array = np.zeros(len(index_map), dtype=int)
213 |     for src_idx, dest_idx in index_map.iteritems():
214 |         index_array[src_idx] = dest_idx
215 |     reordered = []
216 |     for qid, output in predictions:
217 |         reordered.append((qid, np.copy(output[index_array])))
218 |     return reordered
219 | 
220 | def reorder_predictions(predictions, adicts):
221 |     """
222 |     Reorders prediction matrices so that the unit order matches that of the
223 |     first answer dictionary.
224 |     """
225 |     if len(adicts) == 1:
226 |         return predictions
227 |     need_to_reorder = False
228 |     for a2 in adicts[1:]:
229 |         if adicts[0] != a2:
230 |             need_to_reorder = True
231 |     print 'Reordering...' if need_to_reorder else 'No need to reorder!'
232 |     if not need_to_reorder:
233 |         return predictions
234 |     reordered = []
235 |     for i in range(1, len(adicts)):
236 |         if adicts[0] != adicts[i]:
237 |             reordered.append(reorder_one(predictions[i], adicts[i], adicts[0]))
238 |         else:
239 |             reordered.append(predictions[i])
240 |     return reordered
241 | 
242 | def average_outputs(arr_of_arr, rev_adict, qid_valid_answer_dict):
243 |     """
244 |     Given a list of lists, where each list contains (QID, answer vector) tuples,
245 |     returns a single dictionary which maps a question ID to the text answer.
246 |     """
247 |     print 'Averaging outputs...'
248 |     merged = defaultdict(list)
249 |     for arr in arr_of_arr:
250 |         for qid, ans_vec in arr:
251 |             merged[qid].append(ans_vec)
252 | 
253 |     merged = {qid: softmax(np.vstack(ans_vecs).mean(axis=0)) for qid, ans_vecs in merged.iteritems()}
254 |     mask_len = len(merged.values()[0])
255 | 
256 |     # Multiple choice filtering
257 |     if qid_valid_answer_dict is not None:
258 |         for qid in merged:
259 |             valid_indices = qid_valid_answer_dict[qid]
260 |             mask = np.zeros(mask_len)
261 |             for idx in valid_indices:
262 |                 mask[idx] = 1
263 |             merged[qid] *= mask
264 | 
265 |     merged = {qid: rev_adict[ans_vec.argmax()] for qid, ans_vec in merged.iteritems()}
266 | 
267 |     return merged
268 | 
269 | def save_json(qid_ans_dict, fname):
270 |     tmp = []
271 |     for qid, ans in qid_ans_dict.iteritems():
272 |         tmp.append({u'answer': ans, u'question_id': qid})
273 |     with open(fname, 'w') as f:
274 |         json.dump(tmp, f)
275 |     print 'Saved to', fname
276 | 
277 | def main():
278 |     parser = argparse.ArgumentParser()
279 |     parser.add_argument('--ques_file', required=True)
280 |     parser.add_argument('--gpu', type=int, required=True)
281 |     parser.add_argument('--out_file', required=True)
282 |     parser.add_argument('folders', nargs='*',
283 |         help='space-separated list of folders containing models')
284 |     args = parser.parse_args()
285 |     assert len(args.folders) > 0, 'please specify at least one folder'
286 |     print 'Folders', args.folders
287 | 
288 |     adicts = verify_all(args.folders)
289 |     print '-----------------------------------------------'
290 |     qid_valid_answer_dict = None
291 |     if 'MultipleChoice' in args.ques_file:
292 |         qid_valid_answer_dict = get_qid_valid_answer_dict(args.ques_file, adicts[0])
293 | 
294 |     arr_of_arr = [eval_one(folder_path, args.gpu, args.ques_file) for folder_path in args.folders]
295 |     arr_of_arr = [dedupe(x) for x in arr_of_arr]
296 |     #np.save('%s.predict_arr.npz'%args.out_file,x = arr_of_arr) 
297 |     reordered = reorder_predictions(arr_of_arr, adicts)
298 |     qid_ans_dict = average_outputs(reordered, make_rev_adict(adicts[0]), qid_valid_answer_dict)
299 |     save_json(qid_ans_dict, args.out_file)
300 | 
301 | if __name__ == '__main__':
302 |     main()
303 | 


--------------------------------------------------------------------------------
/mfb_coatt_glove/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer
 16 | 
 17 | import config
 18 | sys.path.append(config.VQA_TOOLS_PATH)
 19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 20 | 
 21 | from vqaTools.vqa import VQA
 22 | from vqaEvaluation.vqaEval import VQAEval
 23 | 
 24 | def visualize_failures(stat_list,mode):
 25 | 
 26 |     def save_qtype(qtype_list, save_filename, mode):
 27 | 
 28 |         if mode == 'val':
 29 |             savepath = os.path.join('./eval', save_filename)
 30 |             # TODO
 31 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014'
 32 |         elif mode == 'test-dev':
 33 |             savepath = os.path.join('./test-dev', save_filename)
 34 |             # TODO
 35 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
 36 |         elif mode == 'test':
 37 |             savepath = os.path.join('./test', save_filename)
 38 |             # TODO
 39 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
 40 |         else:
 41 |             raise Exception('Unsupported mode')
 42 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 43 |         if not os.path.exists(savepath): os.makedirs(savepath)
 44 | 
 45 |         for qt in qtype_list:
 46 |             count = 0
 47 |             for t_question in stat_list:
 48 |                 #print count, t_question
 49 |                 if count < 40/len(qtype_list):
 50 |                     t_question_list = t_question['q_list']
 51 |                     saveflag = False
 52 |                     #print 'debug****************************'
 53 |                     #print qt
 54 |                     #print t_question_list
 55 |                     #print t_question_list[0] == qt[0]
 56 |                     #print t_question_list[1] == qt[1]
 57 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 58 |                         saveflag = True
 59 |                     else:
 60 |                         saveflag = False
 61 |                                
 62 |                     if saveflag == True:
 63 |                         t_iid = t_question['iid']
 64 |                         if mode == 'val':
 65 |                             t_img = Image.open(os.path.join(img_pre, \
 66 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 67 |                         elif mode == 'test-dev' or 'test':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 70 | 
 71 |                         # for caption
 72 |                         #print t_iid
 73 |                         #annIds = caps.getAnnIds(t_iid)
 74 |                         #anns = caps.loadAnns(annIds)
 75 |                         #cap_list = [ann['caption'] for ann in anns]
 76 |                         ans_list = t_question['ans_list']
 77 |                         draw = ImageDraw.Draw(t_img)
 78 |                         for i in range(len(ans_list)):
 79 |                             try:
 80 |                                 draw.text((10,10*i), str(ans_list[i]))
 81 |                             except:
 82 |                                 pass
 83 | 
 84 |                         ans = t_question['answer']
 85 |                         pred = t_question['pred']
 86 |                         if ans == -1:
 87 |                             pre = ''
 88 |                         elif ans == pred:
 89 |                             pre = 'correct  '
 90 |                         else:
 91 |                             pre = 'failure  '
 92 |                         #print ' aaa ', ans, pred
 93 |                         ans = re.sub( '/', ' ', str(ans))
 94 |                         pred = re.sub( '/', ' ', str(pred))
 95 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 96 |                             str(ans) + ' p_' + str(pred) + '.png'
 97 |                         count += 1
 98 |                         print os.path.join(savepath,img_title)
 99 |                         t_img.save(os.path.join(savepath,img_title))
100 | 
101 |     print 'saving whatis'
102 |     qt_color_list = [['what','color']]
103 |     save_qtype(qt_color_list, 'colors', mode)
104 | 
105 |     print 'saving whatis'
106 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
107 |     save_qtype(qt_whatis_list, 'whatis', mode)
108 | 
109 |     print 'saving is'
110 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
111 |     save_qtype(qt_is_list, 'is', mode)
112 | 
113 |     print 'saving how many'
114 |     qt_howmany_list =[['how','many']]
115 |     save_qtype(qt_howmany_list, 'howmany', mode)
116 | 
117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False):
118 | 
119 |     caffe.set_device(device_id)
120 |     caffe.set_mode_gpu()
121 |     net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST)
122 | 
123 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder)
124 |     total_questions = len(dp.getQuesIds())
125 |     epoch = 0
126 | 
127 |     pred_list = []
128 |     testloss_list = []
129 |     stat_list = []
130 | 
131 |     while epoch == 0:
132 |         t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
133 |         net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
134 |         net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
135 |         net.blobs['img_feature'].data[...] = t_img_feature
136 |         net.blobs['label'].data[...] = t_answer
137 |         net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2))
138 |         net.forward()
139 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
140 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
141 |         testloss_list.append(net.blobs['loss'].data)
142 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
143 |             #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
144 |             pred_list.append((pred,int(dp.getStrippedQuesId(qid))))
145 |             if visualize:
146 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
147 |                 if mode == 'test-dev' or 'test':
148 |                     ans_str = ''
149 |                     ans_list = ['']*10
150 |                 else:
151 |                     ans_str = dp.vec_to_answer(ans)
152 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
153 |                 stat_list.append({\
154 |                                     'qid'   : qid,
155 |                                     'q_list' : q_list,
156 |                                     'iid'   : iid,
157 |                                     'answer': ans_str,
158 |                                     'ans_list': ans_list,
159 |                                     'pred'  : pred })
160 |         percent = 100 * float(len(pred_list)) / total_questions
161 |         sys.stdout.write('\r' + ('%.2f' % percent) + '%')
162 |         sys.stdout.flush()
163 | 
164 | 
165 |     print 'Deduping arr of len', len(pred_list)
166 |     deduped = []
167 |     seen = set()
168 |     for ans, qid in pred_list:
169 |         if qid not in seen:
170 |             seen.add(qid)
171 |             deduped.append((ans, qid))
172 |     print 'New len', len(deduped)
173 |     final_list=[]
174 |     for ans,qid in deduped:
175 |         final_list.append({u'answer': ans, u'question_id': qid})
176 |  
177 |     mean_testloss = np.array(testloss_list).mean()
178 | 
179 |     if mode == 'val':
180 |         valFile = './%s/val2015_resfile'%folder
181 |         with open(valFile, 'w') as f:
182 |             json.dump(final_list, f)
183 |         if visualize:
184 |             visualize_failures(stat_list,mode)
185 |         annFile = config.DATA_PATHS['val']['ans_file']
186 |         quesFile = config.DATA_PATHS['val']['ques_file']
187 |         vqa = VQA(annFile, quesFile)
188 |         vqaRes = vqa.loadRes(valFile, quesFile)
189 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
190 |         vqaEval.evaluate()
191 |         acc_overall = vqaEval.accuracy['overall']
192 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
193 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
194 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
195 |     elif mode == 'test-dev':
196 |         filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
197 |         with open(filename+'.json', 'w') as f:
198 |             json.dump(final_list, f)
199 |         if visualize:
200 |             visualize_failures(stat_list,mode)
201 |     elif mode == 'test':
202 |         filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
203 |         with open(filename+'.json', 'w') as f:
204 |             json.dump(final_list, f)
205 |         if visualize:
206 |             visualize_failures(stat_list,mode)
207 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False):
208 |     # 0:it
209 |     # 1:trainloss
210 |     # 2:testloss
211 |     # 3:oa_acc
212 |     # 4:qt_acc
213 |     # 5:at_acc
214 | 
215 |     # training curve
216 |     it = np.array([l[0] for l in results])
217 |     loss = np.array([l[1] for l in results])
218 |     valloss = np.array([l[2] for l in results])
219 |     valacc = np.array([l[3] for l in results])
220 | 
221 |     fig = plt.figure()
222 |     ax1 = fig.add_subplot(111)
223 |     ax2 = ax1.twinx()
224 | 
225 |     ax1.plot(it,loss, color='blue', label='train loss')
226 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
227 |     ax2.plot(it,valacc, color='red', label='acc on val')
228 |     plt.legend(loc='lower left')
229 | 
230 |     ax1.set_xlabel('Iterations')
231 |     ax1.set_ylabel('Loss Value')
232 |     ax2.set_ylabel('Accuracy on Val [%]')
233 | 
234 |     plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix))
235 |     plt.clf()
236 |     plt.close("all")
237 | 
238 |     # question type
239 |     it = np.array([l[0] for l in results])
240 |     oa_acc = np.array([l[3] for l in results])
241 |     qt_dic_list = [l[4] for l in results]
242 | 
243 |     def draw_qt_acc(target_key_list, figname):
244 |         fig = plt.figure()
245 |         for k in target_key_list:
246 |             print k,type(k)
247 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
248 |             plt.plot(it,t_val,label=str(k))
249 |         plt.legend(fontsize='small')
250 |         plt.ylim(0,100.)
251 |         #plt.legend(prop={'size':6})
252 | 
253 |         plt.xlabel('Iterations')
254 |         plt.ylabel('Accuracy on Val [%]')
255 | 
256 |         plt.savefig(figname,dpi=200)
257 |         plt.clf()
258 |         plt.close("all")
259 | 
260 |     if save_question_type_graphs:
261 |         s_keys = sorted(qt_dic_list[0].keys())
262 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
263 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
264 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
265 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
266 |         draw_qt_acc(['what color is the','what color are the','what color is',\
267 |             'what color','what is the color of the'],'./qt_color.png')
268 |         draw_qt_acc(['how many','how','how many people are',\
269 |             'how many people are in'],'./qt_number.png')
270 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
271 |             'which'],'./qt_who_why_where_which.png')
272 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
273 |             'is the woman','is this person','what is the woman','is the person',\
274 |             'what is the person'],'./qt_human.png')
275 | 
276 | 
277 | 


--------------------------------------------------------------------------------
/mfh_coatt_glove/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | import os
  4 | import sys
  5 | import json
  6 | import re
  7 | import shutil
  8 | from PIL import Image
  9 | from PIL import ImageFont, ImageDraw
 10 | 
 11 | import caffe
 12 | from caffe import layers as L
 13 | from caffe import params as P
 14 | 
 15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer
 16 | 
 17 | import config
 18 | sys.path.append(config.VQA_TOOLS_PATH)
 19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH)
 20 | 
 21 | from vqaTools.vqa import VQA
 22 | from vqaEvaluation.vqaEval import VQAEval
 23 | 
 24 | def visualize_failures(stat_list,mode):
 25 | 
 26 |     def save_qtype(qtype_list, save_filename, mode):
 27 | 
 28 |         if mode == 'val':
 29 |             savepath = os.path.join('./eval', save_filename)
 30 |             # TODO
 31 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014'
 32 |         elif mode == 'test-dev':
 33 |             savepath = os.path.join('./test-dev', save_filename)
 34 |             # TODO
 35 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
 36 |         elif mode == 'test':
 37 |             savepath = os.path.join('./test', save_filename)
 38 |             # TODO
 39 |             img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015'
 40 |         else:
 41 |             raise Exception('Unsupported mode')
 42 |         if os.path.exists(savepath): shutil.rmtree(savepath)
 43 |         if not os.path.exists(savepath): os.makedirs(savepath)
 44 | 
 45 |         for qt in qtype_list:
 46 |             count = 0
 47 |             for t_question in stat_list:
 48 |                 #print count, t_question
 49 |                 if count < 40/len(qtype_list):
 50 |                     t_question_list = t_question['q_list']
 51 |                     saveflag = False
 52 |                     #print 'debug****************************'
 53 |                     #print qt
 54 |                     #print t_question_list
 55 |                     #print t_question_list[0] == qt[0]
 56 |                     #print t_question_list[1] == qt[1]
 57 |                     if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]:
 58 |                         saveflag = True
 59 |                     else:
 60 |                         saveflag = False
 61 |                                
 62 |                     if saveflag == True:
 63 |                         t_iid = t_question['iid']
 64 |                         if mode == 'val':
 65 |                             t_img = Image.open(os.path.join(img_pre, \
 66 |                                 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg'))
 67 |                         elif mode == 'test-dev' or 'test':
 68 |                             t_img = Image.open(os.path.join(img_pre, \
 69 |                                 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg'))
 70 | 
 71 |                         # for caption
 72 |                         #print t_iid
 73 |                         #annIds = caps.getAnnIds(t_iid)
 74 |                         #anns = caps.loadAnns(annIds)
 75 |                         #cap_list = [ann['caption'] for ann in anns]
 76 |                         ans_list = t_question['ans_list']
 77 |                         draw = ImageDraw.Draw(t_img)
 78 |                         for i in range(len(ans_list)):
 79 |                             try:
 80 |                                 draw.text((10,10*i), str(ans_list[i]))
 81 |                             except:
 82 |                                 pass
 83 | 
 84 |                         ans = t_question['answer']
 85 |                         pred = t_question['pred']
 86 |                         if ans == -1:
 87 |                             pre = ''
 88 |                         elif ans == pred:
 89 |                             pre = 'correct  '
 90 |                         else:
 91 |                             pre = 'failure  '
 92 |                         #print ' aaa ', ans, pred
 93 |                         ans = re.sub( '/', ' ', str(ans))
 94 |                         pred = re.sub( '/', ' ', str(pred))
 95 |                         img_title = pre + str(' '.join(t_question_list)) + '.  a_' + \
 96 |                             str(ans) + ' p_' + str(pred) + '.png'
 97 |                         count += 1
 98 |                         print os.path.join(savepath,img_title)
 99 |                         t_img.save(os.path.join(savepath,img_title))
100 | 
101 |     print 'saving whatis'
102 |     qt_color_list = [['what','color']]
103 |     save_qtype(qt_color_list, 'colors', mode)
104 | 
105 |     print 'saving whatis'
106 |     qt_whatis_list = [['what','is'],['what','kind'],['what','are']]
107 |     save_qtype(qt_whatis_list, 'whatis', mode)
108 | 
109 |     print 'saving is'
110 |     qt_is_list = [['is','the'], ['is','this'],['is','there']]
111 |     save_qtype(qt_is_list, 'is', mode)
112 | 
113 |     print 'saving how many'
114 |     qt_howmany_list =[['how','many']]
115 |     save_qtype(qt_howmany_list, 'howmany', mode)
116 | 
117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False):
118 | 
119 |     caffe.set_device(device_id)
120 |     caffe.set_mode_gpu()
121 |     net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST)
122 | 
123 |     dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder)
124 |     total_questions = len(dp.getQuesIds())
125 |     epoch = 0
126 | 
127 |     pred_list = []
128 |     testloss_list = []
129 |     stat_list = []
130 | 
131 |     while epoch == 0:
132 |         t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec()
133 |         net.blobs['data'].data[...] = np.transpose(t_word,(1,0))
134 |         net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0))
135 |         net.blobs['img_feature'].data[...] = t_img_feature
136 |         net.blobs['label'].data[...] = t_answer
137 |         net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2))
138 |         net.forward()
139 |         t_pred_list = net.blobs['prediction'].data.argmax(axis=1)
140 |         t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list]
141 |         testloss_list.append(net.blobs['loss'].data)
142 |         for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str):
143 |             #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))})
144 |             pred_list.append((pred,int(dp.getStrippedQuesId(qid))))
145 |             if visualize:
146 |                 q_list = dp.seq_to_list(dp.getQuesStr(qid))
147 |                 if mode == 'test-dev' or 'test':
148 |                     ans_str = ''
149 |                     ans_list = ['']*10
150 |                 else:
151 |                     ans_str = dp.vec_to_answer(ans)
152 |                     ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)]
153 |                 stat_list.append({\
154 |                                     'qid'   : qid,
155 |                                     'q_list' : q_list,
156 |                                     'iid'   : iid,
157 |                                     'answer': ans_str,
158 |                                     'ans_list': ans_list,
159 |                                     'pred'  : pred })
160 |         percent = 100 * float(len(pred_list)) / total_questions
161 |         sys.stdout.write('\r' + ('%.2f' % percent) + '%')
162 |         sys.stdout.flush()
163 | 
164 | 
165 |     print 'Deduping arr of len', len(pred_list)
166 |     deduped = []
167 |     seen = set()
168 |     for ans, qid in pred_list:
169 |         if qid not in seen:
170 |             seen.add(qid)
171 |             deduped.append((ans, qid))
172 |     print 'New len', len(deduped)
173 |     final_list=[]
174 |     for ans,qid in deduped:
175 |         final_list.append({u'answer': ans, u'question_id': qid})
176 |  
177 |     mean_testloss = np.array(testloss_list).mean()
178 | 
179 |     if mode == 'val':
180 |         valFile = './%s/val2015_resfile'%folder
181 |         with open(valFile, 'w') as f:
182 |             json.dump(final_list, f)
183 |         if visualize:
184 |             visualize_failures(stat_list,mode)
185 |         annFile = config.DATA_PATHS['val']['ans_file']
186 |         quesFile = config.DATA_PATHS['val']['ques_file']
187 |         vqa = VQA(annFile, quesFile)
188 |         vqaRes = vqa.loadRes(valFile, quesFile)
189 |         vqaEval = VQAEval(vqa, vqaRes, n=2)
190 |         vqaEval.evaluate()
191 |         acc_overall = vqaEval.accuracy['overall']
192 |         acc_perQuestionType = vqaEval.accuracy['perQuestionType']
193 |         acc_perAnswerType = vqaEval.accuracy['perAnswerType']
194 |         return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType
195 |     elif mode == 'test-dev':
196 |         filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
197 |         with open(filename+'.json', 'w') as f:
198 |             json.dump(final_list, f)
199 |         if visualize:
200 |             visualize_failures(stat_list,mode)
201 |     elif mode == 'test':
202 |         filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results'
203 |         with open(filename+'.json', 'w') as f:
204 |             json.dump(final_list, f)
205 |         if visualize:
206 |             visualize_failures(stat_list,mode)
207 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False):
208 |     # 0:it
209 |     # 1:trainloss
210 |     # 2:testloss
211 |     # 3:oa_acc
212 |     # 4:qt_acc
213 |     # 5:at_acc
214 | 
215 |     # training curve
216 |     it = np.array([l[0] for l in results])
217 |     loss = np.array([l[1] for l in results])
218 |     valloss = np.array([l[2] for l in results])
219 |     valacc = np.array([l[3] for l in results])
220 | 
221 |     fig = plt.figure()
222 |     ax1 = fig.add_subplot(111)
223 |     ax2 = ax1.twinx()
224 | 
225 |     ax1.plot(it,loss, color='blue', label='train loss')
226 |     ax1.plot(it,valloss, '--', color='blue', label='test loss')
227 |     ax2.plot(it,valacc, color='red', label='acc on val')
228 |     plt.legend(loc='lower left')
229 | 
230 |     ax1.set_xlabel('Iterations')
231 |     ax1.set_ylabel('Loss Value')
232 |     ax2.set_ylabel('Accuracy on Val [%]')
233 | 
234 |     plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix))
235 |     plt.clf()
236 |     plt.close("all")
237 | 
238 |     # question type
239 |     it = np.array([l[0] for l in results])
240 |     oa_acc = np.array([l[3] for l in results])
241 |     qt_dic_list = [l[4] for l in results]
242 | 
243 |     def draw_qt_acc(target_key_list, figname):
244 |         fig = plt.figure()
245 |         for k in target_key_list:
246 |             print k,type(k)
247 |             t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list])
248 |             plt.plot(it,t_val,label=str(k))
249 |         plt.legend(fontsize='small')
250 |         plt.ylim(0,100.)
251 |         #plt.legend(prop={'size':6})
252 | 
253 |         plt.xlabel('Iterations')
254 |         plt.ylabel('Accuracy on Val [%]')
255 | 
256 |         plt.savefig(figname,dpi=200)
257 |         plt.clf()
258 |         plt.close("all")
259 | 
260 |     if save_question_type_graphs:
261 |         s_keys = sorted(qt_dic_list[0].keys())
262 |         draw_qt_acc(s_keys[ 0:13]+[s_keys[31],],  './ind_qt_are.png')
263 |         draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png')
264 |         draw_qt_acc(s_keys[17:31]+[s_keys[32],],  './ind_qt_is.png')
265 |         draw_qt_acc(s_keys[33:49],             './ind_qt_what.png')
266 |         draw_qt_acc(['what color is the','what color are the','what color is',\
267 |             'what color','what is the color of the'],'./qt_color.png')
268 |         draw_qt_acc(['how many','how','how many people are',\
269 |             'how many people are in'],'./qt_number.png')
270 |         draw_qt_acc(['who is','why','why is the','where is the','where are the',\
271 |             'which'],'./qt_who_why_where_which.png')
272 |         draw_qt_acc(['what is the man','is the man','are they','is he',\
273 |             'is the woman','is this person','what is the woman','is the person',\
274 |             'what is the person'],'./qt_human.png')
275 | 
276 | 
277 | 


--------------------------------------------------------------------------------
/mfb_baseline/vqa_data_layer.py:
--------------------------------------------------------------------------------
  1 | import caffe
  2 | import numpy as np
  3 | import re, json, random
  4 | import config
  5 | 
  6 | QID_KEY_SEPARATOR = '/'
  7 | GLOVE_EMBEDDING_SIZE = 300
  8 | 
  9 | class VQADataProvider:
 10 | 
 11 |     def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
 12 |         self.batchsize = batchsize
 13 |         self.d_vocabulary = None
 14 |         self.batch_index = None
 15 |         self.batch_len = None
 16 |         self.rev_adict = None
 17 |         self.max_length = max_length
 18 |         self.mode = mode
 19 |         self.qdic, self.adic = VQADataProvider.load_data(mode)
 20 | 
 21 |         with open('./%s/vdict.json'%folder,'r') as f:
 22 |             self.vdict = json.load(f)
 23 |         with open('./%s/adict.json'%folder,'r') as f:
 24 |             self.adict = json.load(f)
 25 | 
 26 |         self.n_ans_vocabulary = len(self.adict)
 27 | 
 28 |     @staticmethod
 29 |     def load_vqa_json(data_split):
 30 |         """
 31 |         Parses the question and answer json files for the given data split. 
 32 |         Returns the question dictionary and the answer dictionary.
 33 |         """
 34 |         qdic, adic = {}, {}
 35 | 
 36 |         with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
 37 |             qdata = json.load(f)['questions']
 38 |             for q in qdata:
 39 |                 qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
 40 |                     {'qstr': q['question'], 'iid': q['image_id']}
 41 | 
 42 |         if 'test' not in data_split:
 43 |             with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
 44 |                 adata = json.load(f)['annotations']
 45 |                 for a in adata:
 46 |                     adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
 47 |                         a['answers']
 48 | 
 49 |         print 'parsed', len(qdic), 'questions for', data_split
 50 |         return qdic, adic
 51 | 
 52 |     @staticmethod
 53 |     def load_genome_json():
 54 |         """
 55 |         Parses the genome json file. Returns the question dictionary and the
 56 |         answer dictionary.
 57 |         """
 58 |         qdic, adic = {}, {}
 59 | 
 60 |         with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
 61 |             qdata = json.load(f)
 62 |             for q in qdata:
 63 |                 key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
 64 |                 qdic[key] = {'qstr': q['question'], 'iid': q['image']}
 65 |                 adic[key] = [{'answer': q['answer']}]
 66 | 
 67 |         print 'parsed', len(qdic), 'questions for genome'
 68 |         return qdic, adic
 69 | 
 70 |     @staticmethod
 71 |     def load_data(data_split_str):
 72 |         all_qdic, all_adic = {}, {}
 73 |         for data_split in data_split_str.split('+'):
 74 |             assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
 75 |             if data_split == 'genome':
 76 |                 qdic, adic = VQADataProvider.load_genome_json()
 77 |                 all_qdic.update(qdic)
 78 |                 all_adic.update(adic)
 79 |             else:
 80 |                 qdic, adic = VQADataProvider.load_vqa_json(data_split)
 81 |                 all_qdic.update(qdic)
 82 |                 all_adic.update(adic)
 83 |         return all_qdic, all_adic
 84 | 
 85 |     def getQuesIds(self):
 86 |         return self.qdic.keys()
 87 | 
 88 |     def getStrippedQuesId(self, qid):
 89 |         return qid.split(QID_KEY_SEPARATOR)[1]
 90 | 
 91 |     def getImgId(self,qid):
 92 |         return self.qdic[qid]['iid']
 93 | 
 94 |     def getQuesStr(self,qid):
 95 |         return self.qdic[qid]['qstr']
 96 | 
 97 |     def getAnsObj(self,qid):
 98 |         if self.mode == 'test-dev' or self.mode == 'test':
 99 |             return -1
100 |         return self.adic[qid]
101 | 
102 |     @staticmethod
103 |     def seq_to_list(s):
104 |         t_str = s.lower()
105 |         for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
106 |             t_str = re.sub( i, '', t_str)
107 |         for i in [r'\-',r'\/']:
108 |             t_str = re.sub( i, ' ', t_str)
109 |         q_list = re.sub(r'\?','',t_str.lower()).split(' ')
110 |         q_list = filter(lambda x: len(x) > 0, q_list)
111 |         return q_list
112 | 
113 |     def extract_answer(self,answer_obj):
114 |         """ Return the most popular answer in string."""
115 |         if self.mode == 'test-dev' or self.mode == 'test':
116 |             return -1
117 |         answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
118 |         dic = {}
119 |         for ans in answer_list:
120 |             if dic.has_key(ans):
121 |                 dic[ans] +=1
122 |             else:
123 |                 dic[ans] = 1
124 |         max_key = max((v,k) for (k,v) in dic.items())[1]
125 |         return max_key
126 | 
127 |     def extract_answer_prob(self,answer_obj):
128 |         """ Return the most popular answer in string."""
129 |         if self.mode == 'test-dev' or self.mode == 'test':
130 |             return -1
131 | 
132 |         answer_list = [ ans['answer'] for ans in answer_obj]
133 |         prob_answer_list = []
134 |         for ans in answer_list:
135 |             if self.adict.has_key(ans):
136 |                 prob_answer_list.append(ans)
137 | 
138 |         if len(prob_answer_list) == 0:
139 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
140 |                 return 'hoge'
141 |             else:
142 |                 raise Exception("This should not happen.")
143 |         else:
144 |             return random.choice(prob_answer_list)
145 |  
146 |     def qlist_to_vec(self, max_length, q_list):
147 |         """
148 |         Converts a list of words into a format suitable for the embedding layer.
149 | 
150 |         Arguments:
151 |         max_length -- the maximum length of a question sequence
152 |         q_list -- a list of words which are the tokens in the question
153 | 
154 |         Returns:
155 |         qvec -- A max_length length vector containing one-hot indices for each word
156 |         cvec -- A max_length length sequence continuation indicator vector
157 |         """
158 |         qvec = np.zeros(max_length)
159 |         cvec = np.zeros(max_length)
160 |         for i in xrange(max_length):
161 |             if i < max_length - len(q_list):
162 |                 cvec[i] = 0
163 |             else:
164 |                 w = q_list[i-(max_length-len(q_list))]
165 |                 # is the word in the vocabulary?
166 |                 if self.vdict.has_key(w) is False:
167 |                     w = ''
168 |                 qvec[i] = self.vdict[w]
169 |                 cvec[i] = 0 if i == max_length - len(q_list) else 1
170 | 
171 |         return qvec, cvec
172 |  
173 |     def answer_to_vec(self, ans_str):
174 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
175 |         if self.mode =='test-dev' or self.mode == 'test':
176 |             return -1
177 | 
178 |         if self.adict.has_key(ans_str):
179 |             ans = self.adict[ans_str]
180 |         else:
181 |             ans = self.adict['']
182 |         return ans
183 |  
184 |     def vec_to_answer(self, ans_symbol):
185 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
186 |         if self.rev_adict is None:
187 |             rev_adict = {}
188 |             for k,v in self.adict.items():
189 |                 rev_adict[v] = k
190 |             self.rev_adict = rev_adict
191 | 
192 |         return self.rev_adict[ans_symbol]
193 |  
194 |     def create_batch(self,qid_list):
195 | 
196 |         qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
197 |         cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
198 |         ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048)
199 |         avec = (np.zeros(self.batchsize)).reshape(self.batchsize)
200 | 
201 |         for i,qid in enumerate(qid_list):
202 | 
203 |             # load raw question information
204 |             q_str = self.getQuesStr(qid)
205 |             q_ans = self.getAnsObj(qid)
206 |             q_iid = self.getImgId(qid)
207 | 
208 |             # convert question to vec
209 |             q_list = VQADataProvider.seq_to_list(q_str)
210 |             t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list)
211 | 
212 |             try:
213 |                 qid_split = qid.split(QID_KEY_SEPARATOR)
214 |                 data_split = qid_split[0]
215 |                 if data_split == 'genome':
216 |                     t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
217 |                 else:
218 |                     t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
219 |                 t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
220 |             except:
221 |                 t_ivec = 0.
222 |                 print 'data not found for qid : ', q_iid,  self.mode
223 |              
224 |             # convert answer to vec
225 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
226 |                 q_ans_str = self.extract_answer(q_ans)
227 |             else:
228 |                 q_ans_str = self.extract_answer_prob(q_ans)
229 |             t_avec = self.answer_to_vec(q_ans_str)
230 | 
231 |             qvec[i,...] = t_qvec
232 |             cvec[i,...] = t_cvec
233 |             ivec[i,...] = t_ivec
234 |             avec[i,...] = t_avec
235 | 
236 |         return qvec, cvec, ivec, avec
237 | 
238 |  
239 |     def get_batch_vec(self):
240 |         if self.batch_len is None:
241 |             self.n_skipped = 0
242 |             qid_list = self.getQuesIds()
243 |             random.shuffle(qid_list)
244 |             self.qid_list = qid_list
245 |             self.batch_len = len(qid_list)
246 |             self.batch_index = 0
247 |             self.epoch_counter = 0
248 | 
249 |         def has_at_least_one_valid_answer(t_qid):
250 |             answer_obj = self.getAnsObj(t_qid)
251 |             answer_list = [ans['answer'] for ans in answer_obj]
252 |             for ans in answer_list:
253 |                 if self.adict.has_key(ans):
254 |                     return True
255 | 
256 |         counter = 0
257 |         t_qid_list = []
258 |         t_iid_list = []
259 |         while counter < self.batchsize:
260 |             t_qid = self.qid_list[self.batch_index]
261 |             t_iid = self.getImgId(t_qid)
262 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
263 |                 t_qid_list.append(t_qid)
264 |                 t_iid_list.append(t_iid)
265 |                 counter += 1
266 |             elif has_at_least_one_valid_answer(t_qid):
267 |                 t_qid_list.append(t_qid)
268 |                 t_iid_list.append(t_iid)
269 |                 counter += 1
270 |             else:
271 |                 self.n_skipped += 1 
272 | 
273 |             if self.batch_index < self.batch_len-1:
274 |                 self.batch_index += 1
275 |             else:
276 |                 self.epoch_counter += 1
277 |                 qid_list = self.getQuesIds()
278 |                 random.shuffle(qid_list)
279 |                 self.qid_list = qid_list
280 |                 self.batch_index = 0
281 |                 print("%d questions were skipped in a single epoch" % self.n_skipped)
282 |                 self.n_skipped = 0
283 | 
284 |         t_batch = self.create_batch(t_qid_list)
285 |         return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
286 | 
287 | 
288 | class VQADataProviderLayer(caffe.Layer):
289 |     """
290 |     Provide input data for VQA.
291 |     """
292 | 
293 |     def setup(self, bottom, top):
294 |         self.batchsize = json.loads(self.param_str)['batchsize']
295 |         self.top_names = ['data','cont','feature','label']
296 |         top[0].reshape(15,self.batchsize)
297 |         top[1].reshape(15,self.batchsize)
298 |         top[2].reshape(self.batchsize,2048)
299 |         top[3].reshape(self.batchsize)
300 | 
301 |         self.mode = json.loads(self.param_str)['mode']
302 |         self.folder = json.loads(self.param_str)['folder']
303 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
304 |             pass
305 |         else:
306 |             self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
307 | 
308 |     def reshape(self, bottom, top):
309 |         pass
310 | 
311 |     def forward(self, bottom, top):
312 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
313 |             pass
314 |         else:
315 |             word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec()
316 |             top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
317 |             top[1].data[...] = np.transpose(cont,(1,0))
318 |             top[2].data[...] = feature
319 |             top[3].data[...] = answer
320 | 
321 |     def backward(self, top, propagate_down, bottom):
322 |         pass
323 | 
324 | 


--------------------------------------------------------------------------------
/mfh_baseline/vqa_data_layer.py:
--------------------------------------------------------------------------------
  1 | import caffe
  2 | import numpy as np
  3 | import re, json, random
  4 | import config
  5 | 
  6 | QID_KEY_SEPARATOR = '/'
  7 | GLOVE_EMBEDDING_SIZE = 300
  8 | 
  9 | class VQADataProvider:
 10 | 
 11 |     def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
 12 |         self.batchsize = batchsize
 13 |         self.d_vocabulary = None
 14 |         self.batch_index = None
 15 |         self.batch_len = None
 16 |         self.rev_adict = None
 17 |         self.max_length = max_length
 18 |         self.mode = mode
 19 |         self.qdic, self.adic = VQADataProvider.load_data(mode)
 20 | 
 21 |         with open('./%s/vdict.json'%folder,'r') as f:
 22 |             self.vdict = json.load(f)
 23 |         with open('./%s/adict.json'%folder,'r') as f:
 24 |             self.adict = json.load(f)
 25 | 
 26 |         self.n_ans_vocabulary = len(self.adict)
 27 | 
 28 |     @staticmethod
 29 |     def load_vqa_json(data_split):
 30 |         """
 31 |         Parses the question and answer json files for the given data split. 
 32 |         Returns the question dictionary and the answer dictionary.
 33 |         """
 34 |         qdic, adic = {}, {}
 35 | 
 36 |         with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
 37 |             qdata = json.load(f)['questions']
 38 |             for q in qdata:
 39 |                 qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
 40 |                     {'qstr': q['question'], 'iid': q['image_id']}
 41 | 
 42 |         if 'test' not in data_split:
 43 |             with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
 44 |                 adata = json.load(f)['annotations']
 45 |                 for a in adata:
 46 |                     adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
 47 |                         a['answers']
 48 | 
 49 |         print 'parsed', len(qdic), 'questions for', data_split
 50 |         return qdic, adic
 51 | 
 52 |     @staticmethod
 53 |     def load_genome_json():
 54 |         """
 55 |         Parses the genome json file. Returns the question dictionary and the
 56 |         answer dictionary.
 57 |         """
 58 |         qdic, adic = {}, {}
 59 | 
 60 |         with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
 61 |             qdata = json.load(f)
 62 |             for q in qdata:
 63 |                 key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
 64 |                 qdic[key] = {'qstr': q['question'], 'iid': q['image']}
 65 |                 adic[key] = [{'answer': q['answer']}]
 66 | 
 67 |         print 'parsed', len(qdic), 'questions for genome'
 68 |         return qdic, adic
 69 | 
 70 |     @staticmethod
 71 |     def load_data(data_split_str):
 72 |         all_qdic, all_adic = {}, {}
 73 |         for data_split in data_split_str.split('+'):
 74 |             assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
 75 |             if data_split == 'genome':
 76 |                 qdic, adic = VQADataProvider.load_genome_json()
 77 |                 all_qdic.update(qdic)
 78 |                 all_adic.update(adic)
 79 |             else:
 80 |                 qdic, adic = VQADataProvider.load_vqa_json(data_split)
 81 |                 all_qdic.update(qdic)
 82 |                 all_adic.update(adic)
 83 |         return all_qdic, all_adic
 84 | 
 85 |     def getQuesIds(self):
 86 |         return self.qdic.keys()
 87 | 
 88 |     def getStrippedQuesId(self, qid):
 89 |         return qid.split(QID_KEY_SEPARATOR)[1]
 90 | 
 91 |     def getImgId(self,qid):
 92 |         return self.qdic[qid]['iid']
 93 | 
 94 |     def getQuesStr(self,qid):
 95 |         return self.qdic[qid]['qstr']
 96 | 
 97 |     def getAnsObj(self,qid):
 98 |         if self.mode == 'test-dev' or self.mode == 'test':
 99 |             return -1
100 |         return self.adic[qid]
101 | 
102 |     @staticmethod
103 |     def seq_to_list(s):
104 |         t_str = s.lower()
105 |         for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
106 |             t_str = re.sub( i, '', t_str)
107 |         for i in [r'\-',r'\/']:
108 |             t_str = re.sub( i, ' ', t_str)
109 |         q_list = re.sub(r'\?','',t_str.lower()).split(' ')
110 |         q_list = filter(lambda x: len(x) > 0, q_list)
111 |         return q_list
112 | 
113 |     def extract_answer(self,answer_obj):
114 |         """ Return the most popular answer in string."""
115 |         if self.mode == 'test-dev' or self.mode == 'test':
116 |             return -1
117 |         answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
118 |         dic = {}
119 |         for ans in answer_list:
120 |             if dic.has_key(ans):
121 |                 dic[ans] +=1
122 |             else:
123 |                 dic[ans] = 1
124 |         max_key = max((v,k) for (k,v) in dic.items())[1]
125 |         return max_key
126 | 
127 |     def extract_answer_prob(self,answer_obj):
128 |         """ Return the most popular answer in string."""
129 |         if self.mode == 'test-dev' or self.mode == 'test':
130 |             return -1
131 | 
132 |         answer_list = [ ans['answer'] for ans in answer_obj]
133 |         prob_answer_list = []
134 |         for ans in answer_list:
135 |             if self.adict.has_key(ans):
136 |                 prob_answer_list.append(ans)
137 | 
138 |         if len(prob_answer_list) == 0:
139 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
140 |                 return 'hoge'
141 |             else:
142 |                 raise Exception("This should not happen.")
143 |         else:
144 |             return random.choice(prob_answer_list)
145 |  
146 |     def qlist_to_vec(self, max_length, q_list):
147 |         """
148 |         Converts a list of words into a format suitable for the embedding layer.
149 | 
150 |         Arguments:
151 |         max_length -- the maximum length of a question sequence
152 |         q_list -- a list of words which are the tokens in the question
153 | 
154 |         Returns:
155 |         qvec -- A max_length length vector containing one-hot indices for each word
156 |         cvec -- A max_length length sequence continuation indicator vector
157 |         """
158 |         qvec = np.zeros(max_length)
159 |         cvec = np.zeros(max_length)
160 |         for i in xrange(max_length):
161 |             if i < max_length - len(q_list):
162 |                 cvec[i] = 0
163 |             else:
164 |                 w = q_list[i-(max_length-len(q_list))]
165 |                 # is the word in the vocabulary?
166 |                 if self.vdict.has_key(w) is False:
167 |                     w = ''
168 |                 qvec[i] = self.vdict[w]
169 |                 cvec[i] = 0 if i == max_length - len(q_list) else 1
170 | 
171 |         return qvec, cvec
172 |  
173 |     def answer_to_vec(self, ans_str):
174 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
175 |         if self.mode =='test-dev' or self.mode == 'test':
176 |             return -1
177 | 
178 |         if self.adict.has_key(ans_str):
179 |             ans = self.adict[ans_str]
180 |         else:
181 |             ans = self.adict['']
182 |         return ans
183 |  
184 |     def vec_to_answer(self, ans_symbol):
185 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
186 |         if self.rev_adict is None:
187 |             rev_adict = {}
188 |             for k,v in self.adict.items():
189 |                 rev_adict[v] = k
190 |             self.rev_adict = rev_adict
191 | 
192 |         return self.rev_adict[ans_symbol]
193 |  
194 |     def create_batch(self,qid_list):
195 | 
196 |         qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
197 |         cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
198 |         ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048)
199 |         avec = (np.zeros(self.batchsize)).reshape(self.batchsize)
200 | 
201 |         for i,qid in enumerate(qid_list):
202 | 
203 |             # load raw question information
204 |             q_str = self.getQuesStr(qid)
205 |             q_ans = self.getAnsObj(qid)
206 |             q_iid = self.getImgId(qid)
207 | 
208 |             # convert question to vec
209 |             q_list = VQADataProvider.seq_to_list(q_str)
210 |             t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list)
211 | 
212 |             try:
213 |                 qid_split = qid.split(QID_KEY_SEPARATOR)
214 |                 data_split = qid_split[0]
215 |                 if data_split == 'genome':
216 |                     t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
217 |                 else:
218 |                     t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
219 |                 t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
220 |             except:
221 |                 t_ivec = 0.
222 |                 print 'data not found for qid : ', q_iid,  self.mode
223 |              
224 |             # convert answer to vec
225 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
226 |                 q_ans_str = self.extract_answer(q_ans)
227 |             else:
228 |                 q_ans_str = self.extract_answer_prob(q_ans)
229 |             t_avec = self.answer_to_vec(q_ans_str)
230 | 
231 |             qvec[i,...] = t_qvec
232 |             cvec[i,...] = t_cvec
233 |             ivec[i,...] = t_ivec
234 |             avec[i,...] = t_avec
235 | 
236 |         return qvec, cvec, ivec, avec
237 | 
238 |  
239 |     def get_batch_vec(self):
240 |         if self.batch_len is None:
241 |             self.n_skipped = 0
242 |             qid_list = self.getQuesIds()
243 |             random.shuffle(qid_list)
244 |             self.qid_list = qid_list
245 |             self.batch_len = len(qid_list)
246 |             self.batch_index = 0
247 |             self.epoch_counter = 0
248 | 
249 |         def has_at_least_one_valid_answer(t_qid):
250 |             answer_obj = self.getAnsObj(t_qid)
251 |             answer_list = [ans['answer'] for ans in answer_obj]
252 |             for ans in answer_list:
253 |                 if self.adict.has_key(ans):
254 |                     return True
255 | 
256 |         counter = 0
257 |         t_qid_list = []
258 |         t_iid_list = []
259 |         while counter < self.batchsize:
260 |             t_qid = self.qid_list[self.batch_index]
261 |             t_iid = self.getImgId(t_qid)
262 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
263 |                 t_qid_list.append(t_qid)
264 |                 t_iid_list.append(t_iid)
265 |                 counter += 1
266 |             elif has_at_least_one_valid_answer(t_qid):
267 |                 t_qid_list.append(t_qid)
268 |                 t_iid_list.append(t_iid)
269 |                 counter += 1
270 |             else:
271 |                 self.n_skipped += 1 
272 | 
273 |             if self.batch_index < self.batch_len-1:
274 |                 self.batch_index += 1
275 |             else:
276 |                 self.epoch_counter += 1
277 |                 qid_list = self.getQuesIds()
278 |                 random.shuffle(qid_list)
279 |                 self.qid_list = qid_list
280 |                 self.batch_index = 0
281 |                 print("%d questions were skipped in a single epoch" % self.n_skipped)
282 |                 self.n_skipped = 0
283 | 
284 |         t_batch = self.create_batch(t_qid_list)
285 |         return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
286 | 
287 | 
288 | class VQADataProviderLayer(caffe.Layer):
289 |     """
290 |     Provide input data for VQA.
291 |     """
292 | 
293 |     def setup(self, bottom, top):
294 |         self.batchsize = json.loads(self.param_str)['batchsize']
295 |         self.top_names = ['data','cont','feature','label']
296 |         top[0].reshape(15,self.batchsize)
297 |         top[1].reshape(15,self.batchsize)
298 |         top[2].reshape(self.batchsize,2048)
299 |         top[3].reshape(self.batchsize)
300 | 
301 |         self.mode = json.loads(self.param_str)['mode']
302 |         self.folder = json.loads(self.param_str)['folder']
303 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
304 |             pass
305 |         else:
306 |             self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
307 | 
308 |     def reshape(self, bottom, top):
309 |         pass
310 | 
311 |     def forward(self, bottom, top):
312 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
313 |             pass
314 |         else:
315 |             word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec()
316 |             top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
317 |             top[1].data[...] = np.transpose(cont,(1,0))
318 |             top[2].data[...] = feature
319 |             top[3].data[...] = answer
320 | 
321 |     def backward(self, top, propagate_down, bottom):
322 |         pass
323 | 
324 | 


--------------------------------------------------------------------------------
/mfb_baseline/vqa_data_layer_kld.py:
--------------------------------------------------------------------------------
  1 | import caffe
  2 | import numpy as np
  3 | import re, json, random
  4 | import config
  5 | 
  6 | QID_KEY_SEPARATOR = '/'
  7 | GLOVE_EMBEDDING_SIZE = 300
  8 | 
  9 | class VQADataProvider:
 10 | 
 11 |     def __init__(self, folder='./result', batchsize=64, max_length=15, mode='train'):
 12 |         self.batchsize = batchsize
 13 |         self.d_vocabulary = None
 14 |         self.batch_index = None
 15 |         self.batch_len = None
 16 |         self.rev_adict = None
 17 |         self.max_length = max_length
 18 |         self.mode = mode
 19 |         self.qdic, self.adic = VQADataProvider.load_data(mode)
 20 | 
 21 |         with open('./%s/vdict.json'%folder,'r') as f:
 22 |             self.vdict = json.load(f)
 23 |         with open('./%s/adict.json'%folder,'r') as f:
 24 |             self.adict = json.load(f)
 25 | 
 26 | 
 27 |     @staticmethod
 28 |     def load_vqa_json(data_split):
 29 |         """
 30 |         Parses the question and answer json files for the given data split. 
 31 |         Returns the question dictionary and the answer dictionary.
 32 |         """
 33 |         qdic, adic = {}, {}
 34 | 
 35 |         with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
 36 |             qdata = json.load(f)['questions']
 37 |             for q in qdata:
 38 |                 qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
 39 |                     {'qstr': q['question'], 'iid': q['image_id']}
 40 | 
 41 |         if 'test' not in data_split:
 42 |             with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
 43 |                 adata = json.load(f)['annotations']
 44 |                 for a in adata:
 45 |                     adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
 46 |                         a['answers']
 47 | 
 48 |         print 'parsed', len(qdic), 'questions for', data_split
 49 |         return qdic, adic
 50 | 
 51 |     @staticmethod
 52 |     def load_genome_json():
 53 |         """
 54 |         Parses the genome json file. Returns the question dictionary and the
 55 |         answer dictionary.
 56 |         """
 57 |         qdic, adic = {}, {}
 58 | 
 59 |         with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
 60 |             qdata = json.load(f)
 61 |             for q in qdata:
 62 |                 key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
 63 |                 qdic[key] = {'qstr': q['question'], 'iid': q['image']}
 64 |                 adic[key] = [{'answer': q['answer']}]
 65 | 
 66 |         print 'parsed', len(qdic), 'questions for genome'
 67 |         return qdic, adic
 68 | 
 69 |     @staticmethod
 70 |     def load_data(data_split_str):
 71 |         all_qdic, all_adic = {}, {}
 72 |         for data_split in data_split_str.split('+'):
 73 |             assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
 74 |             if data_split == 'genome':
 75 |                 qdic, adic = VQADataProvider.load_genome_json()
 76 |                 all_qdic.update(qdic)
 77 |                 all_adic.update(adic)
 78 |             else:
 79 |                 qdic, adic = VQADataProvider.load_vqa_json(data_split)
 80 |                 all_qdic.update(qdic)
 81 |                 all_adic.update(adic)
 82 |         return all_qdic, all_adic
 83 | 
 84 |     def getQuesIds(self):
 85 |         return self.qdic.keys()
 86 | 
 87 |     def getStrippedQuesId(self, qid):
 88 |         return qid.split(QID_KEY_SEPARATOR)[1]
 89 | 
 90 |     def getImgId(self,qid):
 91 |         return self.qdic[qid]['iid']
 92 | 
 93 |     def getQuesStr(self,qid):
 94 |         return self.qdic[qid]['qstr']
 95 | 
 96 |     def getAnsObj(self,qid):
 97 |         if self.mode == 'test-dev' or self.mode == 'test':
 98 |             return -1
 99 |         return self.adic[qid]
100 | 
101 |     @staticmethod
102 |     def seq_to_list(s):
103 |         t_str = s.lower()
104 |         for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
105 |             t_str = re.sub( i, '', t_str)
106 |         for i in [r'\-',r'\/']:
107 |             t_str = re.sub( i, ' ', t_str)
108 |         q_list = re.sub(r'\?','',t_str.lower()).split(' ')
109 |         q_list = filter(lambda x: len(x) > 0, q_list)
110 |         return q_list
111 | 
112 |     def extract_answer(self,answer_obj):
113 |         """ Return the most popular answer in string."""
114 |         if self.mode == 'test-dev' or self.mode == 'test':
115 |             return -1
116 |         answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
117 |         dic = {}
118 |         for ans in answer_list:
119 |             if dic.has_key(ans):
120 |                 dic[ans] +=1
121 |             else:
122 |                 dic[ans] = 1
123 |         max_key = max((v,k) for (k,v) in dic.items())[1]
124 |         return max_key
125 | 
126 |     def extract_answer_prob(self,answer_obj):
127 |         """ Return the most popular answer in string."""
128 |         if self.mode == 'test-dev' or self.mode == 'test':
129 |             return -1
130 | 
131 |         answer_list = [ ans['answer'] for ans in answer_obj]
132 |         prob_answer_list = []
133 |         for ans in answer_list:
134 |             if self.adict.has_key(ans):
135 |                 prob_answer_list.append(ans)
136 |     def extract_answer_list(self,answer_obj):
137 |         answer_list = [ ans['answer'] for ans in answer_obj]
138 |         prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS)
139 |         for ans in answer_list:
140 |             if self.adict.has_key(ans):
141 |                 index = self.adict[ans]
142 |                 prob_answer_vec[index] += 1
143 |         return prob_answer_vec / np.sum(prob_answer_vec)
144 |  
145 |         if len(prob_answer_list) == 0:
146 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
147 |                 return 'hoge'
148 |             else:
149 |                 raise Exception("This should not happen.")
150 |         else:
151 |             return random.choice(prob_answer_list)
152 |  
153 |     def qlist_to_vec(self, max_length, q_list):
154 |         """
155 |         Converts a list of words into a format suitable for the embedding layer.
156 | 
157 |         Arguments:
158 |         max_length -- the maximum length of a question sequence
159 |         q_list -- a list of words which are the tokens in the question
160 | 
161 |         Returns:
162 |         qvec -- A max_length length vector containing one-hot indices for each word
163 |         cvec -- A max_length length sequence continuation indicator vector
164 |         """
165 |         qvec = np.zeros(max_length)
166 |         cvec = np.zeros(max_length)
167 |         for i in xrange(max_length):
168 |             if i < max_length - len(q_list):
169 |                 cvec[i] = 0
170 |             else:
171 |                 w = q_list[i-(max_length-len(q_list))]
172 |                 # is the word in the vocabulary?
173 |                 if self.vdict.has_key(w) is False:
174 |                     w = ''
175 |                 qvec[i] = self.vdict[w]
176 |                 cvec[i] = 0 if i == max_length - len(q_list) else 1
177 | 
178 |         return qvec, cvec
179 |  
180 |     def answer_to_vec(self, ans_str):
181 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
182 |         if self.mode =='test-dev' or self.mode == 'test':
183 |             return -1
184 | 
185 |         if self.adict.has_key(ans_str):
186 |             ans = self.adict[ans_str]
187 |         else:
188 |             ans = self.adict['']
189 |         return ans
190 |  
191 |     def vec_to_answer(self, ans_symbol):
192 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
193 |         if self.rev_adict is None:
194 |             rev_adict = {}
195 |             for k,v in self.adict.items():
196 |                 rev_adict[v] = k
197 |             self.rev_adict = rev_adict
198 | 
199 |         return self.rev_adict[ans_symbol]
200 |  
201 |     def create_batch(self,qid_list):
202 | 
203 |         qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
204 |         cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
205 |         ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048)
206 |         avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
207 | 
208 |         for i,qid in enumerate(qid_list):
209 | 
210 |             # load raw question information
211 |             q_str = self.getQuesStr(qid)
212 |             q_ans = self.getAnsObj(qid)
213 |             q_iid = self.getImgId(qid)
214 | 
215 |             # convert question to vec
216 |             q_list = VQADataProvider.seq_to_list(q_str)
217 |             t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list)
218 | 
219 |             try:
220 |                 qid_split = qid.split(QID_KEY_SEPARATOR)
221 |                 data_split = qid_split[0]
222 |                 if data_split == 'genome':
223 |                     t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
224 |                 else:
225 |                     t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
226 |                 t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
227 |             except:
228 |                 t_ivec = 0.
229 |                 print 'data not found for qid : ', q_iid,  self.mode
230 |              
231 |             # convert answer to vec
232 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
233 |                 q_ans_str = self.extract_answer(q_ans)
234 |                 t_avec = self.answer_to_vec(q_ans_str)
235 |             else:
236 |                 t_avec = self.extract_answer_list(q_ans)
237 |  
238 |             qvec[i,...] = t_qvec
239 |             cvec[i,...] = t_cvec
240 |             ivec[i,...] = t_ivec
241 |             avec[i,...] = t_avec
242 | 
243 |         return qvec, cvec, ivec, avec
244 | 
245 |  
246 |     def get_batch_vec(self):
247 |         if self.batch_len is None:
248 |             self.n_skipped = 0
249 |             qid_list = self.getQuesIds()
250 |             random.shuffle(qid_list)
251 |             self.qid_list = qid_list
252 |             self.batch_len = len(qid_list)
253 |             self.batch_index = 0
254 |             self.epoch_counter = 0
255 | 
256 |         def has_at_least_one_valid_answer(t_qid):
257 |             answer_obj = self.getAnsObj(t_qid)
258 |             answer_list = [ans['answer'] for ans in answer_obj]
259 |             for ans in answer_list:
260 |                 if self.adict.has_key(ans):
261 |                     return True
262 | 
263 |         counter = 0
264 |         t_qid_list = []
265 |         t_iid_list = []
266 |         while counter < self.batchsize:
267 |             t_qid = self.qid_list[self.batch_index]
268 |             t_iid = self.getImgId(t_qid)
269 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
270 |                 t_qid_list.append(t_qid)
271 |                 t_iid_list.append(t_iid)
272 |                 counter += 1
273 |             elif has_at_least_one_valid_answer(t_qid):
274 |                 t_qid_list.append(t_qid)
275 |                 t_iid_list.append(t_iid)
276 |                 counter += 1
277 |             else:
278 |                 self.n_skipped += 1 
279 | 
280 |             if self.batch_index < self.batch_len-1:
281 |                 self.batch_index += 1
282 |             else:
283 |                 self.epoch_counter += 1
284 |                 qid_list = self.getQuesIds()
285 |                 random.shuffle(qid_list)
286 |                 self.qid_list = qid_list
287 |                 self.batch_index = 0
288 |                 print("%d questions were skipped in a single epoch" % self.n_skipped)
289 |                 self.n_skipped = 0
290 | 
291 |         t_batch = self.create_batch(t_qid_list)
292 |         return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
293 | 
294 | 
295 | class VQADataProviderLayer(caffe.Layer):
296 |     """
297 |     Provide input data for VQA.
298 |     """
299 | 
300 |     def setup(self, bottom, top):
301 |         self.batchsize = json.loads(self.param_str)['batchsize']
302 |         self.top_names = ['data','cont','feature','label']
303 |         top[0].reshape(15,self.batchsize)
304 |         top[1].reshape(15,self.batchsize)
305 |         top[2].reshape(self.batchsize,2048)
306 |         top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
307 | 
308 |         self.mode = json.loads(self.param_str)['mode']
309 |         self.folder = json.loads(self.param_str)['folder']
310 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
311 |             pass
312 |         else:
313 |             self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
314 | 
315 |     def reshape(self, bottom, top):
316 |         pass
317 | 
318 |     def forward(self, bottom, top):
319 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
320 |             pass
321 |         else:
322 |             word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec()
323 |             top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
324 |             top[1].data[...] = np.transpose(cont,(1,0))
325 |             top[2].data[...] = feature
326 |             top[3].data[...] = answer
327 | 
328 |     def backward(self, top, propagate_down, bottom):
329 |         pass
330 | 
331 | 


--------------------------------------------------------------------------------
/mfh_baseline/vqa_data_layer_kld.py:
--------------------------------------------------------------------------------
  1 | import caffe
  2 | import numpy as np
  3 | import re, json, random
  4 | import config
  5 | 
  6 | QID_KEY_SEPARATOR = '/'
  7 | GLOVE_EMBEDDING_SIZE = 300
  8 | 
  9 | class VQADataProvider:
 10 | 
 11 |     def __init__(self, folder='./result', batchsize=64, max_length=15, mode='train'):
 12 |         self.batchsize = batchsize
 13 |         self.d_vocabulary = None
 14 |         self.batch_index = None
 15 |         self.batch_len = None
 16 |         self.rev_adict = None
 17 |         self.max_length = max_length
 18 |         self.mode = mode
 19 |         self.qdic, self.adic = VQADataProvider.load_data(mode)
 20 | 
 21 |         with open('./%s/vdict.json'%folder,'r') as f:
 22 |             self.vdict = json.load(f)
 23 |         with open('./%s/adict.json'%folder,'r') as f:
 24 |             self.adict = json.load(f)
 25 | 
 26 | 
 27 |     @staticmethod
 28 |     def load_vqa_json(data_split):
 29 |         """
 30 |         Parses the question and answer json files for the given data split. 
 31 |         Returns the question dictionary and the answer dictionary.
 32 |         """
 33 |         qdic, adic = {}, {}
 34 | 
 35 |         with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
 36 |             qdata = json.load(f)['questions']
 37 |             for q in qdata:
 38 |                 qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
 39 |                     {'qstr': q['question'], 'iid': q['image_id']}
 40 | 
 41 |         if 'test' not in data_split:
 42 |             with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
 43 |                 adata = json.load(f)['annotations']
 44 |                 for a in adata:
 45 |                     adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
 46 |                         a['answers']
 47 | 
 48 |         print 'parsed', len(qdic), 'questions for', data_split
 49 |         return qdic, adic
 50 | 
 51 |     @staticmethod
 52 |     def load_genome_json():
 53 |         """
 54 |         Parses the genome json file. Returns the question dictionary and the
 55 |         answer dictionary.
 56 |         """
 57 |         qdic, adic = {}, {}
 58 | 
 59 |         with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
 60 |             qdata = json.load(f)
 61 |             for q in qdata:
 62 |                 key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
 63 |                 qdic[key] = {'qstr': q['question'], 'iid': q['image']}
 64 |                 adic[key] = [{'answer': q['answer']}]
 65 | 
 66 |         print 'parsed', len(qdic), 'questions for genome'
 67 |         return qdic, adic
 68 | 
 69 |     @staticmethod
 70 |     def load_data(data_split_str):
 71 |         all_qdic, all_adic = {}, {}
 72 |         for data_split in data_split_str.split('+'):
 73 |             assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
 74 |             if data_split == 'genome':
 75 |                 qdic, adic = VQADataProvider.load_genome_json()
 76 |                 all_qdic.update(qdic)
 77 |                 all_adic.update(adic)
 78 |             else:
 79 |                 qdic, adic = VQADataProvider.load_vqa_json(data_split)
 80 |                 all_qdic.update(qdic)
 81 |                 all_adic.update(adic)
 82 |         return all_qdic, all_adic
 83 | 
 84 |     def getQuesIds(self):
 85 |         return self.qdic.keys()
 86 | 
 87 |     def getStrippedQuesId(self, qid):
 88 |         return qid.split(QID_KEY_SEPARATOR)[1]
 89 | 
 90 |     def getImgId(self,qid):
 91 |         return self.qdic[qid]['iid']
 92 | 
 93 |     def getQuesStr(self,qid):
 94 |         return self.qdic[qid]['qstr']
 95 | 
 96 |     def getAnsObj(self,qid):
 97 |         if self.mode == 'test-dev' or self.mode == 'test':
 98 |             return -1
 99 |         return self.adic[qid]
100 | 
101 |     @staticmethod
102 |     def seq_to_list(s):
103 |         t_str = s.lower()
104 |         for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
105 |             t_str = re.sub( i, '', t_str)
106 |         for i in [r'\-',r'\/']:
107 |             t_str = re.sub( i, ' ', t_str)
108 |         q_list = re.sub(r'\?','',t_str.lower()).split(' ')
109 |         q_list = filter(lambda x: len(x) > 0, q_list)
110 |         return q_list
111 | 
112 |     def extract_answer(self,answer_obj):
113 |         """ Return the most popular answer in string."""
114 |         if self.mode == 'test-dev' or self.mode == 'test':
115 |             return -1
116 |         answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
117 |         dic = {}
118 |         for ans in answer_list:
119 |             if dic.has_key(ans):
120 |                 dic[ans] +=1
121 |             else:
122 |                 dic[ans] = 1
123 |         max_key = max((v,k) for (k,v) in dic.items())[1]
124 |         return max_key
125 | 
126 |     def extract_answer_prob(self,answer_obj):
127 |         """ Return the most popular answer in string."""
128 |         if self.mode == 'test-dev' or self.mode == 'test':
129 |             return -1
130 | 
131 |         answer_list = [ ans['answer'] for ans in answer_obj]
132 |         prob_answer_list = []
133 |         for ans in answer_list:
134 |             if self.adict.has_key(ans):
135 |                 prob_answer_list.append(ans)
136 |     def extract_answer_list(self,answer_obj):
137 |         answer_list = [ ans['answer'] for ans in answer_obj]
138 |         prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS)
139 |         for ans in answer_list:
140 |             if self.adict.has_key(ans):
141 |                 index = self.adict[ans]
142 |                 prob_answer_vec[index] += 1
143 |         return prob_answer_vec / np.sum(prob_answer_vec)
144 |  
145 |         if len(prob_answer_list) == 0:
146 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
147 |                 return 'hoge'
148 |             else:
149 |                 raise Exception("This should not happen.")
150 |         else:
151 |             return random.choice(prob_answer_list)
152 |  
153 |     def qlist_to_vec(self, max_length, q_list):
154 |         """
155 |         Converts a list of words into a format suitable for the embedding layer.
156 | 
157 |         Arguments:
158 |         max_length -- the maximum length of a question sequence
159 |         q_list -- a list of words which are the tokens in the question
160 | 
161 |         Returns:
162 |         qvec -- A max_length length vector containing one-hot indices for each word
163 |         cvec -- A max_length length sequence continuation indicator vector
164 |         """
165 |         qvec = np.zeros(max_length)
166 |         cvec = np.zeros(max_length)
167 |         for i in xrange(max_length):
168 |             if i < max_length - len(q_list):
169 |                 cvec[i] = 0
170 |             else:
171 |                 w = q_list[i-(max_length-len(q_list))]
172 |                 # is the word in the vocabulary?
173 |                 if self.vdict.has_key(w) is False:
174 |                     w = ''
175 |                 qvec[i] = self.vdict[w]
176 |                 cvec[i] = 0 if i == max_length - len(q_list) else 1
177 | 
178 |         return qvec, cvec
179 |  
180 |     def answer_to_vec(self, ans_str):
181 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
182 |         if self.mode =='test-dev' or self.mode == 'test':
183 |             return -1
184 | 
185 |         if self.adict.has_key(ans_str):
186 |             ans = self.adict[ans_str]
187 |         else:
188 |             ans = self.adict['']
189 |         return ans
190 |  
191 |     def vec_to_answer(self, ans_symbol):
192 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
193 |         if self.rev_adict is None:
194 |             rev_adict = {}
195 |             for k,v in self.adict.items():
196 |                 rev_adict[v] = k
197 |             self.rev_adict = rev_adict
198 | 
199 |         return self.rev_adict[ans_symbol]
200 |  
201 |     def create_batch(self,qid_list):
202 | 
203 |         qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
204 |         cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
205 |         ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048)
206 |         avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
207 | 
208 |         for i,qid in enumerate(qid_list):
209 | 
210 |             # load raw question information
211 |             q_str = self.getQuesStr(qid)
212 |             q_ans = self.getAnsObj(qid)
213 |             q_iid = self.getImgId(qid)
214 | 
215 |             # convert question to vec
216 |             q_list = VQADataProvider.seq_to_list(q_str)
217 |             t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list)
218 | 
219 |             try:
220 |                 qid_split = qid.split(QID_KEY_SEPARATOR)
221 |                 data_split = qid_split[0]
222 |                 if data_split == 'genome':
223 |                     t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
224 |                 else:
225 |                     t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
226 |                 t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
227 |             except:
228 |                 t_ivec = 0.
229 |                 print 'data not found for qid : ', q_iid,  self.mode
230 |              
231 |             # convert answer to vec
232 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
233 |                 q_ans_str = self.extract_answer(q_ans)
234 |                 t_avec = self.answer_to_vec(q_ans_str)
235 |             else:
236 |                 t_avec = self.extract_answer_list(q_ans)
237 |  
238 |             qvec[i,...] = t_qvec
239 |             cvec[i,...] = t_cvec
240 |             ivec[i,...] = t_ivec
241 |             avec[i,...] = t_avec
242 | 
243 |         return qvec, cvec, ivec, avec
244 | 
245 |  
246 |     def get_batch_vec(self):
247 |         if self.batch_len is None:
248 |             self.n_skipped = 0
249 |             qid_list = self.getQuesIds()
250 |             random.shuffle(qid_list)
251 |             self.qid_list = qid_list
252 |             self.batch_len = len(qid_list)
253 |             self.batch_index = 0
254 |             self.epoch_counter = 0
255 | 
256 |         def has_at_least_one_valid_answer(t_qid):
257 |             answer_obj = self.getAnsObj(t_qid)
258 |             answer_list = [ans['answer'] for ans in answer_obj]
259 |             for ans in answer_list:
260 |                 if self.adict.has_key(ans):
261 |                     return True
262 | 
263 |         counter = 0
264 |         t_qid_list = []
265 |         t_iid_list = []
266 |         while counter < self.batchsize:
267 |             t_qid = self.qid_list[self.batch_index]
268 |             t_iid = self.getImgId(t_qid)
269 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
270 |                 t_qid_list.append(t_qid)
271 |                 t_iid_list.append(t_iid)
272 |                 counter += 1
273 |             elif has_at_least_one_valid_answer(t_qid):
274 |                 t_qid_list.append(t_qid)
275 |                 t_iid_list.append(t_iid)
276 |                 counter += 1
277 |             else:
278 |                 self.n_skipped += 1 
279 | 
280 |             if self.batch_index < self.batch_len-1:
281 |                 self.batch_index += 1
282 |             else:
283 |                 self.epoch_counter += 1
284 |                 qid_list = self.getQuesIds()
285 |                 random.shuffle(qid_list)
286 |                 self.qid_list = qid_list
287 |                 self.batch_index = 0
288 |                 print("%d questions were skipped in a single epoch" % self.n_skipped)
289 |                 self.n_skipped = 0
290 | 
291 |         t_batch = self.create_batch(t_qid_list)
292 |         return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
293 | 
294 | 
295 | class VQADataProviderLayer(caffe.Layer):
296 |     """
297 |     Provide input data for VQA.
298 |     """
299 | 
300 |     def setup(self, bottom, top):
301 |         self.batchsize = json.loads(self.param_str)['batchsize']
302 |         self.top_names = ['data','cont','feature','label']
303 |         top[0].reshape(15,self.batchsize)
304 |         top[1].reshape(15,self.batchsize)
305 |         top[2].reshape(self.batchsize,2048)
306 |         top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
307 | 
308 |         self.mode = json.loads(self.param_str)['mode']
309 |         self.folder = json.loads(self.param_str)['folder']
310 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
311 |             pass
312 |         else:
313 |             self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder)
314 | 
315 |     def reshape(self, bottom, top):
316 |         pass
317 | 
318 |     def forward(self, bottom, top):
319 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
320 |             pass
321 |         else:
322 |             word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec()
323 |             top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
324 |             top[1].data[...] = np.transpose(cont,(1,0))
325 |             top[2].data[...] = feature
326 |             top[3].data[...] = answer
327 | 
328 |     def backward(self, top, propagate_down, bottom):
329 |         pass
330 | 
331 | 


--------------------------------------------------------------------------------
/mfb_coatt_glove/vqa_data_layer.py:
--------------------------------------------------------------------------------
  1 | import caffe
  2 | import numpy as np
  3 | import re, json, random
  4 | import config
  5 | import spacy
  6 | 
  7 | QID_KEY_SEPARATOR = '/'
  8 | GLOVE_EMBEDDING_SIZE = 300
  9 | 
 10 | class VQADataProvider:
 11 | 
 12 |     def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
 13 |         self.batchsize = batchsize
 14 |         self.d_vocabulary = None
 15 |         self.batch_index = None
 16 |         self.batch_len = None
 17 |         self.rev_adict = None
 18 |         self.max_length = max_length
 19 |         self.mode = mode
 20 |         self.qdic, self.adic = VQADataProvider.load_data(mode)
 21 | 
 22 |         with open('./%s/vdict.json'%folder,'r') as f:
 23 |             self.vdict = json.load(f)
 24 |         with open('./%s/adict.json'%folder,'r') as f:
 25 |             self.adict = json.load(f)
 26 | 
 27 |         self.n_ans_vocabulary = len(self.adict)
 28 |         self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
 29 |         self.glove_dict = {} # word -> glove vector
 30 | 
 31 |     @staticmethod
 32 |     def load_vqa_json(data_split):
 33 |         """
 34 |         Parses the question and answer json files for the given data split. 
 35 |         Returns the question dictionary and the answer dictionary.
 36 |         """
 37 |         qdic, adic = {}, {}
 38 | 
 39 |         with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
 40 |             qdata = json.load(f)['questions']
 41 |             for q in qdata:
 42 |                 qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
 43 |                     {'qstr': q['question'], 'iid': q['image_id']}
 44 | 
 45 |         if 'test' not in data_split:
 46 |             with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
 47 |                 adata = json.load(f)['annotations']
 48 |                 for a in adata:
 49 |                     adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
 50 |                         a['answers']
 51 | 
 52 |         print 'parsed', len(qdic), 'questions for', data_split
 53 |         return qdic, adic
 54 | 
 55 |     @staticmethod
 56 |     def load_genome_json():
 57 |         """
 58 |         Parses the genome json file. Returns the question dictionary and the
 59 |         answer dictionary.
 60 |         """
 61 |         qdic, adic = {}, {}
 62 | 
 63 |         with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
 64 |             qdata = json.load(f)
 65 |             for q in qdata:
 66 |                 key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
 67 |                 qdic[key] = {'qstr': q['question'], 'iid': q['image']}
 68 |                 adic[key] = [{'answer': q['answer']}]
 69 | 
 70 |         print 'parsed', len(qdic), 'questions for genome'
 71 |         return qdic, adic
 72 | 
 73 |     @staticmethod
 74 |     def load_data(data_split_str):
 75 |         all_qdic, all_adic = {}, {}
 76 |         for data_split in data_split_str.split('+'):
 77 |             assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
 78 |             if data_split == 'genome':
 79 |                 qdic, adic = VQADataProvider.load_genome_json()
 80 |                 all_qdic.update(qdic)
 81 |                 all_adic.update(adic)
 82 |             else:
 83 |                 qdic, adic = VQADataProvider.load_vqa_json(data_split)
 84 |                 all_qdic.update(qdic)
 85 |                 all_adic.update(adic)
 86 |         return all_qdic, all_adic
 87 | 
 88 |     def getQuesIds(self):
 89 |         return self.qdic.keys()
 90 | 
 91 |     def getStrippedQuesId(self, qid):
 92 |         return qid.split(QID_KEY_SEPARATOR)[1]
 93 | 
 94 |     def getImgId(self,qid):
 95 |         return self.qdic[qid]['iid']
 96 | 
 97 |     def getQuesStr(self,qid):
 98 |         return self.qdic[qid]['qstr']
 99 | 
100 |     def getAnsObj(self,qid):
101 |         if self.mode == 'test-dev' or self.mode == 'test':
102 |             return -1
103 |         return self.adic[qid]
104 | 
105 |     @staticmethod
106 |     def seq_to_list(s):
107 |         t_str = s.lower()
108 |         for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
109 |             t_str = re.sub( i, '', t_str)
110 |         for i in [r'\-',r'\/']:
111 |             t_str = re.sub( i, ' ', t_str)
112 |         q_list = re.sub(r'\?','',t_str.lower()).split(' ')
113 |         q_list = filter(lambda x: len(x) > 0, q_list)
114 |         return q_list
115 | 
116 |     def extract_answer(self,answer_obj):
117 |         """ Return the most popular answer in string."""
118 |         if self.mode == 'test-dev' or self.mode == 'test':
119 |             return -1
120 |         answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
121 |         dic = {}
122 |         for ans in answer_list:
123 |             if dic.has_key(ans):
124 |                 dic[ans] +=1
125 |             else:
126 |                 dic[ans] = 1
127 |         max_key = max((v,k) for (k,v) in dic.items())[1]
128 |         return max_key
129 | 
130 |     def extract_answer_prob(self,answer_obj):
131 |         """ Return the most popular answer in string."""
132 |         if self.mode == 'test-dev' or self.mode == 'test':
133 |             return -1
134 | 
135 |         answer_list = [ ans['answer'] for ans in answer_obj]
136 |         prob_answer_list = []
137 |         for ans in answer_list:
138 |             if self.adict.has_key(ans):
139 |                 prob_answer_list.append(ans)
140 | 
141 |         if len(prob_answer_list) == 0:
142 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
143 |                 return 'hoge'
144 |             else:
145 |                 raise Exception("This should not happen.")
146 |         else:
147 |             return random.choice(prob_answer_list)
148 |  
149 |     def qlist_to_vec(self, max_length, q_list):
150 |         """
151 |         Converts a list of words into a format suitable for the embedding layer.
152 | 
153 |         Arguments:
154 |         max_length -- the maximum length of a question sequence
155 |         q_list -- a list of words which are the tokens in the question
156 | 
157 |         Returns:
158 |         qvec -- A max_length length vector containing one-hot indices for each word
159 |         cvec -- A max_length length sequence continuation indicator vector
160 |         glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for
161 |             each word
162 |         """
163 |         qvec = np.zeros(max_length)
164 |         cvec = np.zeros(max_length)
165 |         glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE)
166 |         for i in xrange(max_length):
167 |             if i < max_length - len(q_list):
168 |                 cvec[i] = 0
169 |             else:
170 |                 w = q_list[i-(max_length-len(q_list))]
171 |                 if w not in self.glove_dict:
172 |                     self.glove_dict[w] = self.nlp(u'%s' % w).vector
173 |                 glove_matrix[i] = self.glove_dict[w]
174 |                 # is the word in the vocabulary?
175 |                 if self.vdict.has_key(w) is False:
176 |                     w = ''
177 |                 qvec[i] = self.vdict[w]
178 |                 cvec[i] = 0 if i == max_length - len(q_list) else 1
179 | 
180 |         return qvec, cvec, glove_matrix
181 |  
182 |     def answer_to_vec(self, ans_str):
183 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
184 |         if self.mode =='test-dev' or self.mode == 'test':
185 |             return -1
186 | 
187 |         if self.adict.has_key(ans_str):
188 |             ans = self.adict[ans_str]
189 |         else:
190 |             ans = self.adict['']
191 |         return ans
192 |  
193 |     def vec_to_answer(self, ans_symbol):
194 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
195 |         if self.rev_adict is None:
196 |             rev_adict = {}
197 |             for k,v in self.adict.items():
198 |                 rev_adict[v] = k
199 |             self.rev_adict = rev_adict
200 | 
201 |         return self.rev_adict[ans_symbol]
202 |  
203 |     def create_batch(self,qid_list):
204 | 
205 |         qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
206 |         cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
207 |         ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14) 
208 |         avec = (np.zeros(self.batchsize)).reshape(self.batchsize)
209 |         glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\
210 |             self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE)
211 | 
212 |         for i,qid in enumerate(qid_list):
213 | 
214 |             # load raw question information
215 |             q_str = self.getQuesStr(qid)
216 |             q_ans = self.getAnsObj(qid)
217 |             q_iid = self.getImgId(qid)
218 | 
219 |             # convert question to vec
220 |             q_list = VQADataProvider.seq_to_list(q_str)
221 |             t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list)
222 | 
223 |             try:
224 |                 qid_split = qid.split(QID_KEY_SEPARATOR)
225 |                 data_split = qid_split[0]
226 |                 if data_split == 'genome':
227 |                     t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
228 |                 else:
229 |                     t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
230 |                 t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
231 |             except:
232 |                 t_ivec = 0.
233 |                 print 'data not found for qid : ', q_iid,  self.mode
234 |              
235 |             # convert answer to vec
236 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
237 |                 q_ans_str = self.extract_answer(q_ans)
238 |             else:
239 |                 q_ans_str = self.extract_answer_prob(q_ans)
240 |             t_avec = self.answer_to_vec(q_ans_str)
241 |             qvec[i,...] = t_qvec
242 |             cvec[i,...] = t_cvec
243 |             ivec[i,...] = t_ivec
244 |             avec[i,...] = t_avec
245 |             glove_matrix[i,...] = t_glove_matrix
246 | 
247 |         return qvec, cvec, ivec, avec, glove_matrix
248 | 
249 |  
250 |     def get_batch_vec(self):
251 |         if self.batch_len is None:
252 |             self.n_skipped = 0
253 |             qid_list = self.getQuesIds()
254 |             random.shuffle(qid_list)
255 |             self.qid_list = qid_list
256 |             self.batch_len = len(qid_list)
257 |             self.batch_index = 0
258 |             self.epoch_counter = 0
259 | 
260 |         def has_at_least_one_valid_answer(t_qid):
261 |             answer_obj = self.getAnsObj(t_qid)
262 |             answer_list = [ans['answer'] for ans in answer_obj]
263 |             for ans in answer_list:
264 |                 if self.adict.has_key(ans):
265 |                     return True
266 | 
267 |         counter = 0
268 |         t_qid_list = []
269 |         t_iid_list = []
270 |         while counter < self.batchsize:
271 |             t_qid = self.qid_list[self.batch_index]
272 |             t_iid = self.getImgId(t_qid)
273 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
274 |                 t_qid_list.append(t_qid)
275 |                 t_iid_list.append(t_iid)
276 |                 counter += 1
277 |             elif has_at_least_one_valid_answer(t_qid):
278 |                 t_qid_list.append(t_qid)
279 |                 t_iid_list.append(t_iid)
280 |                 counter += 1
281 |             else:
282 |                 self.n_skipped += 1 
283 | 
284 |             if self.batch_index < self.batch_len-1:
285 |                 self.batch_index += 1
286 |             else:
287 |                 self.epoch_counter += 1
288 |                 qid_list = self.getQuesIds()
289 |                 random.shuffle(qid_list)
290 |                 self.qid_list = qid_list
291 |                 self.batch_index = 0
292 |                 print("%d questions were skipped in a single epoch" % self.n_skipped)
293 |                 self.n_skipped = 0
294 | 
295 |         t_batch = self.create_batch(t_qid_list)
296 |         return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
297 | 
298 | 
299 | class VQADataProviderLayer(caffe.Layer):
300 |     """
301 |     Provide input data for VQA.
302 |     """
303 | 
304 |     def setup(self, bottom, top):
305 |         self.batchsize = json.loads(self.param_str)['batchsize']
306 |         self.top_names = ['data','cont','feature','label','glove']
307 |         top[0].reshape(15,self.batchsize)
308 |         top[1].reshape(15,self.batchsize)
309 |         top[2].reshape(self.batchsize,2048,14,14)
310 |         top[3].reshape(self.batchsize)
311 |         top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE)
312 | 
313 |         self.mode = json.loads(self.param_str)['mode']
314 |         self.folder = json.loads(self.param_str)['folder']
315 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
316 |             pass
317 |         else:
318 |             self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode,  folder=self.folder)
319 | 
320 |     def reshape(self, bottom, top):
321 |         pass
322 | 
323 |     def forward(self, bottom, top):
324 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
325 |             pass
326 |         else:
327 |             word, cont, feature, answer, glove_matrix, _, _, _ = self.dp.get_batch_vec()
328 |             top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
329 |             top[1].data[...] = np.transpose(cont,(1,0))
330 |             top[2].data[...] = feature
331 |             top[3].data[...] = answer
332 |             top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300
333 | 
334 |     def backward(self, top, propagate_down, bottom):
335 |         pass
336 | 
337 | 


--------------------------------------------------------------------------------
/mfh_coatt_glove/vqa_data_layer.py:
--------------------------------------------------------------------------------
  1 | import caffe
  2 | import numpy as np
  3 | import re, json, random
  4 | import config
  5 | import spacy
  6 | 
  7 | QID_KEY_SEPARATOR = '/'
  8 | GLOVE_EMBEDDING_SIZE = 300
  9 | 
 10 | class VQADataProvider:
 11 | 
 12 |     def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
 13 |         self.batchsize = batchsize
 14 |         self.d_vocabulary = None
 15 |         self.batch_index = None
 16 |         self.batch_len = None
 17 |         self.rev_adict = None
 18 |         self.max_length = max_length
 19 |         self.mode = mode
 20 |         self.qdic, self.adic = VQADataProvider.load_data(mode)
 21 | 
 22 |         with open('./%s/vdict.json'%folder,'r') as f:
 23 |             self.vdict = json.load(f)
 24 |         with open('./%s/adict.json'%folder,'r') as f:
 25 |             self.adict = json.load(f)
 26 | 
 27 |         self.n_ans_vocabulary = len(self.adict)
 28 |         self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
 29 |         self.glove_dict = {} # word -> glove vector
 30 | 
 31 |     @staticmethod
 32 |     def load_vqa_json(data_split):
 33 |         """
 34 |         Parses the question and answer json files for the given data split. 
 35 |         Returns the question dictionary and the answer dictionary.
 36 |         """
 37 |         qdic, adic = {}, {}
 38 | 
 39 |         with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
 40 |             qdata = json.load(f)['questions']
 41 |             for q in qdata:
 42 |                 qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
 43 |                     {'qstr': q['question'], 'iid': q['image_id']}
 44 | 
 45 |         if 'test' not in data_split:
 46 |             with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
 47 |                 adata = json.load(f)['annotations']
 48 |                 for a in adata:
 49 |                     adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
 50 |                         a['answers']
 51 | 
 52 |         print 'parsed', len(qdic), 'questions for', data_split
 53 |         return qdic, adic
 54 | 
 55 |     @staticmethod
 56 |     def load_genome_json():
 57 |         """
 58 |         Parses the genome json file. Returns the question dictionary and the
 59 |         answer dictionary.
 60 |         """
 61 |         qdic, adic = {}, {}
 62 | 
 63 |         with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
 64 |             qdata = json.load(f)
 65 |             for q in qdata:
 66 |                 key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
 67 |                 qdic[key] = {'qstr': q['question'], 'iid': q['image']}
 68 |                 adic[key] = [{'answer': q['answer']}]
 69 | 
 70 |         print 'parsed', len(qdic), 'questions for genome'
 71 |         return qdic, adic
 72 | 
 73 |     @staticmethod
 74 |     def load_data(data_split_str):
 75 |         all_qdic, all_adic = {}, {}
 76 |         for data_split in data_split_str.split('+'):
 77 |             assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
 78 |             if data_split == 'genome':
 79 |                 qdic, adic = VQADataProvider.load_genome_json()
 80 |                 all_qdic.update(qdic)
 81 |                 all_adic.update(adic)
 82 |             else:
 83 |                 qdic, adic = VQADataProvider.load_vqa_json(data_split)
 84 |                 all_qdic.update(qdic)
 85 |                 all_adic.update(adic)
 86 |         return all_qdic, all_adic
 87 | 
 88 |     def getQuesIds(self):
 89 |         return self.qdic.keys()
 90 | 
 91 |     def getStrippedQuesId(self, qid):
 92 |         return qid.split(QID_KEY_SEPARATOR)[1]
 93 | 
 94 |     def getImgId(self,qid):
 95 |         return self.qdic[qid]['iid']
 96 | 
 97 |     def getQuesStr(self,qid):
 98 |         return self.qdic[qid]['qstr']
 99 | 
100 |     def getAnsObj(self,qid):
101 |         if self.mode == 'test-dev' or self.mode == 'test':
102 |             return -1
103 |         return self.adic[qid]
104 | 
105 |     @staticmethod
106 |     def seq_to_list(s):
107 |         t_str = s.lower()
108 |         for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
109 |             t_str = re.sub( i, '', t_str)
110 |         for i in [r'\-',r'\/']:
111 |             t_str = re.sub( i, ' ', t_str)
112 |         q_list = re.sub(r'\?','',t_str.lower()).split(' ')
113 |         q_list = filter(lambda x: len(x) > 0, q_list)
114 |         return q_list
115 | 
116 |     def extract_answer(self,answer_obj):
117 |         """ Return the most popular answer in string."""
118 |         if self.mode == 'test-dev' or self.mode == 'test':
119 |             return -1
120 |         answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
121 |         dic = {}
122 |         for ans in answer_list:
123 |             if dic.has_key(ans):
124 |                 dic[ans] +=1
125 |             else:
126 |                 dic[ans] = 1
127 |         max_key = max((v,k) for (k,v) in dic.items())[1]
128 |         return max_key
129 | 
130 |     def extract_answer_prob(self,answer_obj):
131 |         """ Return the most popular answer in string."""
132 |         if self.mode == 'test-dev' or self.mode == 'test':
133 |             return -1
134 | 
135 |         answer_list = [ ans['answer'] for ans in answer_obj]
136 |         prob_answer_list = []
137 |         for ans in answer_list:
138 |             if self.adict.has_key(ans):
139 |                 prob_answer_list.append(ans)
140 | 
141 |         if len(prob_answer_list) == 0:
142 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
143 |                 return 'hoge'
144 |             else:
145 |                 raise Exception("This should not happen.")
146 |         else:
147 |             return random.choice(prob_answer_list)
148 |  
149 |     def qlist_to_vec(self, max_length, q_list):
150 |         """
151 |         Converts a list of words into a format suitable for the embedding layer.
152 | 
153 |         Arguments:
154 |         max_length -- the maximum length of a question sequence
155 |         q_list -- a list of words which are the tokens in the question
156 | 
157 |         Returns:
158 |         qvec -- A max_length length vector containing one-hot indices for each word
159 |         cvec -- A max_length length sequence continuation indicator vector
160 |         glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for
161 |             each word
162 |         """
163 |         qvec = np.zeros(max_length)
164 |         cvec = np.zeros(max_length)
165 |         glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE)
166 |         for i in xrange(max_length):
167 |             if i < max_length - len(q_list):
168 |                 cvec[i] = 0
169 |             else:
170 |                 w = q_list[i-(max_length-len(q_list))]
171 |                 if w not in self.glove_dict:
172 |                     self.glove_dict[w] = self.nlp(u'%s' % w).vector
173 |                 glove_matrix[i] = self.glove_dict[w]
174 |                 # is the word in the vocabulary?
175 |                 if self.vdict.has_key(w) is False:
176 |                     w = ''
177 |                 qvec[i] = self.vdict[w]
178 |                 cvec[i] = 0 if i == max_length - len(q_list) else 1
179 | 
180 |         return qvec, cvec, glove_matrix
181 |  
182 |     def answer_to_vec(self, ans_str):
183 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
184 |         if self.mode =='test-dev' or self.mode == 'test':
185 |             return -1
186 | 
187 |         if self.adict.has_key(ans_str):
188 |             ans = self.adict[ans_str]
189 |         else:
190 |             ans = self.adict['']
191 |         return ans
192 |  
193 |     def vec_to_answer(self, ans_symbol):
194 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
195 |         if self.rev_adict is None:
196 |             rev_adict = {}
197 |             for k,v in self.adict.items():
198 |                 rev_adict[v] = k
199 |             self.rev_adict = rev_adict
200 | 
201 |         return self.rev_adict[ans_symbol]
202 |  
203 |     def create_batch(self,qid_list):
204 | 
205 |         qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
206 |         cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
207 |         ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14) 
208 |         avec = (np.zeros(self.batchsize)).reshape(self.batchsize)
209 |         glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\
210 |             self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE)
211 | 
212 |         for i,qid in enumerate(qid_list):
213 | 
214 |             # load raw question information
215 |             q_str = self.getQuesStr(qid)
216 |             q_ans = self.getAnsObj(qid)
217 |             q_iid = self.getImgId(qid)
218 | 
219 |             # convert question to vec
220 |             q_list = VQADataProvider.seq_to_list(q_str)
221 |             t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list)
222 | 
223 |             try:
224 |                 qid_split = qid.split(QID_KEY_SEPARATOR)
225 |                 data_split = qid_split[0]
226 |                 if data_split == 'genome':
227 |                     t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
228 |                 else:
229 |                     t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
230 |                 t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
231 |             except:
232 |                 t_ivec = 0.
233 |                 print 'data not found for qid : ', q_iid,  self.mode
234 |              
235 |             # convert answer to vec
236 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
237 |                 q_ans_str = self.extract_answer(q_ans)
238 |             else:
239 |                 q_ans_str = self.extract_answer_prob(q_ans)
240 |             t_avec = self.answer_to_vec(q_ans_str)
241 |             qvec[i,...] = t_qvec
242 |             cvec[i,...] = t_cvec
243 |             ivec[i,...] = t_ivec
244 |             avec[i,...] = t_avec
245 |             glove_matrix[i,...] = t_glove_matrix
246 | 
247 |         return qvec, cvec, ivec, avec, glove_matrix
248 | 
249 |  
250 |     def get_batch_vec(self):
251 |         if self.batch_len is None:
252 |             self.n_skipped = 0
253 |             qid_list = self.getQuesIds()
254 |             random.shuffle(qid_list)
255 |             self.qid_list = qid_list
256 |             self.batch_len = len(qid_list)
257 |             self.batch_index = 0
258 |             self.epoch_counter = 0
259 | 
260 |         def has_at_least_one_valid_answer(t_qid):
261 |             answer_obj = self.getAnsObj(t_qid)
262 |             answer_list = [ans['answer'] for ans in answer_obj]
263 |             for ans in answer_list:
264 |                 if self.adict.has_key(ans):
265 |                     return True
266 | 
267 |         counter = 0
268 |         t_qid_list = []
269 |         t_iid_list = []
270 |         while counter < self.batchsize:
271 |             t_qid = self.qid_list[self.batch_index]
272 |             t_iid = self.getImgId(t_qid)
273 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
274 |                 t_qid_list.append(t_qid)
275 |                 t_iid_list.append(t_iid)
276 |                 counter += 1
277 |             elif has_at_least_one_valid_answer(t_qid):
278 |                 t_qid_list.append(t_qid)
279 |                 t_iid_list.append(t_iid)
280 |                 counter += 1
281 |             else:
282 |                 self.n_skipped += 1 
283 | 
284 |             if self.batch_index < self.batch_len-1:
285 |                 self.batch_index += 1
286 |             else:
287 |                 self.epoch_counter += 1
288 |                 qid_list = self.getQuesIds()
289 |                 random.shuffle(qid_list)
290 |                 self.qid_list = qid_list
291 |                 self.batch_index = 0
292 |                 print("%d questions were skipped in a single epoch" % self.n_skipped)
293 |                 self.n_skipped = 0
294 | 
295 |         t_batch = self.create_batch(t_qid_list)
296 |         return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
297 | 
298 | 
299 | class VQADataProviderLayer(caffe.Layer):
300 |     """
301 |     Provide input data for VQA.
302 |     """
303 | 
304 |     def setup(self, bottom, top):
305 |         self.batchsize = json.loads(self.param_str)['batchsize']
306 |         self.top_names = ['data','cont','feature','label','glove']
307 |         top[0].reshape(15,self.batchsize)
308 |         top[1].reshape(15,self.batchsize)
309 |         top[2].reshape(self.batchsize,2048,14,14)
310 |         top[3].reshape(self.batchsize)
311 |         top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE)
312 | 
313 |         self.mode = json.loads(self.param_str)['mode']
314 |         self.folder = json.loads(self.param_str)['folder']
315 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
316 |             pass
317 |         else:
318 |             self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode,  folder=self.folder)
319 | 
320 |     def reshape(self, bottom, top):
321 |         pass
322 | 
323 |     def forward(self, bottom, top):
324 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
325 |             pass
326 |         else:
327 |             word, cont, feature, answer, glove_matrix, _, _, _ = self.dp.get_batch_vec()
328 |             top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
329 |             top[1].data[...] = np.transpose(cont,(1,0))
330 |             top[2].data[...] = feature
331 |             top[3].data[...] = answer
332 |             top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300
333 | 
334 |     def backward(self, top, propagate_down, bottom):
335 |         pass
336 | 
337 | 


--------------------------------------------------------------------------------
/mfb_coatt_glove/vqa_data_layer_kld.py:
--------------------------------------------------------------------------------
  1 | import caffe
  2 | import numpy as np
  3 | import re, json, random
  4 | import config
  5 | import spacy
  6 | 
  7 | QID_KEY_SEPARATOR = '/'
  8 | GLOVE_EMBEDDING_SIZE = 300
  9 | 
 10 | class VQADataProvider:
 11 | 
 12 |     def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
 13 |         self.batchsize = batchsize
 14 |         self.d_vocabulary = None
 15 |         self.batch_index = None
 16 |         self.batch_len = None
 17 |         self.rev_adict = None
 18 |         self.max_length = max_length
 19 |         self.mode = mode
 20 |         self.qdic, self.adic = VQADataProvider.load_data(mode)
 21 | 
 22 |         with open('./%s/vdict.json'%folder,'r') as f:
 23 |             self.vdict = json.load(f)
 24 |         with open('./%s/adict.json'%folder,'r') as f:
 25 |             self.adict = json.load(f)
 26 | 
 27 |         self.n_ans_vocabulary = len(self.adict)
 28 |         self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
 29 |         self.glove_dict = {} # word -> glove vector
 30 | 
 31 |     @staticmethod
 32 |     def load_vqa_json(data_split):
 33 |         """
 34 |         Parses the question and answer json files for the given data split. 
 35 |         Returns the question dictionary and the answer dictionary.
 36 |         """
 37 |         qdic, adic = {}, {}
 38 | 
 39 |         with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
 40 |             qdata = json.load(f)['questions']
 41 |             for q in qdata:
 42 |                 qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
 43 |                     {'qstr': q['question'], 'iid': q['image_id']}
 44 | 
 45 |         if 'test' not in data_split:
 46 |             with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
 47 |                 adata = json.load(f)['annotations']
 48 |                 for a in adata:
 49 |                     adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
 50 |                         a['answers']
 51 | 
 52 |         print 'parsed', len(qdic), 'questions for', data_split
 53 |         return qdic, adic
 54 | 
 55 |     @staticmethod
 56 |     def load_genome_json():
 57 |         """
 58 |         Parses the genome json file. Returns the question dictionary and the
 59 |         answer dictionary.
 60 |         """
 61 |         qdic, adic = {}, {}
 62 | 
 63 |         with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
 64 |             qdata = json.load(f)
 65 |             for q in qdata:
 66 |                 key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
 67 |                 qdic[key] = {'qstr': q['question'], 'iid': q['image']}
 68 |                 adic[key] = [{'answer': q['answer']}]
 69 | 
 70 |         print 'parsed', len(qdic), 'questions for genome'
 71 |         return qdic, adic
 72 | 
 73 |     @staticmethod
 74 |     def load_data(data_split_str):
 75 |         all_qdic, all_adic = {}, {}
 76 |         for data_split in data_split_str.split('+'):
 77 |             assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
 78 |             if data_split == 'genome':
 79 |                 qdic, adic = VQADataProvider.load_genome_json()
 80 |                 all_qdic.update(qdic)
 81 |                 all_adic.update(adic)
 82 |             else:
 83 |                 qdic, adic = VQADataProvider.load_vqa_json(data_split)
 84 |                 all_qdic.update(qdic)
 85 |                 all_adic.update(adic)
 86 |         return all_qdic, all_adic
 87 | 
 88 |     def getQuesIds(self):
 89 |         return self.qdic.keys()
 90 | 
 91 |     def getStrippedQuesId(self, qid):
 92 |         return qid.split(QID_KEY_SEPARATOR)[1]
 93 | 
 94 |     def getImgId(self,qid):
 95 |         return self.qdic[qid]['iid']
 96 | 
 97 |     def getQuesStr(self,qid):
 98 |         return self.qdic[qid]['qstr']
 99 | 
100 |     def getAnsObj(self,qid):
101 |         if self.mode == 'test-dev' or self.mode == 'test':
102 |             return -1
103 |         return self.adic[qid]
104 | 
105 |     @staticmethod
106 |     def seq_to_list(s):
107 |         t_str = s.lower()
108 |         for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
109 |             t_str = re.sub( i, '', t_str)
110 |         for i in [r'\-',r'\/']:
111 |             t_str = re.sub( i, ' ', t_str)
112 |         q_list = re.sub(r'\?','',t_str.lower()).split(' ')
113 |         q_list = filter(lambda x: len(x) > 0, q_list)
114 |         return q_list
115 | 
116 |     def extract_answer(self,answer_obj):
117 |         """ Return the most popular answer in string."""
118 |         if self.mode == 'test-dev' or self.mode == 'test':
119 |             return -1
120 |         answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
121 |         dic = {}
122 |         for ans in answer_list:
123 |             if dic.has_key(ans):
124 |                 dic[ans] +=1
125 |             else:
126 |                 dic[ans] = 1
127 |         max_key = max((v,k) for (k,v) in dic.items())[1]
128 |         return max_key
129 | 
130 |     def extract_answer_prob(self,answer_obj):
131 |         """ Return the most popular answer in string."""
132 |         if self.mode == 'test-dev' or self.mode == 'test':
133 |             return -1
134 | 
135 |         answer_list = [ ans['answer'] for ans in answer_obj]
136 |         prob_answer_list = []
137 |         for ans in answer_list:
138 |             if self.adict.has_key(ans):
139 |                 prob_answer_list.append(ans)
140 |  
141 |         if len(prob_answer_list) == 0:
142 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
143 |                 return 'hoge'
144 |             else:
145 |                 raise Exception("This should not happen.")
146 |         else:
147 |             return random.choice(prob_answer_list)
148 | 
149 |     def extract_answer_list(self,answer_obj):
150 |         answer_list = [ ans['answer'] for ans in answer_obj]
151 |         prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS)
152 |         for ans in answer_list:
153 |             if self.adict.has_key(ans):
154 |                 index = self.adict[ans]
155 |                 prob_answer_vec[index] += 1
156 |         prob_answer_vec =  prob_answer_vec / np.sum(prob_answer_vec)
157 |         return prob_answer_vec
158 | 
159 |     def qlist_to_vec(self, max_length, q_list):
160 |         """
161 |         Converts a list of words into a format suitable for the embedding layer.
162 | 
163 |         Arguments:
164 |         max_length -- the maximum length of a question sequence
165 |         q_list -- a list of words which are the tokens in the question
166 | 
167 |         Returns:
168 |         qvec -- A max_length length vector containing one-hot indices for each word
169 |         cvec -- A max_length length sequence continuation indicator vector
170 |         glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for
171 |             each word
172 |         """
173 |         qvec = np.zeros(max_length)
174 |         cvec = np.zeros(max_length)
175 |         glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE)
176 |         for i in xrange(max_length):
177 |             if i < max_length - len(q_list):
178 |                 cvec[i] = 0
179 |             else:
180 |                 w = q_list[i-(max_length-len(q_list))]
181 |                 if w not in self.glove_dict:
182 |                     self.glove_dict[w] = self.nlp(u'%s' % w).vector
183 |                 glove_matrix[i] = self.glove_dict[w]
184 |                 # is the word in the vocabulary?
185 |                 if self.vdict.has_key(w) is False:
186 |                     w = ''
187 |                 qvec[i] = self.vdict[w]
188 |                 cvec[i] = 0 if i == max_length - len(q_list) else 1
189 | 
190 |         return qvec, cvec, glove_matrix
191 |  
192 |     def answer_to_vec(self, ans_str):
193 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
194 |         if self.mode =='test-dev' or self.mode == 'test':
195 |             return -1
196 | 
197 |         if self.adict.has_key(ans_str):
198 |             ans = self.adict[ans_str]
199 |         else:
200 |             ans = self.adict['']
201 |         return ans
202 |  
203 |     def vec_to_answer(self, ans_symbol):
204 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
205 |         if self.rev_adict is None:
206 |             rev_adict = {}
207 |             for k,v in self.adict.items():
208 |                 rev_adict[v] = k
209 |             self.rev_adict = rev_adict
210 | 
211 |         return self.rev_adict[ans_symbol]
212 |  
213 |     def create_batch(self,qid_list):
214 | 
215 |         qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
216 |         cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
217 |         ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14)
218 |         avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
219 |         glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\
220 |             self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE)
221 | 
222 |         for i,qid in enumerate(qid_list):
223 | 
224 |             # load raw question information
225 |             q_str = self.getQuesStr(qid)
226 |             q_ans = self.getAnsObj(qid)
227 |             q_iid = self.getImgId(qid)
228 | 
229 |             # convert question to vec
230 |             q_list = VQADataProvider.seq_to_list(q_str)
231 |             t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list)
232 | 
233 |             try:
234 |                 qid_split = qid.split(QID_KEY_SEPARATOR)
235 |                 data_split = qid_split[0]
236 |                 if data_split == 'genome':
237 |                     t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
238 |                 else:
239 |                     t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
240 |                 t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
241 |             except:
242 |                 t_ivec = 0.
243 |                 print 'data not found for qid : ', q_iid,  self.mode
244 |              
245 |             # convert answer to vec
246 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
247 |                 q_ans_str = self.extract_answer(q_ans)
248 |                 t_avec = self.answer_to_vec(q_ans_str)
249 |             else:
250 |                 t_avec = self.extract_answer_list(q_ans)
251 |             
252 |             qvec[i,...] = t_qvec
253 |             cvec[i,...] = t_cvec
254 |             ivec[i,...] = t_ivec
255 |             avec[i,...] = t_avec
256 |             glove_matrix[i,...] = t_glove_matrix
257 | 
258 |         return qvec, cvec, ivec, avec, glove_matrix
259 | 
260 |  
261 |     def get_batch_vec(self):
262 |         if self.batch_len is None:
263 |             self.n_skipped = 0
264 |             qid_list = self.getQuesIds()
265 |             random.shuffle(qid_list)
266 |             self.qid_list = qid_list
267 |             self.batch_len = len(qid_list)
268 |             self.batch_index = 0
269 |             self.epoch_counter = 0
270 | 
271 |         def has_at_least_one_valid_answer(t_qid):
272 |             answer_obj = self.getAnsObj(t_qid)
273 |             answer_list = [ans['answer'] for ans in answer_obj]
274 |             for ans in answer_list:
275 |                 if self.adict.has_key(ans):
276 |                     return True
277 | 
278 |         counter = 0
279 |         t_qid_list = []
280 |         t_iid_list = []
281 |         while counter < self.batchsize:
282 |             t_qid = self.qid_list[self.batch_index]
283 |             t_iid = self.getImgId(t_qid)
284 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
285 |                 t_qid_list.append(t_qid)
286 |                 t_iid_list.append(t_iid)
287 |                 counter += 1
288 |             elif has_at_least_one_valid_answer(t_qid):
289 |                 t_qid_list.append(t_qid)
290 |                 t_iid_list.append(t_iid)
291 |                 counter += 1
292 |             else:
293 |                 self.n_skipped += 1 
294 | 
295 |             if self.batch_index < self.batch_len-1:
296 |                 self.batch_index += 1
297 |             else:
298 |                 self.epoch_counter += 1
299 |                 qid_list = self.getQuesIds()
300 |                 random.shuffle(qid_list)
301 |                 self.qid_list = qid_list
302 |                 self.batch_index = 0
303 |                 print("%d questions were skipped in a single epoch" % self.n_skipped)
304 |                 self.n_skipped = 0
305 | 
306 |         t_batch = self.create_batch(t_qid_list)
307 |         return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
308 | 
309 | 
310 | class VQADataProviderLayer(caffe.Layer):
311 |     """
312 |     Provide input data for VQA.
313 |     """
314 | 
315 |     def setup(self, bottom, top):
316 |         self.batchsize = json.loads(self.param_str)['batchsize']
317 |         self.top_names = ['data','cont','feature','label','glove']
318 |         top[0].reshape(15,self.batchsize)
319 |         top[1].reshape(15,self.batchsize)
320 |         top[2].reshape(self.batchsize,2048,14,14)
321 |         top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
322 |         top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE)
323 | 
324 |         self.mode = json.loads(self.param_str)['mode']
325 |         self.folder = json.loads(self.param_str)['folder']
326 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
327 |             pass
328 |         else:
329 |             self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode,  folder=self.folder)
330 | 
331 |     def reshape(self, bottom, top):
332 |         pass
333 | 
334 |     def forward(self, bottom, top):
335 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
336 |             pass
337 |         else:
338 |             word, cont, feature, answer, glove_matrix, _, _, epoch_counter = self.dp.get_batch_vec()
339 |             top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
340 |             top[1].data[...] = np.transpose(cont,(1,0))
341 |             top[2].data[...] = feature
342 |             top[3].data[...] = answer
343 |             top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300
344 | 
345 |     def backward(self, top, propagate_down, bottom):
346 |         pass
347 | 
348 | 


--------------------------------------------------------------------------------
/mfh_coatt_glove/vqa_data_layer_kld.py:
--------------------------------------------------------------------------------
  1 | import caffe
  2 | import numpy as np
  3 | import re, json, random
  4 | import config
  5 | import spacy
  6 | 
  7 | QID_KEY_SEPARATOR = '/'
  8 | GLOVE_EMBEDDING_SIZE = 300
  9 | 
 10 | class VQADataProvider:
 11 | 
 12 |     def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
 13 |         self.batchsize = batchsize
 14 |         self.d_vocabulary = None
 15 |         self.batch_index = None
 16 |         self.batch_len = None
 17 |         self.rev_adict = None
 18 |         self.max_length = max_length
 19 |         self.mode = mode
 20 |         self.qdic, self.adic = VQADataProvider.load_data(mode)
 21 | 
 22 |         with open('./%s/vdict.json'%folder,'r') as f:
 23 |             self.vdict = json.load(f)
 24 |         with open('./%s/adict.json'%folder,'r') as f:
 25 |             self.adict = json.load(f)
 26 | 
 27 |         self.n_ans_vocabulary = len(self.adict)
 28 |         self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
 29 |         self.glove_dict = {} # word -> glove vector
 30 | 
 31 |     @staticmethod
 32 |     def load_vqa_json(data_split):
 33 |         """
 34 |         Parses the question and answer json files for the given data split. 
 35 |         Returns the question dictionary and the answer dictionary.
 36 |         """
 37 |         qdic, adic = {}, {}
 38 | 
 39 |         with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
 40 |             qdata = json.load(f)['questions']
 41 |             for q in qdata:
 42 |                 qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
 43 |                     {'qstr': q['question'], 'iid': q['image_id']}
 44 | 
 45 |         if 'test' not in data_split:
 46 |             with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
 47 |                 adata = json.load(f)['annotations']
 48 |                 for a in adata:
 49 |                     adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
 50 |                         a['answers']
 51 | 
 52 |         print 'parsed', len(qdic), 'questions for', data_split
 53 |         return qdic, adic
 54 | 
 55 |     @staticmethod
 56 |     def load_genome_json():
 57 |         """
 58 |         Parses the genome json file. Returns the question dictionary and the
 59 |         answer dictionary.
 60 |         """
 61 |         qdic, adic = {}, {}
 62 | 
 63 |         with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
 64 |             qdata = json.load(f)
 65 |             for q in qdata:
 66 |                 key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
 67 |                 qdic[key] = {'qstr': q['question'], 'iid': q['image']}
 68 |                 adic[key] = [{'answer': q['answer']}]
 69 | 
 70 |         print 'parsed', len(qdic), 'questions for genome'
 71 |         return qdic, adic
 72 | 
 73 |     @staticmethod
 74 |     def load_data(data_split_str):
 75 |         all_qdic, all_adic = {}, {}
 76 |         for data_split in data_split_str.split('+'):
 77 |             assert data_split in config.DATA_PATHS.keys(), 'unknown data split'
 78 |             if data_split == 'genome':
 79 |                 qdic, adic = VQADataProvider.load_genome_json()
 80 |                 all_qdic.update(qdic)
 81 |                 all_adic.update(adic)
 82 |             else:
 83 |                 qdic, adic = VQADataProvider.load_vqa_json(data_split)
 84 |                 all_qdic.update(qdic)
 85 |                 all_adic.update(adic)
 86 |         return all_qdic, all_adic
 87 | 
 88 |     def getQuesIds(self):
 89 |         return self.qdic.keys()
 90 | 
 91 |     def getStrippedQuesId(self, qid):
 92 |         return qid.split(QID_KEY_SEPARATOR)[1]
 93 | 
 94 |     def getImgId(self,qid):
 95 |         return self.qdic[qid]['iid']
 96 | 
 97 |     def getQuesStr(self,qid):
 98 |         return self.qdic[qid]['qstr']
 99 | 
100 |     def getAnsObj(self,qid):
101 |         if self.mode == 'test-dev' or self.mode == 'test':
102 |             return -1
103 |         return self.adic[qid]
104 | 
105 |     @staticmethod
106 |     def seq_to_list(s):
107 |         t_str = s.lower()
108 |         for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']:
109 |             t_str = re.sub( i, '', t_str)
110 |         for i in [r'\-',r'\/']:
111 |             t_str = re.sub( i, ' ', t_str)
112 |         q_list = re.sub(r'\?','',t_str.lower()).split(' ')
113 |         q_list = filter(lambda x: len(x) > 0, q_list)
114 |         return q_list
115 | 
116 |     def extract_answer(self,answer_obj):
117 |         """ Return the most popular answer in string."""
118 |         if self.mode == 'test-dev' or self.mode == 'test':
119 |             return -1
120 |         answer_list = [ answer_obj[i]['answer'] for i in xrange(10)]
121 |         dic = {}
122 |         for ans in answer_list:
123 |             if dic.has_key(ans):
124 |                 dic[ans] +=1
125 |             else:
126 |                 dic[ans] = 1
127 |         max_key = max((v,k) for (k,v) in dic.items())[1]
128 |         return max_key
129 | 
130 |     def extract_answer_prob(self,answer_obj):
131 |         """ Return the most popular answer in string."""
132 |         if self.mode == 'test-dev' or self.mode == 'test':
133 |             return -1
134 | 
135 |         answer_list = [ ans['answer'] for ans in answer_obj]
136 |         prob_answer_list = []
137 |         for ans in answer_list:
138 |             if self.adict.has_key(ans):
139 |                 prob_answer_list.append(ans)
140 |  
141 |         if len(prob_answer_list) == 0:
142 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
143 |                 return 'hoge'
144 |             else:
145 |                 raise Exception("This should not happen.")
146 |         else:
147 |             return random.choice(prob_answer_list)
148 | 
149 |     def extract_answer_list(self,answer_obj):
150 |         answer_list = [ ans['answer'] for ans in answer_obj]
151 |         prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS)
152 |         for ans in answer_list:
153 |             if self.adict.has_key(ans):
154 |                 index = self.adict[ans]
155 |                 prob_answer_vec[index] += 1
156 |         prob_answer_vec =  prob_answer_vec / np.sum(prob_answer_vec)
157 |         return prob_answer_vec
158 | 
159 |     def qlist_to_vec(self, max_length, q_list):
160 |         """
161 |         Converts a list of words into a format suitable for the embedding layer.
162 | 
163 |         Arguments:
164 |         max_length -- the maximum length of a question sequence
165 |         q_list -- a list of words which are the tokens in the question
166 | 
167 |         Returns:
168 |         qvec -- A max_length length vector containing one-hot indices for each word
169 |         cvec -- A max_length length sequence continuation indicator vector
170 |         glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for
171 |             each word
172 |         """
173 |         qvec = np.zeros(max_length)
174 |         cvec = np.zeros(max_length)
175 |         glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE)
176 |         for i in xrange(max_length):
177 |             if i < max_length - len(q_list):
178 |                 cvec[i] = 0
179 |             else:
180 |                 w = q_list[i-(max_length-len(q_list))]
181 |                 if w not in self.glove_dict:
182 |                     self.glove_dict[w] = self.nlp(u'%s' % w).vector
183 |                 glove_matrix[i] = self.glove_dict[w]
184 |                 # is the word in the vocabulary?
185 |                 if self.vdict.has_key(w) is False:
186 |                     w = ''
187 |                 qvec[i] = self.vdict[w]
188 |                 cvec[i] = 0 if i == max_length - len(q_list) else 1
189 | 
190 |         return qvec, cvec, glove_matrix
191 |  
192 |     def answer_to_vec(self, ans_str):
193 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
194 |         if self.mode =='test-dev' or self.mode == 'test':
195 |             return -1
196 | 
197 |         if self.adict.has_key(ans_str):
198 |             ans = self.adict[ans_str]
199 |         else:
200 |             ans = self.adict['']
201 |         return ans
202 |  
203 |     def vec_to_answer(self, ans_symbol):
204 |         """ Return answer id if the answer is included in vocabulary otherwise '' """
205 |         if self.rev_adict is None:
206 |             rev_adict = {}
207 |             for k,v in self.adict.items():
208 |                 rev_adict[v] = k
209 |             self.rev_adict = rev_adict
210 | 
211 |         return self.rev_adict[ans_symbol]
212 |  
213 |     def create_batch(self,qid_list):
214 | 
215 |         qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
216 |         cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length)
217 |         ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14)
218 |         avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
219 |         glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\
220 |             self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE)
221 | 
222 |         for i,qid in enumerate(qid_list):
223 | 
224 |             # load raw question information
225 |             q_str = self.getQuesStr(qid)
226 |             q_ans = self.getAnsObj(qid)
227 |             q_iid = self.getImgId(qid)
228 | 
229 |             # convert question to vec
230 |             q_list = VQADataProvider.seq_to_list(q_str)
231 |             t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list)
232 | 
233 |             try:
234 |                 qid_split = qid.split(QID_KEY_SEPARATOR)
235 |                 data_split = qid_split[0]
236 |                 if data_split == 'genome':
237 |                     t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x']
238 |                 else:
239 |                     t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x']
240 |                 t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) )
241 |             except:
242 |                 t_ivec = 0.
243 |                 print 'data not found for qid : ', q_iid,  self.mode
244 |              
245 |             # convert answer to vec
246 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
247 |                 q_ans_str = self.extract_answer(q_ans)
248 |                 t_avec = self.answer_to_vec(q_ans_str)
249 |             else:
250 |                 t_avec = self.extract_answer_list(q_ans)
251 |             
252 |             qvec[i,...] = t_qvec
253 |             cvec[i,...] = t_cvec
254 |             ivec[i,...] = t_ivec
255 |             avec[i,...] = t_avec
256 |             glove_matrix[i,...] = t_glove_matrix
257 | 
258 |         return qvec, cvec, ivec, avec, glove_matrix
259 | 
260 |  
261 |     def get_batch_vec(self):
262 |         if self.batch_len is None:
263 |             self.n_skipped = 0
264 |             qid_list = self.getQuesIds()
265 |             random.shuffle(qid_list)
266 |             self.qid_list = qid_list
267 |             self.batch_len = len(qid_list)
268 |             self.batch_index = 0
269 |             self.epoch_counter = 0
270 | 
271 |         def has_at_least_one_valid_answer(t_qid):
272 |             answer_obj = self.getAnsObj(t_qid)
273 |             answer_list = [ans['answer'] for ans in answer_obj]
274 |             for ans in answer_list:
275 |                 if self.adict.has_key(ans):
276 |                     return True
277 | 
278 |         counter = 0
279 |         t_qid_list = []
280 |         t_iid_list = []
281 |         while counter < self.batchsize:
282 |             t_qid = self.qid_list[self.batch_index]
283 |             t_iid = self.getImgId(t_qid)
284 |             if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
285 |                 t_qid_list.append(t_qid)
286 |                 t_iid_list.append(t_iid)
287 |                 counter += 1
288 |             elif has_at_least_one_valid_answer(t_qid):
289 |                 t_qid_list.append(t_qid)
290 |                 t_iid_list.append(t_iid)
291 |                 counter += 1
292 |             else:
293 |                 self.n_skipped += 1 
294 | 
295 |             if self.batch_index < self.batch_len-1:
296 |                 self.batch_index += 1
297 |             else:
298 |                 self.epoch_counter += 1
299 |                 qid_list = self.getQuesIds()
300 |                 random.shuffle(qid_list)
301 |                 self.qid_list = qid_list
302 |                 self.batch_index = 0
303 |                 print("%d questions were skipped in a single epoch" % self.n_skipped)
304 |                 self.n_skipped = 0
305 | 
306 |         t_batch = self.create_batch(t_qid_list)
307 |         return t_batch + (t_qid_list, t_iid_list, self.epoch_counter)
308 | 
309 | 
310 | class VQADataProviderLayer(caffe.Layer):
311 |     """
312 |     Provide input data for VQA.
313 |     """
314 | 
315 |     def setup(self, bottom, top):
316 |         self.batchsize = json.loads(self.param_str)['batchsize']
317 |         self.top_names = ['data','cont','feature','label','glove']
318 |         top[0].reshape(15,self.batchsize)
319 |         top[1].reshape(15,self.batchsize)
320 |         top[2].reshape(self.batchsize,2048,14,14)
321 |         top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS)
322 |         top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE)
323 | 
324 |         self.mode = json.loads(self.param_str)['mode']
325 |         self.folder = json.loads(self.param_str)['folder']
326 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
327 |             pass
328 |         else:
329 |             self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode,  folder=self.folder)
330 | 
331 |     def reshape(self, bottom, top):
332 |         pass
333 | 
334 |     def forward(self, bottom, top):
335 |         if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test':
336 |             pass
337 |         else:
338 |             word, cont, feature, answer, glove_matrix, _, _, epoch_counter = self.dp.get_batch_vec()
339 |             top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N
340 |             top[1].data[...] = np.transpose(cont,(1,0))
341 |             top[2].data[...] = feature
342 |             top[3].data[...] = answer
343 |             top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300
344 | 
345 |     def backward(self, top, propagate_down, bottom):
346 |         pass
347 | 
348 | 


--------------------------------------------------------------------------------
/mfb_coatt_glove/train_mfb_coatt_glove.py:
--------------------------------------------------------------------------------
  1 | import matplotlib
  2 | matplotlib.use('Agg')
  3 | import os
  4 | import sys
  5 | import numpy as np
  6 | import json
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | import caffe
 10 | from caffe import layers as L
 11 | from caffe import params as P
 12 | from caffe.proto import caffe_pb2
 13 | 
 14 | from vqa_data_layer_kld import VQADataProvider
 15 | from utils import exec_validation, drawgraph
 16 | import config
 17 | import time
 18 | 
 19 | def get_solver(folder):
 20 |     s = caffe_pb2.SolverParameter()
 21 |     s.train_net = './%s/proto_train.prototxt'%folder
 22 |     s.snapshot = int(config.VALIDATE_INTERVAL)
 23 |     s.snapshot_prefix = './%s/'%folder
 24 |     s.max_iter = int(config.MAX_ITERATIONS)
 25 |     s.display = int(config.VALIDATE_INTERVAL)
 26 |     s.type = 'Adam'
 27 |     s.stepsize = int(config.MAX_ITERATIONS*0.4)
 28 |     s.gamma = 0.5
 29 |     s.lr_policy = "step"
 30 |     s.base_lr = 0.0007
 31 |     s.momentum = 0.9
 32 |     s.momentum2 = 0.999
 33 |     s.weight_decay = 0.000
 34 |     s.clip_gradients = 10
 35 |     return s
 36 | 
 37 | def get_auxiliary_json():
 38 |     aux = {}
 39 |     aux["batch_size"] = int(config.VAL_BATCH_SIZE)
 40 |     aux["data_shape"] = [2048,14,14]
 41 |     aux["img_feature_prefix"] = config.DATA_PATHS['test']['features_prefix']
 42 |     aux["glove"] = True
 43 |     return aux
 44 | 
 45 | 
 46 | def mfb_coatt(mode, batchsize, T, question_vocab_size, folder):
 47 |     n = caffe.NetSpec()
 48 |     mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder})
 49 |     if mode == 'val':
 50 |         n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \
 51 |             module='vqa_data_layer', layer='VQADataProviderLayer', \
 52 |             param_str=mode_str, ntop=5 )
 53 |     else:
 54 |         n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\
 55 |             module='vqa_data_layer_kld', layer='VQADataProviderLayer', \
 56 |             param_str=mode_str, ntop=5 ) 
 57 |     n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \
 58 |                          weight_filler=dict(type='xavier'))
 59 |     n.embed_tanh = L.TanH(n.embed) 
 60 |     concat_word_embed = [n.embed_tanh, n.glove]
 61 |     n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600
 62 | 
 63 |     # LSTM
 64 |     n.lstm1 = L.LSTM(\
 65 |                    n.concat_embed, n.cont,\
 66 |                    recurrent_param=dict(\
 67 |                        num_output=config.LSTM_UNIT_NUM,\
 68 |                        weight_filler=dict(type='xavier')))
 69 |     n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO})
 70 |     n.lstm1_resh = L.Permute(n.lstm1_droped, permute_param=dict(order=[1,2,0]))
 71 |     n.lstm1_resh2 = L.Reshape(n.lstm1_resh, \
 72 |             reshape_param=dict(shape=dict(dim=[0,0,0,1])))
 73 | 
 74 |     '''
 75 |     Question Attention
 76 |     '''
 77 |     n.qatt_conv1 = L.Convolution(n.lstm1_resh2, kernel_size=1, stride=1, num_output=512, pad=0,
 78 |                                            weight_filler=dict(type='xavier'))
 79 |     n.qatt_relu = L.ReLU(n.qatt_conv1)
 80 |     n.qatt_conv2 = L.Convolution(n.qatt_relu, kernel_size=1, stride=1, num_output=config.NUM_QUESTION_GLIMPSE, pad=0,
 81 |                                            weight_filler=dict(type='xavier')) 
 82 |     n.qatt_reshape = L.Reshape(n.qatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_QUESTION_GLIMPSE,config.MAX_WORDS_IN_QUESTION,1]))) # N*NUM_QUESTION_GLIMPSE*15
 83 |     n.qatt_softmax = L.Softmax(n.qatt_reshape, axis=2)
 84 | 
 85 |     qatt_maps = L.Slice(n.qatt_softmax,ntop=config.NUM_QUESTION_GLIMPSE,slice_param={'axis':1})
 86 |     dummy_lstm = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
 87 |     qatt_feature_list = []
 88 |     for i in xrange(config.NUM_QUESTION_GLIMPSE):
 89 |         if config.NUM_QUESTION_GLIMPSE == 1:
 90 |             n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps, dummy_lstm))
 91 |         else:
 92 |             n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps[i], dummy_lstm))    
 93 |         qatt_feature_list.append(n.__getattr__('qatt_feat%d'%i))
 94 |     n.qatt_feat_concat = L.Concat(*qatt_feature_list) 
 95 |     '''
 96 |     Image Attention with MFB
 97 |     '''
 98 |     n.q_feat_resh = L.Reshape(n.qatt_feat_concat,reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
 99 |     n.i_feat_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[0,-1,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
100 |     
101 |     n.iatt_q_proj = L.InnerProduct(n.q_feat_resh, num_output = config.JOINT_EMB_SIZE, 
102 |                                    weight_filler=dict(type='xavier'))
103 |     n.iatt_q_resh = L.Reshape(n.iatt_q_proj, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,1,1])))  
104 |     n.iatt_q_tile1 = L.Tile(n.iatt_q_resh, axis=2, tiles=config.IMG_FEAT_WIDTH)
105 |     n.iatt_q_tile2 = L.Tile(n.iatt_q_tile1, axis=3, tiles=config.IMG_FEAT_WIDTH)
106 | 
107 | 
108 |     n.iatt_i_conv = L.Convolution(n.i_feat_resh, kernel_size=1, stride=1, num_output=config.JOINT_EMB_SIZE, pad=0,
109 |                                  weight_filler=dict(type='xavier')) 
110 |     n.iatt_i_resh1 = L.Reshape(n.iatt_i_conv, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,
111 |                                                                       config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
112 |     n.iatt_iq_eltwise = L.Eltwise(n.iatt_q_tile2, n.iatt_i_resh1, eltwise_param=dict(operation=0))
113 |     n.iatt_iq_droped = L.Dropout(n.iatt_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
114 |     n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_droped, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,196,1])))
115 |     n.iatt_iq_permute1 = L.Permute(n.iatt_iq_resh2, permute_param=dict(order=[0,2,1,3]))
116 |     n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_permute1, reshape_param=dict(shape=dict(dim=[-1,config.IMG_FEAT_SIZE,
117 |                                                                        config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
118 |     n.iatt_iq_sumpool = L.Pooling(n.iatt_iq_resh2, pool=P.Pooling.SUM, \
119 |                               pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
120 |     n.iatt_iq_permute2 = L.Permute(n.iatt_iq_sumpool, permute_param=dict(order=[0,2,1,3]))
121 |     
122 |     n.iatt_iq_sqrt = L.SignedSqrt(n.iatt_iq_permute2)
123 |     n.iatt_iq_l2 = L.L2Normalize(n.iatt_iq_sqrt)
124 | 
125 | 
126 |     ## 2 conv layers 1000 -> 512 -> 2
127 |     n.iatt_conv1 = L.Convolution(n.iatt_iq_l2, kernel_size=1, stride=1, num_output=512, pad=0, 
128 |                                 weight_filler=dict(type='xavier'))
129 |     n.iatt_relu = L.ReLU(n.iatt_conv1)
130 |     n.iatt_conv2 = L.Convolution(n.iatt_relu, kernel_size=1, stride=1, num_output=config.NUM_IMG_GLIMPSE, pad=0,
131 |                                            weight_filler=dict(type='xavier')) 
132 |     n.iatt_resh = L.Reshape(n.iatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_SIZE])))
133 |     n.iatt_softmax = L.Softmax(n.iatt_resh, axis=2)
134 |     n.iatt_softmax_resh = L.Reshape(n.iatt_softmax,reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH])))
135 |     iatt_maps = L.Slice(n.iatt_softmax_resh, ntop=config.NUM_IMG_GLIMPSE,slice_param={'axis':1})
136 |     dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1)
137 |     iatt_feature_list = []
138 |     for i in xrange(config.NUM_IMG_GLIMPSE):
139 |         if config.NUM_IMG_GLIMPSE == 1:
140 |             n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps, dummy))
141 |         else:
142 |             n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps[i], dummy))
143 |         n.__setattr__('iatt_feat%d_resh'%i, L.Reshape(n.__getattr__('iatt_feat%d'%i), \
144 |                                 reshape_param=dict(shape=dict(dim=[0,-1]))))
145 |         iatt_feature_list.append(n.__getattr__('iatt_feat%d_resh'%i))
146 |     n.iatt_feat_concat = L.Concat(*iatt_feature_list)
147 |     n.iatt_feat_concat_resh = L.Reshape(n.iatt_feat_concat, reshape_param=dict(shape=dict(dim=[0,-1,1,1])))
148 |     
149 |     '''
150 |     Fine-grained Image-Question MFB fusion
151 |     '''
152 | 
153 |     n.mfb_q_proj = L.InnerProduct(n.q_feat_resh, num_output=config.JOINT_EMB_SIZE, 
154 |                                   weight_filler=dict(type='xavier'))
155 |     n.mfb_i_proj = L.InnerProduct(n.iatt_feat_concat_resh, num_output=config.JOINT_EMB_SIZE, 
156 |                                   weight_filler=dict(type='xavier'))
157 |     n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0))
158 |     n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO})
159 |     n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM])))
160 |     n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \
161 |                                       pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1))
162 |     n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\
163 |                                     reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM])))
164 |     n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out)
165 |     n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) 
166 |     
167 |     n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS,
168 |                                   weight_filler=dict(type='xavier')) 
169 |     if mode == 'val':
170 |         n.loss = L.SoftmaxWithLoss(n.prediction, n.label)
171 |     else:
172 |         n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 
173 |     return n.to_proto()
174 | 
175 | def make_answer_vocab(adic, vocab_size):
176 |     """
177 |     Returns a dictionary that maps words to indices.
178 |     """
179 |     adict = {'':0}
180 |     nadict = {'':1000000}
181 |     vid = 1
182 |     for qid in adic.keys():
183 |         answer_obj = adic[qid]
184 |         answer_list = [ans['answer'] for ans in answer_obj]
185 |         
186 |         for q_ans in answer_list:
187 |             # create dict
188 |             if adict.has_key(q_ans):
189 |                 nadict[q_ans] += 1
190 |             else:
191 |                 nadict[q_ans] = 1
192 |                 adict[q_ans] = vid
193 |                 vid +=1
194 | 
195 |     # debug
196 |     nalist = []
197 |     for k,v in sorted(nadict.items(), key=lambda x:x[1]):
198 |         nalist.append((k,v))
199 | 
200 |     # remove words that appear less than once 
201 |     n_del_ans = 0
202 |     n_valid_ans = 0
203 |     adict_nid = {}
204 |     for i, w in enumerate(nalist[:-vocab_size]):
205 |         del adict[w[0]]
206 |         n_del_ans += w[1]
207 |     for i, w in enumerate(nalist[-vocab_size:]):
208 |         n_valid_ans += w[1]
209 |         adict_nid[w[0]] = i
210 |     
211 |     return adict_nid
212 | 
213 | def make_question_vocab(qdic):
214 |     """
215 |     Returns a dictionary that maps words to indices.
216 |     """
217 |     vdict = {'':0}
218 |     vid = 1
219 |     for qid in qdic.keys():
220 |         # sequence to list
221 |         q_str = qdic[qid]['qstr']
222 |         q_list = VQADataProvider.seq_to_list(q_str)
223 | 
224 |         # create dict
225 |         for w in q_list:
226 |             if not vdict.has_key(w):
227 |                 vdict[w] = vid
228 |                 vid +=1
229 | 
230 |     return vdict
231 | 
232 | def make_vocab_files():
233 |     """
234 |     Produce the question and answer vocabulary files.
235 |     """
236 |     print 'making question vocab...', config.QUESTION_VOCAB_SPACE
237 |     qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE)
238 |     question_vocab = make_question_vocab(qdic)
239 |     print 'making answer vocab...', config.ANSWER_VOCAB_SPACE
240 |     _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE)
241 |     answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS)
242 |     return question_vocab, answer_vocab
243 | 
244 | def main():
245 |     folder = 'mfb_coatt_glove_q%dv%d_%s'%(config.NUM_QUESTION_GLIMPSE, config.NUM_IMG_GLIMPSE,config.TRAIN_DATA_SPLITS)
246 |     if not os.path.exists('./%s'%folder):
247 |         os.makedirs('./%s'%folder)
248 | 
249 |     question_vocab, answer_vocab = {}, {}
250 |     if os.path.exists('./%s/vdict.json'%folder) and os.path.exists('./%s/adict.json'%folder):
251 |         print 'restoring vocab'
252 |         with open('./%s/vdict.json'%folder,'r') as f:
253 |             question_vocab = json.load(f)
254 |         with open('./%s/adict.json'%folder,'r') as f:
255 |             answer_vocab = json.load(f)
256 |     else:
257 |         question_vocab, answer_vocab = make_vocab_files()
258 |         with open('./%s/vdict.json'%folder,'w') as f:
259 |             json.dump(question_vocab, f)
260 |         with open('./%s/adict.json'%folder,'w') as f:
261 |             json.dump(answer_vocab, f)
262 | 
263 |     print 'question vocab size:', len(question_vocab)
264 |     print 'answer vocab size:', len(answer_vocab)
265 | 
266 |     with open('./%s/proto_train.prototxt'%folder, 'w') as f:
267 |         f.write(str(mfb_coatt(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \
268 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
269 |     
270 |     with open('./%s/proto_test.prototxt'%folder, 'w') as f:
271 |         f.write(str(mfb_coatt('val', config.VAL_BATCH_SIZE, \
272 |             config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder)))
273 | 
274 |     with open('./%s/solver.prototxt'%folder, 'w') as f:
275 |         f.write(str(get_solver(folder)))    
276 |     with open('./%s/auxiliary.json'%folder, 'w') as f:
277 |         json.dump(get_auxiliary_json(),f, indent=2)
278 | 
279 |     caffe.set_device(config.TRAIN_GPU_ID)
280 |     caffe.set_mode_gpu()
281 |     solver = caffe.get_solver('./%s/solver.prototxt'%folder)
282 | 
283 |     train_loss = np.zeros(config.MAX_ITERATIONS+1)
284 |     results = []
285 | 
286 |     if config.RESTORE_ITER:
287 |         restore_iter = config.RESTORE_ITER
288 |         solver.restore('./%s/_iter_%d.solverstate'%(folder,restore_iter))
289 |     else:
290 |         restore_iter = 0
291 |     
292 |     start = time.clock()
293 |     for it in range(restore_iter, config.MAX_ITERATIONS+1):
294 |         solver.step(1)
295 |     
296 |         # store the train loss
297 |         train_loss[it] = solver.net.blobs['loss'].data
298 |    
299 |         if it % config.PRINT_INTERVAL == 0 and it != 0:
300 |             elapsed = (time.clock() - start)
301 |             print 'Iteration:', it
302 |             c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean()
303 |             print 'Train loss:', c_mean_loss, ' Elapsed seconds:', elapsed
304 |             start = time.clock()
305 |         if it % config.VALIDATE_INTERVAL == 0 and it != restore_iter:
306 |             model_name = './%s/tmp.caffemodel'%(folder)
307 |             solver.net.save(model_name)
308 |             print 'Validating...'
309 |             
310 |             # for test-dev /test set. the json file will be generated under the <folder> file
311 |             exec_validation(config.TEST_GPU_ID, 'test-dev', model_name, it=it, folder=folder)
312 |             caffe.set_device(config.TRAIN_GPU_ID)
313 |             ''' 
314 |             #for val set. the accuracy will be computed and ploted 
315 |             test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.TEST_GPU_ID, 'val', model_name, it=it, folder=folder)
316 |             caffe.set_device(config.TRAIN_GPU_ID)
317 |             print 'Test loss:', test_loss
318 |             print 'Accuracy:', acc_overall
319 |             print 'Test per ans', acc_per_ans
320 |             results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans])
321 |             best_result_idx = np.array([x[3] for x in results]).argmax()
322 |             print 'Best accuracy of', results[best_result_idx][3], 'was at iteration', results[best_result_idx][0]
323 |             drawgraph(results,folder,config.MFB_FACTOR_NUM,config.MFB_OUT_DIM,prefix='mfb_coatt_glove')
324 |             ''' 
325 | if __name__ == '__main__':
326 |     main()
327 | 


--------------------------------------------------------------------------------