├── imgs ├── MFB-github.png └── MFH-github.png ├── .gitignore ├── mfb_baseline ├── config.py ├── train_mfb_baseline.py ├── utils.py ├── vqa_data_layer.py └── vqa_data_layer_kld.py ├── mfh_baseline ├── config.py ├── utils.py ├── train_mfh_baseline.py ├── vqa_data_layer.py └── vqa_data_layer_kld.py ├── mfb_coatt_glove ├── config.py ├── utils.py ├── vqa_data_layer.py ├── vqa_data_layer_kld.py └── train_mfb_coatt_glove.py ├── mfh_coatt_glove ├── config.py ├── utils.py ├── vqa_data_layer.py └── vqa_data_layer_kld.py ├── README.md └── eval └── ensemble.py /imgs/MFB-github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuzcccc/vqa-mfb/HEAD/imgs/MFB-github.png -------------------------------------------------------------------------------- /imgs/MFH-github.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yuzcccc/vqa-mfb/HEAD/imgs/MFH-github.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | mfb_baseline/*.pyc 2 | mfb_coatt_glove/*.pyc 3 | eval/*.pyc 4 | 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *,cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # IPython Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | 92 | # Rope project settings 93 | .ropeproject 94 | -------------------------------------------------------------------------------- /mfb_baseline/config.py: -------------------------------------------------------------------------------- 1 | #training parameters 2 | TRAIN_GPU_ID = 0 3 | TEST_GPU_ID = 0 4 | BATCH_SIZE = 200 5 | VAL_BATCH_SIZE = 200 6 | PRINT_INTERVAL = 100 7 | VALIDATE_INTERVAL = 5000 8 | MAX_ITERATIONS = 100000 9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed! 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train' 12 | # what data to use for the vocabulary 13 | QUESTION_VOCAB_SPACE = 'train' 14 | ANSWER_VOCAB_SPACE = 'train' # test/test-dev/genome should not appear here 15 | 16 | #network parameters 17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 18 | MFB_FACTOR_NUM = 5 19 | MFB_OUT_DIM = 1000 20 | LSTM_UNIT_NUM = 1024 21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM 22 | MAX_WORDS_IN_QUESTION = 15 23 | LSTM_DROPOUT_RATIO = 0.3 24 | MFB_DROPOUT_RATIO = 0.1 25 | 26 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 27 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools' 28 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools' 29 | 30 | # location of the data 31 | VQA_PREFIX = '/home/yuz/data/VQA' 32 | 33 | feat = 'pool5' 34 | DATA_PATHS = { 35 | 'train': { 36 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 37 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 38 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat 39 | }, 40 | 'val': { 41 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 42 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 43 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat 44 | }, 45 | 'test-dev': { 46 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 47 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat 48 | }, 49 | 'test': { 50 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 51 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat 52 | }, 53 | 'genome': { 54 | 'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json', 55 | 'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /mfh_baseline/config.py: -------------------------------------------------------------------------------- 1 | #training parameters 2 | TRAIN_GPU_ID = 0 3 | TEST_GPU_ID = 0 4 | BATCH_SIZE = 200 5 | VAL_BATCH_SIZE = 200 6 | PRINT_INTERVAL = 100 7 | VALIDATE_INTERVAL = 5000 8 | MAX_ITERATIONS = 100000 9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed! 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train' 12 | # what data to use for the vocabulary 13 | QUESTION_VOCAB_SPACE = 'train' 14 | ANSWER_VOCAB_SPACE = 'train' # test/test-dev/genome should not appear here 15 | 16 | #network parameters 17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 18 | MFB_FACTOR_NUM = 5 19 | MFB_OUT_DIM = 1000 20 | LSTM_UNIT_NUM = 1024 21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM 22 | MAX_WORDS_IN_QUESTION = 15 23 | LSTM_DROPOUT_RATIO = 0.3 24 | MFB_DROPOUT_RATIO = 0.1 25 | 26 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 27 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools' 28 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools' 29 | 30 | # location of the data 31 | VQA_PREFIX = '/home/yuz/data/VQA' 32 | 33 | feat = 'pool5' 34 | DATA_PATHS = { 35 | 'train': { 36 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 37 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 38 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat 39 | }, 40 | 'val': { 41 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 42 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 43 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat 44 | }, 45 | 'test-dev': { 46 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 47 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat 48 | }, 49 | 'test': { 50 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 51 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat 52 | }, 53 | 'genome': { 54 | 'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json', 55 | 'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /mfb_coatt_glove/config.py: -------------------------------------------------------------------------------- 1 | #training parameters 2 | TRAIN_GPU_ID = 0 3 | TEST_GPU_ID = 0 4 | BATCH_SIZE = 64 5 | VAL_BATCH_SIZE = 32 6 | PRINT_INTERVAL = 100 7 | VALIDATE_INTERVAL = 5000 8 | MAX_ITERATIONS = 100000 9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed! 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train+val' 12 | # what data to use for the vocabulary 13 | QUESTION_VOCAB_SPACE = 'train+val' 14 | ANSWER_VOCAB_SPACE = 'train+val' # test/test-dev/genome should not appear here 15 | 16 | #network parameters 17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 18 | MFB_FACTOR_NUM = 5 19 | MFB_OUT_DIM = 1000 20 | LSTM_UNIT_NUM = 1024 21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM 22 | NUM_IMG_GLIMPSE = 2 23 | NUM_QUESTION_GLIMPSE = 2 24 | IMG_FEAT_WIDTH = 14 25 | IMG_FEAT_SIZE = IMG_FEAT_WIDTH * IMG_FEAT_WIDTH 26 | MAX_WORDS_IN_QUESTION = 15 27 | LSTM_DROPOUT_RATIO = 0.3 28 | MFB_DROPOUT_RATIO = 0.1 29 | 30 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 31 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools' 32 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools' 33 | 34 | # location of the data 35 | VQA_PREFIX = '/home/yuz/data/VQA' 36 | 37 | feat = 'res5c' 38 | DATA_PATHS = { 39 | 'train': { 40 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 41 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 42 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat 43 | }, 44 | 'val': { 45 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 46 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 47 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat 48 | }, 49 | 'test-dev': { 50 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 51 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat 52 | }, 53 | 'test': { 54 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 55 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat 56 | }, 57 | 'genome': { 58 | 'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json', 59 | 'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /mfh_coatt_glove/config.py: -------------------------------------------------------------------------------- 1 | #training parameters 2 | TRAIN_GPU_ID = 0 3 | TEST_GPU_ID = 0 4 | BATCH_SIZE = 64 5 | VAL_BATCH_SIZE = 32 6 | PRINT_INTERVAL = 100 7 | VALIDATE_INTERVAL = 10000 8 | MAX_ITERATIONS = 100000 9 | RESTORE_ITER = 0 # iteration to restore. *.solverstate file is needed! 10 | # what data to use for training 11 | TRAIN_DATA_SPLITS = 'train+val' 12 | # what data to use for the vocabulary 13 | QUESTION_VOCAB_SPACE = 'train+val' 14 | ANSWER_VOCAB_SPACE = 'train+val' # test/test-dev/genome should not appear here 15 | 16 | #network parameters 17 | NUM_OUTPUT_UNITS = 3000 # This is the answer vocabulary size 18 | MFB_FACTOR_NUM = 5 19 | MFB_OUT_DIM = 1000 20 | LSTM_UNIT_NUM = 1024 21 | JOINT_EMB_SIZE = MFB_FACTOR_NUM*MFB_OUT_DIM 22 | NUM_IMG_GLIMPSE = 2 23 | NUM_QUESTION_GLIMPSE = 2 24 | IMG_FEAT_WIDTH = 14 25 | IMG_FEAT_SIZE = IMG_FEAT_WIDTH * IMG_FEAT_WIDTH 26 | MAX_WORDS_IN_QUESTION = 15 27 | LSTM_DROPOUT_RATIO = 0.3 28 | MFB_DROPOUT_RATIO = 0.1 29 | 30 | # vqa tools - get from https://github.com/VT-vision-lab/VQA 31 | VQA_TOOLS_PATH = '/home/yuz/data/VQA/PythonHelperTools' 32 | VQA_EVAL_TOOLS_PATH = '/home/yuz/data/VQA/PythonEvaluationTools' 33 | 34 | # location of the data 35 | VQA_PREFIX = '/home/yuz/data/VQA' 36 | 37 | feat = 'res5c' 38 | DATA_PATHS = { 39 | 'train': { 40 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_train2014_questions.json', 41 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_train2014_annotations.json', 42 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/train2014/COCO_train2014_'%feat 43 | }, 44 | 'val': { 45 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_val2014_questions.json', 46 | 'ans_file': VQA_PREFIX + '/Annotations/mscoco_val2014_annotations.json', 47 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/val2014/COCO_val2014_'%feat 48 | }, 49 | 'test-dev': { 50 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test-dev2015_questions.json', 51 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat 52 | }, 53 | 'test': { 54 | 'ques_file': VQA_PREFIX + '/Questions/OpenEnded_mscoco_test2015_questions.json', 55 | 'features_prefix': VQA_PREFIX + '/Features/ms_coco/resnet_%s_bgrms_large/test2015/COCO_test2015_'%feat 56 | }, 57 | 'genome': { 58 | 'genome_file': VQA_PREFIX + '/Questions/OpenEnded_genome_train_questions.json', 59 | 'features_prefix': VQA_PREFIX + '/Features/genome/feat_resnet-152/resnet_%s_bgrms_large/'%feat 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MFB and MFH for VQA 2 | 3 | **This project is deprecated! The Pytorch implementation of MFB(MFH)+CoAtt with pre-trained models, along with several state-of-the-art VQA models are maintained in our [OpenVQA](https://github.com/MILVLG/openvqa) project, which is much more convenient to use!** 4 | 5 | This project is the implementation of the papers *[Multi-modal Factorized Bilinear Pooling with Co-Attention Learning for Visual Question Answering (MFB)](https://arxiv.org/abs/1708.01471)* and *[Beyond Bilinear: Generalized Multi-modal Factorized High-order Pooling for Visual Question Answering (MFH)](https://arxiv.org/abs/1708.03619)*. Compared with existing state-of-the-art approaches such as MCB and MLB, our MFB models achieved superior performance on the large-scale VQA-1.0 and VQA-2.0 datasets. Moreover, MFH, the high-order extention of MFB, is also proveided to report better VQA performance. The MFB(MFH)+CoAtt network architecture for VQA is illustrated in Figure 1. 6 | 7 | ![Figure 1: The MFB+CoAtt Network architecture for VQA.](https://github.com/yuzcccc/mfb/raw/master/imgs/MFB-github.png) 8 |
Figure 1: The MFB+CoAtt Network architecture for VQA.
9 | 10 | ## Update Dec. 2nd, 2017 11 | The 3rd-party pytorch implementation for MFB(MFH) is released [here](https://github.com/asdf0982/vqa-mfb.pytorch). Great thanks, Liam! 12 | 13 | ## Update Sep. 5th, 2017 14 | Using the Bottom-up and Top-Down (BUTD) image features (the model with adaptive K ranges from [10,100]) [here](https://github.com/yuzcccc/bottom-up-attention), our single MFH+CoAtt+GloVe model achieved the overall accuracy **68.76%** on the test-dev set of VQA-2.0 dataset. With an ensemble of 8 models, we achieved the new state-of-the-art performance on the VQA-2.0 dataset's [leaderboard](https://evalai.cloudcv.org/web/challenges/challenge-page/1/leaderboard) with the overall accuracy **70.92%**. 15 | 16 | ## Update Aug. 1st, 2019 17 | Our solution for the VQA Challenge 2017 is updated! 18 | 19 | We proposed a **high-order** extention for MFB, i.e., the Multi-modal Factorized High-order Pooling (MFH). See the flowchart in Figure 2 and the implementations in `mfh_baseline` and `mfh-coatt-glove` folders. With an ensemble of 9 MFH+CoAtt+GloVe(+VG) models, **we won the 2nd place (tied with another team) in the VQA Challenge 2017**. The detailed information can be found in our paper (the second paper in the CITATION section on bottom of this page). 20 | 21 | ![](https://github.com/yuzcccc/mfb/raw/master/imgs/MFH-github.png) 22 |
Figure 2: The high-order MFH model which consists of p MFB blocks (without sharing parameters).
23 | 24 | ## Prerequisites 25 | 26 | Our codes is implemented based on the high-quality [vqa-mcb](https://github.com/akirafukui/vqa-mcb) project. The data preprocessing and and other prerequisites are the same with theirs. Before running our scripts to train or test MFB model, see the `Prerequisites` and `Data Preprocessing` sections in the README of vqa-mcb's project first. 27 | 28 | - The Caffe version required for our MFB is slightly different from the MCB. We add some layers, e.g., sum pooling, permute and KLD loss layers to the `feature/20160617_cb_softattention` branch of Caffe for MCB. Please checkout our caffe version [here](https://github.com/yuzcccc/caffe) and compile it. **Note that CuDNN is not compatible with sum pooling currently, you should switch it off to run the codes correctly**. 29 | 30 | ## Pretrained Models 31 | 32 | We release the pretrained **single model** "MFB(or MFH)+CoAtt+GloVe+VG" in the papers. To the best of our knowledge, our MFH+CoAtt+GloVe+VG model report the best result (test-dev) with a single model on both the VQA-1.0 and VQA-2.0 datasets(train + val + visual genome). The corresponding results are shown in the table below. The results JSON files (results.zip for VQA-1.0) are also included in the model folders, which can be uploaded to the evaluation servers directly. **Note that the models are trained with a old version of GloVe in spacy. If you use the latest one, they maybe incosistent, leading to inferior performance. I suggest training the model from scratch by yourself.** 33 | 34 | | Datasets\Models | MCB | MFB | MFH | MFH (BUTD img features) | 35 | |:-----------------:|:-----------------:|:-----------------:|:-----------------:|:-----------------:| 36 | | VQA-1.0 | 65.38% |66.87% [BaiduYun](http://pan.baidu.com/s/1o8LURge) | 67.72% [BaiduYun](http://pan.baidu.com/s/1c2neUv2) or [Dropbox](https://www.dropbox.com/s/qh1swgsq0na1bua/VQA1.0-mfh-coatt-glove-vg.zip?dl=0) | **69.82%** | 37 | | VQA-2.0 | 62.33%1 |65.09% [BaiduYun](http://pan.baidu.com/s/1pLjtkSV) | 66.12% [BaiduYun](http://pan.baidu.com/s/1pLLUvIN) or [Dropbox](https://www.dropbox.com/s/zld15405a69how6/VQA2.0-mfh-coatt-glove-vg.zip?dl=0) | **68.76%**2 | 38 | 39 | 1 the MCB result on VQA-2.0 is provided by the VQA Challenge organizer with does not introdunce the GloVe embedding. 40 | 41 | 2 overall: 68.76, yes/no: 84.27, num: 49.56, other: 59.89 42 | 43 | ## Training from Scratch 44 | 45 | We provide the scripts for training two MFB models from scratch, i.e., `mfb-baseline` and `mfb-coatt-glove` folders. Simply running the python scripts `train_*.py` to train the models from scratch. 46 | 47 | - Most of the hyper-parameters and configrations with comments are defined in the `config.py` file. 48 | - The solver configrations are defined in the `get_solver` function in the `train_*.py` scripts. 49 | - Pretrained GloVe word embedding model (the spacy library) is required to train the mfb-coatt-glove model. The installation instructions of spacy and GloVe model can be found [here](https://github.com/akirafukui/vqa-mcb/tree/master/train). 50 | 51 | ## Evaluation 52 | 53 | To generate an answers JSON file in the format expected by the VQA evaluation code and VQA test server, you can use `eval/ensemble.py`. This code can also ensemble multiple models. Running `python ensemble.py` will print out a help message telling you what arguments to use. 54 | 55 | ## Licence 56 | 57 | This code is distributed under MIT LICENSE. The released models are only allowed for non-commercial use. 58 | 59 | ## Citation 60 | 61 | If the codes are helpful for your research, please cite 62 | 63 | ``` 64 | @article{yu2017mfb, 65 | title={Multi-modal Factorized Bilinear Pooling with Co-Attention Learning for Visual Question Answering}, 66 | author={Yu, Zhou and Yu, Jun and Fan, Jianping and Tao, Dacheng}, 67 | journal={IEEE International Conference on Computer Vision (ICCV)}, 68 | pages={1839--1848}, 69 | year={2017} 70 | } 71 | 72 | @article{yu2018beyond, 73 | title={Beyond Bilinear: Generalized Multimodal Factorized High-Order Pooling for Visual Question Answering}, 74 | author={Yu, Zhou and Yu, Jun and Xiang, Chenchao and Fan, Jianping and Tao, Dacheng}, 75 | journal={IEEE Transactions on Neural Networks and Learning Systems}, 76 | volume={29}, 77 | number={12}, 78 | pages={5947--5959}, 79 | year={2018} 80 | } 81 | ``` 82 | 83 | ## Concat 84 | 85 | Zhou Yu [yuz(AT)hdu.edu.cn] 86 | -------------------------------------------------------------------------------- /mfb_baseline/train_mfb_baseline.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import os 4 | import sys 5 | import numpy as np 6 | import json 7 | import matplotlib.pyplot as plt 8 | 9 | import caffe 10 | from caffe import layers as L 11 | from caffe import params as P 12 | from caffe.proto import caffe_pb2 13 | 14 | from vqa_data_layer_kld import VQADataProvider 15 | from utils import exec_validation, drawgraph 16 | import config 17 | import time 18 | 19 | def get_solver(folder): 20 | s = caffe_pb2.SolverParameter() 21 | s.train_net = './%s/proto_train.prototxt'%folder 22 | s.snapshot = int(config.VALIDATE_INTERVAL) 23 | s.snapshot_prefix = './%s/'%folder 24 | s.max_iter = int(config.MAX_ITERATIONS) 25 | s.display = int(config.VALIDATE_INTERVAL) 26 | s.type = 'Adam' 27 | s.stepsize = int(config.MAX_ITERATIONS*0.4) 28 | s.gamma = 0.5 29 | s.lr_policy = "step" 30 | s.base_lr = 0.0007 31 | s.momentum = 0.9 32 | s.momentum2 = 0.999 33 | s.weight_decay = 0.000 34 | s.clip_gradients = 10 35 | return s 36 | 37 | def get_auxiliary_json(): 38 | aux = {} 39 | aux["batch_size"] = int(config.VAL_BATCH_SIZE) 40 | aux["data_shape"] = [2048] 41 | aux["img_feature_prefix"] = config.DATA_PATHS['test']['features_prefix'] 42 | aux["glove"] = False 43 | return aux 44 | 45 | 46 | def mfb_baseline(mode, batchsize, T, question_vocab_size, folder): 47 | n = caffe.NetSpec() 48 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder}) 49 | if mode == 'val': 50 | n.data, n.cont, n.img_feature, n.label = L.Python( \ 51 | module='vqa_data_layer', layer='VQADataProviderLayer', \ 52 | param_str=mode_str, ntop=4 ) 53 | else: 54 | n.data, n.cont, n.img_feature, n.label = L.Python(\ 55 | module='vqa_data_layer_kld', layer='VQADataProviderLayer', \ 56 | param_str=mode_str, ntop=4 ) 57 | n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ 58 | weight_filler=dict(type='xavier')) 59 | n.embed_tanh = L.TanH(n.embed) 60 | 61 | # LSTM 62 | n.lstm1 = L.LSTM(\ 63 | n.embed_tanh, n.cont,\ 64 | recurrent_param=dict(\ 65 | num_output=config.LSTM_UNIT_NUM,\ 66 | weight_filler=dict(type='xavier'))) 67 | tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0}) 68 | for i in xrange(config.MAX_WORDS_IN_QUESTION-1): 69 | n.__setattr__('slice_first'+str(i), tops1[int(i)]) 70 | n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) 71 | n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1] 72 | n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ 73 | reshape_param=dict(\ 74 | shape=dict(dim=[-1,1024]))) 75 | n.q_feat = L.Dropout(n.lstm1_reshaped,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) 76 | ''' 77 | Coarse Image-Question MFB fusion 78 | ''' 79 | 80 | n.mfb_q_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE, 81 | weight_filler=dict(type='xavier')) 82 | n.mfb_i_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE, 83 | weight_filler=dict(type='xavier')) 84 | n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0)) 85 | n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) 86 | n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) 87 | n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \ 88 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) 89 | n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\ 90 | reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) 91 | n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out) 92 | n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) 93 | 94 | n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS, 95 | weight_filler=dict(type='xavier')) 96 | if mode == 'val': 97 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label) 98 | else: 99 | n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 100 | return n.to_proto() 101 | 102 | def make_answer_vocab(adic, vocab_size): 103 | """ 104 | Returns a dictionary that maps words to indices. 105 | """ 106 | adict = {'':0} 107 | nadict = {'':1000000} 108 | vid = 1 109 | for qid in adic.keys(): 110 | answer_obj = adic[qid] 111 | answer_list = [ans['answer'] for ans in answer_obj] 112 | 113 | for q_ans in answer_list: 114 | # create dict 115 | if adict.has_key(q_ans): 116 | nadict[q_ans] += 1 117 | else: 118 | nadict[q_ans] = 1 119 | adict[q_ans] = vid 120 | vid +=1 121 | 122 | # debug 123 | nalist = [] 124 | for k,v in sorted(nadict.items(), key=lambda x:x[1]): 125 | nalist.append((k,v)) 126 | 127 | # remove words that appear less than once 128 | n_del_ans = 0 129 | n_valid_ans = 0 130 | adict_nid = {} 131 | for i, w in enumerate(nalist[:-vocab_size]): 132 | del adict[w[0]] 133 | n_del_ans += w[1] 134 | for i, w in enumerate(nalist[-vocab_size:]): 135 | n_valid_ans += w[1] 136 | adict_nid[w[0]] = i 137 | 138 | return adict_nid 139 | 140 | def make_question_vocab(qdic): 141 | """ 142 | Returns a dictionary that maps words to indices. 143 | """ 144 | vdict = {'':0} 145 | vid = 1 146 | for qid in qdic.keys(): 147 | # sequence to list 148 | q_str = qdic[qid]['qstr'] 149 | q_list = VQADataProvider.seq_to_list(q_str) 150 | 151 | # create dict 152 | for w in q_list: 153 | if not vdict.has_key(w): 154 | vdict[w] = vid 155 | vid +=1 156 | 157 | return vdict 158 | 159 | def make_vocab_files(): 160 | """ 161 | Produce the question and answer vocabulary files. 162 | """ 163 | print 'making question vocab...', config.QUESTION_VOCAB_SPACE 164 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE) 165 | question_vocab = make_question_vocab(qdic) 166 | print 'making answer vocab...', config.ANSWER_VOCAB_SPACE 167 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE) 168 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS) 169 | return question_vocab, answer_vocab 170 | 171 | def main(): 172 | folder = 'mfb_baseline_%s'%(config.TRAIN_DATA_SPLITS) 173 | if not os.path.exists('./%s'%folder): 174 | os.makedirs('./%s'%folder) 175 | 176 | question_vocab, answer_vocab = {}, {} 177 | if os.path.exists('./%s/vdict.json'%folder) and os.path.exists('./%s/adict.json'%folder): 178 | print 'restoring vocab' 179 | with open('./%s/vdict.json'%folder,'r') as f: 180 | question_vocab = json.load(f) 181 | with open('./%s/adict.json'%folder,'r') as f: 182 | answer_vocab = json.load(f) 183 | else: 184 | question_vocab, answer_vocab = make_vocab_files() 185 | with open('./%s/vdict.json'%folder,'w') as f: 186 | json.dump(question_vocab, f) 187 | with open('./%s/adict.json'%folder,'w') as f: 188 | json.dump(answer_vocab, f) 189 | 190 | print 'question vocab size:', len(question_vocab) 191 | print 'answer vocab size:', len(answer_vocab) 192 | 193 | with open('./%s/proto_train.prototxt'%folder, 'w') as f: 194 | f.write(str(mfb_baseline(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \ 195 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder))) 196 | 197 | with open('./%s/proto_test.prototxt'%folder, 'w') as f: 198 | f.write(str(mfb_baseline('val', config.VAL_BATCH_SIZE, \ 199 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder))) 200 | 201 | with open('./%s/solver.prototxt'%folder, 'w') as f: 202 | f.write(str(get_solver(folder))) 203 | with open('./%s/auxiliary.json'%folder, 'w') as f: 204 | json.dump(get_auxiliary_json(),f, indent=2) 205 | 206 | caffe.set_device(config.TRAIN_GPU_ID) 207 | caffe.set_mode_gpu() 208 | solver = caffe.get_solver('./%s/solver.prototxt'%folder) 209 | 210 | train_loss = np.zeros(config.MAX_ITERATIONS+1) 211 | results = [] 212 | 213 | if config.RESTORE_ITER: 214 | restore_iter = config.RESTORE_ITER 215 | solver.restore('./%s/_iter_%d.solverstate'%(folder,restore_iter)) 216 | else: 217 | restore_iter = 0 218 | 219 | start = time.clock() 220 | for it in range(restore_iter, config.MAX_ITERATIONS+1): 221 | solver.step(1) 222 | 223 | # store the train loss 224 | train_loss[it] = solver.net.blobs['loss'].data 225 | 226 | if it % config.PRINT_INTERVAL == 0 and it != 0: 227 | elapsed = (time.clock() - start) 228 | print 'Iteration:', it 229 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean() 230 | print 'Train loss:', c_mean_loss, ' Elapsed seconds:', elapsed 231 | start = time.clock() 232 | if it % config.VALIDATE_INTERVAL == 0 and it != restore_iter: 233 | model_name = './%s/tmp.caffemodel'%(folder) 234 | solver.net.save(model_name) 235 | print 'Validating...' 236 | ''' 237 | # for test-dev /test set. the json file will be generated under the file 238 | exec_validation(config.TEST_GPU_ID, 'test-dev', model_name, it=it, folder=folder) 239 | caffe.set_device(config.TRAIN_GPU_ID) 240 | ''' 241 | #for val set. the accuracy will be computed and ploted 242 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.TEST_GPU_ID, 'val', model_name, it=it, folder=folder) 243 | caffe.set_device(config.TRAIN_GPU_ID) 244 | print 'Test loss:', test_loss 245 | print 'Accuracy:', acc_overall 246 | print 'Test per ans', acc_per_ans 247 | results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans]) 248 | best_result_idx = np.array([x[3] for x in results]).argmax() 249 | print 'Best accuracy of', results[best_result_idx][3], 'was at iteration', results[best_result_idx][0] 250 | drawgraph(results,folder,config.MFB_FACTOR_NUM,config.MFB_OUT_DIM,prefix='mfb_baseline') 251 | 252 | if __name__ == '__main__': 253 | main() 254 | -------------------------------------------------------------------------------- /mfb_baseline/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer 16 | 17 | import config 18 | sys.path.append(config.VQA_TOOLS_PATH) 19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 20 | 21 | from vqaTools.vqa import VQA 22 | from vqaEvaluation.vqaEval import VQAEval 23 | 24 | def visualize_failures(stat_list,mode): 25 | 26 | def save_qtype(qtype_list, save_filename, mode): 27 | 28 | if mode == 'val': 29 | savepath = os.path.join('./eval', save_filename) 30 | # TODO 31 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014' 32 | elif mode == 'test-dev': 33 | savepath = os.path.join('./test-dev', save_filename) 34 | # TODO 35 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015' 36 | elif mode == 'test': 37 | savepath = os.path.join('./test', save_filename) 38 | # TODO 39 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015' 40 | else: 41 | raise Exception('Unsupported mode') 42 | if os.path.exists(savepath): shutil.rmtree(savepath) 43 | if not os.path.exists(savepath): os.makedirs(savepath) 44 | 45 | for qt in qtype_list: 46 | count = 0 47 | for t_question in stat_list: 48 | #print count, t_question 49 | if count < 40/len(qtype_list): 50 | t_question_list = t_question['q_list'] 51 | saveflag = False 52 | #print 'debug****************************' 53 | #print qt 54 | #print t_question_list 55 | #print t_question_list[0] == qt[0] 56 | #print t_question_list[1] == qt[1] 57 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 58 | saveflag = True 59 | else: 60 | saveflag = False 61 | 62 | if saveflag == True: 63 | t_iid = t_question['iid'] 64 | if mode == 'val': 65 | t_img = Image.open(os.path.join(img_pre, \ 66 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 67 | elif mode == 'test-dev' or 'test': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 70 | 71 | # for caption 72 | #print t_iid 73 | #annIds = caps.getAnnIds(t_iid) 74 | #anns = caps.loadAnns(annIds) 75 | #cap_list = [ann['caption'] for ann in anns] 76 | ans_list = t_question['ans_list'] 77 | draw = ImageDraw.Draw(t_img) 78 | for i in range(len(ans_list)): 79 | try: 80 | draw.text((10,10*i), str(ans_list[i])) 81 | except: 82 | pass 83 | 84 | ans = t_question['answer'] 85 | pred = t_question['pred'] 86 | if ans == -1: 87 | pre = '' 88 | elif ans == pred: 89 | pre = 'correct ' 90 | else: 91 | pre = 'failure ' 92 | #print ' aaa ', ans, pred 93 | ans = re.sub( '/', ' ', str(ans)) 94 | pred = re.sub( '/', ' ', str(pred)) 95 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 96 | str(ans) + ' p_' + str(pred) + '.png' 97 | count += 1 98 | print os.path.join(savepath,img_title) 99 | t_img.save(os.path.join(savepath,img_title)) 100 | 101 | print 'saving whatis' 102 | qt_color_list = [['what','color']] 103 | save_qtype(qt_color_list, 'colors', mode) 104 | 105 | print 'saving whatis' 106 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 107 | save_qtype(qt_whatis_list, 'whatis', mode) 108 | 109 | print 'saving is' 110 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 111 | save_qtype(qt_is_list, 'is', mode) 112 | 113 | print 'saving how many' 114 | qt_howmany_list =[['how','many']] 115 | save_qtype(qt_howmany_list, 'howmany', mode) 116 | 117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False): 118 | 119 | caffe.set_device(device_id) 120 | caffe.set_mode_gpu() 121 | net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST) 122 | 123 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder) 124 | total_questions = len(dp.getQuesIds()) 125 | epoch = 0 126 | 127 | pred_list = [] 128 | testloss_list = [] 129 | stat_list = [] 130 | 131 | while epoch == 0: 132 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 133 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0)) 134 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0)) 135 | net.blobs['img_feature'].data[...] = t_img_feature 136 | net.blobs['label'].data[...] = t_answer 137 | net.forward() 138 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 139 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 140 | testloss_list.append(net.blobs['loss'].data) 141 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 142 | #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 143 | pred_list.append((pred,int(dp.getStrippedQuesId(qid)))) 144 | if visualize: 145 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 146 | if mode == 'test-dev' or 'test': 147 | ans_str = '' 148 | ans_list = ['']*10 149 | else: 150 | ans_str = dp.vec_to_answer(ans) 151 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 152 | stat_list.append({\ 153 | 'qid' : qid, 154 | 'q_list' : q_list, 155 | 'iid' : iid, 156 | 'answer': ans_str, 157 | 'ans_list': ans_list, 158 | 'pred' : pred }) 159 | percent = 100 * float(len(pred_list)) / total_questions 160 | sys.stdout.write('\r' + ('%.2f' % percent) + '%') 161 | sys.stdout.flush() 162 | 163 | 164 | print 'Deduping arr of len', len(pred_list) 165 | deduped = [] 166 | seen = set() 167 | for ans, qid in pred_list: 168 | if qid not in seen: 169 | seen.add(qid) 170 | deduped.append((ans, qid)) 171 | print 'New len', len(deduped) 172 | final_list=[] 173 | for ans,qid in deduped: 174 | final_list.append({u'answer': ans, u'question_id': qid}) 175 | 176 | mean_testloss = np.array(testloss_list).mean() 177 | 178 | if mode == 'val': 179 | valFile = './%s/val2015_resfile'%folder 180 | with open(valFile, 'w') as f: 181 | json.dump(final_list, f) 182 | if visualize: 183 | visualize_failures(stat_list,mode) 184 | annFile = config.DATA_PATHS['val']['ans_file'] 185 | quesFile = config.DATA_PATHS['val']['ques_file'] 186 | vqa = VQA(annFile, quesFile) 187 | vqaRes = vqa.loadRes(valFile, quesFile) 188 | vqaEval = VQAEval(vqa, vqaRes, n=2) 189 | vqaEval.evaluate() 190 | acc_overall = vqaEval.accuracy['overall'] 191 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 192 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 193 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 194 | elif mode == 'test-dev': 195 | filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results' 196 | with open(filename+'.json', 'w') as f: 197 | json.dump(final_list, f) 198 | if visualize: 199 | visualize_failures(stat_list,mode) 200 | elif mode == 'test': 201 | filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results' 202 | with open(filename+'.json', 'w') as f: 203 | json.dump(final_list, f) 204 | if visualize: 205 | visualize_failures(stat_list,mode) 206 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False): 207 | # 0:it 208 | # 1:trainloss 209 | # 2:testloss 210 | # 3:oa_acc 211 | # 4:qt_acc 212 | # 5:at_acc 213 | 214 | # training curve 215 | it = np.array([l[0] for l in results]) 216 | loss = np.array([l[1] for l in results]) 217 | valloss = np.array([l[2] for l in results]) 218 | valacc = np.array([l[3] for l in results]) 219 | 220 | fig = plt.figure() 221 | ax1 = fig.add_subplot(111) 222 | ax2 = ax1.twinx() 223 | 224 | ax1.plot(it,loss, color='blue', label='train loss') 225 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 226 | ax2.plot(it,valacc, color='red', label='acc on val') 227 | plt.legend(loc='lower left') 228 | 229 | ax1.set_xlabel('Iterations') 230 | ax1.set_ylabel('Loss Value') 231 | ax2.set_ylabel('Accuracy on Val [%]') 232 | 233 | plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix)) 234 | plt.clf() 235 | plt.close("all") 236 | 237 | # question type 238 | it = np.array([l[0] for l in results]) 239 | oa_acc = np.array([l[3] for l in results]) 240 | qt_dic_list = [l[4] for l in results] 241 | 242 | def draw_qt_acc(target_key_list, figname): 243 | fig = plt.figure() 244 | for k in target_key_list: 245 | print k,type(k) 246 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 247 | plt.plot(it,t_val,label=str(k)) 248 | plt.legend(fontsize='small') 249 | plt.ylim(0,100.) 250 | #plt.legend(prop={'size':6}) 251 | 252 | plt.xlabel('Iterations') 253 | plt.ylabel('Accuracy on Val [%]') 254 | 255 | plt.savefig(figname,dpi=200) 256 | plt.clf() 257 | plt.close("all") 258 | 259 | if save_question_type_graphs: 260 | s_keys = sorted(qt_dic_list[0].keys()) 261 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 262 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 263 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 264 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 265 | draw_qt_acc(['what color is the','what color are the','what color is',\ 266 | 'what color','what is the color of the'],'./qt_color.png') 267 | draw_qt_acc(['how many','how','how many people are',\ 268 | 'how many people are in'],'./qt_number.png') 269 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 270 | 'which'],'./qt_who_why_where_which.png') 271 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 272 | 'is the woman','is this person','what is the woman','is the person',\ 273 | 'what is the person'],'./qt_human.png') 274 | 275 | 276 | -------------------------------------------------------------------------------- /mfh_baseline/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer 16 | 17 | import config 18 | sys.path.append(config.VQA_TOOLS_PATH) 19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 20 | 21 | from vqaTools.vqa import VQA 22 | from vqaEvaluation.vqaEval import VQAEval 23 | 24 | def visualize_failures(stat_list,mode): 25 | 26 | def save_qtype(qtype_list, save_filename, mode): 27 | 28 | if mode == 'val': 29 | savepath = os.path.join('./eval', save_filename) 30 | # TODO 31 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014' 32 | elif mode == 'test-dev': 33 | savepath = os.path.join('./test-dev', save_filename) 34 | # TODO 35 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015' 36 | elif mode == 'test': 37 | savepath = os.path.join('./test', save_filename) 38 | # TODO 39 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015' 40 | else: 41 | raise Exception('Unsupported mode') 42 | if os.path.exists(savepath): shutil.rmtree(savepath) 43 | if not os.path.exists(savepath): os.makedirs(savepath) 44 | 45 | for qt in qtype_list: 46 | count = 0 47 | for t_question in stat_list: 48 | #print count, t_question 49 | if count < 40/len(qtype_list): 50 | t_question_list = t_question['q_list'] 51 | saveflag = False 52 | #print 'debug****************************' 53 | #print qt 54 | #print t_question_list 55 | #print t_question_list[0] == qt[0] 56 | #print t_question_list[1] == qt[1] 57 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 58 | saveflag = True 59 | else: 60 | saveflag = False 61 | 62 | if saveflag == True: 63 | t_iid = t_question['iid'] 64 | if mode == 'val': 65 | t_img = Image.open(os.path.join(img_pre, \ 66 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 67 | elif mode == 'test-dev' or 'test': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 70 | 71 | # for caption 72 | #print t_iid 73 | #annIds = caps.getAnnIds(t_iid) 74 | #anns = caps.loadAnns(annIds) 75 | #cap_list = [ann['caption'] for ann in anns] 76 | ans_list = t_question['ans_list'] 77 | draw = ImageDraw.Draw(t_img) 78 | for i in range(len(ans_list)): 79 | try: 80 | draw.text((10,10*i), str(ans_list[i])) 81 | except: 82 | pass 83 | 84 | ans = t_question['answer'] 85 | pred = t_question['pred'] 86 | if ans == -1: 87 | pre = '' 88 | elif ans == pred: 89 | pre = 'correct ' 90 | else: 91 | pre = 'failure ' 92 | #print ' aaa ', ans, pred 93 | ans = re.sub( '/', ' ', str(ans)) 94 | pred = re.sub( '/', ' ', str(pred)) 95 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 96 | str(ans) + ' p_' + str(pred) + '.png' 97 | count += 1 98 | print os.path.join(savepath,img_title) 99 | t_img.save(os.path.join(savepath,img_title)) 100 | 101 | print 'saving whatis' 102 | qt_color_list = [['what','color']] 103 | save_qtype(qt_color_list, 'colors', mode) 104 | 105 | print 'saving whatis' 106 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 107 | save_qtype(qt_whatis_list, 'whatis', mode) 108 | 109 | print 'saving is' 110 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 111 | save_qtype(qt_is_list, 'is', mode) 112 | 113 | print 'saving how many' 114 | qt_howmany_list =[['how','many']] 115 | save_qtype(qt_howmany_list, 'howmany', mode) 116 | 117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False): 118 | 119 | caffe.set_device(device_id) 120 | caffe.set_mode_gpu() 121 | net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST) 122 | 123 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder) 124 | total_questions = len(dp.getQuesIds()) 125 | epoch = 0 126 | 127 | pred_list = [] 128 | testloss_list = [] 129 | stat_list = [] 130 | 131 | while epoch == 0: 132 | t_word, t_cont, t_img_feature, t_answer, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 133 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0)) 134 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0)) 135 | net.blobs['img_feature'].data[...] = t_img_feature 136 | net.blobs['label'].data[...] = t_answer 137 | net.forward() 138 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 139 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 140 | testloss_list.append(net.blobs['loss'].data) 141 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 142 | #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 143 | pred_list.append((pred,int(dp.getStrippedQuesId(qid)))) 144 | if visualize: 145 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 146 | if mode == 'test-dev' or 'test': 147 | ans_str = '' 148 | ans_list = ['']*10 149 | else: 150 | ans_str = dp.vec_to_answer(ans) 151 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 152 | stat_list.append({\ 153 | 'qid' : qid, 154 | 'q_list' : q_list, 155 | 'iid' : iid, 156 | 'answer': ans_str, 157 | 'ans_list': ans_list, 158 | 'pred' : pred }) 159 | percent = 100 * float(len(pred_list)) / total_questions 160 | sys.stdout.write('\r' + ('%.2f' % percent) + '%') 161 | sys.stdout.flush() 162 | 163 | 164 | print 'Deduping arr of len', len(pred_list) 165 | deduped = [] 166 | seen = set() 167 | for ans, qid in pred_list: 168 | if qid not in seen: 169 | seen.add(qid) 170 | deduped.append((ans, qid)) 171 | print 'New len', len(deduped) 172 | final_list=[] 173 | for ans,qid in deduped: 174 | final_list.append({u'answer': ans, u'question_id': qid}) 175 | 176 | mean_testloss = np.array(testloss_list).mean() 177 | 178 | if mode == 'val': 179 | valFile = './%s/val2015_resfile'%folder 180 | with open(valFile, 'w') as f: 181 | json.dump(final_list, f) 182 | if visualize: 183 | visualize_failures(stat_list,mode) 184 | annFile = config.DATA_PATHS['val']['ans_file'] 185 | quesFile = config.DATA_PATHS['val']['ques_file'] 186 | vqa = VQA(annFile, quesFile) 187 | vqaRes = vqa.loadRes(valFile, quesFile) 188 | vqaEval = VQAEval(vqa, vqaRes, n=2) 189 | vqaEval.evaluate() 190 | acc_overall = vqaEval.accuracy['overall'] 191 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 192 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 193 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 194 | elif mode == 'test-dev': 195 | filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results' 196 | with open(filename+'.json', 'w') as f: 197 | json.dump(final_list, f) 198 | if visualize: 199 | visualize_failures(stat_list,mode) 200 | elif mode == 'test': 201 | filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results' 202 | with open(filename+'.json', 'w') as f: 203 | json.dump(final_list, f) 204 | if visualize: 205 | visualize_failures(stat_list,mode) 206 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False): 207 | # 0:it 208 | # 1:trainloss 209 | # 2:testloss 210 | # 3:oa_acc 211 | # 4:qt_acc 212 | # 5:at_acc 213 | 214 | # training curve 215 | it = np.array([l[0] for l in results]) 216 | loss = np.array([l[1] for l in results]) 217 | valloss = np.array([l[2] for l in results]) 218 | valacc = np.array([l[3] for l in results]) 219 | 220 | fig = plt.figure() 221 | ax1 = fig.add_subplot(111) 222 | ax2 = ax1.twinx() 223 | 224 | ax1.plot(it,loss, color='blue', label='train loss') 225 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 226 | ax2.plot(it,valacc, color='red', label='acc on val') 227 | plt.legend(loc='lower left') 228 | 229 | ax1.set_xlabel('Iterations') 230 | ax1.set_ylabel('Loss Value') 231 | ax2.set_ylabel('Accuracy on Val [%]') 232 | 233 | plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix)) 234 | plt.clf() 235 | plt.close("all") 236 | 237 | # question type 238 | it = np.array([l[0] for l in results]) 239 | oa_acc = np.array([l[3] for l in results]) 240 | qt_dic_list = [l[4] for l in results] 241 | 242 | def draw_qt_acc(target_key_list, figname): 243 | fig = plt.figure() 244 | for k in target_key_list: 245 | print k,type(k) 246 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 247 | plt.plot(it,t_val,label=str(k)) 248 | plt.legend(fontsize='small') 249 | plt.ylim(0,100.) 250 | #plt.legend(prop={'size':6}) 251 | 252 | plt.xlabel('Iterations') 253 | plt.ylabel('Accuracy on Val [%]') 254 | 255 | plt.savefig(figname,dpi=200) 256 | plt.clf() 257 | plt.close("all") 258 | 259 | if save_question_type_graphs: 260 | s_keys = sorted(qt_dic_list[0].keys()) 261 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 262 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 263 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 264 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 265 | draw_qt_acc(['what color is the','what color are the','what color is',\ 266 | 'what color','what is the color of the'],'./qt_color.png') 267 | draw_qt_acc(['how many','how','how many people are',\ 268 | 'how many people are in'],'./qt_number.png') 269 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 270 | 'which'],'./qt_who_why_where_which.png') 271 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 272 | 'is the woman','is this person','what is the woman','is the person',\ 273 | 'what is the person'],'./qt_human.png') 274 | 275 | 276 | -------------------------------------------------------------------------------- /mfh_baseline/train_mfh_baseline.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import os 4 | import sys 5 | import numpy as np 6 | import json 7 | import matplotlib.pyplot as plt 8 | 9 | import caffe 10 | from caffe import layers as L 11 | from caffe import params as P 12 | from caffe.proto import caffe_pb2 13 | 14 | from vqa_data_layer_kld import VQADataProvider 15 | from utils import exec_validation, drawgraph 16 | import config 17 | import time 18 | 19 | def get_solver(folder): 20 | s = caffe_pb2.SolverParameter() 21 | s.train_net = './%s/proto_train.prototxt'%folder 22 | s.snapshot = 10000 23 | s.snapshot_prefix = './%s/'%folder 24 | s.max_iter = int(config.MAX_ITERATIONS) 25 | s.display = int(config.VALIDATE_INTERVAL) 26 | s.type = 'Adam' 27 | s.stepsize = int(config.MAX_ITERATIONS*0.2) 28 | s.gamma = 0.5 29 | s.lr_policy = "step" 30 | s.base_lr = 0.0007 31 | s.momentum = 0.9 32 | s.momentum2 = 0.999 33 | s.weight_decay = 0.000 34 | s.clip_gradients = 10 35 | return s 36 | 37 | def get_auxiliary_json(): 38 | aux = {} 39 | aux["batch_size"] = int(config.VAL_BATCH_SIZE) 40 | aux["data_shape"] = [2048] 41 | aux["img_feature_prefix"] = config.DATA_PATHS['test']['features_prefix'] 42 | aux["glove"] = False 43 | return aux 44 | 45 | 46 | def mfh_baseline(mode, batchsize, T, question_vocab_size, folder): 47 | n = caffe.NetSpec() 48 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder}) 49 | if mode == 'val': 50 | n.data, n.cont, n.img_feature, n.label = L.Python( \ 51 | module='vqa_data_layer', layer='VQADataProviderLayer', \ 52 | param_str=mode_str, ntop=4 ) 53 | else: 54 | n.data, n.cont, n.img_feature, n.label = L.Python(\ 55 | module='vqa_data_layer_kld', layer='VQADataProviderLayer', \ 56 | param_str=mode_str, ntop=4 ) 57 | n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ 58 | weight_filler=dict(type='xavier')) 59 | n.embed_tanh = L.TanH(n.embed) 60 | 61 | # LSTM 62 | n.lstm1 = L.LSTM(\ 63 | n.embed_tanh, n.cont,\ 64 | recurrent_param=dict(\ 65 | num_output=config.LSTM_UNIT_NUM,\ 66 | weight_filler=dict(type='xavier'))) 67 | tops1 = L.Slice(n.lstm1, ntop=config.MAX_WORDS_IN_QUESTION, slice_param={'axis':0}) 68 | for i in xrange(config.MAX_WORDS_IN_QUESTION-1): 69 | n.__setattr__('slice_first'+str(i), tops1[int(i)]) 70 | n.__setattr__('silence_data_first'+str(i), L.Silence(tops1[int(i)],ntop=0)) 71 | n.lstm1_out = tops1[config.MAX_WORDS_IN_QUESTION-1] 72 | n.lstm1_reshaped = L.Reshape(n.lstm1_out,\ 73 | reshape_param=dict(\ 74 | shape=dict(dim=[-1,1024]))) 75 | n.q_feat = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) 76 | 77 | ''' 78 | Coarse Image-Question MFH fusion 79 | ''' 80 | 81 | n.mfb_q_o2_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE, 82 | weight_filler=dict(type='xavier')) 83 | n.mfb_i_o2_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE, 84 | weight_filler=dict(type='xavier')) 85 | n.mfb_iq_o2_eltwise = L.Eltwise(n.mfb_q_o2_proj, n.mfb_i_o2_proj, eltwise_param=dict(operation=0)) 86 | n.mfb_iq_o2_drop = L.Dropout(n.mfb_iq_o2_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) 87 | n.mfb_iq_o2_resh = L.Reshape(n.mfb_iq_o2_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) 88 | n.mfb_iq_o2_sumpool = L.Pooling(n.mfb_iq_o2_resh, pool=P.Pooling.SUM, \ 89 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) 90 | n.mfb_o2_out = L.Reshape(n.mfb_iq_o2_sumpool,\ 91 | reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) 92 | n.mfb_o2_sign_sqrt = L.SignedSqrt(n.mfb_o2_out) 93 | n.mfb_o2_l2 = L.L2Normalize(n.mfb_o2_sign_sqrt) 94 | 95 | n.mfb_q_o3_proj = L.InnerProduct(n.q_feat, num_output=config.JOINT_EMB_SIZE, 96 | weight_filler=dict(type='xavier')) 97 | n.mfb_i_o3_proj = L.InnerProduct(n.img_feature, num_output=config.JOINT_EMB_SIZE, 98 | weight_filler=dict(type='xavier')) 99 | n.mfb_iq_o3_eltwise = L.Eltwise(n.mfb_q_o3_proj, n.mfb_i_o3_proj,n.mfb_iq_o2_drop, eltwise_param=dict(operation=0)) 100 | n.mfb_iq_o3_drop = L.Dropout(n.mfb_iq_o3_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) 101 | n.mfb_iq_o3_resh = L.Reshape(n.mfb_iq_o3_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) 102 | n.mfb_iq_o3_sumpool = L.Pooling(n.mfb_iq_o3_resh, pool=P.Pooling.SUM, \ 103 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) 104 | n.mfb_o3_out = L.Reshape(n.mfb_iq_o3_sumpool,\ 105 | reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) 106 | n.mfb_o3_sign_sqrt = L.SignedSqrt(n.mfb_o3_out) 107 | n.mfb_o3_l2 = L.L2Normalize(n.mfb_o3_sign_sqrt) 108 | 109 | n.mfb_o23_l2 = L.Concat(n.mfb_o2_l2,n.mfb_o3_l2) 110 | 111 | n.prediction = L.InnerProduct(n.mfb_o23_l2, num_output=config.NUM_OUTPUT_UNITS, 112 | weight_filler=dict(type='xavier')) 113 | 114 | if mode == 'val': 115 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label) 116 | else: 117 | n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 118 | return n.to_proto() 119 | 120 | def make_answer_vocab(adic, vocab_size): 121 | """ 122 | Returns a dictionary that maps words to indices. 123 | """ 124 | adict = {'':0} 125 | nadict = {'':1000000} 126 | vid = 1 127 | for qid in adic.keys(): 128 | answer_obj = adic[qid] 129 | answer_list = [ans['answer'] for ans in answer_obj] 130 | 131 | for q_ans in answer_list: 132 | # create dict 133 | if adict.has_key(q_ans): 134 | nadict[q_ans] += 1 135 | else: 136 | nadict[q_ans] = 1 137 | adict[q_ans] = vid 138 | vid +=1 139 | 140 | # debug 141 | nalist = [] 142 | for k,v in sorted(nadict.items(), key=lambda x:x[1]): 143 | nalist.append((k,v)) 144 | 145 | # remove words that appear less than once 146 | n_del_ans = 0 147 | n_valid_ans = 0 148 | adict_nid = {} 149 | for i, w in enumerate(nalist[:-vocab_size]): 150 | del adict[w[0]] 151 | n_del_ans += w[1] 152 | for i, w in enumerate(nalist[-vocab_size:]): 153 | n_valid_ans += w[1] 154 | adict_nid[w[0]] = i 155 | 156 | return adict_nid 157 | 158 | def make_question_vocab(qdic): 159 | """ 160 | Returns a dictionary that maps words to indices. 161 | """ 162 | vdict = {'':0} 163 | vid = 1 164 | for qid in qdic.keys(): 165 | # sequence to list 166 | q_str = qdic[qid]['qstr'] 167 | q_list = VQADataProvider.seq_to_list(q_str) 168 | 169 | # create dict 170 | for w in q_list: 171 | if not vdict.has_key(w): 172 | vdict[w] = vid 173 | vid +=1 174 | 175 | return vdict 176 | 177 | def make_vocab_files(): 178 | """ 179 | Produce the question and answer vocabulary files. 180 | """ 181 | print 'making question vocab...', config.QUESTION_VOCAB_SPACE 182 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE) 183 | question_vocab = make_question_vocab(qdic) 184 | print 'making answer vocab...', config.ANSWER_VOCAB_SPACE 185 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE) 186 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS) 187 | return question_vocab, answer_vocab 188 | 189 | def main(): 190 | folder = 'mfh_baseline_%s'%(config.TRAIN_DATA_SPLITS) 191 | if not os.path.exists('./%s'%folder): 192 | os.makedirs('./%s'%folder) 193 | 194 | question_vocab, answer_vocab = {}, {} 195 | if os.path.exists('./%s/vdict.json'%folder) and os.path.exists('./%s/adict.json'%folder): 196 | print 'restoring vocab' 197 | with open('./%s/vdict.json'%folder,'r') as f: 198 | question_vocab = json.load(f) 199 | with open('./%s/adict.json'%folder,'r') as f: 200 | answer_vocab = json.load(f) 201 | else: 202 | question_vocab, answer_vocab = make_vocab_files() 203 | with open('./%s/vdict.json'%folder,'w') as f: 204 | json.dump(question_vocab, f) 205 | with open('./%s/adict.json'%folder,'w') as f: 206 | json.dump(answer_vocab, f) 207 | 208 | print 'question vocab size:', len(question_vocab) 209 | print 'answer vocab size:', len(answer_vocab) 210 | 211 | with open('./%s/proto_train.prototxt'%folder, 'w') as f: 212 | f.write(str(mfh_baseline(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \ 213 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder))) 214 | 215 | with open('./%s/proto_test.prototxt'%folder, 'w') as f: 216 | f.write(str(mfh_baseline('val', config.VAL_BATCH_SIZE, \ 217 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder))) 218 | 219 | with open('./%s/solver.prototxt'%folder, 'w') as f: 220 | f.write(str(get_solver(folder))) 221 | with open('./%s/auxiliary.json'%folder, 'w') as f: 222 | json.dump(get_auxiliary_json(),f, indent=2) 223 | 224 | caffe.set_device(config.TRAIN_GPU_ID) 225 | caffe.set_mode_gpu() 226 | solver = caffe.get_solver('./%s/solver.prototxt'%folder) 227 | 228 | train_loss = np.zeros(config.MAX_ITERATIONS+1) 229 | results = [] 230 | 231 | if config.RESTORE_ITER: 232 | restore_iter = config.RESTORE_ITER 233 | solver.restore('./%s/_iter_%d.solverstate'%(folder,restore_iter)) 234 | else: 235 | restore_iter = 0 236 | 237 | start = time.clock() 238 | for it in range(restore_iter, config.MAX_ITERATIONS+1): 239 | solver.step(1) 240 | 241 | # store the train loss 242 | train_loss[it] = solver.net.blobs['loss'].data 243 | 244 | if it % config.PRINT_INTERVAL == 0 and it != 0: 245 | elapsed = (time.clock() - start) 246 | print 'Iteration:', it 247 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean() 248 | print 'Train loss:', c_mean_loss, ' Elapsed seconds:', elapsed 249 | start = time.clock() 250 | if it % config.VALIDATE_INTERVAL == 0 and it != restore_iter: 251 | model_name = './%s/tmp.caffemodel'%(folder) 252 | solver.net.save(model_name) 253 | print 'Validating...' 254 | ''' 255 | # for test-dev /test set. the json file will be generated under the file 256 | exec_validation(config.TEST_GPU_ID, 'test-dev', model_name, it=it, folder=folder) 257 | caffe.set_device(config.TRAIN_GPU_ID) 258 | ''' 259 | #for val set. the accuracy will be computed and ploted 260 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.TEST_GPU_ID, 'val', model_name, it=it, folder=folder) 261 | caffe.set_device(config.TRAIN_GPU_ID) 262 | print 'Test loss:', test_loss 263 | print 'Accuracy:', acc_overall 264 | print 'Test per ans', acc_per_ans 265 | results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans]) 266 | best_result_idx = np.array([x[3] for x in results]).argmax() 267 | print 'Best accuracy of', results[best_result_idx][3], 'was at iteration', results[best_result_idx][0] 268 | drawgraph(results,folder,config.MFB_FACTOR_NUM,config.MFB_OUT_DIM,prefix='mfh_baseline') 269 | 270 | if __name__ == '__main__': 271 | main() 272 | -------------------------------------------------------------------------------- /eval/ensemble.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generates predictions on test-dev or test using an ensemble of nets. The 3 | ensemble is produced using the average of the pre-softmax output from each net. 4 | 5 | Place each model in its own folder. The folder must contain: 6 | 7 | - The .caffemodel file 8 | - proto_test.prototxt 9 | - adict.json 10 | - vdict.json 11 | - auxiliary.json 12 | 13 | auxiliary.json should contain the following keys: 14 | 15 | - batch_size (value should be integer) 16 | - data_shape (value should be array of integer) 17 | - img_feature_prefix (value should be string) 18 | - spatial_coord (value should be boolean) 19 | - glove (value should be boolean) 20 | 21 | If the folder also contains "preds.pkl", evaluation is skipped for that network. 22 | 23 | """ 24 | 25 | import caffe 26 | import numpy as np 27 | import cPickle 28 | import argparse, os, glob 29 | import sys 30 | import json 31 | from collections import defaultdict 32 | import vqa_data_layer 33 | from vqa_data_layer import LoadVQADataProvider 34 | 35 | def verify_all(folder_paths): 36 | """ 37 | Calls verify_one on each folder path. Also checks to make sure all the 38 | answer vocabularies are the same. 39 | """ 40 | adict_paths = [] 41 | for folder_path in folder_paths: 42 | paths = verify_one(folder_path) 43 | adict_paths.append(paths[2]) 44 | adicts = [] 45 | for path in adict_paths: 46 | with open(path, 'r') as f: 47 | adict = json.load(f) 48 | adicts.append(adict) 49 | if len(adicts) > 1: 50 | for a2 in adicts[1:]: 51 | if set(adicts[0].keys()) != set(a2.keys()): 52 | print set(adicts[0].keys()) - set(a2.keys()) 53 | print set(a2.keys()) - set(adicts[0].keys()) 54 | raise Exception('Answer vocab mismatch') 55 | return adicts 56 | 57 | def verify_one(folder_path): 58 | """ 59 | Makes sure all the required files exist in the folder. If so, returns the 60 | paths to all the files. 61 | """ 62 | model_path = glob.glob(folder_path + '/tmp*.caffemodel') 63 | print model_path 64 | assert len(model_path) == 1, 'one .caffemodel per folder, please' 65 | model_path = model_path[0] 66 | proto_path = folder_path + '/proto_test.prototxt' 67 | adict_path = folder_path + '/adict.json' 68 | vdict_path = folder_path + '/vdict.json' 69 | aux_path = folder_path + '/auxiliary.json' 70 | assert os.path.exists(proto_path), 'proto_test.prototxt missing' 71 | assert os.path.exists(adict_path), 'adict.json missing' 72 | assert os.path.exists(vdict_path), 'vdict.json missing' 73 | assert os.path.exists(aux_path), 'auxiliary.json missing' 74 | with open(aux_path, 'r') as f: 75 | aux = json.load(f) 76 | batch_size = int(aux['batch_size']) 77 | data_shape = tuple(map(int, aux['data_shape'])) 78 | img_feature_prefix = aux['img_feature_prefix'] 79 | spatial_coord = aux['spatial_coord'] if 'spatial_coord' in aux else False 80 | glove = aux['glove'] if 'glove' in aux else False 81 | model_weight = float(aux['model_weight']) if 'model_weight' in aux else 1.0 82 | #print 'weight: ', model_weight 83 | return model_path, proto_path, adict_path, vdict_path, batch_size, data_shape, img_feature_prefix, spatial_coord, glove, model_weight 84 | 85 | def get_pkl_fname(ques_file): 86 | if '_val2014_' in ques_file: 87 | return '/preds_val.pkl' 88 | elif '_test-dev2015_' in ques_file: 89 | return '/preds_test_dev.pkl' 90 | elif '_test2015_' in ques_file: 91 | return '/preds_test.pkl' 92 | else: 93 | raise NotImplementedError 94 | 95 | def eval_one(folder_path, gpuid, ques_file): 96 | """ 97 | Evaluates a single model (in folder_path) on the questions in ques_file. 98 | Returns an array of (QID, answer vector) tuples. 99 | """ 100 | 101 | model_path, proto_path, adict_path, vdict_path, batch_size, data_shape, \ 102 | img_feature_prefix, spatial_coord, glove, model_weight = verify_one(folder_path) 103 | 104 | dp = LoadVQADataProvider(ques_file, img_feature_prefix, vdict_path, \ 105 | adict_path, mode='test', batchsize=batch_size, data_shape=data_shape) 106 | total_questions = len(dp.getQuesIds()) 107 | print total_questions, 'total questions' 108 | 109 | if os.path.exists(folder_path + get_pkl_fname(ques_file)): 110 | print 'Found existing prediction file, trying to load...' 111 | with open(folder_path + get_pkl_fname(ques_file), 'r') as f: 112 | preds = cPickle.load(f) 113 | if len(preds) >= total_questions: 114 | print 'Loaded.' 115 | return preds 116 | else: 117 | print 'Number of saved answers does not match number of questions, continuing...' 118 | 119 | caffe.set_device(gpuid) 120 | caffe.set_mode_gpu() 121 | 122 | vqa_data_layer.CURRENT_DATA_SHAPE = data_shape # This is a huge hack 123 | vqa_data_layer.SPATIAL_COORD = spatial_coord 124 | vqa_data_layer.GLOVE = glove 125 | 126 | net = caffe.Net(proto_path, model_path, caffe.TEST) 127 | 128 | print 'Model loaded:', model_path 129 | print 'Image feature prefix:', img_feature_prefix 130 | sys.stdout.flush() 131 | 132 | 133 | pred_layers = [] 134 | 135 | epoch = 0 136 | while epoch == 0: 137 | t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, _, epoch = dp.get_batch_vec() 138 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0)) 139 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0)) 140 | net.blobs['img_feature'].data[...] = t_img_feature 141 | net.blobs['label'].data[...] = t_answer # dummy 142 | if glove: 143 | net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2)) 144 | net.forward() 145 | ans_matrix = net.blobs['prediction'].data 146 | 147 | for i in range(len(t_qid_list)): 148 | qid = t_qid_list[i] 149 | pred_layers.append((qid, np.copy(model_weight * ans_matrix[i]))) # model_weight * answer_matrix 150 | 151 | percent = 100 * float(len(pred_layers)) / total_questions 152 | sys.stdout.write('\r' + ('%.2f' % percent) + '%') 153 | sys.stdout.flush() 154 | 155 | #print 'Saving predictions...' 156 | #with open(folder_path + get_pkl_fname(ques_file), 'w') as f: 157 | # cPickle.dump(pred_layers, f, protocol=-1) 158 | #print 'Saved.' 159 | return pred_layers 160 | 161 | def make_rev_adict(adict): 162 | """ 163 | An adict maps text answers to neuron indices. A reverse adict maps neuron 164 | indices to text answers. 165 | """ 166 | rev_adict = {} 167 | for k,v in adict.items(): 168 | rev_adict[v] = k 169 | return rev_adict 170 | 171 | def softmax(arr): 172 | e = np.exp(arr) 173 | dist = e / np.sum(e) 174 | return dist 175 | 176 | def get_qid_valid_answer_dict(ques_file, adict): 177 | """ 178 | Returns a dictionary mapping question IDs to valid neuron indices. 179 | """ 180 | print 'Multiple choice mode: making valid answer dictionary...' 181 | valid_answer_dict = {} 182 | with open(ques_file, 'r') as f: 183 | qdata = json.load(f) 184 | for q in qdata['questions']: 185 | valid_answer_dict[q['question_id']] = q['multiple_choices'] 186 | for qid in valid_answer_dict: 187 | answers = valid_answer_dict[qid] 188 | valid_indices = [] 189 | for answer in answers: 190 | if answer in adict: 191 | valid_indices.append(adict[answer]) 192 | if len(valid_indices) == 0: 193 | print "we won't be able to answer qid", qid 194 | valid_answer_dict[qid] = valid_indices 195 | return valid_answer_dict 196 | 197 | def dedupe(arr): 198 | print 'Deduping arr of len', len(arr) 199 | deduped = [] 200 | seen = set() 201 | for qid, pred in arr: 202 | if qid not in seen: 203 | seen.add(qid) 204 | deduped.append((qid, pred)) 205 | print 'New len', len(deduped) 206 | return deduped 207 | 208 | def reorder_one(predictions, this_adict, canonical_adict): 209 | index_map = {} 210 | for idx, word in make_rev_adict(this_adict).iteritems(): 211 | index_map[int(idx)] = int(canonical_adict[word]) 212 | index_array = np.zeros(len(index_map), dtype=int) 213 | for src_idx, dest_idx in index_map.iteritems(): 214 | index_array[src_idx] = dest_idx 215 | reordered = [] 216 | for qid, output in predictions: 217 | reordered.append((qid, np.copy(output[index_array]))) 218 | return reordered 219 | 220 | def reorder_predictions(predictions, adicts): 221 | """ 222 | Reorders prediction matrices so that the unit order matches that of the 223 | first answer dictionary. 224 | """ 225 | if len(adicts) == 1: 226 | return predictions 227 | need_to_reorder = False 228 | for a2 in adicts[1:]: 229 | if adicts[0] != a2: 230 | need_to_reorder = True 231 | print 'Reordering...' if need_to_reorder else 'No need to reorder!' 232 | if not need_to_reorder: 233 | return predictions 234 | reordered = [] 235 | for i in range(1, len(adicts)): 236 | if adicts[0] != adicts[i]: 237 | reordered.append(reorder_one(predictions[i], adicts[i], adicts[0])) 238 | else: 239 | reordered.append(predictions[i]) 240 | return reordered 241 | 242 | def average_outputs(arr_of_arr, rev_adict, qid_valid_answer_dict): 243 | """ 244 | Given a list of lists, where each list contains (QID, answer vector) tuples, 245 | returns a single dictionary which maps a question ID to the text answer. 246 | """ 247 | print 'Averaging outputs...' 248 | merged = defaultdict(list) 249 | for arr in arr_of_arr: 250 | for qid, ans_vec in arr: 251 | merged[qid].append(ans_vec) 252 | 253 | merged = {qid: softmax(np.vstack(ans_vecs).mean(axis=0)) for qid, ans_vecs in merged.iteritems()} 254 | mask_len = len(merged.values()[0]) 255 | 256 | # Multiple choice filtering 257 | if qid_valid_answer_dict is not None: 258 | for qid in merged: 259 | valid_indices = qid_valid_answer_dict[qid] 260 | mask = np.zeros(mask_len) 261 | for idx in valid_indices: 262 | mask[idx] = 1 263 | merged[qid] *= mask 264 | 265 | merged = {qid: rev_adict[ans_vec.argmax()] for qid, ans_vec in merged.iteritems()} 266 | 267 | return merged 268 | 269 | def save_json(qid_ans_dict, fname): 270 | tmp = [] 271 | for qid, ans in qid_ans_dict.iteritems(): 272 | tmp.append({u'answer': ans, u'question_id': qid}) 273 | with open(fname, 'w') as f: 274 | json.dump(tmp, f) 275 | print 'Saved to', fname 276 | 277 | def main(): 278 | parser = argparse.ArgumentParser() 279 | parser.add_argument('--ques_file', required=True) 280 | parser.add_argument('--gpu', type=int, required=True) 281 | parser.add_argument('--out_file', required=True) 282 | parser.add_argument('folders', nargs='*', 283 | help='space-separated list of folders containing models') 284 | args = parser.parse_args() 285 | assert len(args.folders) > 0, 'please specify at least one folder' 286 | print 'Folders', args.folders 287 | 288 | adicts = verify_all(args.folders) 289 | print '-----------------------------------------------' 290 | qid_valid_answer_dict = None 291 | if 'MultipleChoice' in args.ques_file: 292 | qid_valid_answer_dict = get_qid_valid_answer_dict(args.ques_file, adicts[0]) 293 | 294 | arr_of_arr = [eval_one(folder_path, args.gpu, args.ques_file) for folder_path in args.folders] 295 | arr_of_arr = [dedupe(x) for x in arr_of_arr] 296 | #np.save('%s.predict_arr.npz'%args.out_file,x = arr_of_arr) 297 | reordered = reorder_predictions(arr_of_arr, adicts) 298 | qid_ans_dict = average_outputs(reordered, make_rev_adict(adicts[0]), qid_valid_answer_dict) 299 | save_json(qid_ans_dict, args.out_file) 300 | 301 | if __name__ == '__main__': 302 | main() 303 | -------------------------------------------------------------------------------- /mfb_coatt_glove/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer 16 | 17 | import config 18 | sys.path.append(config.VQA_TOOLS_PATH) 19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 20 | 21 | from vqaTools.vqa import VQA 22 | from vqaEvaluation.vqaEval import VQAEval 23 | 24 | def visualize_failures(stat_list,mode): 25 | 26 | def save_qtype(qtype_list, save_filename, mode): 27 | 28 | if mode == 'val': 29 | savepath = os.path.join('./eval', save_filename) 30 | # TODO 31 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014' 32 | elif mode == 'test-dev': 33 | savepath = os.path.join('./test-dev', save_filename) 34 | # TODO 35 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015' 36 | elif mode == 'test': 37 | savepath = os.path.join('./test', save_filename) 38 | # TODO 39 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015' 40 | else: 41 | raise Exception('Unsupported mode') 42 | if os.path.exists(savepath): shutil.rmtree(savepath) 43 | if not os.path.exists(savepath): os.makedirs(savepath) 44 | 45 | for qt in qtype_list: 46 | count = 0 47 | for t_question in stat_list: 48 | #print count, t_question 49 | if count < 40/len(qtype_list): 50 | t_question_list = t_question['q_list'] 51 | saveflag = False 52 | #print 'debug****************************' 53 | #print qt 54 | #print t_question_list 55 | #print t_question_list[0] == qt[0] 56 | #print t_question_list[1] == qt[1] 57 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 58 | saveflag = True 59 | else: 60 | saveflag = False 61 | 62 | if saveflag == True: 63 | t_iid = t_question['iid'] 64 | if mode == 'val': 65 | t_img = Image.open(os.path.join(img_pre, \ 66 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 67 | elif mode == 'test-dev' or 'test': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 70 | 71 | # for caption 72 | #print t_iid 73 | #annIds = caps.getAnnIds(t_iid) 74 | #anns = caps.loadAnns(annIds) 75 | #cap_list = [ann['caption'] for ann in anns] 76 | ans_list = t_question['ans_list'] 77 | draw = ImageDraw.Draw(t_img) 78 | for i in range(len(ans_list)): 79 | try: 80 | draw.text((10,10*i), str(ans_list[i])) 81 | except: 82 | pass 83 | 84 | ans = t_question['answer'] 85 | pred = t_question['pred'] 86 | if ans == -1: 87 | pre = '' 88 | elif ans == pred: 89 | pre = 'correct ' 90 | else: 91 | pre = 'failure ' 92 | #print ' aaa ', ans, pred 93 | ans = re.sub( '/', ' ', str(ans)) 94 | pred = re.sub( '/', ' ', str(pred)) 95 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 96 | str(ans) + ' p_' + str(pred) + '.png' 97 | count += 1 98 | print os.path.join(savepath,img_title) 99 | t_img.save(os.path.join(savepath,img_title)) 100 | 101 | print 'saving whatis' 102 | qt_color_list = [['what','color']] 103 | save_qtype(qt_color_list, 'colors', mode) 104 | 105 | print 'saving whatis' 106 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 107 | save_qtype(qt_whatis_list, 'whatis', mode) 108 | 109 | print 'saving is' 110 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 111 | save_qtype(qt_is_list, 'is', mode) 112 | 113 | print 'saving how many' 114 | qt_howmany_list =[['how','many']] 115 | save_qtype(qt_howmany_list, 'howmany', mode) 116 | 117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False): 118 | 119 | caffe.set_device(device_id) 120 | caffe.set_mode_gpu() 121 | net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST) 122 | 123 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder) 124 | total_questions = len(dp.getQuesIds()) 125 | epoch = 0 126 | 127 | pred_list = [] 128 | testloss_list = [] 129 | stat_list = [] 130 | 131 | while epoch == 0: 132 | t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 133 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0)) 134 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0)) 135 | net.blobs['img_feature'].data[...] = t_img_feature 136 | net.blobs['label'].data[...] = t_answer 137 | net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2)) 138 | net.forward() 139 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 140 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 141 | testloss_list.append(net.blobs['loss'].data) 142 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 143 | #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 144 | pred_list.append((pred,int(dp.getStrippedQuesId(qid)))) 145 | if visualize: 146 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 147 | if mode == 'test-dev' or 'test': 148 | ans_str = '' 149 | ans_list = ['']*10 150 | else: 151 | ans_str = dp.vec_to_answer(ans) 152 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 153 | stat_list.append({\ 154 | 'qid' : qid, 155 | 'q_list' : q_list, 156 | 'iid' : iid, 157 | 'answer': ans_str, 158 | 'ans_list': ans_list, 159 | 'pred' : pred }) 160 | percent = 100 * float(len(pred_list)) / total_questions 161 | sys.stdout.write('\r' + ('%.2f' % percent) + '%') 162 | sys.stdout.flush() 163 | 164 | 165 | print 'Deduping arr of len', len(pred_list) 166 | deduped = [] 167 | seen = set() 168 | for ans, qid in pred_list: 169 | if qid not in seen: 170 | seen.add(qid) 171 | deduped.append((ans, qid)) 172 | print 'New len', len(deduped) 173 | final_list=[] 174 | for ans,qid in deduped: 175 | final_list.append({u'answer': ans, u'question_id': qid}) 176 | 177 | mean_testloss = np.array(testloss_list).mean() 178 | 179 | if mode == 'val': 180 | valFile = './%s/val2015_resfile'%folder 181 | with open(valFile, 'w') as f: 182 | json.dump(final_list, f) 183 | if visualize: 184 | visualize_failures(stat_list,mode) 185 | annFile = config.DATA_PATHS['val']['ans_file'] 186 | quesFile = config.DATA_PATHS['val']['ques_file'] 187 | vqa = VQA(annFile, quesFile) 188 | vqaRes = vqa.loadRes(valFile, quesFile) 189 | vqaEval = VQAEval(vqa, vqaRes, n=2) 190 | vqaEval.evaluate() 191 | acc_overall = vqaEval.accuracy['overall'] 192 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 193 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 194 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 195 | elif mode == 'test-dev': 196 | filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results' 197 | with open(filename+'.json', 'w') as f: 198 | json.dump(final_list, f) 199 | if visualize: 200 | visualize_failures(stat_list,mode) 201 | elif mode == 'test': 202 | filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results' 203 | with open(filename+'.json', 'w') as f: 204 | json.dump(final_list, f) 205 | if visualize: 206 | visualize_failures(stat_list,mode) 207 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False): 208 | # 0:it 209 | # 1:trainloss 210 | # 2:testloss 211 | # 3:oa_acc 212 | # 4:qt_acc 213 | # 5:at_acc 214 | 215 | # training curve 216 | it = np.array([l[0] for l in results]) 217 | loss = np.array([l[1] for l in results]) 218 | valloss = np.array([l[2] for l in results]) 219 | valacc = np.array([l[3] for l in results]) 220 | 221 | fig = plt.figure() 222 | ax1 = fig.add_subplot(111) 223 | ax2 = ax1.twinx() 224 | 225 | ax1.plot(it,loss, color='blue', label='train loss') 226 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 227 | ax2.plot(it,valacc, color='red', label='acc on val') 228 | plt.legend(loc='lower left') 229 | 230 | ax1.set_xlabel('Iterations') 231 | ax1.set_ylabel('Loss Value') 232 | ax2.set_ylabel('Accuracy on Val [%]') 233 | 234 | plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix)) 235 | plt.clf() 236 | plt.close("all") 237 | 238 | # question type 239 | it = np.array([l[0] for l in results]) 240 | oa_acc = np.array([l[3] for l in results]) 241 | qt_dic_list = [l[4] for l in results] 242 | 243 | def draw_qt_acc(target_key_list, figname): 244 | fig = plt.figure() 245 | for k in target_key_list: 246 | print k,type(k) 247 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 248 | plt.plot(it,t_val,label=str(k)) 249 | plt.legend(fontsize='small') 250 | plt.ylim(0,100.) 251 | #plt.legend(prop={'size':6}) 252 | 253 | plt.xlabel('Iterations') 254 | plt.ylabel('Accuracy on Val [%]') 255 | 256 | plt.savefig(figname,dpi=200) 257 | plt.clf() 258 | plt.close("all") 259 | 260 | if save_question_type_graphs: 261 | s_keys = sorted(qt_dic_list[0].keys()) 262 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 263 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 264 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 265 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 266 | draw_qt_acc(['what color is the','what color are the','what color is',\ 267 | 'what color','what is the color of the'],'./qt_color.png') 268 | draw_qt_acc(['how many','how','how many people are',\ 269 | 'how many people are in'],'./qt_number.png') 270 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 271 | 'which'],'./qt_who_why_where_which.png') 272 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 273 | 'is the woman','is this person','what is the woman','is the person',\ 274 | 'what is the person'],'./qt_human.png') 275 | 276 | 277 | -------------------------------------------------------------------------------- /mfh_coatt_glove/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import os 4 | import sys 5 | import json 6 | import re 7 | import shutil 8 | from PIL import Image 9 | from PIL import ImageFont, ImageDraw 10 | 11 | import caffe 12 | from caffe import layers as L 13 | from caffe import params as P 14 | 15 | from vqa_data_layer import VQADataProvider, VQADataProviderLayer 16 | 17 | import config 18 | sys.path.append(config.VQA_TOOLS_PATH) 19 | sys.path.append(config.VQA_EVAL_TOOLS_PATH) 20 | 21 | from vqaTools.vqa import VQA 22 | from vqaEvaluation.vqaEval import VQAEval 23 | 24 | def visualize_failures(stat_list,mode): 25 | 26 | def save_qtype(qtype_list, save_filename, mode): 27 | 28 | if mode == 'val': 29 | savepath = os.path.join('./eval', save_filename) 30 | # TODO 31 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/val2014' 32 | elif mode == 'test-dev': 33 | savepath = os.path.join('./test-dev', save_filename) 34 | # TODO 35 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015' 36 | elif mode == 'test': 37 | savepath = os.path.join('./test', save_filename) 38 | # TODO 39 | img_pre = '/home/dhpseth/vqa/02_tools/VQA/Images/test2015' 40 | else: 41 | raise Exception('Unsupported mode') 42 | if os.path.exists(savepath): shutil.rmtree(savepath) 43 | if not os.path.exists(savepath): os.makedirs(savepath) 44 | 45 | for qt in qtype_list: 46 | count = 0 47 | for t_question in stat_list: 48 | #print count, t_question 49 | if count < 40/len(qtype_list): 50 | t_question_list = t_question['q_list'] 51 | saveflag = False 52 | #print 'debug****************************' 53 | #print qt 54 | #print t_question_list 55 | #print t_question_list[0] == qt[0] 56 | #print t_question_list[1] == qt[1] 57 | if t_question_list[0] == qt[0] and t_question_list[1] == qt[1]: 58 | saveflag = True 59 | else: 60 | saveflag = False 61 | 62 | if saveflag == True: 63 | t_iid = t_question['iid'] 64 | if mode == 'val': 65 | t_img = Image.open(os.path.join(img_pre, \ 66 | 'COCO_val2014_' + str(t_iid).zfill(12) + '.jpg')) 67 | elif mode == 'test-dev' or 'test': 68 | t_img = Image.open(os.path.join(img_pre, \ 69 | 'COCO_test2015_' + str(t_iid).zfill(12) + '.jpg')) 70 | 71 | # for caption 72 | #print t_iid 73 | #annIds = caps.getAnnIds(t_iid) 74 | #anns = caps.loadAnns(annIds) 75 | #cap_list = [ann['caption'] for ann in anns] 76 | ans_list = t_question['ans_list'] 77 | draw = ImageDraw.Draw(t_img) 78 | for i in range(len(ans_list)): 79 | try: 80 | draw.text((10,10*i), str(ans_list[i])) 81 | except: 82 | pass 83 | 84 | ans = t_question['answer'] 85 | pred = t_question['pred'] 86 | if ans == -1: 87 | pre = '' 88 | elif ans == pred: 89 | pre = 'correct ' 90 | else: 91 | pre = 'failure ' 92 | #print ' aaa ', ans, pred 93 | ans = re.sub( '/', ' ', str(ans)) 94 | pred = re.sub( '/', ' ', str(pred)) 95 | img_title = pre + str(' '.join(t_question_list)) + '. a_' + \ 96 | str(ans) + ' p_' + str(pred) + '.png' 97 | count += 1 98 | print os.path.join(savepath,img_title) 99 | t_img.save(os.path.join(savepath,img_title)) 100 | 101 | print 'saving whatis' 102 | qt_color_list = [['what','color']] 103 | save_qtype(qt_color_list, 'colors', mode) 104 | 105 | print 'saving whatis' 106 | qt_whatis_list = [['what','is'],['what','kind'],['what','are']] 107 | save_qtype(qt_whatis_list, 'whatis', mode) 108 | 109 | print 'saving is' 110 | qt_is_list = [['is','the'], ['is','this'],['is','there']] 111 | save_qtype(qt_is_list, 'is', mode) 112 | 113 | print 'saving how many' 114 | qt_howmany_list =[['how','many']] 115 | save_qtype(qt_howmany_list, 'howmany', mode) 116 | 117 | def exec_validation(device_id, mode, model_name, folder, it='', visualize=False): 118 | 119 | caffe.set_device(device_id) 120 | caffe.set_mode_gpu() 121 | net = caffe.Net('./%s/proto_test.prototxt'%folder,model_name,caffe.TEST) 122 | 123 | dp = VQADataProvider(mode=mode,batchsize=config.VAL_BATCH_SIZE,folder=folder) 124 | total_questions = len(dp.getQuesIds()) 125 | epoch = 0 126 | 127 | pred_list = [] 128 | testloss_list = [] 129 | stat_list = [] 130 | 131 | while epoch == 0: 132 | t_word, t_cont, t_img_feature, t_answer, t_glove_matrix, t_qid_list, t_iid_list, epoch = dp.get_batch_vec() 133 | net.blobs['data'].data[...] = np.transpose(t_word,(1,0)) 134 | net.blobs['cont'].data[...] = np.transpose(t_cont,(1,0)) 135 | net.blobs['img_feature'].data[...] = t_img_feature 136 | net.blobs['label'].data[...] = t_answer 137 | net.blobs['glove'].data[...] = np.transpose(t_glove_matrix, (1,0,2)) 138 | net.forward() 139 | t_pred_list = net.blobs['prediction'].data.argmax(axis=1) 140 | t_pred_str = [dp.vec_to_answer(pred_symbol) for pred_symbol in t_pred_list] 141 | testloss_list.append(net.blobs['loss'].data) 142 | for qid, iid, ans, pred in zip(t_qid_list, t_iid_list, t_answer.tolist(), t_pred_str): 143 | #pred_list.append({u'answer':pred, u'question_id': int(dp.getStrippedQuesId(qid))}) 144 | pred_list.append((pred,int(dp.getStrippedQuesId(qid)))) 145 | if visualize: 146 | q_list = dp.seq_to_list(dp.getQuesStr(qid)) 147 | if mode == 'test-dev' or 'test': 148 | ans_str = '' 149 | ans_list = ['']*10 150 | else: 151 | ans_str = dp.vec_to_answer(ans) 152 | ans_list = [ dp.getAnsObj(qid)[i]['answer'] for i in xrange(10)] 153 | stat_list.append({\ 154 | 'qid' : qid, 155 | 'q_list' : q_list, 156 | 'iid' : iid, 157 | 'answer': ans_str, 158 | 'ans_list': ans_list, 159 | 'pred' : pred }) 160 | percent = 100 * float(len(pred_list)) / total_questions 161 | sys.stdout.write('\r' + ('%.2f' % percent) + '%') 162 | sys.stdout.flush() 163 | 164 | 165 | print 'Deduping arr of len', len(pred_list) 166 | deduped = [] 167 | seen = set() 168 | for ans, qid in pred_list: 169 | if qid not in seen: 170 | seen.add(qid) 171 | deduped.append((ans, qid)) 172 | print 'New len', len(deduped) 173 | final_list=[] 174 | for ans,qid in deduped: 175 | final_list.append({u'answer': ans, u'question_id': qid}) 176 | 177 | mean_testloss = np.array(testloss_list).mean() 178 | 179 | if mode == 'val': 180 | valFile = './%s/val2015_resfile'%folder 181 | with open(valFile, 'w') as f: 182 | json.dump(final_list, f) 183 | if visualize: 184 | visualize_failures(stat_list,mode) 185 | annFile = config.DATA_PATHS['val']['ans_file'] 186 | quesFile = config.DATA_PATHS['val']['ques_file'] 187 | vqa = VQA(annFile, quesFile) 188 | vqaRes = vqa.loadRes(valFile, quesFile) 189 | vqaEval = VQAEval(vqa, vqaRes, n=2) 190 | vqaEval.evaluate() 191 | acc_overall = vqaEval.accuracy['overall'] 192 | acc_perQuestionType = vqaEval.accuracy['perQuestionType'] 193 | acc_perAnswerType = vqaEval.accuracy['perAnswerType'] 194 | return mean_testloss, acc_overall, acc_perQuestionType, acc_perAnswerType 195 | elif mode == 'test-dev': 196 | filename = './%s/vqa_OpenEnded_mscoco_test-dev2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results' 197 | with open(filename+'.json', 'w') as f: 198 | json.dump(final_list, f) 199 | if visualize: 200 | visualize_failures(stat_list,mode) 201 | elif mode == 'test': 202 | filename = './%s/vqa_OpenEnded_mscoco_test2015_%s-'%(folder,folder)+str(it).zfill(8)+'_results' 203 | with open(filename+'.json', 'w') as f: 204 | json.dump(final_list, f) 205 | if visualize: 206 | visualize_failures(stat_list,mode) 207 | def drawgraph(results, folder,k,d,prefix='std',save_question_type_graphs=False): 208 | # 0:it 209 | # 1:trainloss 210 | # 2:testloss 211 | # 3:oa_acc 212 | # 4:qt_acc 213 | # 5:at_acc 214 | 215 | # training curve 216 | it = np.array([l[0] for l in results]) 217 | loss = np.array([l[1] for l in results]) 218 | valloss = np.array([l[2] for l in results]) 219 | valacc = np.array([l[3] for l in results]) 220 | 221 | fig = plt.figure() 222 | ax1 = fig.add_subplot(111) 223 | ax2 = ax1.twinx() 224 | 225 | ax1.plot(it,loss, color='blue', label='train loss') 226 | ax1.plot(it,valloss, '--', color='blue', label='test loss') 227 | ax2.plot(it,valacc, color='red', label='acc on val') 228 | plt.legend(loc='lower left') 229 | 230 | ax1.set_xlabel('Iterations') 231 | ax1.set_ylabel('Loss Value') 232 | ax2.set_ylabel('Accuracy on Val [%]') 233 | 234 | plt.savefig('./%s/result_it_%d_acc_%2.2f_k_%d_d_%d_%s.png'%(folder,it[-1],valacc[-1],k,d,prefix)) 235 | plt.clf() 236 | plt.close("all") 237 | 238 | # question type 239 | it = np.array([l[0] for l in results]) 240 | oa_acc = np.array([l[3] for l in results]) 241 | qt_dic_list = [l[4] for l in results] 242 | 243 | def draw_qt_acc(target_key_list, figname): 244 | fig = plt.figure() 245 | for k in target_key_list: 246 | print k,type(k) 247 | t_val = np.array([ qt_dic[k] for qt_dic in qt_dic_list]) 248 | plt.plot(it,t_val,label=str(k)) 249 | plt.legend(fontsize='small') 250 | plt.ylim(0,100.) 251 | #plt.legend(prop={'size':6}) 252 | 253 | plt.xlabel('Iterations') 254 | plt.ylabel('Accuracy on Val [%]') 255 | 256 | plt.savefig(figname,dpi=200) 257 | plt.clf() 258 | plt.close("all") 259 | 260 | if save_question_type_graphs: 261 | s_keys = sorted(qt_dic_list[0].keys()) 262 | draw_qt_acc(s_keys[ 0:13]+[s_keys[31],], './ind_qt_are.png') 263 | draw_qt_acc(s_keys[13:17]+s_keys[49:], './ind_qt_how_where_who_why.png') 264 | draw_qt_acc(s_keys[17:31]+[s_keys[32],], './ind_qt_is.png') 265 | draw_qt_acc(s_keys[33:49], './ind_qt_what.png') 266 | draw_qt_acc(['what color is the','what color are the','what color is',\ 267 | 'what color','what is the color of the'],'./qt_color.png') 268 | draw_qt_acc(['how many','how','how many people are',\ 269 | 'how many people are in'],'./qt_number.png') 270 | draw_qt_acc(['who is','why','why is the','where is the','where are the',\ 271 | 'which'],'./qt_who_why_where_which.png') 272 | draw_qt_acc(['what is the man','is the man','are they','is he',\ 273 | 'is the woman','is this person','what is the woman','is the person',\ 274 | 'what is the person'],'./qt_human.png') 275 | 276 | 277 | -------------------------------------------------------------------------------- /mfb_baseline/vqa_data_layer.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | import re, json, random 4 | import config 5 | 6 | QID_KEY_SEPARATOR = '/' 7 | GLOVE_EMBEDDING_SIZE = 300 8 | 9 | class VQADataProvider: 10 | 11 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'): 12 | self.batchsize = batchsize 13 | self.d_vocabulary = None 14 | self.batch_index = None 15 | self.batch_len = None 16 | self.rev_adict = None 17 | self.max_length = max_length 18 | self.mode = mode 19 | self.qdic, self.adic = VQADataProvider.load_data(mode) 20 | 21 | with open('./%s/vdict.json'%folder,'r') as f: 22 | self.vdict = json.load(f) 23 | with open('./%s/adict.json'%folder,'r') as f: 24 | self.adict = json.load(f) 25 | 26 | self.n_ans_vocabulary = len(self.adict) 27 | 28 | @staticmethod 29 | def load_vqa_json(data_split): 30 | """ 31 | Parses the question and answer json files for the given data split. 32 | Returns the question dictionary and the answer dictionary. 33 | """ 34 | qdic, adic = {}, {} 35 | 36 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: 37 | qdata = json.load(f)['questions'] 38 | for q in qdata: 39 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ 40 | {'qstr': q['question'], 'iid': q['image_id']} 41 | 42 | if 'test' not in data_split: 43 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: 44 | adata = json.load(f)['annotations'] 45 | for a in adata: 46 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ 47 | a['answers'] 48 | 49 | print 'parsed', len(qdic), 'questions for', data_split 50 | return qdic, adic 51 | 52 | @staticmethod 53 | def load_genome_json(): 54 | """ 55 | Parses the genome json file. Returns the question dictionary and the 56 | answer dictionary. 57 | """ 58 | qdic, adic = {}, {} 59 | 60 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: 61 | qdata = json.load(f) 62 | for q in qdata: 63 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) 64 | qdic[key] = {'qstr': q['question'], 'iid': q['image']} 65 | adic[key] = [{'answer': q['answer']}] 66 | 67 | print 'parsed', len(qdic), 'questions for genome' 68 | return qdic, adic 69 | 70 | @staticmethod 71 | def load_data(data_split_str): 72 | all_qdic, all_adic = {}, {} 73 | for data_split in data_split_str.split('+'): 74 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split' 75 | if data_split == 'genome': 76 | qdic, adic = VQADataProvider.load_genome_json() 77 | all_qdic.update(qdic) 78 | all_adic.update(adic) 79 | else: 80 | qdic, adic = VQADataProvider.load_vqa_json(data_split) 81 | all_qdic.update(qdic) 82 | all_adic.update(adic) 83 | return all_qdic, all_adic 84 | 85 | def getQuesIds(self): 86 | return self.qdic.keys() 87 | 88 | def getStrippedQuesId(self, qid): 89 | return qid.split(QID_KEY_SEPARATOR)[1] 90 | 91 | def getImgId(self,qid): 92 | return self.qdic[qid]['iid'] 93 | 94 | def getQuesStr(self,qid): 95 | return self.qdic[qid]['qstr'] 96 | 97 | def getAnsObj(self,qid): 98 | if self.mode == 'test-dev' or self.mode == 'test': 99 | return -1 100 | return self.adic[qid] 101 | 102 | @staticmethod 103 | def seq_to_list(s): 104 | t_str = s.lower() 105 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']: 106 | t_str = re.sub( i, '', t_str) 107 | for i in [r'\-',r'\/']: 108 | t_str = re.sub( i, ' ', t_str) 109 | q_list = re.sub(r'\?','',t_str.lower()).split(' ') 110 | q_list = filter(lambda x: len(x) > 0, q_list) 111 | return q_list 112 | 113 | def extract_answer(self,answer_obj): 114 | """ Return the most popular answer in string.""" 115 | if self.mode == 'test-dev' or self.mode == 'test': 116 | return -1 117 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)] 118 | dic = {} 119 | for ans in answer_list: 120 | if dic.has_key(ans): 121 | dic[ans] +=1 122 | else: 123 | dic[ans] = 1 124 | max_key = max((v,k) for (k,v) in dic.items())[1] 125 | return max_key 126 | 127 | def extract_answer_prob(self,answer_obj): 128 | """ Return the most popular answer in string.""" 129 | if self.mode == 'test-dev' or self.mode == 'test': 130 | return -1 131 | 132 | answer_list = [ ans['answer'] for ans in answer_obj] 133 | prob_answer_list = [] 134 | for ans in answer_list: 135 | if self.adict.has_key(ans): 136 | prob_answer_list.append(ans) 137 | 138 | if len(prob_answer_list) == 0: 139 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 140 | return 'hoge' 141 | else: 142 | raise Exception("This should not happen.") 143 | else: 144 | return random.choice(prob_answer_list) 145 | 146 | def qlist_to_vec(self, max_length, q_list): 147 | """ 148 | Converts a list of words into a format suitable for the embedding layer. 149 | 150 | Arguments: 151 | max_length -- the maximum length of a question sequence 152 | q_list -- a list of words which are the tokens in the question 153 | 154 | Returns: 155 | qvec -- A max_length length vector containing one-hot indices for each word 156 | cvec -- A max_length length sequence continuation indicator vector 157 | """ 158 | qvec = np.zeros(max_length) 159 | cvec = np.zeros(max_length) 160 | for i in xrange(max_length): 161 | if i < max_length - len(q_list): 162 | cvec[i] = 0 163 | else: 164 | w = q_list[i-(max_length-len(q_list))] 165 | # is the word in the vocabulary? 166 | if self.vdict.has_key(w) is False: 167 | w = '' 168 | qvec[i] = self.vdict[w] 169 | cvec[i] = 0 if i == max_length - len(q_list) else 1 170 | 171 | return qvec, cvec 172 | 173 | def answer_to_vec(self, ans_str): 174 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 175 | if self.mode =='test-dev' or self.mode == 'test': 176 | return -1 177 | 178 | if self.adict.has_key(ans_str): 179 | ans = self.adict[ans_str] 180 | else: 181 | ans = self.adict[''] 182 | return ans 183 | 184 | def vec_to_answer(self, ans_symbol): 185 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 186 | if self.rev_adict is None: 187 | rev_adict = {} 188 | for k,v in self.adict.items(): 189 | rev_adict[v] = k 190 | self.rev_adict = rev_adict 191 | 192 | return self.rev_adict[ans_symbol] 193 | 194 | def create_batch(self,qid_list): 195 | 196 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 197 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 198 | ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048) 199 | avec = (np.zeros(self.batchsize)).reshape(self.batchsize) 200 | 201 | for i,qid in enumerate(qid_list): 202 | 203 | # load raw question information 204 | q_str = self.getQuesStr(qid) 205 | q_ans = self.getAnsObj(qid) 206 | q_iid = self.getImgId(qid) 207 | 208 | # convert question to vec 209 | q_list = VQADataProvider.seq_to_list(q_str) 210 | t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list) 211 | 212 | try: 213 | qid_split = qid.split(QID_KEY_SEPARATOR) 214 | data_split = qid_split[0] 215 | if data_split == 'genome': 216 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x'] 217 | else: 218 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x'] 219 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) ) 220 | except: 221 | t_ivec = 0. 222 | print 'data not found for qid : ', q_iid, self.mode 223 | 224 | # convert answer to vec 225 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 226 | q_ans_str = self.extract_answer(q_ans) 227 | else: 228 | q_ans_str = self.extract_answer_prob(q_ans) 229 | t_avec = self.answer_to_vec(q_ans_str) 230 | 231 | qvec[i,...] = t_qvec 232 | cvec[i,...] = t_cvec 233 | ivec[i,...] = t_ivec 234 | avec[i,...] = t_avec 235 | 236 | return qvec, cvec, ivec, avec 237 | 238 | 239 | def get_batch_vec(self): 240 | if self.batch_len is None: 241 | self.n_skipped = 0 242 | qid_list = self.getQuesIds() 243 | random.shuffle(qid_list) 244 | self.qid_list = qid_list 245 | self.batch_len = len(qid_list) 246 | self.batch_index = 0 247 | self.epoch_counter = 0 248 | 249 | def has_at_least_one_valid_answer(t_qid): 250 | answer_obj = self.getAnsObj(t_qid) 251 | answer_list = [ans['answer'] for ans in answer_obj] 252 | for ans in answer_list: 253 | if self.adict.has_key(ans): 254 | return True 255 | 256 | counter = 0 257 | t_qid_list = [] 258 | t_iid_list = [] 259 | while counter < self.batchsize: 260 | t_qid = self.qid_list[self.batch_index] 261 | t_iid = self.getImgId(t_qid) 262 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 263 | t_qid_list.append(t_qid) 264 | t_iid_list.append(t_iid) 265 | counter += 1 266 | elif has_at_least_one_valid_answer(t_qid): 267 | t_qid_list.append(t_qid) 268 | t_iid_list.append(t_iid) 269 | counter += 1 270 | else: 271 | self.n_skipped += 1 272 | 273 | if self.batch_index < self.batch_len-1: 274 | self.batch_index += 1 275 | else: 276 | self.epoch_counter += 1 277 | qid_list = self.getQuesIds() 278 | random.shuffle(qid_list) 279 | self.qid_list = qid_list 280 | self.batch_index = 0 281 | print("%d questions were skipped in a single epoch" % self.n_skipped) 282 | self.n_skipped = 0 283 | 284 | t_batch = self.create_batch(t_qid_list) 285 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter) 286 | 287 | 288 | class VQADataProviderLayer(caffe.Layer): 289 | """ 290 | Provide input data for VQA. 291 | """ 292 | 293 | def setup(self, bottom, top): 294 | self.batchsize = json.loads(self.param_str)['batchsize'] 295 | self.top_names = ['data','cont','feature','label'] 296 | top[0].reshape(15,self.batchsize) 297 | top[1].reshape(15,self.batchsize) 298 | top[2].reshape(self.batchsize,2048) 299 | top[3].reshape(self.batchsize) 300 | 301 | self.mode = json.loads(self.param_str)['mode'] 302 | self.folder = json.loads(self.param_str)['folder'] 303 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 304 | pass 305 | else: 306 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder) 307 | 308 | def reshape(self, bottom, top): 309 | pass 310 | 311 | def forward(self, bottom, top): 312 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 313 | pass 314 | else: 315 | word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec() 316 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N 317 | top[1].data[...] = np.transpose(cont,(1,0)) 318 | top[2].data[...] = feature 319 | top[3].data[...] = answer 320 | 321 | def backward(self, top, propagate_down, bottom): 322 | pass 323 | 324 | -------------------------------------------------------------------------------- /mfh_baseline/vqa_data_layer.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | import re, json, random 4 | import config 5 | 6 | QID_KEY_SEPARATOR = '/' 7 | GLOVE_EMBEDDING_SIZE = 300 8 | 9 | class VQADataProvider: 10 | 11 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'): 12 | self.batchsize = batchsize 13 | self.d_vocabulary = None 14 | self.batch_index = None 15 | self.batch_len = None 16 | self.rev_adict = None 17 | self.max_length = max_length 18 | self.mode = mode 19 | self.qdic, self.adic = VQADataProvider.load_data(mode) 20 | 21 | with open('./%s/vdict.json'%folder,'r') as f: 22 | self.vdict = json.load(f) 23 | with open('./%s/adict.json'%folder,'r') as f: 24 | self.adict = json.load(f) 25 | 26 | self.n_ans_vocabulary = len(self.adict) 27 | 28 | @staticmethod 29 | def load_vqa_json(data_split): 30 | """ 31 | Parses the question and answer json files for the given data split. 32 | Returns the question dictionary and the answer dictionary. 33 | """ 34 | qdic, adic = {}, {} 35 | 36 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: 37 | qdata = json.load(f)['questions'] 38 | for q in qdata: 39 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ 40 | {'qstr': q['question'], 'iid': q['image_id']} 41 | 42 | if 'test' not in data_split: 43 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: 44 | adata = json.load(f)['annotations'] 45 | for a in adata: 46 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ 47 | a['answers'] 48 | 49 | print 'parsed', len(qdic), 'questions for', data_split 50 | return qdic, adic 51 | 52 | @staticmethod 53 | def load_genome_json(): 54 | """ 55 | Parses the genome json file. Returns the question dictionary and the 56 | answer dictionary. 57 | """ 58 | qdic, adic = {}, {} 59 | 60 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: 61 | qdata = json.load(f) 62 | for q in qdata: 63 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) 64 | qdic[key] = {'qstr': q['question'], 'iid': q['image']} 65 | adic[key] = [{'answer': q['answer']}] 66 | 67 | print 'parsed', len(qdic), 'questions for genome' 68 | return qdic, adic 69 | 70 | @staticmethod 71 | def load_data(data_split_str): 72 | all_qdic, all_adic = {}, {} 73 | for data_split in data_split_str.split('+'): 74 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split' 75 | if data_split == 'genome': 76 | qdic, adic = VQADataProvider.load_genome_json() 77 | all_qdic.update(qdic) 78 | all_adic.update(adic) 79 | else: 80 | qdic, adic = VQADataProvider.load_vqa_json(data_split) 81 | all_qdic.update(qdic) 82 | all_adic.update(adic) 83 | return all_qdic, all_adic 84 | 85 | def getQuesIds(self): 86 | return self.qdic.keys() 87 | 88 | def getStrippedQuesId(self, qid): 89 | return qid.split(QID_KEY_SEPARATOR)[1] 90 | 91 | def getImgId(self,qid): 92 | return self.qdic[qid]['iid'] 93 | 94 | def getQuesStr(self,qid): 95 | return self.qdic[qid]['qstr'] 96 | 97 | def getAnsObj(self,qid): 98 | if self.mode == 'test-dev' or self.mode == 'test': 99 | return -1 100 | return self.adic[qid] 101 | 102 | @staticmethod 103 | def seq_to_list(s): 104 | t_str = s.lower() 105 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']: 106 | t_str = re.sub( i, '', t_str) 107 | for i in [r'\-',r'\/']: 108 | t_str = re.sub( i, ' ', t_str) 109 | q_list = re.sub(r'\?','',t_str.lower()).split(' ') 110 | q_list = filter(lambda x: len(x) > 0, q_list) 111 | return q_list 112 | 113 | def extract_answer(self,answer_obj): 114 | """ Return the most popular answer in string.""" 115 | if self.mode == 'test-dev' or self.mode == 'test': 116 | return -1 117 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)] 118 | dic = {} 119 | for ans in answer_list: 120 | if dic.has_key(ans): 121 | dic[ans] +=1 122 | else: 123 | dic[ans] = 1 124 | max_key = max((v,k) for (k,v) in dic.items())[1] 125 | return max_key 126 | 127 | def extract_answer_prob(self,answer_obj): 128 | """ Return the most popular answer in string.""" 129 | if self.mode == 'test-dev' or self.mode == 'test': 130 | return -1 131 | 132 | answer_list = [ ans['answer'] for ans in answer_obj] 133 | prob_answer_list = [] 134 | for ans in answer_list: 135 | if self.adict.has_key(ans): 136 | prob_answer_list.append(ans) 137 | 138 | if len(prob_answer_list) == 0: 139 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 140 | return 'hoge' 141 | else: 142 | raise Exception("This should not happen.") 143 | else: 144 | return random.choice(prob_answer_list) 145 | 146 | def qlist_to_vec(self, max_length, q_list): 147 | """ 148 | Converts a list of words into a format suitable for the embedding layer. 149 | 150 | Arguments: 151 | max_length -- the maximum length of a question sequence 152 | q_list -- a list of words which are the tokens in the question 153 | 154 | Returns: 155 | qvec -- A max_length length vector containing one-hot indices for each word 156 | cvec -- A max_length length sequence continuation indicator vector 157 | """ 158 | qvec = np.zeros(max_length) 159 | cvec = np.zeros(max_length) 160 | for i in xrange(max_length): 161 | if i < max_length - len(q_list): 162 | cvec[i] = 0 163 | else: 164 | w = q_list[i-(max_length-len(q_list))] 165 | # is the word in the vocabulary? 166 | if self.vdict.has_key(w) is False: 167 | w = '' 168 | qvec[i] = self.vdict[w] 169 | cvec[i] = 0 if i == max_length - len(q_list) else 1 170 | 171 | return qvec, cvec 172 | 173 | def answer_to_vec(self, ans_str): 174 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 175 | if self.mode =='test-dev' or self.mode == 'test': 176 | return -1 177 | 178 | if self.adict.has_key(ans_str): 179 | ans = self.adict[ans_str] 180 | else: 181 | ans = self.adict[''] 182 | return ans 183 | 184 | def vec_to_answer(self, ans_symbol): 185 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 186 | if self.rev_adict is None: 187 | rev_adict = {} 188 | for k,v in self.adict.items(): 189 | rev_adict[v] = k 190 | self.rev_adict = rev_adict 191 | 192 | return self.rev_adict[ans_symbol] 193 | 194 | def create_batch(self,qid_list): 195 | 196 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 197 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 198 | ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048) 199 | avec = (np.zeros(self.batchsize)).reshape(self.batchsize) 200 | 201 | for i,qid in enumerate(qid_list): 202 | 203 | # load raw question information 204 | q_str = self.getQuesStr(qid) 205 | q_ans = self.getAnsObj(qid) 206 | q_iid = self.getImgId(qid) 207 | 208 | # convert question to vec 209 | q_list = VQADataProvider.seq_to_list(q_str) 210 | t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list) 211 | 212 | try: 213 | qid_split = qid.split(QID_KEY_SEPARATOR) 214 | data_split = qid_split[0] 215 | if data_split == 'genome': 216 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x'] 217 | else: 218 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x'] 219 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) ) 220 | except: 221 | t_ivec = 0. 222 | print 'data not found for qid : ', q_iid, self.mode 223 | 224 | # convert answer to vec 225 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 226 | q_ans_str = self.extract_answer(q_ans) 227 | else: 228 | q_ans_str = self.extract_answer_prob(q_ans) 229 | t_avec = self.answer_to_vec(q_ans_str) 230 | 231 | qvec[i,...] = t_qvec 232 | cvec[i,...] = t_cvec 233 | ivec[i,...] = t_ivec 234 | avec[i,...] = t_avec 235 | 236 | return qvec, cvec, ivec, avec 237 | 238 | 239 | def get_batch_vec(self): 240 | if self.batch_len is None: 241 | self.n_skipped = 0 242 | qid_list = self.getQuesIds() 243 | random.shuffle(qid_list) 244 | self.qid_list = qid_list 245 | self.batch_len = len(qid_list) 246 | self.batch_index = 0 247 | self.epoch_counter = 0 248 | 249 | def has_at_least_one_valid_answer(t_qid): 250 | answer_obj = self.getAnsObj(t_qid) 251 | answer_list = [ans['answer'] for ans in answer_obj] 252 | for ans in answer_list: 253 | if self.adict.has_key(ans): 254 | return True 255 | 256 | counter = 0 257 | t_qid_list = [] 258 | t_iid_list = [] 259 | while counter < self.batchsize: 260 | t_qid = self.qid_list[self.batch_index] 261 | t_iid = self.getImgId(t_qid) 262 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 263 | t_qid_list.append(t_qid) 264 | t_iid_list.append(t_iid) 265 | counter += 1 266 | elif has_at_least_one_valid_answer(t_qid): 267 | t_qid_list.append(t_qid) 268 | t_iid_list.append(t_iid) 269 | counter += 1 270 | else: 271 | self.n_skipped += 1 272 | 273 | if self.batch_index < self.batch_len-1: 274 | self.batch_index += 1 275 | else: 276 | self.epoch_counter += 1 277 | qid_list = self.getQuesIds() 278 | random.shuffle(qid_list) 279 | self.qid_list = qid_list 280 | self.batch_index = 0 281 | print("%d questions were skipped in a single epoch" % self.n_skipped) 282 | self.n_skipped = 0 283 | 284 | t_batch = self.create_batch(t_qid_list) 285 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter) 286 | 287 | 288 | class VQADataProviderLayer(caffe.Layer): 289 | """ 290 | Provide input data for VQA. 291 | """ 292 | 293 | def setup(self, bottom, top): 294 | self.batchsize = json.loads(self.param_str)['batchsize'] 295 | self.top_names = ['data','cont','feature','label'] 296 | top[0].reshape(15,self.batchsize) 297 | top[1].reshape(15,self.batchsize) 298 | top[2].reshape(self.batchsize,2048) 299 | top[3].reshape(self.batchsize) 300 | 301 | self.mode = json.loads(self.param_str)['mode'] 302 | self.folder = json.loads(self.param_str)['folder'] 303 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 304 | pass 305 | else: 306 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder) 307 | 308 | def reshape(self, bottom, top): 309 | pass 310 | 311 | def forward(self, bottom, top): 312 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 313 | pass 314 | else: 315 | word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec() 316 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N 317 | top[1].data[...] = np.transpose(cont,(1,0)) 318 | top[2].data[...] = feature 319 | top[3].data[...] = answer 320 | 321 | def backward(self, top, propagate_down, bottom): 322 | pass 323 | 324 | -------------------------------------------------------------------------------- /mfb_baseline/vqa_data_layer_kld.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | import re, json, random 4 | import config 5 | 6 | QID_KEY_SEPARATOR = '/' 7 | GLOVE_EMBEDDING_SIZE = 300 8 | 9 | class VQADataProvider: 10 | 11 | def __init__(self, folder='./result', batchsize=64, max_length=15, mode='train'): 12 | self.batchsize = batchsize 13 | self.d_vocabulary = None 14 | self.batch_index = None 15 | self.batch_len = None 16 | self.rev_adict = None 17 | self.max_length = max_length 18 | self.mode = mode 19 | self.qdic, self.adic = VQADataProvider.load_data(mode) 20 | 21 | with open('./%s/vdict.json'%folder,'r') as f: 22 | self.vdict = json.load(f) 23 | with open('./%s/adict.json'%folder,'r') as f: 24 | self.adict = json.load(f) 25 | 26 | 27 | @staticmethod 28 | def load_vqa_json(data_split): 29 | """ 30 | Parses the question and answer json files for the given data split. 31 | Returns the question dictionary and the answer dictionary. 32 | """ 33 | qdic, adic = {}, {} 34 | 35 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: 36 | qdata = json.load(f)['questions'] 37 | for q in qdata: 38 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ 39 | {'qstr': q['question'], 'iid': q['image_id']} 40 | 41 | if 'test' not in data_split: 42 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: 43 | adata = json.load(f)['annotations'] 44 | for a in adata: 45 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ 46 | a['answers'] 47 | 48 | print 'parsed', len(qdic), 'questions for', data_split 49 | return qdic, adic 50 | 51 | @staticmethod 52 | def load_genome_json(): 53 | """ 54 | Parses the genome json file. Returns the question dictionary and the 55 | answer dictionary. 56 | """ 57 | qdic, adic = {}, {} 58 | 59 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: 60 | qdata = json.load(f) 61 | for q in qdata: 62 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) 63 | qdic[key] = {'qstr': q['question'], 'iid': q['image']} 64 | adic[key] = [{'answer': q['answer']}] 65 | 66 | print 'parsed', len(qdic), 'questions for genome' 67 | return qdic, adic 68 | 69 | @staticmethod 70 | def load_data(data_split_str): 71 | all_qdic, all_adic = {}, {} 72 | for data_split in data_split_str.split('+'): 73 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split' 74 | if data_split == 'genome': 75 | qdic, adic = VQADataProvider.load_genome_json() 76 | all_qdic.update(qdic) 77 | all_adic.update(adic) 78 | else: 79 | qdic, adic = VQADataProvider.load_vqa_json(data_split) 80 | all_qdic.update(qdic) 81 | all_adic.update(adic) 82 | return all_qdic, all_adic 83 | 84 | def getQuesIds(self): 85 | return self.qdic.keys() 86 | 87 | def getStrippedQuesId(self, qid): 88 | return qid.split(QID_KEY_SEPARATOR)[1] 89 | 90 | def getImgId(self,qid): 91 | return self.qdic[qid]['iid'] 92 | 93 | def getQuesStr(self,qid): 94 | return self.qdic[qid]['qstr'] 95 | 96 | def getAnsObj(self,qid): 97 | if self.mode == 'test-dev' or self.mode == 'test': 98 | return -1 99 | return self.adic[qid] 100 | 101 | @staticmethod 102 | def seq_to_list(s): 103 | t_str = s.lower() 104 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']: 105 | t_str = re.sub( i, '', t_str) 106 | for i in [r'\-',r'\/']: 107 | t_str = re.sub( i, ' ', t_str) 108 | q_list = re.sub(r'\?','',t_str.lower()).split(' ') 109 | q_list = filter(lambda x: len(x) > 0, q_list) 110 | return q_list 111 | 112 | def extract_answer(self,answer_obj): 113 | """ Return the most popular answer in string.""" 114 | if self.mode == 'test-dev' or self.mode == 'test': 115 | return -1 116 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)] 117 | dic = {} 118 | for ans in answer_list: 119 | if dic.has_key(ans): 120 | dic[ans] +=1 121 | else: 122 | dic[ans] = 1 123 | max_key = max((v,k) for (k,v) in dic.items())[1] 124 | return max_key 125 | 126 | def extract_answer_prob(self,answer_obj): 127 | """ Return the most popular answer in string.""" 128 | if self.mode == 'test-dev' or self.mode == 'test': 129 | return -1 130 | 131 | answer_list = [ ans['answer'] for ans in answer_obj] 132 | prob_answer_list = [] 133 | for ans in answer_list: 134 | if self.adict.has_key(ans): 135 | prob_answer_list.append(ans) 136 | def extract_answer_list(self,answer_obj): 137 | answer_list = [ ans['answer'] for ans in answer_obj] 138 | prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS) 139 | for ans in answer_list: 140 | if self.adict.has_key(ans): 141 | index = self.adict[ans] 142 | prob_answer_vec[index] += 1 143 | return prob_answer_vec / np.sum(prob_answer_vec) 144 | 145 | if len(prob_answer_list) == 0: 146 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 147 | return 'hoge' 148 | else: 149 | raise Exception("This should not happen.") 150 | else: 151 | return random.choice(prob_answer_list) 152 | 153 | def qlist_to_vec(self, max_length, q_list): 154 | """ 155 | Converts a list of words into a format suitable for the embedding layer. 156 | 157 | Arguments: 158 | max_length -- the maximum length of a question sequence 159 | q_list -- a list of words which are the tokens in the question 160 | 161 | Returns: 162 | qvec -- A max_length length vector containing one-hot indices for each word 163 | cvec -- A max_length length sequence continuation indicator vector 164 | """ 165 | qvec = np.zeros(max_length) 166 | cvec = np.zeros(max_length) 167 | for i in xrange(max_length): 168 | if i < max_length - len(q_list): 169 | cvec[i] = 0 170 | else: 171 | w = q_list[i-(max_length-len(q_list))] 172 | # is the word in the vocabulary? 173 | if self.vdict.has_key(w) is False: 174 | w = '' 175 | qvec[i] = self.vdict[w] 176 | cvec[i] = 0 if i == max_length - len(q_list) else 1 177 | 178 | return qvec, cvec 179 | 180 | def answer_to_vec(self, ans_str): 181 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 182 | if self.mode =='test-dev' or self.mode == 'test': 183 | return -1 184 | 185 | if self.adict.has_key(ans_str): 186 | ans = self.adict[ans_str] 187 | else: 188 | ans = self.adict[''] 189 | return ans 190 | 191 | def vec_to_answer(self, ans_symbol): 192 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 193 | if self.rev_adict is None: 194 | rev_adict = {} 195 | for k,v in self.adict.items(): 196 | rev_adict[v] = k 197 | self.rev_adict = rev_adict 198 | 199 | return self.rev_adict[ans_symbol] 200 | 201 | def create_batch(self,qid_list): 202 | 203 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 204 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 205 | ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048) 206 | avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS) 207 | 208 | for i,qid in enumerate(qid_list): 209 | 210 | # load raw question information 211 | q_str = self.getQuesStr(qid) 212 | q_ans = self.getAnsObj(qid) 213 | q_iid = self.getImgId(qid) 214 | 215 | # convert question to vec 216 | q_list = VQADataProvider.seq_to_list(q_str) 217 | t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list) 218 | 219 | try: 220 | qid_split = qid.split(QID_KEY_SEPARATOR) 221 | data_split = qid_split[0] 222 | if data_split == 'genome': 223 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x'] 224 | else: 225 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x'] 226 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) ) 227 | except: 228 | t_ivec = 0. 229 | print 'data not found for qid : ', q_iid, self.mode 230 | 231 | # convert answer to vec 232 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 233 | q_ans_str = self.extract_answer(q_ans) 234 | t_avec = self.answer_to_vec(q_ans_str) 235 | else: 236 | t_avec = self.extract_answer_list(q_ans) 237 | 238 | qvec[i,...] = t_qvec 239 | cvec[i,...] = t_cvec 240 | ivec[i,...] = t_ivec 241 | avec[i,...] = t_avec 242 | 243 | return qvec, cvec, ivec, avec 244 | 245 | 246 | def get_batch_vec(self): 247 | if self.batch_len is None: 248 | self.n_skipped = 0 249 | qid_list = self.getQuesIds() 250 | random.shuffle(qid_list) 251 | self.qid_list = qid_list 252 | self.batch_len = len(qid_list) 253 | self.batch_index = 0 254 | self.epoch_counter = 0 255 | 256 | def has_at_least_one_valid_answer(t_qid): 257 | answer_obj = self.getAnsObj(t_qid) 258 | answer_list = [ans['answer'] for ans in answer_obj] 259 | for ans in answer_list: 260 | if self.adict.has_key(ans): 261 | return True 262 | 263 | counter = 0 264 | t_qid_list = [] 265 | t_iid_list = [] 266 | while counter < self.batchsize: 267 | t_qid = self.qid_list[self.batch_index] 268 | t_iid = self.getImgId(t_qid) 269 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 270 | t_qid_list.append(t_qid) 271 | t_iid_list.append(t_iid) 272 | counter += 1 273 | elif has_at_least_one_valid_answer(t_qid): 274 | t_qid_list.append(t_qid) 275 | t_iid_list.append(t_iid) 276 | counter += 1 277 | else: 278 | self.n_skipped += 1 279 | 280 | if self.batch_index < self.batch_len-1: 281 | self.batch_index += 1 282 | else: 283 | self.epoch_counter += 1 284 | qid_list = self.getQuesIds() 285 | random.shuffle(qid_list) 286 | self.qid_list = qid_list 287 | self.batch_index = 0 288 | print("%d questions were skipped in a single epoch" % self.n_skipped) 289 | self.n_skipped = 0 290 | 291 | t_batch = self.create_batch(t_qid_list) 292 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter) 293 | 294 | 295 | class VQADataProviderLayer(caffe.Layer): 296 | """ 297 | Provide input data for VQA. 298 | """ 299 | 300 | def setup(self, bottom, top): 301 | self.batchsize = json.loads(self.param_str)['batchsize'] 302 | self.top_names = ['data','cont','feature','label'] 303 | top[0].reshape(15,self.batchsize) 304 | top[1].reshape(15,self.batchsize) 305 | top[2].reshape(self.batchsize,2048) 306 | top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS) 307 | 308 | self.mode = json.loads(self.param_str)['mode'] 309 | self.folder = json.loads(self.param_str)['folder'] 310 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 311 | pass 312 | else: 313 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder) 314 | 315 | def reshape(self, bottom, top): 316 | pass 317 | 318 | def forward(self, bottom, top): 319 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 320 | pass 321 | else: 322 | word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec() 323 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N 324 | top[1].data[...] = np.transpose(cont,(1,0)) 325 | top[2].data[...] = feature 326 | top[3].data[...] = answer 327 | 328 | def backward(self, top, propagate_down, bottom): 329 | pass 330 | 331 | -------------------------------------------------------------------------------- /mfh_baseline/vqa_data_layer_kld.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | import re, json, random 4 | import config 5 | 6 | QID_KEY_SEPARATOR = '/' 7 | GLOVE_EMBEDDING_SIZE = 300 8 | 9 | class VQADataProvider: 10 | 11 | def __init__(self, folder='./result', batchsize=64, max_length=15, mode='train'): 12 | self.batchsize = batchsize 13 | self.d_vocabulary = None 14 | self.batch_index = None 15 | self.batch_len = None 16 | self.rev_adict = None 17 | self.max_length = max_length 18 | self.mode = mode 19 | self.qdic, self.adic = VQADataProvider.load_data(mode) 20 | 21 | with open('./%s/vdict.json'%folder,'r') as f: 22 | self.vdict = json.load(f) 23 | with open('./%s/adict.json'%folder,'r') as f: 24 | self.adict = json.load(f) 25 | 26 | 27 | @staticmethod 28 | def load_vqa_json(data_split): 29 | """ 30 | Parses the question and answer json files for the given data split. 31 | Returns the question dictionary and the answer dictionary. 32 | """ 33 | qdic, adic = {}, {} 34 | 35 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: 36 | qdata = json.load(f)['questions'] 37 | for q in qdata: 38 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ 39 | {'qstr': q['question'], 'iid': q['image_id']} 40 | 41 | if 'test' not in data_split: 42 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: 43 | adata = json.load(f)['annotations'] 44 | for a in adata: 45 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ 46 | a['answers'] 47 | 48 | print 'parsed', len(qdic), 'questions for', data_split 49 | return qdic, adic 50 | 51 | @staticmethod 52 | def load_genome_json(): 53 | """ 54 | Parses the genome json file. Returns the question dictionary and the 55 | answer dictionary. 56 | """ 57 | qdic, adic = {}, {} 58 | 59 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: 60 | qdata = json.load(f) 61 | for q in qdata: 62 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) 63 | qdic[key] = {'qstr': q['question'], 'iid': q['image']} 64 | adic[key] = [{'answer': q['answer']}] 65 | 66 | print 'parsed', len(qdic), 'questions for genome' 67 | return qdic, adic 68 | 69 | @staticmethod 70 | def load_data(data_split_str): 71 | all_qdic, all_adic = {}, {} 72 | for data_split in data_split_str.split('+'): 73 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split' 74 | if data_split == 'genome': 75 | qdic, adic = VQADataProvider.load_genome_json() 76 | all_qdic.update(qdic) 77 | all_adic.update(adic) 78 | else: 79 | qdic, adic = VQADataProvider.load_vqa_json(data_split) 80 | all_qdic.update(qdic) 81 | all_adic.update(adic) 82 | return all_qdic, all_adic 83 | 84 | def getQuesIds(self): 85 | return self.qdic.keys() 86 | 87 | def getStrippedQuesId(self, qid): 88 | return qid.split(QID_KEY_SEPARATOR)[1] 89 | 90 | def getImgId(self,qid): 91 | return self.qdic[qid]['iid'] 92 | 93 | def getQuesStr(self,qid): 94 | return self.qdic[qid]['qstr'] 95 | 96 | def getAnsObj(self,qid): 97 | if self.mode == 'test-dev' or self.mode == 'test': 98 | return -1 99 | return self.adic[qid] 100 | 101 | @staticmethod 102 | def seq_to_list(s): 103 | t_str = s.lower() 104 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']: 105 | t_str = re.sub( i, '', t_str) 106 | for i in [r'\-',r'\/']: 107 | t_str = re.sub( i, ' ', t_str) 108 | q_list = re.sub(r'\?','',t_str.lower()).split(' ') 109 | q_list = filter(lambda x: len(x) > 0, q_list) 110 | return q_list 111 | 112 | def extract_answer(self,answer_obj): 113 | """ Return the most popular answer in string.""" 114 | if self.mode == 'test-dev' or self.mode == 'test': 115 | return -1 116 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)] 117 | dic = {} 118 | for ans in answer_list: 119 | if dic.has_key(ans): 120 | dic[ans] +=1 121 | else: 122 | dic[ans] = 1 123 | max_key = max((v,k) for (k,v) in dic.items())[1] 124 | return max_key 125 | 126 | def extract_answer_prob(self,answer_obj): 127 | """ Return the most popular answer in string.""" 128 | if self.mode == 'test-dev' or self.mode == 'test': 129 | return -1 130 | 131 | answer_list = [ ans['answer'] for ans in answer_obj] 132 | prob_answer_list = [] 133 | for ans in answer_list: 134 | if self.adict.has_key(ans): 135 | prob_answer_list.append(ans) 136 | def extract_answer_list(self,answer_obj): 137 | answer_list = [ ans['answer'] for ans in answer_obj] 138 | prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS) 139 | for ans in answer_list: 140 | if self.adict.has_key(ans): 141 | index = self.adict[ans] 142 | prob_answer_vec[index] += 1 143 | return prob_answer_vec / np.sum(prob_answer_vec) 144 | 145 | if len(prob_answer_list) == 0: 146 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 147 | return 'hoge' 148 | else: 149 | raise Exception("This should not happen.") 150 | else: 151 | return random.choice(prob_answer_list) 152 | 153 | def qlist_to_vec(self, max_length, q_list): 154 | """ 155 | Converts a list of words into a format suitable for the embedding layer. 156 | 157 | Arguments: 158 | max_length -- the maximum length of a question sequence 159 | q_list -- a list of words which are the tokens in the question 160 | 161 | Returns: 162 | qvec -- A max_length length vector containing one-hot indices for each word 163 | cvec -- A max_length length sequence continuation indicator vector 164 | """ 165 | qvec = np.zeros(max_length) 166 | cvec = np.zeros(max_length) 167 | for i in xrange(max_length): 168 | if i < max_length - len(q_list): 169 | cvec[i] = 0 170 | else: 171 | w = q_list[i-(max_length-len(q_list))] 172 | # is the word in the vocabulary? 173 | if self.vdict.has_key(w) is False: 174 | w = '' 175 | qvec[i] = self.vdict[w] 176 | cvec[i] = 0 if i == max_length - len(q_list) else 1 177 | 178 | return qvec, cvec 179 | 180 | def answer_to_vec(self, ans_str): 181 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 182 | if self.mode =='test-dev' or self.mode == 'test': 183 | return -1 184 | 185 | if self.adict.has_key(ans_str): 186 | ans = self.adict[ans_str] 187 | else: 188 | ans = self.adict[''] 189 | return ans 190 | 191 | def vec_to_answer(self, ans_symbol): 192 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 193 | if self.rev_adict is None: 194 | rev_adict = {} 195 | for k,v in self.adict.items(): 196 | rev_adict[v] = k 197 | self.rev_adict = rev_adict 198 | 199 | return self.rev_adict[ans_symbol] 200 | 201 | def create_batch(self,qid_list): 202 | 203 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 204 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 205 | ivec = (np.zeros(self.batchsize*2048)).reshape(self.batchsize,2048) 206 | avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS) 207 | 208 | for i,qid in enumerate(qid_list): 209 | 210 | # load raw question information 211 | q_str = self.getQuesStr(qid) 212 | q_ans = self.getAnsObj(qid) 213 | q_iid = self.getImgId(qid) 214 | 215 | # convert question to vec 216 | q_list = VQADataProvider.seq_to_list(q_str) 217 | t_qvec, t_cvec = self.qlist_to_vec(self.max_length, q_list) 218 | 219 | try: 220 | qid_split = qid.split(QID_KEY_SEPARATOR) 221 | data_split = qid_split[0] 222 | if data_split == 'genome': 223 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x'] 224 | else: 225 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x'] 226 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) ) 227 | except: 228 | t_ivec = 0. 229 | print 'data not found for qid : ', q_iid, self.mode 230 | 231 | # convert answer to vec 232 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 233 | q_ans_str = self.extract_answer(q_ans) 234 | t_avec = self.answer_to_vec(q_ans_str) 235 | else: 236 | t_avec = self.extract_answer_list(q_ans) 237 | 238 | qvec[i,...] = t_qvec 239 | cvec[i,...] = t_cvec 240 | ivec[i,...] = t_ivec 241 | avec[i,...] = t_avec 242 | 243 | return qvec, cvec, ivec, avec 244 | 245 | 246 | def get_batch_vec(self): 247 | if self.batch_len is None: 248 | self.n_skipped = 0 249 | qid_list = self.getQuesIds() 250 | random.shuffle(qid_list) 251 | self.qid_list = qid_list 252 | self.batch_len = len(qid_list) 253 | self.batch_index = 0 254 | self.epoch_counter = 0 255 | 256 | def has_at_least_one_valid_answer(t_qid): 257 | answer_obj = self.getAnsObj(t_qid) 258 | answer_list = [ans['answer'] for ans in answer_obj] 259 | for ans in answer_list: 260 | if self.adict.has_key(ans): 261 | return True 262 | 263 | counter = 0 264 | t_qid_list = [] 265 | t_iid_list = [] 266 | while counter < self.batchsize: 267 | t_qid = self.qid_list[self.batch_index] 268 | t_iid = self.getImgId(t_qid) 269 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 270 | t_qid_list.append(t_qid) 271 | t_iid_list.append(t_iid) 272 | counter += 1 273 | elif has_at_least_one_valid_answer(t_qid): 274 | t_qid_list.append(t_qid) 275 | t_iid_list.append(t_iid) 276 | counter += 1 277 | else: 278 | self.n_skipped += 1 279 | 280 | if self.batch_index < self.batch_len-1: 281 | self.batch_index += 1 282 | else: 283 | self.epoch_counter += 1 284 | qid_list = self.getQuesIds() 285 | random.shuffle(qid_list) 286 | self.qid_list = qid_list 287 | self.batch_index = 0 288 | print("%d questions were skipped in a single epoch" % self.n_skipped) 289 | self.n_skipped = 0 290 | 291 | t_batch = self.create_batch(t_qid_list) 292 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter) 293 | 294 | 295 | class VQADataProviderLayer(caffe.Layer): 296 | """ 297 | Provide input data for VQA. 298 | """ 299 | 300 | def setup(self, bottom, top): 301 | self.batchsize = json.loads(self.param_str)['batchsize'] 302 | self.top_names = ['data','cont','feature','label'] 303 | top[0].reshape(15,self.batchsize) 304 | top[1].reshape(15,self.batchsize) 305 | top[2].reshape(self.batchsize,2048) 306 | top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS) 307 | 308 | self.mode = json.loads(self.param_str)['mode'] 309 | self.folder = json.loads(self.param_str)['folder'] 310 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 311 | pass 312 | else: 313 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder) 314 | 315 | def reshape(self, bottom, top): 316 | pass 317 | 318 | def forward(self, bottom, top): 319 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 320 | pass 321 | else: 322 | word, cont, feature, answer, _, _, _ = self.dp.get_batch_vec() 323 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N 324 | top[1].data[...] = np.transpose(cont,(1,0)) 325 | top[2].data[...] = feature 326 | top[3].data[...] = answer 327 | 328 | def backward(self, top, propagate_down, bottom): 329 | pass 330 | 331 | -------------------------------------------------------------------------------- /mfb_coatt_glove/vqa_data_layer.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | import re, json, random 4 | import config 5 | import spacy 6 | 7 | QID_KEY_SEPARATOR = '/' 8 | GLOVE_EMBEDDING_SIZE = 300 9 | 10 | class VQADataProvider: 11 | 12 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'): 13 | self.batchsize = batchsize 14 | self.d_vocabulary = None 15 | self.batch_index = None 16 | self.batch_len = None 17 | self.rev_adict = None 18 | self.max_length = max_length 19 | self.mode = mode 20 | self.qdic, self.adic = VQADataProvider.load_data(mode) 21 | 22 | with open('./%s/vdict.json'%folder,'r') as f: 23 | self.vdict = json.load(f) 24 | with open('./%s/adict.json'%folder,'r') as f: 25 | self.adict = json.load(f) 26 | 27 | self.n_ans_vocabulary = len(self.adict) 28 | self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') 29 | self.glove_dict = {} # word -> glove vector 30 | 31 | @staticmethod 32 | def load_vqa_json(data_split): 33 | """ 34 | Parses the question and answer json files for the given data split. 35 | Returns the question dictionary and the answer dictionary. 36 | """ 37 | qdic, adic = {}, {} 38 | 39 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: 40 | qdata = json.load(f)['questions'] 41 | for q in qdata: 42 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ 43 | {'qstr': q['question'], 'iid': q['image_id']} 44 | 45 | if 'test' not in data_split: 46 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: 47 | adata = json.load(f)['annotations'] 48 | for a in adata: 49 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ 50 | a['answers'] 51 | 52 | print 'parsed', len(qdic), 'questions for', data_split 53 | return qdic, adic 54 | 55 | @staticmethod 56 | def load_genome_json(): 57 | """ 58 | Parses the genome json file. Returns the question dictionary and the 59 | answer dictionary. 60 | """ 61 | qdic, adic = {}, {} 62 | 63 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: 64 | qdata = json.load(f) 65 | for q in qdata: 66 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) 67 | qdic[key] = {'qstr': q['question'], 'iid': q['image']} 68 | adic[key] = [{'answer': q['answer']}] 69 | 70 | print 'parsed', len(qdic), 'questions for genome' 71 | return qdic, adic 72 | 73 | @staticmethod 74 | def load_data(data_split_str): 75 | all_qdic, all_adic = {}, {} 76 | for data_split in data_split_str.split('+'): 77 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split' 78 | if data_split == 'genome': 79 | qdic, adic = VQADataProvider.load_genome_json() 80 | all_qdic.update(qdic) 81 | all_adic.update(adic) 82 | else: 83 | qdic, adic = VQADataProvider.load_vqa_json(data_split) 84 | all_qdic.update(qdic) 85 | all_adic.update(adic) 86 | return all_qdic, all_adic 87 | 88 | def getQuesIds(self): 89 | return self.qdic.keys() 90 | 91 | def getStrippedQuesId(self, qid): 92 | return qid.split(QID_KEY_SEPARATOR)[1] 93 | 94 | def getImgId(self,qid): 95 | return self.qdic[qid]['iid'] 96 | 97 | def getQuesStr(self,qid): 98 | return self.qdic[qid]['qstr'] 99 | 100 | def getAnsObj(self,qid): 101 | if self.mode == 'test-dev' or self.mode == 'test': 102 | return -1 103 | return self.adic[qid] 104 | 105 | @staticmethod 106 | def seq_to_list(s): 107 | t_str = s.lower() 108 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']: 109 | t_str = re.sub( i, '', t_str) 110 | for i in [r'\-',r'\/']: 111 | t_str = re.sub( i, ' ', t_str) 112 | q_list = re.sub(r'\?','',t_str.lower()).split(' ') 113 | q_list = filter(lambda x: len(x) > 0, q_list) 114 | return q_list 115 | 116 | def extract_answer(self,answer_obj): 117 | """ Return the most popular answer in string.""" 118 | if self.mode == 'test-dev' or self.mode == 'test': 119 | return -1 120 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)] 121 | dic = {} 122 | for ans in answer_list: 123 | if dic.has_key(ans): 124 | dic[ans] +=1 125 | else: 126 | dic[ans] = 1 127 | max_key = max((v,k) for (k,v) in dic.items())[1] 128 | return max_key 129 | 130 | def extract_answer_prob(self,answer_obj): 131 | """ Return the most popular answer in string.""" 132 | if self.mode == 'test-dev' or self.mode == 'test': 133 | return -1 134 | 135 | answer_list = [ ans['answer'] for ans in answer_obj] 136 | prob_answer_list = [] 137 | for ans in answer_list: 138 | if self.adict.has_key(ans): 139 | prob_answer_list.append(ans) 140 | 141 | if len(prob_answer_list) == 0: 142 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 143 | return 'hoge' 144 | else: 145 | raise Exception("This should not happen.") 146 | else: 147 | return random.choice(prob_answer_list) 148 | 149 | def qlist_to_vec(self, max_length, q_list): 150 | """ 151 | Converts a list of words into a format suitable for the embedding layer. 152 | 153 | Arguments: 154 | max_length -- the maximum length of a question sequence 155 | q_list -- a list of words which are the tokens in the question 156 | 157 | Returns: 158 | qvec -- A max_length length vector containing one-hot indices for each word 159 | cvec -- A max_length length sequence continuation indicator vector 160 | glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for 161 | each word 162 | """ 163 | qvec = np.zeros(max_length) 164 | cvec = np.zeros(max_length) 165 | glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE) 166 | for i in xrange(max_length): 167 | if i < max_length - len(q_list): 168 | cvec[i] = 0 169 | else: 170 | w = q_list[i-(max_length-len(q_list))] 171 | if w not in self.glove_dict: 172 | self.glove_dict[w] = self.nlp(u'%s' % w).vector 173 | glove_matrix[i] = self.glove_dict[w] 174 | # is the word in the vocabulary? 175 | if self.vdict.has_key(w) is False: 176 | w = '' 177 | qvec[i] = self.vdict[w] 178 | cvec[i] = 0 if i == max_length - len(q_list) else 1 179 | 180 | return qvec, cvec, glove_matrix 181 | 182 | def answer_to_vec(self, ans_str): 183 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 184 | if self.mode =='test-dev' or self.mode == 'test': 185 | return -1 186 | 187 | if self.adict.has_key(ans_str): 188 | ans = self.adict[ans_str] 189 | else: 190 | ans = self.adict[''] 191 | return ans 192 | 193 | def vec_to_answer(self, ans_symbol): 194 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 195 | if self.rev_adict is None: 196 | rev_adict = {} 197 | for k,v in self.adict.items(): 198 | rev_adict[v] = k 199 | self.rev_adict = rev_adict 200 | 201 | return self.rev_adict[ans_symbol] 202 | 203 | def create_batch(self,qid_list): 204 | 205 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 206 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 207 | ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14) 208 | avec = (np.zeros(self.batchsize)).reshape(self.batchsize) 209 | glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\ 210 | self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE) 211 | 212 | for i,qid in enumerate(qid_list): 213 | 214 | # load raw question information 215 | q_str = self.getQuesStr(qid) 216 | q_ans = self.getAnsObj(qid) 217 | q_iid = self.getImgId(qid) 218 | 219 | # convert question to vec 220 | q_list = VQADataProvider.seq_to_list(q_str) 221 | t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list) 222 | 223 | try: 224 | qid_split = qid.split(QID_KEY_SEPARATOR) 225 | data_split = qid_split[0] 226 | if data_split == 'genome': 227 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x'] 228 | else: 229 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x'] 230 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) ) 231 | except: 232 | t_ivec = 0. 233 | print 'data not found for qid : ', q_iid, self.mode 234 | 235 | # convert answer to vec 236 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 237 | q_ans_str = self.extract_answer(q_ans) 238 | else: 239 | q_ans_str = self.extract_answer_prob(q_ans) 240 | t_avec = self.answer_to_vec(q_ans_str) 241 | qvec[i,...] = t_qvec 242 | cvec[i,...] = t_cvec 243 | ivec[i,...] = t_ivec 244 | avec[i,...] = t_avec 245 | glove_matrix[i,...] = t_glove_matrix 246 | 247 | return qvec, cvec, ivec, avec, glove_matrix 248 | 249 | 250 | def get_batch_vec(self): 251 | if self.batch_len is None: 252 | self.n_skipped = 0 253 | qid_list = self.getQuesIds() 254 | random.shuffle(qid_list) 255 | self.qid_list = qid_list 256 | self.batch_len = len(qid_list) 257 | self.batch_index = 0 258 | self.epoch_counter = 0 259 | 260 | def has_at_least_one_valid_answer(t_qid): 261 | answer_obj = self.getAnsObj(t_qid) 262 | answer_list = [ans['answer'] for ans in answer_obj] 263 | for ans in answer_list: 264 | if self.adict.has_key(ans): 265 | return True 266 | 267 | counter = 0 268 | t_qid_list = [] 269 | t_iid_list = [] 270 | while counter < self.batchsize: 271 | t_qid = self.qid_list[self.batch_index] 272 | t_iid = self.getImgId(t_qid) 273 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 274 | t_qid_list.append(t_qid) 275 | t_iid_list.append(t_iid) 276 | counter += 1 277 | elif has_at_least_one_valid_answer(t_qid): 278 | t_qid_list.append(t_qid) 279 | t_iid_list.append(t_iid) 280 | counter += 1 281 | else: 282 | self.n_skipped += 1 283 | 284 | if self.batch_index < self.batch_len-1: 285 | self.batch_index += 1 286 | else: 287 | self.epoch_counter += 1 288 | qid_list = self.getQuesIds() 289 | random.shuffle(qid_list) 290 | self.qid_list = qid_list 291 | self.batch_index = 0 292 | print("%d questions were skipped in a single epoch" % self.n_skipped) 293 | self.n_skipped = 0 294 | 295 | t_batch = self.create_batch(t_qid_list) 296 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter) 297 | 298 | 299 | class VQADataProviderLayer(caffe.Layer): 300 | """ 301 | Provide input data for VQA. 302 | """ 303 | 304 | def setup(self, bottom, top): 305 | self.batchsize = json.loads(self.param_str)['batchsize'] 306 | self.top_names = ['data','cont','feature','label','glove'] 307 | top[0].reshape(15,self.batchsize) 308 | top[1].reshape(15,self.batchsize) 309 | top[2].reshape(self.batchsize,2048,14,14) 310 | top[3].reshape(self.batchsize) 311 | top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE) 312 | 313 | self.mode = json.loads(self.param_str)['mode'] 314 | self.folder = json.loads(self.param_str)['folder'] 315 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 316 | pass 317 | else: 318 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder) 319 | 320 | def reshape(self, bottom, top): 321 | pass 322 | 323 | def forward(self, bottom, top): 324 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 325 | pass 326 | else: 327 | word, cont, feature, answer, glove_matrix, _, _, _ = self.dp.get_batch_vec() 328 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N 329 | top[1].data[...] = np.transpose(cont,(1,0)) 330 | top[2].data[...] = feature 331 | top[3].data[...] = answer 332 | top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300 333 | 334 | def backward(self, top, propagate_down, bottom): 335 | pass 336 | 337 | -------------------------------------------------------------------------------- /mfh_coatt_glove/vqa_data_layer.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | import re, json, random 4 | import config 5 | import spacy 6 | 7 | QID_KEY_SEPARATOR = '/' 8 | GLOVE_EMBEDDING_SIZE = 300 9 | 10 | class VQADataProvider: 11 | 12 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'): 13 | self.batchsize = batchsize 14 | self.d_vocabulary = None 15 | self.batch_index = None 16 | self.batch_len = None 17 | self.rev_adict = None 18 | self.max_length = max_length 19 | self.mode = mode 20 | self.qdic, self.adic = VQADataProvider.load_data(mode) 21 | 22 | with open('./%s/vdict.json'%folder,'r') as f: 23 | self.vdict = json.load(f) 24 | with open('./%s/adict.json'%folder,'r') as f: 25 | self.adict = json.load(f) 26 | 27 | self.n_ans_vocabulary = len(self.adict) 28 | self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') 29 | self.glove_dict = {} # word -> glove vector 30 | 31 | @staticmethod 32 | def load_vqa_json(data_split): 33 | """ 34 | Parses the question and answer json files for the given data split. 35 | Returns the question dictionary and the answer dictionary. 36 | """ 37 | qdic, adic = {}, {} 38 | 39 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: 40 | qdata = json.load(f)['questions'] 41 | for q in qdata: 42 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ 43 | {'qstr': q['question'], 'iid': q['image_id']} 44 | 45 | if 'test' not in data_split: 46 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: 47 | adata = json.load(f)['annotations'] 48 | for a in adata: 49 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ 50 | a['answers'] 51 | 52 | print 'parsed', len(qdic), 'questions for', data_split 53 | return qdic, adic 54 | 55 | @staticmethod 56 | def load_genome_json(): 57 | """ 58 | Parses the genome json file. Returns the question dictionary and the 59 | answer dictionary. 60 | """ 61 | qdic, adic = {}, {} 62 | 63 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: 64 | qdata = json.load(f) 65 | for q in qdata: 66 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) 67 | qdic[key] = {'qstr': q['question'], 'iid': q['image']} 68 | adic[key] = [{'answer': q['answer']}] 69 | 70 | print 'parsed', len(qdic), 'questions for genome' 71 | return qdic, adic 72 | 73 | @staticmethod 74 | def load_data(data_split_str): 75 | all_qdic, all_adic = {}, {} 76 | for data_split in data_split_str.split('+'): 77 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split' 78 | if data_split == 'genome': 79 | qdic, adic = VQADataProvider.load_genome_json() 80 | all_qdic.update(qdic) 81 | all_adic.update(adic) 82 | else: 83 | qdic, adic = VQADataProvider.load_vqa_json(data_split) 84 | all_qdic.update(qdic) 85 | all_adic.update(adic) 86 | return all_qdic, all_adic 87 | 88 | def getQuesIds(self): 89 | return self.qdic.keys() 90 | 91 | def getStrippedQuesId(self, qid): 92 | return qid.split(QID_KEY_SEPARATOR)[1] 93 | 94 | def getImgId(self,qid): 95 | return self.qdic[qid]['iid'] 96 | 97 | def getQuesStr(self,qid): 98 | return self.qdic[qid]['qstr'] 99 | 100 | def getAnsObj(self,qid): 101 | if self.mode == 'test-dev' or self.mode == 'test': 102 | return -1 103 | return self.adic[qid] 104 | 105 | @staticmethod 106 | def seq_to_list(s): 107 | t_str = s.lower() 108 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']: 109 | t_str = re.sub( i, '', t_str) 110 | for i in [r'\-',r'\/']: 111 | t_str = re.sub( i, ' ', t_str) 112 | q_list = re.sub(r'\?','',t_str.lower()).split(' ') 113 | q_list = filter(lambda x: len(x) > 0, q_list) 114 | return q_list 115 | 116 | def extract_answer(self,answer_obj): 117 | """ Return the most popular answer in string.""" 118 | if self.mode == 'test-dev' or self.mode == 'test': 119 | return -1 120 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)] 121 | dic = {} 122 | for ans in answer_list: 123 | if dic.has_key(ans): 124 | dic[ans] +=1 125 | else: 126 | dic[ans] = 1 127 | max_key = max((v,k) for (k,v) in dic.items())[1] 128 | return max_key 129 | 130 | def extract_answer_prob(self,answer_obj): 131 | """ Return the most popular answer in string.""" 132 | if self.mode == 'test-dev' or self.mode == 'test': 133 | return -1 134 | 135 | answer_list = [ ans['answer'] for ans in answer_obj] 136 | prob_answer_list = [] 137 | for ans in answer_list: 138 | if self.adict.has_key(ans): 139 | prob_answer_list.append(ans) 140 | 141 | if len(prob_answer_list) == 0: 142 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 143 | return 'hoge' 144 | else: 145 | raise Exception("This should not happen.") 146 | else: 147 | return random.choice(prob_answer_list) 148 | 149 | def qlist_to_vec(self, max_length, q_list): 150 | """ 151 | Converts a list of words into a format suitable for the embedding layer. 152 | 153 | Arguments: 154 | max_length -- the maximum length of a question sequence 155 | q_list -- a list of words which are the tokens in the question 156 | 157 | Returns: 158 | qvec -- A max_length length vector containing one-hot indices for each word 159 | cvec -- A max_length length sequence continuation indicator vector 160 | glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for 161 | each word 162 | """ 163 | qvec = np.zeros(max_length) 164 | cvec = np.zeros(max_length) 165 | glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE) 166 | for i in xrange(max_length): 167 | if i < max_length - len(q_list): 168 | cvec[i] = 0 169 | else: 170 | w = q_list[i-(max_length-len(q_list))] 171 | if w not in self.glove_dict: 172 | self.glove_dict[w] = self.nlp(u'%s' % w).vector 173 | glove_matrix[i] = self.glove_dict[w] 174 | # is the word in the vocabulary? 175 | if self.vdict.has_key(w) is False: 176 | w = '' 177 | qvec[i] = self.vdict[w] 178 | cvec[i] = 0 if i == max_length - len(q_list) else 1 179 | 180 | return qvec, cvec, glove_matrix 181 | 182 | def answer_to_vec(self, ans_str): 183 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 184 | if self.mode =='test-dev' or self.mode == 'test': 185 | return -1 186 | 187 | if self.adict.has_key(ans_str): 188 | ans = self.adict[ans_str] 189 | else: 190 | ans = self.adict[''] 191 | return ans 192 | 193 | def vec_to_answer(self, ans_symbol): 194 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 195 | if self.rev_adict is None: 196 | rev_adict = {} 197 | for k,v in self.adict.items(): 198 | rev_adict[v] = k 199 | self.rev_adict = rev_adict 200 | 201 | return self.rev_adict[ans_symbol] 202 | 203 | def create_batch(self,qid_list): 204 | 205 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 206 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 207 | ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14) 208 | avec = (np.zeros(self.batchsize)).reshape(self.batchsize) 209 | glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\ 210 | self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE) 211 | 212 | for i,qid in enumerate(qid_list): 213 | 214 | # load raw question information 215 | q_str = self.getQuesStr(qid) 216 | q_ans = self.getAnsObj(qid) 217 | q_iid = self.getImgId(qid) 218 | 219 | # convert question to vec 220 | q_list = VQADataProvider.seq_to_list(q_str) 221 | t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list) 222 | 223 | try: 224 | qid_split = qid.split(QID_KEY_SEPARATOR) 225 | data_split = qid_split[0] 226 | if data_split == 'genome': 227 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x'] 228 | else: 229 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x'] 230 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) ) 231 | except: 232 | t_ivec = 0. 233 | print 'data not found for qid : ', q_iid, self.mode 234 | 235 | # convert answer to vec 236 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 237 | q_ans_str = self.extract_answer(q_ans) 238 | else: 239 | q_ans_str = self.extract_answer_prob(q_ans) 240 | t_avec = self.answer_to_vec(q_ans_str) 241 | qvec[i,...] = t_qvec 242 | cvec[i,...] = t_cvec 243 | ivec[i,...] = t_ivec 244 | avec[i,...] = t_avec 245 | glove_matrix[i,...] = t_glove_matrix 246 | 247 | return qvec, cvec, ivec, avec, glove_matrix 248 | 249 | 250 | def get_batch_vec(self): 251 | if self.batch_len is None: 252 | self.n_skipped = 0 253 | qid_list = self.getQuesIds() 254 | random.shuffle(qid_list) 255 | self.qid_list = qid_list 256 | self.batch_len = len(qid_list) 257 | self.batch_index = 0 258 | self.epoch_counter = 0 259 | 260 | def has_at_least_one_valid_answer(t_qid): 261 | answer_obj = self.getAnsObj(t_qid) 262 | answer_list = [ans['answer'] for ans in answer_obj] 263 | for ans in answer_list: 264 | if self.adict.has_key(ans): 265 | return True 266 | 267 | counter = 0 268 | t_qid_list = [] 269 | t_iid_list = [] 270 | while counter < self.batchsize: 271 | t_qid = self.qid_list[self.batch_index] 272 | t_iid = self.getImgId(t_qid) 273 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 274 | t_qid_list.append(t_qid) 275 | t_iid_list.append(t_iid) 276 | counter += 1 277 | elif has_at_least_one_valid_answer(t_qid): 278 | t_qid_list.append(t_qid) 279 | t_iid_list.append(t_iid) 280 | counter += 1 281 | else: 282 | self.n_skipped += 1 283 | 284 | if self.batch_index < self.batch_len-1: 285 | self.batch_index += 1 286 | else: 287 | self.epoch_counter += 1 288 | qid_list = self.getQuesIds() 289 | random.shuffle(qid_list) 290 | self.qid_list = qid_list 291 | self.batch_index = 0 292 | print("%d questions were skipped in a single epoch" % self.n_skipped) 293 | self.n_skipped = 0 294 | 295 | t_batch = self.create_batch(t_qid_list) 296 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter) 297 | 298 | 299 | class VQADataProviderLayer(caffe.Layer): 300 | """ 301 | Provide input data for VQA. 302 | """ 303 | 304 | def setup(self, bottom, top): 305 | self.batchsize = json.loads(self.param_str)['batchsize'] 306 | self.top_names = ['data','cont','feature','label','glove'] 307 | top[0].reshape(15,self.batchsize) 308 | top[1].reshape(15,self.batchsize) 309 | top[2].reshape(self.batchsize,2048,14,14) 310 | top[3].reshape(self.batchsize) 311 | top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE) 312 | 313 | self.mode = json.loads(self.param_str)['mode'] 314 | self.folder = json.loads(self.param_str)['folder'] 315 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 316 | pass 317 | else: 318 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder) 319 | 320 | def reshape(self, bottom, top): 321 | pass 322 | 323 | def forward(self, bottom, top): 324 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 325 | pass 326 | else: 327 | word, cont, feature, answer, glove_matrix, _, _, _ = self.dp.get_batch_vec() 328 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N 329 | top[1].data[...] = np.transpose(cont,(1,0)) 330 | top[2].data[...] = feature 331 | top[3].data[...] = answer 332 | top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300 333 | 334 | def backward(self, top, propagate_down, bottom): 335 | pass 336 | 337 | -------------------------------------------------------------------------------- /mfb_coatt_glove/vqa_data_layer_kld.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | import re, json, random 4 | import config 5 | import spacy 6 | 7 | QID_KEY_SEPARATOR = '/' 8 | GLOVE_EMBEDDING_SIZE = 300 9 | 10 | class VQADataProvider: 11 | 12 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'): 13 | self.batchsize = batchsize 14 | self.d_vocabulary = None 15 | self.batch_index = None 16 | self.batch_len = None 17 | self.rev_adict = None 18 | self.max_length = max_length 19 | self.mode = mode 20 | self.qdic, self.adic = VQADataProvider.load_data(mode) 21 | 22 | with open('./%s/vdict.json'%folder,'r') as f: 23 | self.vdict = json.load(f) 24 | with open('./%s/adict.json'%folder,'r') as f: 25 | self.adict = json.load(f) 26 | 27 | self.n_ans_vocabulary = len(self.adict) 28 | self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') 29 | self.glove_dict = {} # word -> glove vector 30 | 31 | @staticmethod 32 | def load_vqa_json(data_split): 33 | """ 34 | Parses the question and answer json files for the given data split. 35 | Returns the question dictionary and the answer dictionary. 36 | """ 37 | qdic, adic = {}, {} 38 | 39 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: 40 | qdata = json.load(f)['questions'] 41 | for q in qdata: 42 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ 43 | {'qstr': q['question'], 'iid': q['image_id']} 44 | 45 | if 'test' not in data_split: 46 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: 47 | adata = json.load(f)['annotations'] 48 | for a in adata: 49 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ 50 | a['answers'] 51 | 52 | print 'parsed', len(qdic), 'questions for', data_split 53 | return qdic, adic 54 | 55 | @staticmethod 56 | def load_genome_json(): 57 | """ 58 | Parses the genome json file. Returns the question dictionary and the 59 | answer dictionary. 60 | """ 61 | qdic, adic = {}, {} 62 | 63 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: 64 | qdata = json.load(f) 65 | for q in qdata: 66 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) 67 | qdic[key] = {'qstr': q['question'], 'iid': q['image']} 68 | adic[key] = [{'answer': q['answer']}] 69 | 70 | print 'parsed', len(qdic), 'questions for genome' 71 | return qdic, adic 72 | 73 | @staticmethod 74 | def load_data(data_split_str): 75 | all_qdic, all_adic = {}, {} 76 | for data_split in data_split_str.split('+'): 77 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split' 78 | if data_split == 'genome': 79 | qdic, adic = VQADataProvider.load_genome_json() 80 | all_qdic.update(qdic) 81 | all_adic.update(adic) 82 | else: 83 | qdic, adic = VQADataProvider.load_vqa_json(data_split) 84 | all_qdic.update(qdic) 85 | all_adic.update(adic) 86 | return all_qdic, all_adic 87 | 88 | def getQuesIds(self): 89 | return self.qdic.keys() 90 | 91 | def getStrippedQuesId(self, qid): 92 | return qid.split(QID_KEY_SEPARATOR)[1] 93 | 94 | def getImgId(self,qid): 95 | return self.qdic[qid]['iid'] 96 | 97 | def getQuesStr(self,qid): 98 | return self.qdic[qid]['qstr'] 99 | 100 | def getAnsObj(self,qid): 101 | if self.mode == 'test-dev' or self.mode == 'test': 102 | return -1 103 | return self.adic[qid] 104 | 105 | @staticmethod 106 | def seq_to_list(s): 107 | t_str = s.lower() 108 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']: 109 | t_str = re.sub( i, '', t_str) 110 | for i in [r'\-',r'\/']: 111 | t_str = re.sub( i, ' ', t_str) 112 | q_list = re.sub(r'\?','',t_str.lower()).split(' ') 113 | q_list = filter(lambda x: len(x) > 0, q_list) 114 | return q_list 115 | 116 | def extract_answer(self,answer_obj): 117 | """ Return the most popular answer in string.""" 118 | if self.mode == 'test-dev' or self.mode == 'test': 119 | return -1 120 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)] 121 | dic = {} 122 | for ans in answer_list: 123 | if dic.has_key(ans): 124 | dic[ans] +=1 125 | else: 126 | dic[ans] = 1 127 | max_key = max((v,k) for (k,v) in dic.items())[1] 128 | return max_key 129 | 130 | def extract_answer_prob(self,answer_obj): 131 | """ Return the most popular answer in string.""" 132 | if self.mode == 'test-dev' or self.mode == 'test': 133 | return -1 134 | 135 | answer_list = [ ans['answer'] for ans in answer_obj] 136 | prob_answer_list = [] 137 | for ans in answer_list: 138 | if self.adict.has_key(ans): 139 | prob_answer_list.append(ans) 140 | 141 | if len(prob_answer_list) == 0: 142 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 143 | return 'hoge' 144 | else: 145 | raise Exception("This should not happen.") 146 | else: 147 | return random.choice(prob_answer_list) 148 | 149 | def extract_answer_list(self,answer_obj): 150 | answer_list = [ ans['answer'] for ans in answer_obj] 151 | prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS) 152 | for ans in answer_list: 153 | if self.adict.has_key(ans): 154 | index = self.adict[ans] 155 | prob_answer_vec[index] += 1 156 | prob_answer_vec = prob_answer_vec / np.sum(prob_answer_vec) 157 | return prob_answer_vec 158 | 159 | def qlist_to_vec(self, max_length, q_list): 160 | """ 161 | Converts a list of words into a format suitable for the embedding layer. 162 | 163 | Arguments: 164 | max_length -- the maximum length of a question sequence 165 | q_list -- a list of words which are the tokens in the question 166 | 167 | Returns: 168 | qvec -- A max_length length vector containing one-hot indices for each word 169 | cvec -- A max_length length sequence continuation indicator vector 170 | glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for 171 | each word 172 | """ 173 | qvec = np.zeros(max_length) 174 | cvec = np.zeros(max_length) 175 | glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE) 176 | for i in xrange(max_length): 177 | if i < max_length - len(q_list): 178 | cvec[i] = 0 179 | else: 180 | w = q_list[i-(max_length-len(q_list))] 181 | if w not in self.glove_dict: 182 | self.glove_dict[w] = self.nlp(u'%s' % w).vector 183 | glove_matrix[i] = self.glove_dict[w] 184 | # is the word in the vocabulary? 185 | if self.vdict.has_key(w) is False: 186 | w = '' 187 | qvec[i] = self.vdict[w] 188 | cvec[i] = 0 if i == max_length - len(q_list) else 1 189 | 190 | return qvec, cvec, glove_matrix 191 | 192 | def answer_to_vec(self, ans_str): 193 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 194 | if self.mode =='test-dev' or self.mode == 'test': 195 | return -1 196 | 197 | if self.adict.has_key(ans_str): 198 | ans = self.adict[ans_str] 199 | else: 200 | ans = self.adict[''] 201 | return ans 202 | 203 | def vec_to_answer(self, ans_symbol): 204 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 205 | if self.rev_adict is None: 206 | rev_adict = {} 207 | for k,v in self.adict.items(): 208 | rev_adict[v] = k 209 | self.rev_adict = rev_adict 210 | 211 | return self.rev_adict[ans_symbol] 212 | 213 | def create_batch(self,qid_list): 214 | 215 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 216 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 217 | ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14) 218 | avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS) 219 | glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\ 220 | self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE) 221 | 222 | for i,qid in enumerate(qid_list): 223 | 224 | # load raw question information 225 | q_str = self.getQuesStr(qid) 226 | q_ans = self.getAnsObj(qid) 227 | q_iid = self.getImgId(qid) 228 | 229 | # convert question to vec 230 | q_list = VQADataProvider.seq_to_list(q_str) 231 | t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list) 232 | 233 | try: 234 | qid_split = qid.split(QID_KEY_SEPARATOR) 235 | data_split = qid_split[0] 236 | if data_split == 'genome': 237 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x'] 238 | else: 239 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x'] 240 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) ) 241 | except: 242 | t_ivec = 0. 243 | print 'data not found for qid : ', q_iid, self.mode 244 | 245 | # convert answer to vec 246 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 247 | q_ans_str = self.extract_answer(q_ans) 248 | t_avec = self.answer_to_vec(q_ans_str) 249 | else: 250 | t_avec = self.extract_answer_list(q_ans) 251 | 252 | qvec[i,...] = t_qvec 253 | cvec[i,...] = t_cvec 254 | ivec[i,...] = t_ivec 255 | avec[i,...] = t_avec 256 | glove_matrix[i,...] = t_glove_matrix 257 | 258 | return qvec, cvec, ivec, avec, glove_matrix 259 | 260 | 261 | def get_batch_vec(self): 262 | if self.batch_len is None: 263 | self.n_skipped = 0 264 | qid_list = self.getQuesIds() 265 | random.shuffle(qid_list) 266 | self.qid_list = qid_list 267 | self.batch_len = len(qid_list) 268 | self.batch_index = 0 269 | self.epoch_counter = 0 270 | 271 | def has_at_least_one_valid_answer(t_qid): 272 | answer_obj = self.getAnsObj(t_qid) 273 | answer_list = [ans['answer'] for ans in answer_obj] 274 | for ans in answer_list: 275 | if self.adict.has_key(ans): 276 | return True 277 | 278 | counter = 0 279 | t_qid_list = [] 280 | t_iid_list = [] 281 | while counter < self.batchsize: 282 | t_qid = self.qid_list[self.batch_index] 283 | t_iid = self.getImgId(t_qid) 284 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 285 | t_qid_list.append(t_qid) 286 | t_iid_list.append(t_iid) 287 | counter += 1 288 | elif has_at_least_one_valid_answer(t_qid): 289 | t_qid_list.append(t_qid) 290 | t_iid_list.append(t_iid) 291 | counter += 1 292 | else: 293 | self.n_skipped += 1 294 | 295 | if self.batch_index < self.batch_len-1: 296 | self.batch_index += 1 297 | else: 298 | self.epoch_counter += 1 299 | qid_list = self.getQuesIds() 300 | random.shuffle(qid_list) 301 | self.qid_list = qid_list 302 | self.batch_index = 0 303 | print("%d questions were skipped in a single epoch" % self.n_skipped) 304 | self.n_skipped = 0 305 | 306 | t_batch = self.create_batch(t_qid_list) 307 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter) 308 | 309 | 310 | class VQADataProviderLayer(caffe.Layer): 311 | """ 312 | Provide input data for VQA. 313 | """ 314 | 315 | def setup(self, bottom, top): 316 | self.batchsize = json.loads(self.param_str)['batchsize'] 317 | self.top_names = ['data','cont','feature','label','glove'] 318 | top[0].reshape(15,self.batchsize) 319 | top[1].reshape(15,self.batchsize) 320 | top[2].reshape(self.batchsize,2048,14,14) 321 | top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS) 322 | top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE) 323 | 324 | self.mode = json.loads(self.param_str)['mode'] 325 | self.folder = json.loads(self.param_str)['folder'] 326 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 327 | pass 328 | else: 329 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder) 330 | 331 | def reshape(self, bottom, top): 332 | pass 333 | 334 | def forward(self, bottom, top): 335 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 336 | pass 337 | else: 338 | word, cont, feature, answer, glove_matrix, _, _, epoch_counter = self.dp.get_batch_vec() 339 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N 340 | top[1].data[...] = np.transpose(cont,(1,0)) 341 | top[2].data[...] = feature 342 | top[3].data[...] = answer 343 | top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300 344 | 345 | def backward(self, top, propagate_down, bottom): 346 | pass 347 | 348 | -------------------------------------------------------------------------------- /mfh_coatt_glove/vqa_data_layer_kld.py: -------------------------------------------------------------------------------- 1 | import caffe 2 | import numpy as np 3 | import re, json, random 4 | import config 5 | import spacy 6 | 7 | QID_KEY_SEPARATOR = '/' 8 | GLOVE_EMBEDDING_SIZE = 300 9 | 10 | class VQADataProvider: 11 | 12 | def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'): 13 | self.batchsize = batchsize 14 | self.d_vocabulary = None 15 | self.batch_index = None 16 | self.batch_len = None 17 | self.rev_adict = None 18 | self.max_length = max_length 19 | self.mode = mode 20 | self.qdic, self.adic = VQADataProvider.load_data(mode) 21 | 22 | with open('./%s/vdict.json'%folder,'r') as f: 23 | self.vdict = json.load(f) 24 | with open('./%s/adict.json'%folder,'r') as f: 25 | self.adict = json.load(f) 26 | 27 | self.n_ans_vocabulary = len(self.adict) 28 | self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') 29 | self.glove_dict = {} # word -> glove vector 30 | 31 | @staticmethod 32 | def load_vqa_json(data_split): 33 | """ 34 | Parses the question and answer json files for the given data split. 35 | Returns the question dictionary and the answer dictionary. 36 | """ 37 | qdic, adic = {}, {} 38 | 39 | with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: 40 | qdata = json.load(f)['questions'] 41 | for q in qdata: 42 | qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ 43 | {'qstr': q['question'], 'iid': q['image_id']} 44 | 45 | if 'test' not in data_split: 46 | with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: 47 | adata = json.load(f)['annotations'] 48 | for a in adata: 49 | adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ 50 | a['answers'] 51 | 52 | print 'parsed', len(qdic), 'questions for', data_split 53 | return qdic, adic 54 | 55 | @staticmethod 56 | def load_genome_json(): 57 | """ 58 | Parses the genome json file. Returns the question dictionary and the 59 | answer dictionary. 60 | """ 61 | qdic, adic = {}, {} 62 | 63 | with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: 64 | qdata = json.load(f) 65 | for q in qdata: 66 | key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) 67 | qdic[key] = {'qstr': q['question'], 'iid': q['image']} 68 | adic[key] = [{'answer': q['answer']}] 69 | 70 | print 'parsed', len(qdic), 'questions for genome' 71 | return qdic, adic 72 | 73 | @staticmethod 74 | def load_data(data_split_str): 75 | all_qdic, all_adic = {}, {} 76 | for data_split in data_split_str.split('+'): 77 | assert data_split in config.DATA_PATHS.keys(), 'unknown data split' 78 | if data_split == 'genome': 79 | qdic, adic = VQADataProvider.load_genome_json() 80 | all_qdic.update(qdic) 81 | all_adic.update(adic) 82 | else: 83 | qdic, adic = VQADataProvider.load_vqa_json(data_split) 84 | all_qdic.update(qdic) 85 | all_adic.update(adic) 86 | return all_qdic, all_adic 87 | 88 | def getQuesIds(self): 89 | return self.qdic.keys() 90 | 91 | def getStrippedQuesId(self, qid): 92 | return qid.split(QID_KEY_SEPARATOR)[1] 93 | 94 | def getImgId(self,qid): 95 | return self.qdic[qid]['iid'] 96 | 97 | def getQuesStr(self,qid): 98 | return self.qdic[qid]['qstr'] 99 | 100 | def getAnsObj(self,qid): 101 | if self.mode == 'test-dev' or self.mode == 'test': 102 | return -1 103 | return self.adic[qid] 104 | 105 | @staticmethod 106 | def seq_to_list(s): 107 | t_str = s.lower() 108 | for i in [r'\?',r'\!',r'\'',r'\"',r'\$',r'\:',r'\@',r'\(',r'\)',r'\,',r'\.',r'\;']: 109 | t_str = re.sub( i, '', t_str) 110 | for i in [r'\-',r'\/']: 111 | t_str = re.sub( i, ' ', t_str) 112 | q_list = re.sub(r'\?','',t_str.lower()).split(' ') 113 | q_list = filter(lambda x: len(x) > 0, q_list) 114 | return q_list 115 | 116 | def extract_answer(self,answer_obj): 117 | """ Return the most popular answer in string.""" 118 | if self.mode == 'test-dev' or self.mode == 'test': 119 | return -1 120 | answer_list = [ answer_obj[i]['answer'] for i in xrange(10)] 121 | dic = {} 122 | for ans in answer_list: 123 | if dic.has_key(ans): 124 | dic[ans] +=1 125 | else: 126 | dic[ans] = 1 127 | max_key = max((v,k) for (k,v) in dic.items())[1] 128 | return max_key 129 | 130 | def extract_answer_prob(self,answer_obj): 131 | """ Return the most popular answer in string.""" 132 | if self.mode == 'test-dev' or self.mode == 'test': 133 | return -1 134 | 135 | answer_list = [ ans['answer'] for ans in answer_obj] 136 | prob_answer_list = [] 137 | for ans in answer_list: 138 | if self.adict.has_key(ans): 139 | prob_answer_list.append(ans) 140 | 141 | if len(prob_answer_list) == 0: 142 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 143 | return 'hoge' 144 | else: 145 | raise Exception("This should not happen.") 146 | else: 147 | return random.choice(prob_answer_list) 148 | 149 | def extract_answer_list(self,answer_obj): 150 | answer_list = [ ans['answer'] for ans in answer_obj] 151 | prob_answer_vec = np.zeros(config.NUM_OUTPUT_UNITS) 152 | for ans in answer_list: 153 | if self.adict.has_key(ans): 154 | index = self.adict[ans] 155 | prob_answer_vec[index] += 1 156 | prob_answer_vec = prob_answer_vec / np.sum(prob_answer_vec) 157 | return prob_answer_vec 158 | 159 | def qlist_to_vec(self, max_length, q_list): 160 | """ 161 | Converts a list of words into a format suitable for the embedding layer. 162 | 163 | Arguments: 164 | max_length -- the maximum length of a question sequence 165 | q_list -- a list of words which are the tokens in the question 166 | 167 | Returns: 168 | qvec -- A max_length length vector containing one-hot indices for each word 169 | cvec -- A max_length length sequence continuation indicator vector 170 | glove_matrix -- A max_length x GLOVE_EMBEDDING_SIZE matrix containing the glove embedding for 171 | each word 172 | """ 173 | qvec = np.zeros(max_length) 174 | cvec = np.zeros(max_length) 175 | glove_matrix = np.zeros(max_length * GLOVE_EMBEDDING_SIZE).reshape(max_length, GLOVE_EMBEDDING_SIZE) 176 | for i in xrange(max_length): 177 | if i < max_length - len(q_list): 178 | cvec[i] = 0 179 | else: 180 | w = q_list[i-(max_length-len(q_list))] 181 | if w not in self.glove_dict: 182 | self.glove_dict[w] = self.nlp(u'%s' % w).vector 183 | glove_matrix[i] = self.glove_dict[w] 184 | # is the word in the vocabulary? 185 | if self.vdict.has_key(w) is False: 186 | w = '' 187 | qvec[i] = self.vdict[w] 188 | cvec[i] = 0 if i == max_length - len(q_list) else 1 189 | 190 | return qvec, cvec, glove_matrix 191 | 192 | def answer_to_vec(self, ans_str): 193 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 194 | if self.mode =='test-dev' or self.mode == 'test': 195 | return -1 196 | 197 | if self.adict.has_key(ans_str): 198 | ans = self.adict[ans_str] 199 | else: 200 | ans = self.adict[''] 201 | return ans 202 | 203 | def vec_to_answer(self, ans_symbol): 204 | """ Return answer id if the answer is included in vocabulary otherwise '' """ 205 | if self.rev_adict is None: 206 | rev_adict = {} 207 | for k,v in self.adict.items(): 208 | rev_adict[v] = k 209 | self.rev_adict = rev_adict 210 | 211 | return self.rev_adict[ans_symbol] 212 | 213 | def create_batch(self,qid_list): 214 | 215 | qvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 216 | cvec = (np.zeros(self.batchsize*self.max_length)).reshape(self.batchsize,self.max_length) 217 | ivec = (np.zeros(self.batchsize*2048*14*14)).reshape(self.batchsize,2048,14,14) 218 | avec = (np.zeros(self.batchsize*config.NUM_OUTPUT_UNITS)).reshape(self.batchsize,config.NUM_OUTPUT_UNITS) 219 | glove_matrix = np.zeros(self.batchsize * self.max_length * GLOVE_EMBEDDING_SIZE).reshape(\ 220 | self.batchsize, self.max_length, GLOVE_EMBEDDING_SIZE) 221 | 222 | for i,qid in enumerate(qid_list): 223 | 224 | # load raw question information 225 | q_str = self.getQuesStr(qid) 226 | q_ans = self.getAnsObj(qid) 227 | q_iid = self.getImgId(qid) 228 | 229 | # convert question to vec 230 | q_list = VQADataProvider.seq_to_list(q_str) 231 | t_qvec, t_cvec, t_glove_matrix = self.qlist_to_vec(self.max_length, q_list) 232 | 233 | try: 234 | qid_split = qid.split(QID_KEY_SEPARATOR) 235 | data_split = qid_split[0] 236 | if data_split == 'genome': 237 | t_ivec = np.load(config.DATA_PATHS['genome']['features_prefix'] + str(q_iid) + '.jpg.npz')['x'] 238 | else: 239 | t_ivec = np.load(config.DATA_PATHS[data_split]['features_prefix'] + str(q_iid).zfill(12) + '.jpg.npz')['x'] 240 | t_ivec = ( t_ivec / np.sqrt((t_ivec**2).sum()) ) 241 | except: 242 | t_ivec = 0. 243 | print 'data not found for qid : ', q_iid, self.mode 244 | 245 | # convert answer to vec 246 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 247 | q_ans_str = self.extract_answer(q_ans) 248 | t_avec = self.answer_to_vec(q_ans_str) 249 | else: 250 | t_avec = self.extract_answer_list(q_ans) 251 | 252 | qvec[i,...] = t_qvec 253 | cvec[i,...] = t_cvec 254 | ivec[i,...] = t_ivec 255 | avec[i,...] = t_avec 256 | glove_matrix[i,...] = t_glove_matrix 257 | 258 | return qvec, cvec, ivec, avec, glove_matrix 259 | 260 | 261 | def get_batch_vec(self): 262 | if self.batch_len is None: 263 | self.n_skipped = 0 264 | qid_list = self.getQuesIds() 265 | random.shuffle(qid_list) 266 | self.qid_list = qid_list 267 | self.batch_len = len(qid_list) 268 | self.batch_index = 0 269 | self.epoch_counter = 0 270 | 271 | def has_at_least_one_valid_answer(t_qid): 272 | answer_obj = self.getAnsObj(t_qid) 273 | answer_list = [ans['answer'] for ans in answer_obj] 274 | for ans in answer_list: 275 | if self.adict.has_key(ans): 276 | return True 277 | 278 | counter = 0 279 | t_qid_list = [] 280 | t_iid_list = [] 281 | while counter < self.batchsize: 282 | t_qid = self.qid_list[self.batch_index] 283 | t_iid = self.getImgId(t_qid) 284 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 285 | t_qid_list.append(t_qid) 286 | t_iid_list.append(t_iid) 287 | counter += 1 288 | elif has_at_least_one_valid_answer(t_qid): 289 | t_qid_list.append(t_qid) 290 | t_iid_list.append(t_iid) 291 | counter += 1 292 | else: 293 | self.n_skipped += 1 294 | 295 | if self.batch_index < self.batch_len-1: 296 | self.batch_index += 1 297 | else: 298 | self.epoch_counter += 1 299 | qid_list = self.getQuesIds() 300 | random.shuffle(qid_list) 301 | self.qid_list = qid_list 302 | self.batch_index = 0 303 | print("%d questions were skipped in a single epoch" % self.n_skipped) 304 | self.n_skipped = 0 305 | 306 | t_batch = self.create_batch(t_qid_list) 307 | return t_batch + (t_qid_list, t_iid_list, self.epoch_counter) 308 | 309 | 310 | class VQADataProviderLayer(caffe.Layer): 311 | """ 312 | Provide input data for VQA. 313 | """ 314 | 315 | def setup(self, bottom, top): 316 | self.batchsize = json.loads(self.param_str)['batchsize'] 317 | self.top_names = ['data','cont','feature','label','glove'] 318 | top[0].reshape(15,self.batchsize) 319 | top[1].reshape(15,self.batchsize) 320 | top[2].reshape(self.batchsize,2048,14,14) 321 | top[3].reshape(self.batchsize,config.NUM_OUTPUT_UNITS) 322 | top[4].reshape(15,self.batchsize,GLOVE_EMBEDDING_SIZE) 323 | 324 | self.mode = json.loads(self.param_str)['mode'] 325 | self.folder = json.loads(self.param_str)['folder'] 326 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 327 | pass 328 | else: 329 | self.dp = VQADataProvider(batchsize=self.batchsize, mode=self.mode, folder=self.folder) 330 | 331 | def reshape(self, bottom, top): 332 | pass 333 | 334 | def forward(self, bottom, top): 335 | if self.mode == 'val' or self.mode == 'test-dev' or self.mode == 'test': 336 | pass 337 | else: 338 | word, cont, feature, answer, glove_matrix, _, _, epoch_counter = self.dp.get_batch_vec() 339 | top[0].data[...] = np.transpose(word,(1,0)) # N x T -> T x N 340 | top[1].data[...] = np.transpose(cont,(1,0)) 341 | top[2].data[...] = feature 342 | top[3].data[...] = answer 343 | top[4].data[...] = np.transpose(glove_matrix, (1,0,2)) # N x T x 300 -> T x N x 300 344 | 345 | def backward(self, top, propagate_down, bottom): 346 | pass 347 | 348 | -------------------------------------------------------------------------------- /mfb_coatt_glove/train_mfb_coatt_glove.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | matplotlib.use('Agg') 3 | import os 4 | import sys 5 | import numpy as np 6 | import json 7 | import matplotlib.pyplot as plt 8 | 9 | import caffe 10 | from caffe import layers as L 11 | from caffe import params as P 12 | from caffe.proto import caffe_pb2 13 | 14 | from vqa_data_layer_kld import VQADataProvider 15 | from utils import exec_validation, drawgraph 16 | import config 17 | import time 18 | 19 | def get_solver(folder): 20 | s = caffe_pb2.SolverParameter() 21 | s.train_net = './%s/proto_train.prototxt'%folder 22 | s.snapshot = int(config.VALIDATE_INTERVAL) 23 | s.snapshot_prefix = './%s/'%folder 24 | s.max_iter = int(config.MAX_ITERATIONS) 25 | s.display = int(config.VALIDATE_INTERVAL) 26 | s.type = 'Adam' 27 | s.stepsize = int(config.MAX_ITERATIONS*0.4) 28 | s.gamma = 0.5 29 | s.lr_policy = "step" 30 | s.base_lr = 0.0007 31 | s.momentum = 0.9 32 | s.momentum2 = 0.999 33 | s.weight_decay = 0.000 34 | s.clip_gradients = 10 35 | return s 36 | 37 | def get_auxiliary_json(): 38 | aux = {} 39 | aux["batch_size"] = int(config.VAL_BATCH_SIZE) 40 | aux["data_shape"] = [2048,14,14] 41 | aux["img_feature_prefix"] = config.DATA_PATHS['test']['features_prefix'] 42 | aux["glove"] = True 43 | return aux 44 | 45 | 46 | def mfb_coatt(mode, batchsize, T, question_vocab_size, folder): 47 | n = caffe.NetSpec() 48 | mode_str = json.dumps({'mode':mode, 'batchsize':batchsize,'folder':folder}) 49 | if mode == 'val': 50 | n.data, n.cont, n.img_feature, n.label, n.glove = L.Python( \ 51 | module='vqa_data_layer', layer='VQADataProviderLayer', \ 52 | param_str=mode_str, ntop=5 ) 53 | else: 54 | n.data, n.cont, n.img_feature, n.label, n.glove = L.Python(\ 55 | module='vqa_data_layer_kld', layer='VQADataProviderLayer', \ 56 | param_str=mode_str, ntop=5 ) 57 | n.embed = L.Embed(n.data, input_dim=question_vocab_size, num_output=300, \ 58 | weight_filler=dict(type='xavier')) 59 | n.embed_tanh = L.TanH(n.embed) 60 | concat_word_embed = [n.embed_tanh, n.glove] 61 | n.concat_embed = L.Concat(*concat_word_embed, concat_param={'axis': 2}) # T x N x 600 62 | 63 | # LSTM 64 | n.lstm1 = L.LSTM(\ 65 | n.concat_embed, n.cont,\ 66 | recurrent_param=dict(\ 67 | num_output=config.LSTM_UNIT_NUM,\ 68 | weight_filler=dict(type='xavier'))) 69 | n.lstm1_droped = L.Dropout(n.lstm1,dropout_param={'dropout_ratio':config.LSTM_DROPOUT_RATIO}) 70 | n.lstm1_resh = L.Permute(n.lstm1_droped, permute_param=dict(order=[1,2,0])) 71 | n.lstm1_resh2 = L.Reshape(n.lstm1_resh, \ 72 | reshape_param=dict(shape=dict(dim=[0,0,0,1]))) 73 | 74 | ''' 75 | Question Attention 76 | ''' 77 | n.qatt_conv1 = L.Convolution(n.lstm1_resh2, kernel_size=1, stride=1, num_output=512, pad=0, 78 | weight_filler=dict(type='xavier')) 79 | n.qatt_relu = L.ReLU(n.qatt_conv1) 80 | n.qatt_conv2 = L.Convolution(n.qatt_relu, kernel_size=1, stride=1, num_output=config.NUM_QUESTION_GLIMPSE, pad=0, 81 | weight_filler=dict(type='xavier')) 82 | n.qatt_reshape = L.Reshape(n.qatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_QUESTION_GLIMPSE,config.MAX_WORDS_IN_QUESTION,1]))) # N*NUM_QUESTION_GLIMPSE*15 83 | n.qatt_softmax = L.Softmax(n.qatt_reshape, axis=2) 84 | 85 | qatt_maps = L.Slice(n.qatt_softmax,ntop=config.NUM_QUESTION_GLIMPSE,slice_param={'axis':1}) 86 | dummy_lstm = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) 87 | qatt_feature_list = [] 88 | for i in xrange(config.NUM_QUESTION_GLIMPSE): 89 | if config.NUM_QUESTION_GLIMPSE == 1: 90 | n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps, dummy_lstm)) 91 | else: 92 | n.__setattr__('qatt_feat%d'%i, L.SoftAttention(n.lstm1_resh2, qatt_maps[i], dummy_lstm)) 93 | qatt_feature_list.append(n.__getattr__('qatt_feat%d'%i)) 94 | n.qatt_feat_concat = L.Concat(*qatt_feature_list) 95 | ''' 96 | Image Attention with MFB 97 | ''' 98 | n.q_feat_resh = L.Reshape(n.qatt_feat_concat,reshape_param=dict(shape=dict(dim=[0,-1,1,1]))) 99 | n.i_feat_resh = L.Reshape(n.img_feature,reshape_param=dict(shape=dict(dim=[0,-1,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH]))) 100 | 101 | n.iatt_q_proj = L.InnerProduct(n.q_feat_resh, num_output = config.JOINT_EMB_SIZE, 102 | weight_filler=dict(type='xavier')) 103 | n.iatt_q_resh = L.Reshape(n.iatt_q_proj, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,1,1]))) 104 | n.iatt_q_tile1 = L.Tile(n.iatt_q_resh, axis=2, tiles=config.IMG_FEAT_WIDTH) 105 | n.iatt_q_tile2 = L.Tile(n.iatt_q_tile1, axis=3, tiles=config.IMG_FEAT_WIDTH) 106 | 107 | 108 | n.iatt_i_conv = L.Convolution(n.i_feat_resh, kernel_size=1, stride=1, num_output=config.JOINT_EMB_SIZE, pad=0, 109 | weight_filler=dict(type='xavier')) 110 | n.iatt_i_resh1 = L.Reshape(n.iatt_i_conv, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE, 111 | config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH]))) 112 | n.iatt_iq_eltwise = L.Eltwise(n.iatt_q_tile2, n.iatt_i_resh1, eltwise_param=dict(operation=0)) 113 | n.iatt_iq_droped = L.Dropout(n.iatt_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) 114 | n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_droped, reshape_param=dict(shape=dict(dim=[-1,config.JOINT_EMB_SIZE,196,1]))) 115 | n.iatt_iq_permute1 = L.Permute(n.iatt_iq_resh2, permute_param=dict(order=[0,2,1,3])) 116 | n.iatt_iq_resh2 = L.Reshape(n.iatt_iq_permute1, reshape_param=dict(shape=dict(dim=[-1,config.IMG_FEAT_SIZE, 117 | config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) 118 | n.iatt_iq_sumpool = L.Pooling(n.iatt_iq_resh2, pool=P.Pooling.SUM, \ 119 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) 120 | n.iatt_iq_permute2 = L.Permute(n.iatt_iq_sumpool, permute_param=dict(order=[0,2,1,3])) 121 | 122 | n.iatt_iq_sqrt = L.SignedSqrt(n.iatt_iq_permute2) 123 | n.iatt_iq_l2 = L.L2Normalize(n.iatt_iq_sqrt) 124 | 125 | 126 | ## 2 conv layers 1000 -> 512 -> 2 127 | n.iatt_conv1 = L.Convolution(n.iatt_iq_l2, kernel_size=1, stride=1, num_output=512, pad=0, 128 | weight_filler=dict(type='xavier')) 129 | n.iatt_relu = L.ReLU(n.iatt_conv1) 130 | n.iatt_conv2 = L.Convolution(n.iatt_relu, kernel_size=1, stride=1, num_output=config.NUM_IMG_GLIMPSE, pad=0, 131 | weight_filler=dict(type='xavier')) 132 | n.iatt_resh = L.Reshape(n.iatt_conv2, reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_SIZE]))) 133 | n.iatt_softmax = L.Softmax(n.iatt_resh, axis=2) 134 | n.iatt_softmax_resh = L.Reshape(n.iatt_softmax,reshape_param=dict(shape=dict(dim=[-1,config.NUM_IMG_GLIMPSE,config.IMG_FEAT_WIDTH,config.IMG_FEAT_WIDTH]))) 135 | iatt_maps = L.Slice(n.iatt_softmax_resh, ntop=config.NUM_IMG_GLIMPSE,slice_param={'axis':1}) 136 | dummy = L.DummyData(shape=dict(dim=[batchsize, 1]), data_filler=dict(type='constant', value=1), ntop=1) 137 | iatt_feature_list = [] 138 | for i in xrange(config.NUM_IMG_GLIMPSE): 139 | if config.NUM_IMG_GLIMPSE == 1: 140 | n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps, dummy)) 141 | else: 142 | n.__setattr__('iatt_feat%d'%i, L.SoftAttention(n.i_feat_resh, iatt_maps[i], dummy)) 143 | n.__setattr__('iatt_feat%d_resh'%i, L.Reshape(n.__getattr__('iatt_feat%d'%i), \ 144 | reshape_param=dict(shape=dict(dim=[0,-1])))) 145 | iatt_feature_list.append(n.__getattr__('iatt_feat%d_resh'%i)) 146 | n.iatt_feat_concat = L.Concat(*iatt_feature_list) 147 | n.iatt_feat_concat_resh = L.Reshape(n.iatt_feat_concat, reshape_param=dict(shape=dict(dim=[0,-1,1,1]))) 148 | 149 | ''' 150 | Fine-grained Image-Question MFB fusion 151 | ''' 152 | 153 | n.mfb_q_proj = L.InnerProduct(n.q_feat_resh, num_output=config.JOINT_EMB_SIZE, 154 | weight_filler=dict(type='xavier')) 155 | n.mfb_i_proj = L.InnerProduct(n.iatt_feat_concat_resh, num_output=config.JOINT_EMB_SIZE, 156 | weight_filler=dict(type='xavier')) 157 | n.mfb_iq_eltwise = L.Eltwise(n.mfb_q_proj, n.mfb_i_proj, eltwise_param=dict(operation=0)) 158 | n.mfb_iq_drop = L.Dropout(n.mfb_iq_eltwise, dropout_param={'dropout_ratio':config.MFB_DROPOUT_RATIO}) 159 | n.mfb_iq_resh = L.Reshape(n.mfb_iq_drop, reshape_param=dict(shape=dict(dim=[-1,1,config.MFB_OUT_DIM,config.MFB_FACTOR_NUM]))) 160 | n.mfb_iq_sumpool = L.Pooling(n.mfb_iq_resh, pool=P.Pooling.SUM, \ 161 | pooling_param=dict(kernel_w=config.MFB_FACTOR_NUM, kernel_h=1)) 162 | n.mfb_out = L.Reshape(n.mfb_iq_sumpool,\ 163 | reshape_param=dict(shape=dict(dim=[-1,config.MFB_OUT_DIM]))) 164 | n.mfb_sign_sqrt = L.SignedSqrt(n.mfb_out) 165 | n.mfb_l2 = L.L2Normalize(n.mfb_sign_sqrt) 166 | 167 | n.prediction = L.InnerProduct(n.mfb_l2, num_output=config.NUM_OUTPUT_UNITS, 168 | weight_filler=dict(type='xavier')) 169 | if mode == 'val': 170 | n.loss = L.SoftmaxWithLoss(n.prediction, n.label) 171 | else: 172 | n.loss = L.SoftmaxKLDLoss(n.prediction, n.label) 173 | return n.to_proto() 174 | 175 | def make_answer_vocab(adic, vocab_size): 176 | """ 177 | Returns a dictionary that maps words to indices. 178 | """ 179 | adict = {'':0} 180 | nadict = {'':1000000} 181 | vid = 1 182 | for qid in adic.keys(): 183 | answer_obj = adic[qid] 184 | answer_list = [ans['answer'] for ans in answer_obj] 185 | 186 | for q_ans in answer_list: 187 | # create dict 188 | if adict.has_key(q_ans): 189 | nadict[q_ans] += 1 190 | else: 191 | nadict[q_ans] = 1 192 | adict[q_ans] = vid 193 | vid +=1 194 | 195 | # debug 196 | nalist = [] 197 | for k,v in sorted(nadict.items(), key=lambda x:x[1]): 198 | nalist.append((k,v)) 199 | 200 | # remove words that appear less than once 201 | n_del_ans = 0 202 | n_valid_ans = 0 203 | adict_nid = {} 204 | for i, w in enumerate(nalist[:-vocab_size]): 205 | del adict[w[0]] 206 | n_del_ans += w[1] 207 | for i, w in enumerate(nalist[-vocab_size:]): 208 | n_valid_ans += w[1] 209 | adict_nid[w[0]] = i 210 | 211 | return adict_nid 212 | 213 | def make_question_vocab(qdic): 214 | """ 215 | Returns a dictionary that maps words to indices. 216 | """ 217 | vdict = {'':0} 218 | vid = 1 219 | for qid in qdic.keys(): 220 | # sequence to list 221 | q_str = qdic[qid]['qstr'] 222 | q_list = VQADataProvider.seq_to_list(q_str) 223 | 224 | # create dict 225 | for w in q_list: 226 | if not vdict.has_key(w): 227 | vdict[w] = vid 228 | vid +=1 229 | 230 | return vdict 231 | 232 | def make_vocab_files(): 233 | """ 234 | Produce the question and answer vocabulary files. 235 | """ 236 | print 'making question vocab...', config.QUESTION_VOCAB_SPACE 237 | qdic, _ = VQADataProvider.load_data(config.QUESTION_VOCAB_SPACE) 238 | question_vocab = make_question_vocab(qdic) 239 | print 'making answer vocab...', config.ANSWER_VOCAB_SPACE 240 | _, adic = VQADataProvider.load_data(config.ANSWER_VOCAB_SPACE) 241 | answer_vocab = make_answer_vocab(adic, config.NUM_OUTPUT_UNITS) 242 | return question_vocab, answer_vocab 243 | 244 | def main(): 245 | folder = 'mfb_coatt_glove_q%dv%d_%s'%(config.NUM_QUESTION_GLIMPSE, config.NUM_IMG_GLIMPSE,config.TRAIN_DATA_SPLITS) 246 | if not os.path.exists('./%s'%folder): 247 | os.makedirs('./%s'%folder) 248 | 249 | question_vocab, answer_vocab = {}, {} 250 | if os.path.exists('./%s/vdict.json'%folder) and os.path.exists('./%s/adict.json'%folder): 251 | print 'restoring vocab' 252 | with open('./%s/vdict.json'%folder,'r') as f: 253 | question_vocab = json.load(f) 254 | with open('./%s/adict.json'%folder,'r') as f: 255 | answer_vocab = json.load(f) 256 | else: 257 | question_vocab, answer_vocab = make_vocab_files() 258 | with open('./%s/vdict.json'%folder,'w') as f: 259 | json.dump(question_vocab, f) 260 | with open('./%s/adict.json'%folder,'w') as f: 261 | json.dump(answer_vocab, f) 262 | 263 | print 'question vocab size:', len(question_vocab) 264 | print 'answer vocab size:', len(answer_vocab) 265 | 266 | with open('./%s/proto_train.prototxt'%folder, 'w') as f: 267 | f.write(str(mfb_coatt(config.TRAIN_DATA_SPLITS, config.BATCH_SIZE, \ 268 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder))) 269 | 270 | with open('./%s/proto_test.prototxt'%folder, 'w') as f: 271 | f.write(str(mfb_coatt('val', config.VAL_BATCH_SIZE, \ 272 | config.MAX_WORDS_IN_QUESTION, len(question_vocab), folder))) 273 | 274 | with open('./%s/solver.prototxt'%folder, 'w') as f: 275 | f.write(str(get_solver(folder))) 276 | with open('./%s/auxiliary.json'%folder, 'w') as f: 277 | json.dump(get_auxiliary_json(),f, indent=2) 278 | 279 | caffe.set_device(config.TRAIN_GPU_ID) 280 | caffe.set_mode_gpu() 281 | solver = caffe.get_solver('./%s/solver.prototxt'%folder) 282 | 283 | train_loss = np.zeros(config.MAX_ITERATIONS+1) 284 | results = [] 285 | 286 | if config.RESTORE_ITER: 287 | restore_iter = config.RESTORE_ITER 288 | solver.restore('./%s/_iter_%d.solverstate'%(folder,restore_iter)) 289 | else: 290 | restore_iter = 0 291 | 292 | start = time.clock() 293 | for it in range(restore_iter, config.MAX_ITERATIONS+1): 294 | solver.step(1) 295 | 296 | # store the train loss 297 | train_loss[it] = solver.net.blobs['loss'].data 298 | 299 | if it % config.PRINT_INTERVAL == 0 and it != 0: 300 | elapsed = (time.clock() - start) 301 | print 'Iteration:', it 302 | c_mean_loss = train_loss[it-config.PRINT_INTERVAL:it].mean() 303 | print 'Train loss:', c_mean_loss, ' Elapsed seconds:', elapsed 304 | start = time.clock() 305 | if it % config.VALIDATE_INTERVAL == 0 and it != restore_iter: 306 | model_name = './%s/tmp.caffemodel'%(folder) 307 | solver.net.save(model_name) 308 | print 'Validating...' 309 | 310 | # for test-dev /test set. the json file will be generated under the file 311 | exec_validation(config.TEST_GPU_ID, 'test-dev', model_name, it=it, folder=folder) 312 | caffe.set_device(config.TRAIN_GPU_ID) 313 | ''' 314 | #for val set. the accuracy will be computed and ploted 315 | test_loss, acc_overall, acc_per_ques, acc_per_ans = exec_validation(config.TEST_GPU_ID, 'val', model_name, it=it, folder=folder) 316 | caffe.set_device(config.TRAIN_GPU_ID) 317 | print 'Test loss:', test_loss 318 | print 'Accuracy:', acc_overall 319 | print 'Test per ans', acc_per_ans 320 | results.append([it, c_mean_loss, test_loss, acc_overall, acc_per_ques, acc_per_ans]) 321 | best_result_idx = np.array([x[3] for x in results]).argmax() 322 | print 'Best accuracy of', results[best_result_idx][3], 'was at iteration', results[best_result_idx][0] 323 | drawgraph(results,folder,config.MFB_FACTOR_NUM,config.MFB_OUT_DIM,prefix='mfb_coatt_glove') 324 | ''' 325 | if __name__ == '__main__': 326 | main() 327 | --------------------------------------------------------------------------------