├── .gitignore ├── .gitmodules ├── Dockerfile ├── LICENSE ├── README.md ├── cfgs ├── noc_coco_res101.yml ├── noc_coco_vgg16.yml ├── normal_coco_res101.yml └── robust_coco.yml ├── data ├── README.md ├── coco │ └── coco_class_name.txt ├── flickr30k │ └── flickr30k_class_name.txt ├── noc_coco │ └── split_noc_coco.json └── robust_coco │ └── split_robust_coco.json ├── demo.py ├── demo ├── img1.png └── img2.png ├── generate_robust_split.py ├── main.py ├── misc ├── AttModel.py ├── CaptionModel.py ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-36.pyc │ └── eval_utils.cpython-36.pyc ├── bak.py ├── bbox_transform.py ├── dataloader_coco.py ├── dataloader_flickr30k.py ├── dataloader_hdf.py ├── eval_utils.py ├── model.py ├── resnet.py ├── rewards.py ├── utils.py └── vgg16.py ├── opts.py ├── pooling ├── __init__.py ├── make.sh └── roi_align │ ├── __init__.py │ ├── _ext │ ├── __init__.py │ └── roi_align │ │ ├── __init__.py │ │ └── _roi_align.so │ ├── build.py │ ├── functions │ ├── __init__.py │ └── roi_align.py │ ├── make.sh │ ├── modules │ ├── __init__.py │ └── roi_align.py │ └── src │ ├── roi_align.c │ ├── roi_align.h │ ├── roi_align_cuda.c │ ├── roi_align_cuda.h │ ├── roi_align_kernel.cu │ └── roi_align_kernel.h ├── prepro ├── prepro_det.py ├── prepro_dic_coco.py ├── prepro_dic_flickr.py ├── prepro_ngrams.py ├── prepro_ngrams_bak.py └── prepro_ngrams_flickr30k.py └── tools ├── pycider ├── PyDataFormat │ ├── __init__.py │ ├── __init__.pyc │ ├── jsonify_refs.py │ ├── loadData.py │ └── loadData.pyc ├── README.md ├── cidereval.ipynb ├── cidereval.py ├── license.txt ├── params.json └── pyciderevalcap │ ├── __init__.py │ ├── __init__.pyc │ ├── cider │ ├── __init__.py │ ├── __init__.pyc │ ├── cider.py │ ├── cider.pyc │ ├── cider_scorer.py │ └── cider_scorer.pyc │ ├── ciderD │ ├── __init__.py │ ├── __init__.pyc │ ├── ciderD.py │ ├── ciderD.pyc │ ├── ciderD_scorer.py │ └── ciderD_scorer.pyc │ ├── eval.py │ ├── eval.pyc │ └── tokenizer │ ├── __init__.py │ ├── __init__.pyc │ ├── ptbtokenizer.py │ ├── ptbtokenizer.pyc │ ├── stanford-corenlp-3.4.1.jar │ ├── tmpBF49XX │ ├── tmpql9uU7 │ ├── tmpuCp_T0 │ ├── tmpxAmV_C │ └── tmpzNW4I2 └── sentence_gen_tools ├── __init__.py └── coco_eval.py /.gitignore: -------------------------------------------------------------------------------- 1 | # project specific 2 | save/ 3 | 4 | # remote ftp files 5 | .ftpconfig 6 | 7 | # Byte-compiled / optimized / DLL files 8 | __pycache__/ 9 | *.py[cod] 10 | *$py.class 11 | 12 | # C extensions 13 | *.so 14 | 15 | # Distribution / packaging 16 | .Python 17 | .vector_cache/ 18 | env/ 19 | env3/ 20 | build/ 21 | data/ 22 | logs/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | *.log 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # PyBuilder 53 | target/ 54 | 55 | # IPython Notebook 56 | .ipynb_checkpoints 57 | 58 | # pyenv 59 | .python-version 60 | 61 | # celery beat schedule file 62 | celerybeat-schedule 63 | 64 | # dotenv 65 | .env 66 | 67 | # virtualenv 68 | venv/ 69 | ENV/ 70 | 71 | # Spyder project settings 72 | .spyderproject 73 | 74 | # Rope project settings 75 | .ropeproject 76 | 77 | # OS X files 78 | .DS_Store 79 | 80 | .idea 81 | media/ 82 | db.sqlite3 83 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tools/coco-caption"] 2 | path = tools/coco-caption 3 | url = https://www.github.com/kdexd/coco-caption 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:0.4-cuda9-cudnn7-devel 2 | 3 | COPY . /workspace/neuralbabytalk 4 | 5 | # ---------------------------------------------------------------------------- 6 | # -- install apt and pip dependencies 7 | # ---------------------------------------------------------------------------- 8 | 9 | RUN apt-get update && \ 10 | apt-get install -y \ 11 | ant \ 12 | ca-certificates-java \ 13 | nano \ 14 | openjdk-8-jdk \ 15 | python2.7 \ 16 | unzip \ 17 | wget && \ 18 | apt-get clean 19 | 20 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ 21 | RUN update-ca-certificates -f && export JAVA_HOME 22 | 23 | RUN pip install Cython && pip install h5py \ 24 | matplotlib \ 25 | nltk \ 26 | numpy \ 27 | pycocotools \ 28 | scikit-image \ 29 | stanfordcorenlp \ 30 | tensorflow \ 31 | torchtext \ 32 | tqdm && python -c "import nltk; nltk.download('punkt')" 33 | 34 | 35 | # ---------------------------------------------------------------------------- 36 | # -- download pretrained imagenet weights for resnet-101 37 | # ---------------------------------------------------------------------------- 38 | 39 | RUN mkdir /workspace/neuralbabytalk/data/imagenet_weights && \ 40 | cd /workspace/neuralbabytalk/data/imagenet_weights && \ 41 | wget --quiet https://www.dropbox.com/sh/67fc8n6ddo3qp47/AAACkO4QntI0RPvYic5voWHFa/resnet101.pth 42 | 43 | 44 | # ---------------------------------------------------------------------------- 45 | # -- download Karpathy's preprocessed captions datasets and corenlp jar 46 | # ---------------------------------------------------------------------------- 47 | 48 | RUN cd /workspace/neuralbabytalk/data && \ 49 | wget --quiet http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip && \ 50 | unzip caption_datasets.zip && \ 51 | mv dataset_coco.json coco/ && \ 52 | mv dataset_flickr30k.json flickr30k/ && \ 53 | rm caption_datasets.zip dataset_flickr8k.json 54 | 55 | RUN cd /workspace/neuralbabytalk/prepro && \ 56 | wget --quiet https://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip && \ 57 | unzip stanford-corenlp-full-2017-06-09.zip && \ 58 | rm stanford-corenlp-full-2017-06-09.zip 59 | 60 | RUN cd /workspace/neuralbabytalk/tools/coco-caption && \ 61 | sh get_stanford_models.sh 62 | 63 | # ---------------------------------------------------------------------------- 64 | # -- download preprocessed COCO detection output HDF file and pretrained model 65 | # ---------------------------------------------------------------------------- 66 | 67 | RUN cd /workspace/neuralbabytalk/data/coco && \ 68 | wget --quiet https://www.dropbox.com/s/2gzo4ops5gbjx5h/coco_detection.h5.tar.gz && \ 69 | tar -xzvf coco_detection.h5.tar.gz && \ 70 | rm coco_detection.h5.tar.gz 71 | 72 | RUN mkdir -p /workspace/neuralbabytalk/save && \ 73 | cd /workspace/neuralbabytalk/save && \ 74 | wget --quiet https://www.dropbox.com/s/6buajkxm9oed1jp/coco_nbt_1024.tar.gz && \ 75 | tar -xzvf coco_nbt_1024.tar.gz && \ 76 | rm coco_nbt_1024.tar.gz 77 | 78 | WORKDIR /workspace/neuralbabytalk 79 | RUN python prepro/prepro_dic_coco.py \ 80 | --input_json data/coco/dataset_coco.json \ 81 | --split normal \ 82 | --output_dic_json data/coco/dic_coco.json \ 83 | --output_cap_json data/coco/cap_coco.json && \ 84 | python prepro/prepro_dic_coco.py \ 85 | --input_json data/coco/dataset_coco.json \ 86 | --split robust \ 87 | --output_dic_json data/robust_coco/dic_coco.json \ 88 | --output_cap_json data/robust_coco/cap_coco.json && \ 89 | python prepro/prepro_dic_coco.py \ 90 | --input_json data/coco/dataset_coco.json \ 91 | --split noc \ 92 | --output_dic_json data/noc_coco/dic_coco.json \ 93 | --output_cap_json data/noc_coco/cap_coco.json 94 | 95 | EXPOSE 8888 96 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Jiasen Lu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Neural Baby Talk 2 | 3 | ![teaser results](demo/img1.png) 4 | 5 | 6 | ## Docker Setup 7 | 8 | This repository provides a Dockerfile for setting up all dependencies and preprocessed data for COCO experiments (normal / robust / NOC). Docker support for Flickr30k experiments is not yet supported. To build the Dockerfile, just execute this from project root: 9 | 10 | ```shell 11 | docker build -t nbt . 12 | ``` 13 | 14 | Before running the container, you need to get COCO dataset downloaded and kept somewhere in your filesystem. Declare two environment variables: 15 | 1. `$COCO_IMAGES`: path to a directory with sub-directories of images as `train2014`, `val2014`, `test2015`, etc... 16 | 2. `$COCO_ANNOTATIONS`: path to a directory with annotation files like `instances_train2014.json`, `captions_train2014.json` etc... 17 | 18 | These directories will be attached as "volumes" to our docker container for Neural Baby Talk to use within. Run the docker image within a container in an interactive mode (bash session). Get [nvidia-docker](https://www.github.com/NVIDIA/nvidia-docker) and execute this command to run the fresh built docker image. 19 | 20 | ```shell 21 | nvidia-docker run --name nbt_container -it \ 22 | -v $COCO_IMAGES:/workspace/neuralbabytalk/data/coco/images \ 23 | -v $COCO_ANNOTATIONS:/workspace/neuralbabytalk/data/coco/annotations \ 24 | --shm-size 8G -p 8888:8888 nbt /bin/bash 25 | ``` 26 | 27 | Ideally, shared memory size (`--shm-size`) of 8GB would be enough. Tune it according to your requirements / machine specifications. 28 | 29 | **Saved Checkpoints:** All checkpoints will be saved in `/workspace/neuralbabytalk/save`. From outside the container, execute this to get your checkpoints from this container into the main filesystem: 30 | The container would expose port 8888, which can be used to host tensorboard visualizations. 31 | 32 | ```shell 33 | docker container cp nbt_container:workspace/neuralbabytalk/save /path/to/local/filesystem/save 34 | ``` 35 | 36 | Skip directly to **Training and Evaluation** section to execute specified commands within the container. 37 | 38 | 39 | ## requirement 40 | 41 | Inference: 42 | 43 | - [pytorch](http://pytorch.org/) 44 | - [torchvision](https://github.com/pytorch/vision) 45 | - [torchtext](https://github.com/pytorch/text) 46 | 47 | Data Preparation: 48 | 49 | - [stanford-corenlp-wrapper](https://github.com/Lynten/stanford-corenlp) 50 | - [stanford-corenlp](https://stanfordnlp.github.io/CoreNLP/) 51 | 52 | Evaluation: 53 | 54 | - [coco-caption](https://github.com/jiasenlu/coco-caption): Download the modified version of coco-caption and put it under `tools/` 55 | 56 | 57 | ## Demo 58 | 59 | #### Without detection bbox 60 | 61 | 62 | #### With detection bbox 63 | 64 | #### Constraint beam search 65 | This code also involve the implementation of constraint beam search proposed by Peter Anderson. I'm not sure my impmentation is 100% correct, but it works well in conjuction with neural baby talk code. You can refer to [this](http://users.cecs.anu.edu.au/~sgould/papers/emnlp17-constrained-beam-search.pdf) paper for more details. To enable CBS while decoding, please set the following flags: 66 | ``` 67 | --cbs True|False : Whether use the constraint beam search. 68 | --cbs_tag_size 3 : How many detection bboxes do we want to include in the decoded caption. 69 | --cbs_mode all|unqiue|novel : Do we allow the repetive bounding box? `novel` is an option only for novel object detection task. 70 | ``` 71 | 72 | ## Training and Evaluation 73 | ### Data Preparation 74 | Head to `data/README.md`, and prepare the data for training and evaluation. 75 | 76 | ### Pretrained model 77 | | Task | Dataset | Backend | Batch size | Link | 78 | | ---- | :----:| :----:| :----:|:----:| 79 | | Standard image captioning | COCO | Res-101 | 100 | [Pre-trained Model](https://www.dropbox.com/s/6buajkxm9oed1jp/coco_nbt_1024.tar.gz?dl=0) | 80 | | Standard image captioning | Flickr30k | Res-101 | 50 | [Pre-trained Model](https://www.dropbox.com/s/cirzj1b2jul6yzx/flickr30k_nbt_1024.tar.gz?dl=0) | 81 | | Robust image captioning | COCO | Res-101 | 100 | [Pre-trained Model](https://www.dropbox.com/s/sxuodvob0ftesm9/robust_coco_nbt_1024.tar.gz?dl=0) | 82 | | Novel object captioning | COCO | Res-101 | 100 | [Pre-trained Model](https://www.dropbox.com/s/b7i6vx5pf98540l/noc_coco_nbt_1024.tar.gz?dl=0) | 83 | 84 | 85 | ### Standard Image Captioning 86 | ##### Training (COCO) 87 | 88 | First, modify the cofig file `cfgs/normal_coco_res101.yml` with the correct file path. 89 | 90 | ``` 91 | python main.py --path_opt cfgs/normal_coco_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 92 | ``` 93 | ##### Evaluation (COCO) 94 | Download Pre-trained model. Extract the tar.zip file and put it under `save/`. 95 | 96 | ``` 97 | python main.py --path_opt cfgs/normal_coco_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 --inference_only True --beam_size 3 --start_from save/coco_nbt_1024 98 | ``` 99 | 100 | ##### Training (Flickr30k) 101 | Modify the cofig file `cfgs/normal_flickr_res101.yml` with the correct file path. 102 | 103 | ``` 104 | python main.py --path_opt cfgs/normal_flickr_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 105 | ``` 106 | 107 | ##### Evaluation (Flickr30k) 108 | Download Pre-trained model. Extract the tar.zip file and put it under `save/`. 109 | 110 | ``` 111 | python main.py --path_opt cfgs/normal_flickr_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 --inference_only True --beam_size 3 --start_from save/flickr30k_nbt_1024 112 | ``` 113 | 114 | ### Robust Image Captioning 115 | 116 | ##### Training 117 | Modify the cofig file `cfgs/normal_flickr_res101.yml` with the correct file path. 118 | 119 | ``` 120 | python main.py --path_opt cfgs/robust_coco.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 121 | ``` 122 | ##### Evaluation (robust-coco) 123 | Download Pre-trained model. Extract the tar.zip file and put it under `save/`. 124 | 125 | ``` 126 | python main.py --path_opt cfgs/robust_coco.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 --inference_only True --beam_size 3 --start_from save/robust_coco_nbt_1024 127 | ``` 128 | 129 | ### Novel Object Captioning 130 | 131 | ##### Training 132 | Modify the cofig file `cfgs/noc_coco_res101.yml` with the correct file path. 133 | 134 | ``` 135 | python main.py --path_opt cfgs/noc_coco_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 136 | ``` 137 | ##### Evaluation (noc-coco) 138 | Download Pre-trained model. Extract the tar.zip file and put it under `save/`. 139 | 140 | ``` 141 | python main.py --path_opt cfgs/noc_coco_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 --inference_only True --beam_size 3 --start_from save/noc_coco_nbt_1024 142 | ``` 143 | 144 | ### Multi-GPU Training 145 | This codebase also support training with multiple GPU. To enable this feature, simply add `--mGPUs Ture` in the commnad. 146 | 147 | ### Self-Critic Training and Fine-Tuning CNN 148 | 149 | This codebase also support self-critic training and fine-tuning CNN. You are welcome to try this part and upload your trained model to the repo! 150 | 151 | ## More Visualization Results 152 | ![teaser results](demo/img2.png) 153 | 154 | ## Reference 155 | If you use this code as part of any published research, please acknowledge the following paper 156 | 157 | ``` 158 | @inproceedings{Lu2018Neural, 159 | author = {Lu, Jiasen and Yang, Jianwei and Batra, Dhruv and Parikh, Devi}, 160 | title = {Neural Baby Talk}, 161 | booktitle = {CVPR}, 162 | year = {2018} 163 | } 164 | ``` 165 | ## Acknowledgement 166 | We thank Ruotian Luo for his [self-critical.pytorch](https://github.com/ruotianluo/self-critical.pytorch) repo. 167 | -------------------------------------------------------------------------------- /cfgs/noc_coco_res101.yml: -------------------------------------------------------------------------------- 1 | # dataset setting 2 | dataset: coco 3 | input_json: '/srv/share/jiasenlu/nbtv2/data/coco/cap_coco.json' 4 | input_dic: '/srv/share/jiasenlu/nbtv2/data/coco_noc/dic_coco.json' 5 | image_path: '/srv/share/datasets/coco/images' 6 | proposal_h5: '/srv/share/jiasenlu/nbtv2/data/coco_noc/coco_noc_detection.h5' 7 | data_path: '/srv/share/jiasenlu/nbtv2/data' 8 | # language model 9 | cnn_backend: res101 10 | att_model: topdown 11 | rnn_size: 1024 12 | num_layers: 1 13 | seq_length: 20 14 | # image model 15 | image_size: 576 16 | image_crop_size: 512 17 | # decode setting 18 | decode_noc: True 19 | cached_tokens: 'coco-train-idxs' 20 | val_split: 'test' 21 | val_images_use: -1 22 | cider_df: 'noc_test_freq' 23 | optim: 'adam' -------------------------------------------------------------------------------- /cfgs/noc_coco_vgg16.yml: -------------------------------------------------------------------------------- 1 | # dataset setting 2 | dataset: coco 3 | input_json: '/srv/share/jiasenlu/nbtv2/data/coco/cap_coco.json' 4 | input_dic: '/srv/share/jiasenlu/nbtv2/data/coco_noc/dic_coco.json' 5 | image_path: '/srv/share/datasets/coco/images' 6 | proposal_h5: '/srv/share/jiasenlu/nbtv2/data/coco_noc/coco_noc_detection.h5' 7 | checkpoint_path: '/srv/share/jiasenlu/nbtv2/model/coco_noc_vgg16' 8 | data_path: '/srv/share/jiasenlu/nbtv2/data' 9 | # language model 10 | cnn_backend: vgg16 11 | att_feat_size: 512 12 | fc_feat_size: 4096 13 | att_model: topdown 14 | rnn_size: 1024 15 | num_layers: 1 16 | seq_length: 20 17 | # image model 18 | image_size: 576 19 | image_crop_size: 512 20 | # decode setting 21 | decode_noc: True 22 | cached_tokens: 'coco-train-idxs' 23 | val_split: 'test' 24 | val_images_use: -1 25 | cider_df: 'noc_test_freq' -------------------------------------------------------------------------------- /cfgs/normal_coco_res101.yml: -------------------------------------------------------------------------------- 1 | # dataset setting 2 | dataset: coco 3 | input_json: 'data/coco/cap_coco.json' 4 | input_dic: 'data/coco/dic_coco.json' 5 | image_path: 'data/coco/images' 6 | proposal_h5: 'data/coco/coco_detection.h5' 7 | data_path: 'data' 8 | # language model 9 | cnn_backend: res101 10 | att_model: topdown 11 | rnn_size: 1024 12 | num_layers: 1 13 | seq_length: 20 14 | # image model 15 | image_size: 576 16 | image_crop_size: 512 17 | # decode setting 18 | decode_noc: False 19 | cached_tokens: 'coco-train-idxs' 20 | val_split: 'test' 21 | val_images_use: -1 22 | cider_df: 'corpus' 23 | optim: 'adam' 24 | checkpoint_path: 'save/normal_coco_1024_adam' 25 | -------------------------------------------------------------------------------- /cfgs/robust_coco.yml: -------------------------------------------------------------------------------- 1 | # dataset setting 2 | dataset: coco 3 | input_json: 'data/robust_coco/cap_coco.json' 4 | input_dic: 'data/robust_coco/dic_coco.json' 5 | image_path: 'data/coco/images' 6 | proposal_h5: 'data/coco/coco_detection.h5' 7 | data_path: 'data' 8 | # language model 9 | cnn_backend: res101 10 | att_model: topdown 11 | rnn_size: 1024 12 | num_layers: 1 13 | seq_length: 20 14 | # image model 15 | image_size: 576 16 | image_crop_size: 512 17 | # decode setting 18 | decode_noc: False 19 | cached_tokens: 'coco-all-idxs' 20 | val_split: 'test' 21 | val_images_use: -1 22 | cider_df: 'corpus' 23 | optim: 'adam' 24 | checkpoint_path: 'save/robust_coco_1024' 25 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | ## Data Preparation for Neural Baby Talk 2 | ### Image Dataset 3 | 4 | - COCO: Download coco images from [link](http://cocodataset.org/#download), we need `2014 training` images and `2014 val` images. You should put the image in some directory, denoted as `$IMAGE_ROOT`. 5 | 6 | - Flickr30k: Download flickr30k entity images from [link](http://web.engr.illinois.edu/~bplumme2/Flickr30kEntities/), you may need to fill a form to get the annotations. 7 | 8 | ### Pretrained CNN weight 9 | - Download pretrained CNN weight from [link](https://www.dropbox.com/sh/67fc8n6ddo3qp47/AADUMRqlcvjv4zqBX6K2L8c2a?dl=0), and put it into `/data` 10 | 11 | ### COCO 12 | - Download the preprocessed Karpathy's split of coco caption from [link](http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip). Extract `dataset_coco.json` from the zip file and copy it into `coco/`. 13 | - Download COCO 2014 Train/Val annotations from [link](http://images.cocodataset.org/annotations/annotations_trainval2014.zip). Extract the zip file and put the json file under `coco/annotations/` 14 | - Download stanford core nlp tools and modified the `scripts/prepro_dic_coco.py` with correct stanford core nlp location. (In my experiment, I use the the version of `stanford-corenlp-full-2017-06-09` [link](https://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip)) 15 | - You can either download the preprocessed data from [here](https://www.dropbox.com/s/1t9nrbevzqn93to/coco.tar.gz?dl=0) or you can use the pre-process script to generate the data. Under the `root` directory, run the following command to pre-process the data. 16 | ``` 17 | python prepro/prepro_dic_coco.py --input_json data/coco/dataset_coco.json --split normal --output_dic_json data/coco/dic_coco.json --output_cap_json data/coco/cap_coco.json 18 | ``` 19 | - Download the pre-extracted coco detection result from [link](https://www.dropbox.com/s/2gzo4ops5gbjx5h/coco_detection.h5.tar.gz?dl=0) and extract the tar.gz file and copy it into `coco/`. You can also extract using our reimplementation of faster rcnn code, or any exsiting detection framework. The format of bounding box data will added later. 20 | - After all these steps, we are ready to train the model for coco :) 21 | 22 | ### Flickr30k 23 | - Download the preprocessed Karpathy's split of coco caption from [link](http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip). Extract `dataset_flickr30k.json` from the zip file and copy it into `flickr30k/`. 24 | - Download the preprocessed Flickr30k annotation for NeuralBabyTalk (annotations that linking the nouns to specific bounding box) from [link](https://www.dropbox.com/s/h4ru86ocb10axa1/flickr30k_cleaned_class.json.tar.gz?dl=0). Extract the tar.gz file and copy it into `flickr30k/`. 25 | - Download the stanford corenlp as COCO's instruction. 26 | - You can either download the preprocessed data from [here](https://www.dropbox.com/s/twve5exs8qj9xgd/flickr30k.tar.gz?dl=0) or you can use the pre-process script to generate the data. Under the `root` directory, run the following command to pre-process the data. 27 | ``` 28 | python prepro/prepro_dic_flickr.py --input_json data/flickr30k/dataset_flickr30k.json --input_class_name data/flickr30k/flickr30k_class_name.txt 29 | ``` 30 | - Download the pre-extracted flickr30k detection result from [link](https://www.dropbox.com/s/5o6so7h4xq5ki1t/flickr30k_detection.h5.tar.gz?dl=0) and extract the tar.gz file and copy it into `flickr30k/`. You can also extract using our reimplementation of faster rcnn code, or any exsiting detection framework. The format of bounding box data will added later. 31 | - After all these steps, we are ready to train the model for flickr30k :) 32 | 33 | ### Robust-COCO 34 | - Follow the instructions as COCO (1-3, 5). 35 | - You can either download the preprocessed data from [here](https://www.dropbox.com/s/tevyub9rxz6d22l/coco_robust.tar.gz?dl=0) or you can use the pre-process script to generate the data. Under the `root` directory, run the following command to pre-process the data. 36 | ``` 37 | python prepro/prepro_dic_coco.py --input_json data/coco/dataset_coco.json --split robust --output_dic_json data/robust_coco/dic_coco.json --output_cap_json data/robust_coco/cap_coco.json 38 | ``` 39 | 40 | ### NOC-COCO 41 | - Follow the instructions as COCO (1-3). 42 | - You can either download the preprocessed data from [here](https://www.dropbox.com/s/tevyub9rxz6d22l/coco_robust.tar.gz?dl=0) or you can use the pre-process script to generate the data. Under the `root` directory, run the following command to pre-process the data. 43 | ``` 44 | python prepro/prepro_dic_coco.py --input_json data/coco/dataset_coco.json --split noc --output_dic_json data/noc_coco/dic_coco.json --output_cap_json data/noc_coco/cap_coco.json 45 | ``` 46 | - Download the pre-extracted coco detection result trained on `train2014` from [link](https://www.dropbox.com/s/2gzo4ops5gbjx5h/coco_detection.h5.tar.gz?dl=0) and extract the tar.gz file and copy it into `coco/`. You can also extract using our reimplementation of faster rcnn code, or any exsiting detection framework. The format of bounding box data will added later. 47 | 48 | -------------------------------------------------------------------------------- /data/coco/coco_class_name.txt: -------------------------------------------------------------------------------- 1 | person, girl, boy, man, woman, kid, child, chef, baker, people, adult, rider, children, baby, worker, passenger, sister, biker, policeman, cop, officer, lady, cowboy, bride, groom, male, female, guy, traveler, mother, father, gentleman, pitcher, player, skier, snowboarder, skater, skateboarder, person, woman, guy, foreigner, child, gentleman, caller, offender, coworker, trespasser, patient, politician, soldier, grandchild, serviceman, walker, drinker, doctor, bicyclist, thief, buyer, teenager, student, camper, driver, solider, hunter, shopper, villager 2 | bicycle, bike, bicycle, bike, unicycle, minibike, trike 3 | car, automobile, van, minivan, sedan, suv, hatchback, cab, jeep, coupe, taxicab, limo, taxi 4 | motorcycle, scooter, motor bike, motor cycle, motorbike, scooter, moped 5 | airplane, jetliner, plane, air plane, monoplane, aircraft, jet, jetliner, airbus, biplane, seaplane 6 | bus, minibus, trolley 7 | train, locomotive, tramway, caboose 8 | truck, pickup, lorry, hauler, firetruck 9 | boat, ship, liner, sailboat, motorboat, dinghy, powerboat, speedboat, canoe, skiff, yacht, kayak, catamaran, pontoon, houseboat, vessel, rowboat, trawler, ferryboat, watercraft, tugboat, schooner, barge, ferry, sailboard, paddleboat, lifeboat, freighter, steamboat, riverboat, surfboard, battleship, steamship 10 | traffic light, street light, traffic signal, stop light, streetlight, stoplight 11 | fire hydrant, hydrant 12 | stop sign, street sign 13 | parking meter 14 | bench, pew 15 | bird, ostrich, owl, seagull, goose, duck, parakeet, falcon, robin, pelican, waterfowl, heron, hummingbird, mallard, finch, pigeon, sparrow, seabird, osprey, blackbird, fowl, shorebird, woodpecker, egret, chickadee, quail, bluebird, kingfisher, buzzard, willet, gull, swan, bluejay, flamingo, cormorant, parrot, loon, gosling, waterbird, pheasant, rooster, sandpiper, crow, raven, turkey, oriole, cowbird, warbler, magpie, peacock, cockatiel, lorikeet, puffin, vulture, condor, macaw, peafowl, cockatoo, songbird 16 | cat, kitten, feline, tabby 17 | dog, puppy, beagle, pup, chihuahua, schnauzer, dachshund, rottweiler, canine, pitbull, collie, pug, terrier, poodle, labrador, doggie, doberman, mutt, doggy, spaniel, bulldog, sheepdog, weimaraner, corgi, cocker, greyhound, retriever, brindle, hound, whippet, husky 18 | horse, colt, pony, racehorse, stallion, equine, mare, foal, palomino, mustang, clydesdale, bronc, bronco 19 | sheep, lamb, goat, ram, cattle, lamb, goat, ewe 20 | cow, cattle, oxen, ox, calf, cattle, ewe, holstein, heifer, buffalo, bull, zebu, bison 21 | elephant 22 | bear, panda 23 | zebra 24 | giraffe 25 | backpack, knapsack 26 | umbrella 27 | handbag, wallet, purse, briefcase 28 | tie 29 | suitcase, suit case, luggage 30 | frisbee 31 | skis, ski 32 | snowboard 33 | sports ball, baseball, ball, football, soccer, basketball, softball, volleyball, pinball, fastball, racquetball 34 | kite 35 | baseball bat 36 | baseball glove 37 | skateboard 38 | surfboard, longboard, skimboard, shortboard, wakeboard 39 | tennis racket 40 | bottle 41 | wine glass 42 | cup 43 | fork 44 | knife, pocketknife, knive 45 | spoon 46 | bowl, container, plate 47 | banana 48 | apple 49 | sandwich, burger, sub, cheeseburger, hamburger 50 | orange, lemons 51 | broccoli 52 | carrot 53 | hot dog 54 | pizza 55 | donut, doughnut, bagel 56 | cake, cheesecake, cupcake, shortcake, coffeecake, pancake 57 | chair, seat, recliner, stool 58 | couch, sofa, recliner, futon, loveseat, settee, chesterfield 59 | potted plant, houseplant 60 | bed 61 | dining table, table 62 | toilet, urinal, commode, toilet, lavatory, potty 63 | tv, monitor, televison, television 64 | laptop, computer, notebook, netbook, lenovo, macbook 65 | mouse 66 | remote 67 | keyboard 68 | cell phone, mobile phone, phone, cellphone, telephone, phon, smartphone, iPhone 69 | microwave 70 | oven, stovetop, stove 71 | toaster 72 | sink 73 | refrigerator, fridge, fridge, freezer 74 | book 75 | clock 76 | vase 77 | scissors 78 | teddy bear, teddybear 79 | hair drier, hairdryer 80 | toothbrush -------------------------------------------------------------------------------- /data/flickr30k/flickr30k_class_name.txt: -------------------------------------------------------------------------------- 1 | railing 2 | blouse 3 | puppy 4 | climber 5 | glass 6 | hole 7 | machine 8 | wine 9 | backpack 10 | telescope 11 | basketball 12 | tree 13 | runner 14 | escalator 15 | concrete 16 | object 17 | mouth 18 | singer 19 | bush 20 | lady 21 | plate 22 | hammer 23 | jumpsuit 24 | worker 25 | child 26 | player 27 | violin 28 | hay 29 | hat 30 | pavement 31 | crowd 32 | people 33 | trampoline 34 | son 35 | fabric 36 | onlooker 37 | passenger 38 | floor 39 | chair 40 | team 41 | sign 42 | blond-hair 43 | wheelchair 44 | truck 45 | accordion 46 | apron 47 | container 48 | city 49 | alley 50 | knee 51 | lawn 52 | robe 53 | artist 54 | screen 55 | boat 56 | newspaper 57 | canoe 58 | purse 59 | advertisement 60 | instrument 61 | skier 62 | tube 63 | stone 64 | bubble 65 | cream 66 | beverage 67 | groom 68 | mask 69 | smile 70 | door 71 | male 72 | dress 73 | plant 74 | plane 75 | volleyball 76 | paper 77 | swimsuit 78 | number 79 | pedestrian 80 | pajamas 81 | banner 82 | set 83 | crosswalk 84 | church 85 | belt 86 | fire 87 | racer 88 | person 89 | sandal 90 | couple 91 | sock 92 | suit 93 | poster 94 | t-shirt 95 | line 96 | lane 97 | race 98 | athlete 99 | bird 100 | leg 101 | baby 102 | customer 103 | trumpet 104 | animal 105 | counter 106 | dock 107 | obstacle 108 | graffitus 109 | scarf 110 | shorts 111 | device 112 | face 113 | mustache 114 | painting 115 | biker 116 | motorcycle 117 | wire 118 | drum 119 | ramp 120 | doorway 121 | ball 122 | drink 123 | overalls 124 | desk 125 | pier 126 | stage 127 | ponytail 128 | bike 129 | blanket 130 | daughter 131 | sweater 132 | work 133 | beach 134 | ladder 135 | lap 136 | coffee 137 | band 138 | bread 139 | hurdle 140 | train 141 | sled 142 | goalie 143 | gentleman 144 | kitchen 145 | cow 146 | cone 147 | wheel 148 | rail 149 | hand 150 | goggles 151 | board 152 | gun 153 | sidewalk 154 | uniform 155 | teacher 156 | pillow 157 | snowboard 158 | market 159 | car 160 | cap 161 | cat 162 | clothing 163 | airplane 164 | staircase 165 | haircut 166 | window 167 | cart 168 | card 169 | ring 170 | sheep 171 | friend 172 | equipment 173 | tooth 174 | snack 175 | stair 176 | jeans 177 | fruit 178 | snowboarder 179 | log 180 | area 181 | lot 182 | pitcher 183 | bucket 184 | podium 185 | pool 186 | building 187 | gymnast 188 | fountain 189 | fence 190 | trunk 191 | soldier 192 | family 193 | grill 194 | tattoo 195 | food 196 | foot 197 | dirt 198 | base 199 | horse 200 | station 201 | bride 202 | scooter 203 | lake 204 | rope 205 | bikini 206 | camera 207 | game 208 | parade 209 | step 210 | block 211 | structure 212 | wave 213 | booth 214 | vehicle 215 | ride 216 | skirt 217 | costume 218 | broom 219 | skater 220 | slide 221 | umbrella 222 | art 223 | beard 224 | neck 225 | bed 226 | basket 227 | cellphone 228 | computer 229 | tent 230 | group 231 | jersey 232 | surface 233 | balloon 234 | fireman 235 | tongue 236 | carriage 237 | display 238 | balcony 239 | clothes 240 | net 241 | red-hair 242 | adult 243 | room 244 | roof 245 | deck 246 | keyboard 247 | crew 248 | meat 249 | meal 250 | hoodie 251 | hoop 252 | chef 253 | chest 254 | dog 255 | bat 256 | bar 257 | bag 258 | microscope 259 | wetsuit 260 | cane 261 | vegetable 262 | waterfall 263 | kid 264 | shoulder 265 | skateboarder 266 | magazine 267 | ship 268 | jacket 269 | father 270 | item 271 | makeup 272 | box 273 | boy 274 | kayak 275 | pink 276 | sword 277 | map 278 | mat 279 | man 280 | rock 281 | girl 282 | headband 283 | tractor 284 | track 285 | sunglass 286 | shop 287 | shoe 288 | corner 289 | seat 290 | doctor 291 | pan 292 | bottle 293 | audience 294 | nose 295 | sneaker 296 | knife 297 | road 298 | harness 299 | walkway 300 | field 301 | ribbon 302 | eye 303 | raft 304 | coat 305 | infant 306 | house 307 | fish 308 | flower 309 | pigeon 310 | paint 311 | leash 312 | park 313 | mountain 314 | couch 315 | individual 316 | restaurant 317 | dancer 318 | stripe 319 | cup 320 | glove 321 | cheerleader 322 | back 323 | mirror 324 | candle 325 | goods 326 | jockey 327 | opponent 328 | curb 329 | firetruck 330 | stroller 331 | mural 332 | trail 333 | forest 334 | sweatshirt 335 | yard 336 | skateboard 337 | gear 338 | beam 339 | puddle 340 | racket 341 | swimmer 342 | orange 343 | bull 344 | bench 345 | heel 346 | hair 347 | hose 348 | guard 349 | female 350 | firefighter 351 | bus 352 | ledge 353 | hiker 354 | motorcyclist 355 | bicycle 356 | street 357 | path 358 | luggage 359 | scaffolding 360 | phone 361 | drummer 362 | gate 363 | tourist 364 | sand 365 | outfits 366 | toy 367 | top 368 | tool 369 | bridge 370 | snow 371 | rider 372 | stool 373 | mud 374 | finger 375 | metal 376 | surfer 377 | beer 378 | microphone 379 | ocean 380 | mother 381 | laptop 382 | teenager 383 | officer 384 | ice 385 | cowboy 386 | head 387 | papers 388 | tie 389 | picture 390 | football 391 | policeman 392 | water 393 | baseball 394 | tire 395 | post 396 | piano 397 | performer 398 | figure 399 | platform 400 | wagon 401 | swing 402 | slope 403 | wood 404 | bicyclist 405 | guitar 406 | color 407 | pot 408 | pole 409 | teammate 410 | vest 411 | someone 412 | helmet 413 | bowl 414 | driver 415 | statue 416 | towel 417 | table 418 | stand 419 | garb 420 | grass 421 | dish 422 | woman 423 | fan 424 | saxophone 425 | sun 426 | flag 427 | stick 428 | pond 429 | court 430 | goal 431 | shore 432 | hill 433 | guy 434 | store 435 | surfboard 436 | cigarette 437 | arm 438 | outfit 439 | referee 440 | shirt 441 | machinery 442 | vendor 443 | clown 444 | cloth 445 | attire 446 | cyclist 447 | shovel 448 | duck 449 | stream 450 | musician 451 | something 452 | toddler 453 | light 454 | necklace 455 | van 456 | river 457 | sculpture 458 | class 459 | pipe 460 | ear 461 | pants 462 | wall 463 | motorbike 464 | member 465 | student 466 | collar 467 | spectator 468 | bandanna 469 | camel 470 | boot 471 | sky 472 | book 473 | ski 474 | leaf 475 | headphone 476 | cliff 477 | cake 478 | guitarist 479 | other 480 | branch 481 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.autograd import Variable 8 | import torch.optim as optim 9 | 10 | import numpy as np 11 | import time 12 | import os 13 | from six.moves import cPickle 14 | import torch.backends.cudnn as cudnn 15 | import yaml 16 | 17 | import opts 18 | import misc.eval_utils 19 | import misc.utils as utils 20 | import misc.AttModel as AttModel 21 | import yaml 22 | 23 | # from misc.rewards import get_self_critical_reward 24 | import torchvision.transforms as transforms 25 | import pdb 26 | import argparse 27 | import torch.nn.functional as F 28 | import matplotlib.pyplot as plt 29 | from PIL import Image 30 | plt.switch_backend('agg') 31 | import json 32 | def demo(opt): 33 | model.eval() 34 | ######################################################################################### 35 | # eval begins here 36 | ######################################################################################### 37 | data_iter_val = iter(dataloader_val) 38 | loss_temp = 0 39 | start = time.time() 40 | 41 | num_show = 0 42 | predictions = [] 43 | count = 0 44 | for step in range(1000): 45 | data = data_iter_val.next() 46 | img, iseq, gts_seq, num, proposals, bboxs, box_mask, img_id = data 47 | 48 | # if img_id[0] != 134688: 49 | # continue 50 | 51 | # # for i in range(proposals.size(1)): print(opt.itoc[proposals[0][i][4]], i) 52 | 53 | # # list1 = [6, 10] 54 | # list1 = [0, 1, 10, 2, 3, 4, 5, 6, 7, 8, 9] 55 | # proposals = proposals[:,list1] 56 | # num[0,1] = len(list1) 57 | proposals = proposals[:,:max(int(max(num[:,1])),1),:] 58 | 59 | input_imgs.data.resize_(img.size()).copy_(img) 60 | input_seqs.data.resize_(iseq.size()).copy_(iseq) 61 | gt_seqs.data.resize_(gts_seq.size()).copy_(gts_seq) 62 | input_num.data.resize_(num.size()).copy_(num) 63 | input_ppls.data.resize_(proposals.size()).copy_(proposals) 64 | gt_bboxs.data.resize_(bboxs.size()).copy_(bboxs) 65 | mask_bboxs.data.resize_(box_mask.size()).copy_(box_mask) 66 | input_imgs.data.resize_(img.size()).copy_(img) 67 | 68 | eval_opt = {'sample_max':1, 'beam_size': opt.beam_size, 'inference_mode' : True, 'tag_size' : opt.cbs_tag_size} 69 | seq, bn_seq, fg_seq, _, _, _ = model._sample(input_imgs, input_ppls, input_num, eval_opt) 70 | 71 | sents, det_idx, det_word = utils.decode_sequence_det(dataset_val.itow, dataset_val.itod, dataset_val.ltow, dataset_val.itoc, dataset_val.wtod, \ 72 | seq, bn_seq, fg_seq, opt.vocab_size, opt) 73 | 74 | if opt.dataset == 'flickr30k': 75 | im2show = Image.open(os.path.join(opt.image_path, '%d.jpg' % img_id[0])).convert('RGB') 76 | else: 77 | 78 | if os.path.isfile(os.path.join(opt.image_path, 'val2014/COCO_val2014_%012d.jpg' % img_id[0])): 79 | im2show = Image.open(os.path.join(opt.image_path, 'val2014/COCO_val2014_%012d.jpg' % img_id[0])).convert('RGB') 80 | else: 81 | im2show = Image.open(os.path.join(opt.image_path, 'train2014/COCO_train2014_%012d.jpg' % img_id[0])).convert('RGB') 82 | 83 | w, h = im2show.size 84 | 85 | rest_idx = [] 86 | for i in range(proposals[0].shape[0]): 87 | if i not in det_idx: 88 | rest_idx.append(i) 89 | 90 | 91 | if len(det_idx) > 0: 92 | # for visulization 93 | proposals = proposals[0].numpy() 94 | proposals[:,0] = proposals[:,0] * w / float(opt.image_crop_size) 95 | proposals[:,2] = proposals[:,2] * w / float(opt.image_crop_size) 96 | proposals[:,1] = proposals[:,1] * h / float(opt.image_crop_size) 97 | proposals[:,3] = proposals[:,3] * h / float(opt.image_crop_size) 98 | 99 | cls_dets = proposals[det_idx] 100 | rest_dets = proposals[rest_idx] 101 | 102 | # fig = plt.figure() 103 | # fig = plt.figure(frameon=False) 104 | # ax = plt.Axes(fig, [0., 0., 1., 1.]) 105 | fig = plt.figure(frameon=False) 106 | # fig.set_size_inches(5,5*h/w) 107 | ax = plt.Axes(fig, [0., 0., 1., 1.]) 108 | ax.set_axis_off() 109 | fig.add_axes(ax) 110 | a=fig.gca() 111 | a.set_frame_on(False) 112 | a.set_xticks([]); a.set_yticks([]) 113 | plt.axis('off') 114 | plt.xlim(0,w); plt.ylim(h,0) 115 | # fig, ax = plt.subplots(1) 116 | 117 | # show other box in grey. 118 | 119 | plt.imshow(im2show) 120 | 121 | if len(rest_idx) > 0: 122 | for i in range(len(rest_dets)): 123 | ax = utils.vis_detections(ax, dataset_val.itoc[int(rest_dets[i,4])], rest_dets[i,:5], i, 1) 124 | 125 | if len(det_idx) > 0: 126 | for i in range(len(cls_dets)): 127 | ax = utils.vis_detections(ax, dataset_val.itoc[int(cls_dets[i,4])], cls_dets[i,:5], i, 0) 128 | 129 | # plt.axis('off') 130 | # plt.axis('tight') 131 | # plt.tight_layout() 132 | fig.savefig('visu/%d.jpg' %(img_id[0]), bbox_inches='tight', pad_inches=0, dpi=150) 133 | print(str(img_id[0]) + ': ' + sents[0]) 134 | 135 | entry = {'image_id': img_id[0], 'caption': sents[0]} 136 | predictions.append(entry) 137 | 138 | return predictions 139 | #################################################################################### 140 | # Main 141 | #################################################################################### 142 | # initialize the data holder. 143 | if __name__ == '__main__': 144 | 145 | parser = argparse.ArgumentParser() 146 | parser.add_argument('--start_from', type=str, default='', help='') 147 | parser.add_argument('--load_best_score', type=int, default=1, 148 | help='Do we load previous best score when resuming training.') 149 | parser.add_argument('--id', type=str, default='', 150 | help='an id identifying this run/job. used in cross-val and appended when writing progress files') 151 | parser.add_argument('--image_path', type=str, default='/home/jiasen/data/coco/images/', 152 | help='path to the h5file containing the image data') 153 | parser.add_argument('--cbs', type=bool, default=False, 154 | help='whether use constraint beam search.') 155 | parser.add_argument('--cbs_tag_size', type=int, default=3, 156 | help='whether use constraint beam search.') 157 | parser.add_argument('--cbs_mode', type=str, default='all', 158 | help='which cbs mode to use in the decoding stage. cbs_mode: all|unique|novel') 159 | parser.add_argument('--det_oracle', type=bool, default=False, 160 | help='whether use oracle bounding box.') 161 | parser.add_argument('--cnn_backend', type=str, default='res101', 162 | help='res101 or vgg16') 163 | parser.add_argument('--data_path', type=str, default='') 164 | parser.add_argument('--beam_size', type=int, default=1) 165 | 166 | args = parser.parse_args() 167 | 168 | infos = {} 169 | histories = {} 170 | if args.start_from is not None: 171 | if args.load_best_score == 1: 172 | model_path = os.path.join(args.start_from, 'model-best.pth') 173 | info_path = os.path.join(args.start_from, 'infos_'+args.id+'-best.pkl') 174 | else: 175 | model_path = os.path.join(args.start_from, 'model.pth') 176 | info_path = os.path.join(args.start_from, 'infos_'+args.id+'.pkl') 177 | 178 | # open old infos and check if models are compatible 179 | with open(info_path) as f: 180 | infos = cPickle.load(f) 181 | opt = infos['opt'] 182 | opt.image_path = args.image_path 183 | opt.cbs = args.cbs 184 | opt.cbs_tag_size = args.cbs_tag_size 185 | opt.cbs_mode = args.cbs_mode 186 | opt.det_oracle = args.det_oracle 187 | opt.cnn_backend = args.cnn_backend 188 | opt.data_path = args.data_path 189 | opt.beam_size = args.beam_size 190 | else: 191 | print("please specify the model path...") 192 | pdb.set_trace() 193 | 194 | cudnn.benchmark = True 195 | 196 | if opt.dataset == 'flickr30k': 197 | from misc.dataloader_flickr30k import DataLoader 198 | else: 199 | from misc.dataloader_coco import DataLoader 200 | 201 | 202 | #################################################################################### 203 | # Data Loader 204 | #################################################################################### 205 | dataset_val = DataLoader(opt, split='test') 206 | dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=1, 207 | shuffle=False, num_workers=0) 208 | 209 | input_imgs = torch.FloatTensor(1) 210 | input_seqs = torch.LongTensor(1) 211 | input_ppls = torch.FloatTensor(1) 212 | gt_bboxs = torch.FloatTensor(1) 213 | mask_bboxs = torch.ByteTensor(1) 214 | gt_seqs = torch.LongTensor(1) 215 | input_num = torch.LongTensor(1) 216 | 217 | if opt.cuda: 218 | input_imgs = input_imgs.cuda() 219 | input_seqs = input_seqs.cuda() 220 | gt_seqs = gt_seqs.cuda() 221 | input_num = input_num.cuda() 222 | input_ppls = input_ppls.cuda() 223 | gt_bboxs = gt_bboxs.cuda() 224 | mask_bboxs = mask_bboxs.cuda() 225 | 226 | input_imgs = Variable(input_imgs) 227 | input_seqs = Variable(input_seqs) 228 | gt_seqs = Variable(gt_seqs) 229 | input_num = Variable(input_num) 230 | input_ppls = Variable(input_ppls) 231 | gt_bboxs = Variable(gt_bboxs) 232 | mask_bboxs = Variable(mask_bboxs) 233 | 234 | #################################################################################### 235 | # Build the Model 236 | #################################################################################### 237 | opt.vocab_size = dataset_val.vocab_size 238 | opt.detect_size = dataset_val.detect_size 239 | opt.seq_length = opt.seq_length 240 | opt.fg_size = dataset_val.fg_size 241 | opt.fg_mask = torch.from_numpy(dataset_val.fg_mask).byte() 242 | opt.glove_fg = torch.from_numpy(dataset_val.glove_fg).float() 243 | opt.glove_clss = torch.from_numpy(dataset_val.glove_clss).float() 244 | opt.st2towidx = torch.from_numpy(dataset_val.st2towidx).long() 245 | 246 | opt.itow = dataset_val.itow 247 | opt.itod = dataset_val.itod 248 | opt.ltow = dataset_val.ltow 249 | opt.itoc = dataset_val.itoc 250 | 251 | pdb.set_trace() 252 | if opt.att_model == 'topdown': 253 | model = AttModel.TopDownModel(opt) 254 | elif opt.att_model == 'att2in2': 255 | model = AttModel.Att2in2Model(opt) 256 | 257 | if opt.decode_noc: 258 | model._reinit_word_weight(opt, dataset_val.ctoi, dataset_val.wtoi) 259 | 260 | if args.start_from != None: 261 | # opt.learning_rate = saved_model_opt.learning_rate 262 | print('Loading the model %s...' %(model_path)) 263 | model.load_state_dict(torch.load(model_path)) 264 | if os.path.isfile(os.path.join(args.start_from, 'histories_'+opt.id+'.pkl')): 265 | with open(os.path.join(args.start_from, 'histories_'+opt.id+'.pkl')) as f: 266 | histories = cPickle.load(f) 267 | 268 | if opt.cuda: 269 | model.cuda() 270 | 271 | predictions = demo(opt) 272 | 273 | print('saving...') 274 | json.dump(predictions, open('visu.json', 'w')) 275 | 276 | -------------------------------------------------------------------------------- /demo/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/demo/img1.png -------------------------------------------------------------------------------- /demo/img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/demo/img2.png -------------------------------------------------------------------------------- /generate_robust_split.py: -------------------------------------------------------------------------------- 1 | # import _init_paths 2 | import copy 3 | import json 4 | import operator 5 | from random import seed, shuffle 6 | 7 | import numpy as np 8 | from six.moves import xrange 9 | 10 | from pycocotools.coco import COCO 11 | 12 | 13 | def get_det_word(bbox_ann, captions, wtoi, wtod, dtoi, wtol, ngram=2): 14 | 15 | # get the present category. 16 | pcats = [box['label'] for box in bbox_ann] 17 | 18 | # get the orginial form of the caption. 19 | indicator = [] 20 | stem_caption = [] 21 | for s in captions: 22 | tmp = [] 23 | for w in s: 24 | if w in wtol: 25 | tmp.append(wtol[w]) 26 | else: 27 | tmp.append(w) 28 | 29 | stem_caption.append(tmp) 30 | indicator.append([(0, 0, 0)]*len(s)) # category class, binary class, fine-grain class. 31 | 32 | ngram_indicator = {i+1:copy.deepcopy(indicator) for i in range(ngram)} 33 | # get the 2 gram of the caption. 34 | for n in range(ngram,0,-1): 35 | for i, s in enumerate(stem_caption): 36 | for j in xrange(len(s)-n+1): 37 | ng = ' '.join(s[j:j+n]) 38 | # if the n-gram exist in word_to_detection dictionary. 39 | if ng in wtod and indicator[i][j][0] == 0 and wtod[ng] in pcats: # make sure that larger gram not overwright with lower gram. 40 | bn = (ng != ' '.join(captions[i][j:j+n])) + 1 41 | fg = dtoi[ng] 42 | ngram_indicator[n][i][j] = (wtod[ng], bn, fg) 43 | indicator[i][j:j+n] = [(wtod[ng], bn, fg)] * n 44 | 45 | return ngram_indicator 46 | 47 | def get_stats(imgs, wtoi, wtod, dtoi, wtol, ctol, coco_det_train, coco_det_val): 48 | 49 | train_matrix = np.zeros((len(wtod),len(wtod))) 50 | test_matrix = np.zeros((len(wtod),len(wtod))) 51 | test_num = 0 52 | coco_stats = [] 53 | 54 | for idx, img in enumerate(imgs): 55 | 56 | image_id = info['images'][idx]['id'] 57 | file_path = info['images'][idx]['file_path'].split('/')[0] 58 | 59 | if file_path == 'train2014': 60 | coco = coco_det_train 61 | else: 62 | coco = coco_det_val 63 | bbox_ann_ids = coco.getAnnIds(imgIds=image_id) 64 | bbox_ann = [{'label': ctol[i['category_id']], 'bbox': i['bbox']} for i in coco.loadAnns(bbox_ann_ids)] 65 | captions = [] 66 | for sent in img['sentences']: 67 | captions.append(sent['tokens']) 68 | det_indicator = get_det_word(bbox_ann, captions, wtoi, wtod, dtoi, wtol) 69 | 70 | present_clss = [] 71 | 72 | for i, caption in enumerate(captions): 73 | for j in range(len(caption)): 74 | for n in range(2, 0, -1): 75 | if det_indicator[n][i][j][0] != 0 and det_indicator[n][i][j][0] not in present_clss: 76 | present_clss.append(det_indicator[n][i][j][0]) 77 | coco_stats.append({'pclss':present_clss, 'image_id':image_id}) 78 | 79 | return coco_stats 80 | 81 | imgs = json.load(open('data/robust_coco_creation/dataset_coco.json', 'r')) 82 | 83 | det_train_path = 'data/robust_coco_creation/annotations/instances_train2014.json' 84 | det_val_path = 'data/robust_coco_creation/annotations/instances_val2014.json' 85 | 86 | coco_det_train = COCO(det_train_path) 87 | coco_det_val = COCO(det_val_path) 88 | 89 | info = json.load(open('data/robust_coco_creation/dic_coco.json', 'r')) 90 | itow = info['ix_to_word'] 91 | wtoi = {w:i for i,w in itow.items()} 92 | wtod = {w:i+1 for w,i in info['wtod'].items()} # word to detection 93 | dtoi = {w:i+1 for i,w in enumerate(wtod.keys())} # detection to index 94 | wtol = info['wtol'] 95 | ctol = {c:i+1 for i, c in enumerate(coco_det_train.cats.keys())} 96 | imgs = imgs['images'] 97 | coco_stats = get_stats(imgs, wtoi, wtod, dtoi, wtol, ctol, coco_det_train, coco_det_val) 98 | class_total = np.zeros(80) 99 | # get the sum for each category. 100 | for img in coco_stats: 101 | img['pclss'] = [i-1 for i in img['pclss']] 102 | for idx in img['pclss']: 103 | class_total[idx] += 1 104 | 105 | json.dump(coco_stats, open('coco_obj_stats.json', 'w')) 106 | pair_list = {} 107 | for img in coco_stats: 108 | for i in range(len(img['pclss'])): 109 | for j in range(len(img['pclss'])): 110 | if i != j: 111 | idx_i = img['pclss'][i] 112 | idx_j = img['pclss'][j] 113 | if idx_i < idx_j: 114 | idx_ij = (idx_i, idx_j) 115 | else: 116 | idx_ij = (idx_j, idx_i) 117 | if idx_ij not in pair_list: 118 | pair_list[idx_ij] = 0 119 | else: 120 | pair_list[idx_ij] += 1 121 | 122 | pair_list_sort = sorted(pair_list.items(), key=operator.itemgetter(1)) 123 | 124 | pair_list = [] 125 | for pair in pair_list_sort: 126 | pair_list.append([pair[0][0], pair[0][1], pair[1]]) 127 | 128 | # for each pair, go throughall the images 129 | testing_total = np.zeros(80) 130 | test_pair = [] 131 | count = 0 132 | test_img_num = 0 133 | for pair in pair_list: 134 | tmp_num = 0 135 | testing_total_copy = copy.deepcopy(testing_total) 136 | for img in coco_stats: 137 | if pair[0] in img['pclss'] and pair[1] in img['pclss']: 138 | # also accumulate other class. 139 | for idx in img['pclss']: 140 | testing_total_copy[idx] += 1 141 | tmp_num += 1 142 | 143 | # if the testing data exceed half of the total data, don't count this pair. 144 | drop_flag = False 145 | for i in range(80): 146 | if testing_total_copy[i] > (class_total[i] / 2): 147 | drop_flag = True 148 | print("drop pair " + str(pair[0]) + '_' + str(pair[1])) 149 | break 150 | 151 | if drop_flag == False: 152 | test_pair.append(pair) 153 | testing_total = copy.deepcopy(testing_total_copy) 154 | test_img_num += tmp_num 155 | 156 | count += 1 157 | print(count, test_img_num) 158 | if test_img_num > 15000: 159 | break 160 | 161 | print('saving the test pair list....') 162 | json.dump(test_pair, open('test_pair_list.json', 'w')) 163 | 164 | test_pair_dic = {} 165 | for pair in test_pair: 166 | test_pair_dic[str(pair[0])+'_'+str(pair[1])] = 0 167 | 168 | train_img_id = [] 169 | test_img_id = [] 170 | for img in coco_stats: 171 | present_clss = img['pclss'] 172 | 173 | # generate the pair. 174 | tmp = [] 175 | for i in range(len(present_clss)): 176 | for j in range(len(present_clss)): 177 | if i != j: 178 | tmp.append(str(present_clss[i]) + '_' + str(present_clss[j])) 179 | 180 | test_flag = False 181 | for i in tmp: 182 | if i in test_pair_dic: 183 | test_flag = True 184 | if test_flag == True: 185 | test_img_id.append({'img_id': img['image_id']}) 186 | else: 187 | train_img_id.append({'img_id': img['image_id']}) 188 | 189 | seed(123) # make reproducible 190 | shuffle(test_img_id) # shuffle the order 191 | 192 | num_val = int(0.3 * len(test_img_id)) 193 | 194 | train_id = train_img_id 195 | val_id = test_img_id[:num_val] 196 | test_id = test_img_id[num_val:] 197 | 198 | print("train, val, test", len(train_id), len(val_id), len(test_id)) 199 | robust_split = {'train_id':train_id, 'val_id':val_id, 'test_id':test_id} 200 | json.dump(robust_split, open('split_robust_coco.json', 'w')) 201 | -------------------------------------------------------------------------------- /misc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/misc/__init__.py -------------------------------------------------------------------------------- /misc/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/misc/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /misc/__pycache__/eval_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/misc/__pycache__/eval_utils.cpython-36.pyc -------------------------------------------------------------------------------- /misc/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | # -------------------------------------------------------- 8 | # Reorganized and modified by Jianwei Yang and Jiasen Lu 9 | # -------------------------------------------------------- 10 | 11 | import torch 12 | import numpy as np 13 | import pdb 14 | 15 | def bbox_transform(ex_rois, gt_rois): 16 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 17 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 18 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 19 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 20 | 21 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 22 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 23 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 24 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 25 | 26 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 27 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 28 | targets_dw = torch.log(gt_widths / ex_widths) 29 | targets_dh = torch.log(gt_heights / ex_heights) 30 | 31 | targets = torch.stack( 32 | (targets_dx, targets_dy, targets_dw, targets_dh),1) 33 | 34 | return targets 35 | 36 | def bbox_transform_batch(ex_rois, gt_rois): 37 | 38 | if ex_rois.dim() == 2: 39 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 40 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 41 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 42 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 43 | 44 | gt_widths = gt_rois[:, :, 2] - gt_rois[:, :, 0] + 1.0 45 | gt_heights = gt_rois[:, :, 3] - gt_rois[:, :, 1] + 1.0 46 | gt_ctr_x = gt_rois[:, :, 0] + 0.5 * gt_widths 47 | gt_ctr_y = gt_rois[:, :, 1] + 0.5 * gt_heights 48 | 49 | targets_dx = (gt_ctr_x - ex_ctr_x.view(1,-1).expand_as(gt_ctr_x)) / ex_widths 50 | targets_dy = (gt_ctr_y - ex_ctr_y.view(1,-1).expand_as(gt_ctr_y)) / ex_heights 51 | targets_dw = torch.log(gt_widths / ex_widths.view(1,-1).expand_as(gt_widths)) 52 | targets_dh = torch.log(gt_heights / ex_heights.view(1,-1).expand_as(gt_heights)) 53 | 54 | elif ex_rois.dim() == 3: 55 | ex_widths = ex_rois[:, :, 2] - ex_rois[:, :, 0] + 1.0 56 | ex_heights = ex_rois[:,:, 3] - ex_rois[:,:, 1] + 1.0 57 | ex_ctr_x = ex_rois[:, :, 0] + 0.5 * ex_widths 58 | ex_ctr_y = ex_rois[:, :, 1] + 0.5 * ex_heights 59 | 60 | gt_widths = gt_rois[:, :, 2] - gt_rois[:, :, 0] + 1.0 61 | gt_heights = gt_rois[:, :, 3] - gt_rois[:, :, 1] + 1.0 62 | gt_ctr_x = gt_rois[:, :, 0] + 0.5 * gt_widths 63 | gt_ctr_y = gt_rois[:, :, 1] + 0.5 * gt_heights 64 | 65 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 66 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 67 | targets_dw = torch.log(gt_widths / ex_widths) 68 | targets_dh = torch.log(gt_heights / ex_heights) 69 | else: 70 | raise ValueError('ex_roi input dimension is not correct.') 71 | 72 | targets = torch.stack( 73 | (targets_dx, targets_dy, targets_dw, targets_dh),2) 74 | 75 | return targets 76 | 77 | def bbox_transform_inv(boxes, deltas, batch_size): 78 | widths = boxes[:, :, 2] - boxes[:, :, 0] + 1.0 79 | heights = boxes[:, :, 3] - boxes[:, :, 1] + 1.0 80 | ctr_x = boxes[:, :, 0] + 0.5 * widths 81 | ctr_y = boxes[:, :, 1] + 0.5 * heights 82 | 83 | dx = deltas[:, :, 0::4] 84 | dy = deltas[:, :, 1::4] 85 | dw = deltas[:, :, 2::4] 86 | dh = deltas[:, :, 3::4] 87 | 88 | pred_ctr_x = dx * widths.unsqueeze(2) + ctr_x.unsqueeze(2) 89 | pred_ctr_y = dy * heights.unsqueeze(2) + ctr_y.unsqueeze(2) 90 | pred_w = np.exp(dw) * widths.unsqueeze(2) 91 | pred_h = np.exp(dh) * heights.unsqueeze(2) 92 | 93 | pred_boxes = deltas.clone() 94 | # x1 95 | pred_boxes[:, :, 0::4] = pred_ctr_x - 0.5 * pred_w 96 | # y1 97 | pred_boxes[:, :, 1::4] = pred_ctr_y - 0.5 * pred_h 98 | # x2 99 | pred_boxes[:, :, 2::4] = pred_ctr_x + 0.5 * pred_w 100 | # y2 101 | pred_boxes[:, :, 3::4] = pred_ctr_y + 0.5 * pred_h 102 | 103 | return pred_boxes 104 | 105 | def clip_boxes_batch(boxes, im_shape, batch_size): 106 | """ 107 | Clip boxes to image boundaries. 108 | """ 109 | num_rois = boxes.size(1) 110 | 111 | boxes[boxes < 0] = 0 112 | # batch_x = (im_shape[:,0]-1).view(batch_size, 1).expand(batch_size, num_rois) 113 | # batch_y = (im_shape[:,1]-1).view(batch_size, 1).expand(batch_size, num_rois) 114 | 115 | batch_x = im_shape[:, 1] - 1 116 | batch_y = im_shape[:, 0] - 1 117 | 118 | boxes[:,:,0][boxes[:,:,0] > batch_x] = batch_x 119 | boxes[:,:,1][boxes[:,:,1] > batch_y] = batch_y 120 | boxes[:,:,2][boxes[:,:,2] > batch_x] = batch_x 121 | boxes[:,:,3][boxes[:,:,3] > batch_y] = batch_y 122 | 123 | return boxes 124 | 125 | def clip_boxes(boxes, im_shape, batch_size): 126 | 127 | for i in range(batch_size): 128 | boxes[i,:,0::4].clamp_(0, im_shape[i, 1]-1) 129 | boxes[i,:,1::4].clamp_(0, im_shape[i, 0]-1) 130 | boxes[i,:,2::4].clamp_(0, im_shape[i, 1]-1) 131 | boxes[i,:,3::4].clamp_(0, im_shape[i, 0]-1) 132 | 133 | return boxes 134 | 135 | 136 | def bbox_overlaps(anchors, gt_boxes): 137 | """ 138 | anchors: (N, 4) ndarray of float 139 | gt_boxes: (K, 4) ndarray of float 140 | 141 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 142 | """ 143 | N = anchors.size(0) 144 | K = gt_boxes.size(0) 145 | 146 | gt_boxes_area = ((gt_boxes[:,2] - gt_boxes[:,0] + 1) * 147 | (gt_boxes[:,3] - gt_boxes[:,1] + 1)).view(1, K) 148 | 149 | anchors_area = ((anchors[:,2] - anchors[:,0] + 1) * 150 | (anchors[:,3] - anchors[:,1] + 1)).view(N, 1) 151 | 152 | boxes = anchors.view(N, 1, 4).expand(N, K, 4) 153 | query_boxes = gt_boxes.view(1, K, 4).expand(N, K, 4) 154 | 155 | iw = (torch.min(boxes[:,:,2], query_boxes[:,:,2]) - 156 | torch.max(boxes[:,:,0], query_boxes[:,:,0]) + 1) 157 | iw[iw < 0] = 0 158 | 159 | ih = (torch.min(boxes[:,:,3], query_boxes[:,:,3]) - 160 | torch.max(boxes[:,:,1], query_boxes[:,:,1]) + 1) 161 | ih[ih < 0] = 0 162 | 163 | ua = anchors_area + gt_boxes_area - (iw * ih) 164 | overlaps = iw * ih / ua 165 | 166 | return overlaps 167 | 168 | def bbox_overlaps_batch(anchors, gt_boxes): 169 | """ 170 | anchors: (N, 4) ndarray of float 171 | gt_boxes: (b, K, 5) ndarray of float 172 | 173 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 174 | """ 175 | batch_size = gt_boxes.size(0) 176 | 177 | 178 | if anchors.dim() == 2: 179 | 180 | N = anchors.size(0) 181 | K = gt_boxes.size(1) 182 | 183 | anchors = anchors.view(1, N, 4).expand(batch_size, N, 4).contiguous() 184 | gt_boxes = gt_boxes[:,:,:4].contiguous() 185 | 186 | 187 | gt_boxes_x = (gt_boxes[:,:,2] - gt_boxes[:,:,0] + 1) 188 | gt_boxes_y = (gt_boxes[:,:,3] - gt_boxes[:,:,1] + 1) 189 | gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K) 190 | 191 | anchors_boxes_x = (anchors[:,:,2] - anchors[:,:,0] + 1) 192 | anchors_boxes_y = (anchors[:,:,3] - anchors[:,:,1] + 1) 193 | anchors_area = (anchors_boxes_x * anchors_boxes_y).view(batch_size, N, 1) 194 | 195 | gt_area_zero = (gt_boxes_x == 1) & (gt_boxes_y == 1) 196 | anchors_area_zero = (anchors_boxes_x == 1) & (anchors_boxes_y == 1) 197 | 198 | boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4) 199 | query_boxes = gt_boxes.view(batch_size, 1, K, 4).expand(batch_size, N, K, 4) 200 | 201 | iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) - 202 | torch.max(boxes[:,:,:,0], query_boxes[:,:,:,0]) + 1) 203 | iw[iw < 0] = 0 204 | 205 | ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) - 206 | torch.max(boxes[:,:,:,1], query_boxes[:,:,:,1]) + 1) 207 | ih[ih < 0] = 0 208 | ua = anchors_area + gt_boxes_area - (iw * ih) 209 | overlaps = iw * ih / ua 210 | 211 | # mask the overlap here. 212 | overlaps.masked_fill_(gt_area_zero.view(batch_size, 1, K).expand(batch_size, N, K), 0) 213 | overlaps.masked_fill_(anchors_area_zero.view(batch_size, N, 1).expand(batch_size, N, K), -1) 214 | 215 | elif anchors.dim() == 3: 216 | N = anchors.size(1) 217 | K = gt_boxes.size(1) 218 | 219 | if anchors.size(2) == 4: 220 | anchors = anchors[:,:,:4].contiguous() 221 | else: 222 | anchors = anchors[:,:,1:5].contiguous() 223 | 224 | gt_boxes = gt_boxes[:,:,:4].contiguous() 225 | 226 | gt_boxes_x = (gt_boxes[:,:,2] - gt_boxes[:,:,0] + 1) 227 | gt_boxes_y = (gt_boxes[:,:,3] - gt_boxes[:,:,1] + 1) 228 | gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K) 229 | 230 | anchors_boxes_x = (anchors[:,:,2] - anchors[:,:,0] + 1) 231 | anchors_boxes_y = (anchors[:,:,3] - anchors[:,:,1] + 1) 232 | anchors_area = (anchors_boxes_x * anchors_boxes_y).view(batch_size, N, 1) 233 | 234 | gt_area_zero = (gt_boxes_x == 1) & (gt_boxes_y == 1) 235 | anchors_area_zero = (anchors_boxes_x == 1) & (anchors_boxes_y == 1) 236 | 237 | boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4) 238 | query_boxes = gt_boxes.view(batch_size, 1, K, 4).expand(batch_size, N, K, 4) 239 | 240 | iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) - 241 | torch.max(boxes[:,:,:,0], query_boxes[:,:,:,0]) + 1) 242 | iw[iw < 0] = 0 243 | 244 | ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) - 245 | torch.max(boxes[:,:,:,1], query_boxes[:,:,:,1]) + 1) 246 | ih[ih < 0] = 0 247 | ua = anchors_area + gt_boxes_area - (iw * ih) 248 | 249 | overlaps = iw * ih / ua 250 | 251 | # mask the overlap here. 252 | overlaps.masked_fill_(gt_area_zero.view(batch_size, 1, K).expand(batch_size, N, K), 0) 253 | overlaps.masked_fill_(anchors_area_zero.view(batch_size, N, 1).expand(batch_size, N, K), -1) 254 | else: 255 | raise ValueError('anchors input dimension is not correct.') 256 | 257 | return overlaps 258 | 259 | -------------------------------------------------------------------------------- /misc/dataloader_hdf.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import h5py 4 | from torch.utils.data import Dataset 5 | 6 | 7 | class HDFShardDataset(Dataset): 8 | def __init__(self, shard_dir, shard_names=None, primary_key=None, stride=1): 9 | super().__init__() 10 | self.shard_dir = shard_dir 11 | self.shard_names = shard_names 12 | if not shard_names: 13 | self.shard_names = sorted(os.listdir(shard_dir)) 14 | self.primary_key = self.__primary_key(primary_key) 15 | self.stride = stride 16 | 17 | # length is expressed as per items, not rows (#items * stride = #rows) 18 | self.shard_len, self.dataset_len = self.__shard_len_dataset_len() 19 | 20 | def __len__(self): 21 | return self.dataset_len 22 | 23 | def __getitem__(self, idx): 24 | shard_num = idx // self.shard_len 25 | idx -= shard_num * self.shard_len 26 | nth_shard = h5py.File(os.path.join(self.shard_dir, self.shard_names[shard_num]), 'r') 27 | keys = list(nth_shard.keys()) 28 | item = {} 29 | for key in keys: 30 | item[key] = nth_shard[key][idx * self.stride : (idx + 1) * self.stride] 31 | nth_shard.close() 32 | return item 33 | 34 | def __primary_key(self, primary_key): 35 | first_shard = h5py.File(os.path.join(self.shard_dir, self.shard_names[0]), 'r') 36 | if not primary_key: 37 | primary_key = list(first_shard.keys())[0] 38 | first_shard.close() 39 | return primary_key 40 | 41 | def __shard_len_dataset_len(self): 42 | # check number of items per shard by opening one shard 43 | # check remainder number of items in last shard 44 | first_shard = h5py.File(os.path.join(self.shard_dir, self.shard_names[0]), 'r') 45 | last_shard = h5py.File(os.path.join(self.shard_dir, self.shard_names[-1]), 'r') 46 | rows_per_shard = len(first_shard[self.primary_key]) 47 | rows_per_last_shard = len(last_shard[self.primary_key]) 48 | 49 | dataset_len = rows_per_shard * (len(self.shard_names) - 1) // self.stride 50 | dataset_len += rows_per_last_shard // self.stride 51 | shard_len = rows_per_shard // self.stride 52 | first_shard.close() 53 | last_shard.close() 54 | return shard_len, dataset_len 55 | 56 | 57 | class HDFSingleDataset(HDFShardDataset): 58 | def __init__(self, hdf_path, primary_key=None, stride=1): 59 | super().__init__( 60 | os.path.dirname(hdf_path), 61 | shard_names=[os.path.basename(hdf_path)], 62 | primary_key=primary_key, 63 | stride=stride, 64 | ) 65 | -------------------------------------------------------------------------------- /misc/eval_utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch.autograd import Variable 8 | 9 | import numpy as np 10 | import json 11 | from json import encoder 12 | import random 13 | import string 14 | import time 15 | import os 16 | import sys 17 | import misc.utils as utils 18 | 19 | def language_eval(dataset, preds, model_id, split): 20 | import sys 21 | sys.path.append("coco-caption") 22 | annFile = 'coco-caption/annotations/captions_val2014.json' 23 | from pycocotools.coco import COCO 24 | from pycocoevalcap.eval import COCOEvalCap 25 | 26 | encoder.FLOAT_REPR = lambda o: format(o, '.3f') 27 | 28 | if not os.path.isdir('eval_results'): 29 | os.mkdir('eval_results') 30 | cache_path = os.path.join('eval_results/', model_id + '_' + split + '.json') 31 | 32 | coco = COCO(annFile) 33 | valids = coco.getImgIds() 34 | 35 | # filter results to only those in MSCOCO validation set (will be about a third) 36 | preds_filt = [p for p in preds if p['image_id'] in valids] 37 | print('using %d/%d predictions' % (len(preds_filt), len(preds))) 38 | json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API... 39 | 40 | cocoRes = coco.loadRes(cache_path) 41 | cocoEval = COCOEvalCap(coco, cocoRes) 42 | cocoEval.params['image_id'] = cocoRes.getImgIds() 43 | cocoEval.evaluate() 44 | 45 | # create output dictionary 46 | out = {} 47 | for metric, score in cocoEval.eval.items(): 48 | out[metric] = score 49 | 50 | imgToEval = cocoEval.imgToEval 51 | for p in preds_filt: 52 | image_id, caption = p['image_id'], p['caption'] 53 | imgToEval[image_id]['caption'] = caption 54 | with open(cache_path, 'w') as outfile: 55 | json.dump({'overall': out, 'imgToEval': imgToEval}, outfile) 56 | 57 | return out 58 | 59 | def eval_split(model, crit, loader, eval_kwargs={}): 60 | verbose = eval_kwargs.get('verbose', True) 61 | num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1)) 62 | split = eval_kwargs.get('split', 'val') 63 | lang_eval = eval_kwargs.get('language_eval', 0) 64 | dataset = eval_kwargs.get('dataset', 'coco') 65 | beam_size = eval_kwargs.get('beam_size', 1) 66 | 67 | # Make sure in the evaluation mode 68 | model.eval() 69 | 70 | loader.reset_iterator(split) 71 | 72 | n = 0 73 | loss = 0 74 | loss_sum = 0 75 | loss_evals = 1e-8 76 | predictions = [] 77 | while True: 78 | data = loader.get_batch(split) 79 | n = n + loader.batch_size 80 | 81 | if data.get('labels', None) is not None: 82 | # forward the model to get loss 83 | tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']] 84 | tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp] 85 | fc_feats, att_feats, labels, masks = tmp 86 | 87 | loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:]).data[0] 88 | loss_sum = loss_sum + loss 89 | loss_evals = loss_evals + 1 90 | 91 | # forward the model to also get generated samples for each image 92 | # Only leave one feature for each image, in case duplicate sample 93 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img], 94 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]] 95 | tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp] 96 | fc_feats, att_feats = tmp 97 | # forward the model to also get generated samples for each image 98 | seq, _ = model.sample(fc_feats, att_feats, eval_kwargs) 99 | 100 | #set_trace() 101 | sents = utils.decode_sequence(loader.get_vocab(), seq) 102 | 103 | for k, sent in enumerate(sents): 104 | entry = {'image_id': data['infos'][k]['id'], 'caption': sent} 105 | if eval_kwargs.get('dump_path', 0) == 1: 106 | entry['file_name'] = data['infos'][k]['file_path'] 107 | predictions.append(entry) 108 | if eval_kwargs.get('dump_images', 0) == 1: 109 | # dump the raw image to vis/ folder 110 | cmd = 'cp "' + os.path.join(eval_kwargs['image_root'], data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross 111 | print(cmd) 112 | os.system(cmd) 113 | 114 | if verbose: 115 | print('image %s: %s' %(entry['image_id'], entry['caption'])) 116 | 117 | # if we wrapped around the split or used up val imgs budget then bail 118 | ix0 = data['bounds']['it_pos_now'] 119 | ix1 = data['bounds']['it_max'] 120 | if num_images != -1: 121 | ix1 = min(ix1, num_images) 122 | for i in range(n - ix1): 123 | predictions.pop() 124 | 125 | if verbose: 126 | print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss)) 127 | 128 | if data['bounds']['wrapped']: 129 | break 130 | if num_images >= 0 and n >= num_images: 131 | break 132 | 133 | lang_stats = None 134 | if lang_eval == 1: 135 | lang_stats = language_eval(dataset, predictions, eval_kwargs['id'], split) 136 | 137 | # Switch back to training mode 138 | model.train() 139 | return loss_sum/loss_evals, predictions, lang_stats 140 | -------------------------------------------------------------------------------- /misc/resnet.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | import torchvision.models as models 9 | import math 10 | import pdb 11 | import torch.utils.model_zoo as model_zoo 12 | 13 | 14 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 15 | 'resnet152'] 16 | 17 | 18 | model_urls = { 19 | 'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth', 20 | 'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth', 21 | 'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth', 22 | 'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth', 23 | 'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth', 24 | } 25 | 26 | def conv3x3(in_planes, out_planes, stride=1): 27 | "3x3 convolution with padding" 28 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 29 | padding=1, bias=False) 30 | 31 | 32 | class BasicBlock(nn.Module): 33 | expansion = 1 34 | 35 | def __init__(self, inplanes, planes, stride=1, downsample=None): 36 | super(BasicBlock, self).__init__() 37 | self.conv1 = conv3x3(inplanes, planes, stride) 38 | self.bn1 = nn.BatchNorm2d(planes) 39 | self.relu = nn.ReLU(inplace=True) 40 | self.conv2 = conv3x3(planes, planes) 41 | self.bn2 = nn.BatchNorm2d(planes) 42 | self.downsample = downsample 43 | self.stride = stride 44 | 45 | def forward(self, x): 46 | residual = x 47 | 48 | out = self.conv1(x) 49 | out = self.bn1(out) 50 | out = self.relu(out) 51 | 52 | out = self.conv2(out) 53 | out = self.bn2(out) 54 | 55 | if self.downsample is not None: 56 | residual = self.downsample(x) 57 | 58 | out += residual 59 | out = self.relu(out) 60 | 61 | return out 62 | 63 | 64 | class Bottleneck(nn.Module): 65 | expansion = 4 66 | 67 | def __init__(self, inplanes, planes, stride=1, downsample=None): 68 | super(Bottleneck, self).__init__() 69 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change 70 | self.bn1 = nn.BatchNorm2d(planes) 71 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change 72 | padding=1, bias=False) 73 | self.bn2 = nn.BatchNorm2d(planes) 74 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 75 | self.bn3 = nn.BatchNorm2d(planes * 4) 76 | self.relu = nn.ReLU(inplace=True) 77 | self.downsample = downsample 78 | self.stride = stride 79 | 80 | def forward(self, x): 81 | residual = x 82 | 83 | out = self.conv1(x) 84 | out = self.bn1(out) 85 | out = self.relu(out) 86 | 87 | out = self.conv2(out) 88 | out = self.bn2(out) 89 | out = self.relu(out) 90 | 91 | out = self.conv3(out) 92 | out = self.bn3(out) 93 | 94 | if self.downsample is not None: 95 | residual = self.downsample(x) 96 | 97 | out += residual 98 | out = self.relu(out) 99 | 100 | return out 101 | 102 | 103 | class ResNet(nn.Module): 104 | def __init__(self, block, layers, num_classes=1000): 105 | self.inplanes = 64 106 | super(ResNet, self).__init__() 107 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 108 | bias=False) 109 | self.bn1 = nn.BatchNorm2d(64) 110 | self.relu = nn.ReLU(inplace=True) 111 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change 112 | self.layer1 = self._make_layer(block, 64, layers[0]) 113 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 114 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 115 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 116 | self.avgpool = nn.AvgPool2d(7) 117 | self.fc = nn.Linear(512 * block.expansion, num_classes) 118 | 119 | for m in self.modules(): 120 | if isinstance(m, nn.Conv2d): 121 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 122 | m.weight.data.normal_(0, math.sqrt(2. / n)) 123 | elif isinstance(m, nn.BatchNorm2d): 124 | m.weight.data.fill_(1) 125 | m.bias.data.zero_() 126 | 127 | def _make_layer(self, block, planes, blocks, stride=1): 128 | downsample = None 129 | if stride != 1 or self.inplanes != planes * block.expansion: 130 | downsample = nn.Sequential( 131 | nn.Conv2d(self.inplanes, planes * block.expansion, 132 | kernel_size=1, stride=stride, bias=False), 133 | nn.BatchNorm2d(planes * block.expansion), 134 | ) 135 | 136 | layers = [] 137 | layers.append(block(self.inplanes, planes, stride, downsample)) 138 | self.inplanes = planes * block.expansion 139 | for i in range(1, blocks): 140 | layers.append(block(self.inplanes, planes)) 141 | 142 | return nn.Sequential(*layers) 143 | 144 | def forward(self, x): 145 | x = self.conv1(x) 146 | x = self.bn1(x) 147 | x = self.relu(x) 148 | x = self.maxpool(x) 149 | 150 | x = self.layer1(x) 151 | x = self.layer2(x) 152 | x = self.layer3(x) 153 | x = self.layer4(x) 154 | 155 | x = self.avgpool(x) 156 | x = x.view(x.size(0), -1) 157 | x = self.fc(x) 158 | 159 | return x 160 | 161 | 162 | def resnet18(pretrained=False): 163 | """Constructs a ResNet-18 model. 164 | Args: 165 | pretrained (bool): If True, returns a model pre-trained on ImageNet 166 | """ 167 | model = ResNet(BasicBlock, [2, 2, 2, 2]) 168 | if pretrained: 169 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18'])) 170 | return model 171 | 172 | 173 | def resnet34(pretrained=False): 174 | """Constructs a ResNet-34 model. 175 | Args: 176 | pretrained (bool): If True, returns a model pre-trained on ImageNet 177 | """ 178 | model = ResNet(BasicBlock, [3, 4, 6, 3]) 179 | if pretrained: 180 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34'])) 181 | return model 182 | 183 | 184 | def resnet50(pretrained=False): 185 | """Constructs a ResNet-50 model. 186 | Args: 187 | pretrained (bool): If True, returns a model pre-trained on ImageNet 188 | """ 189 | model = ResNet(Bottleneck, [3, 4, 6, 3]) 190 | if pretrained: 191 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50'])) 192 | return model 193 | 194 | 195 | def resnet101(pretrained=False): 196 | """Constructs a ResNet-101 model. 197 | Args: 198 | pretrained (bool): If True, returns a model pre-trained on ImageNet 199 | """ 200 | model = ResNet(Bottleneck, [3, 4, 23, 3]) 201 | if pretrained: 202 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101'])) 203 | return model 204 | 205 | 206 | def resnet152(pretrained=False): 207 | """Constructs a ResNet-152 model. 208 | Args: 209 | pretrained (bool): If True, returns a model pre-trained on ImageNet 210 | """ 211 | model = ResNet(Bottleneck, [3, 8, 36, 3]) 212 | if pretrained: 213 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) 214 | return model 215 | 216 | 217 | class resnet(nn.Module): 218 | def __init__(self, opt, _num_layers=101, _fixed_block=1, pretrained=True): 219 | super(resnet, self).__init__() 220 | self._num_layers = _num_layers 221 | self._fixed_block = _fixed_block 222 | self.pretrained = pretrained 223 | self.model_path = '%s/imagenet_weights/resnet' %(opt.data_path) + str(_num_layers) + '.pth' 224 | 225 | if self._num_layers == 50: 226 | self.resnet = resnet50(pretrained=False) 227 | 228 | elif self._num_layers == 101: 229 | self.resnet = resnet101(pretrained=False) 230 | 231 | elif self._num_layers == 152: 232 | self.resnet = resnet152(pretrained=False) 233 | else: 234 | raise NotImplementedError 235 | 236 | if self.pretrained == True: 237 | print("Loading pretrained weights from %s" %(self.model_path)) 238 | state_dict = torch.load(self.model_path) 239 | self.resnet.load_state_dict({k:v for k,v in state_dict.items() if k in self.resnet.state_dict()}) 240 | 241 | # Fix blocks 242 | for p in self.resnet.bn1.parameters(): p.requires_grad=False 243 | for p in self.resnet.conv1.parameters(): p.requires_grad=False 244 | assert (0 <= _fixed_block <= 4) 245 | if _fixed_block >= 4: 246 | for p in self.resnet.layer4.parameters(): p.requires_grad=False 247 | if _fixed_block >= 3: 248 | for p in self.resnet.layer3.parameters(): p.requires_grad=False 249 | if _fixed_block >= 2: 250 | for p in self.resnet.layer2.parameters(): p.requires_grad=False 251 | if _fixed_block >= 1: 252 | for p in self.resnet.layer1.parameters(): p.requires_grad=False 253 | 254 | def set_bn_fix(m): 255 | classname = m.__class__.__name__ 256 | if classname.find('BatchNorm') != -1: 257 | for p in m.parameters(): p.requires_grad=False 258 | 259 | self.resnet.apply(set_bn_fix) 260 | 261 | self.cnn_net = nn.Sequential(self.resnet.conv1, self.resnet.bn1,self.resnet.relu, 262 | self.resnet.maxpool,self.resnet.layer1,self.resnet.layer2,self.resnet.layer3, self.resnet.layer4) 263 | 264 | def forward(self, img): 265 | conv_feat = self.cnn_net(img) 266 | fc_feat = conv_feat.mean(3).mean(2) 267 | 268 | return conv_feat, fc_feat 269 | 270 | def train(self, mode=True): 271 | # Override train so that the training mode is set as we want 272 | nn.Module.train(self, mode) 273 | if mode: 274 | # Set fixed blocks to be in eval mode 275 | self.resnet.eval() 276 | if self._fixed_block <= 3: 277 | self.resnet.layer4.train() 278 | if self._fixed_block <= 2: 279 | self.resnet.layer3.train() 280 | if self._fixed_block <= 1: 281 | self.resnet.layer2.train() 282 | if self._fixed_block <= 0: 283 | self.resnet.layer1.train() 284 | 285 | def set_bn_eval(m): 286 | classname = m.__class__.__name__ 287 | if classname.find('BatchNorm') != -1: 288 | m.eval() 289 | 290 | self.resnet.apply(set_bn_eval) -------------------------------------------------------------------------------- /misc/rewards.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import time 7 | import misc.utils as utils 8 | from collections import OrderedDict 9 | import torch 10 | from torch.autograd import Variable 11 | import torch.nn as nn 12 | from torch.nn.parameter import Parameter 13 | 14 | import sys 15 | sys.path.append("tools/pycider") 16 | from pyciderevalcap.ciderD.ciderD import CiderD 17 | import pdb 18 | 19 | #CiderD_scorer = CiderD(df='corpus') 20 | 21 | def array_to_str(arr): 22 | out = '' 23 | for i in range(len(arr)): 24 | out += str(arr[i]) + ' ' 25 | if arr[i] == 0: 26 | break 27 | return out.strip() 28 | 29 | class get_self_critical_reward(nn.Module): 30 | def __init__(self, opt): 31 | super(get_self_critical_reward, self).__init__() 32 | self.vocab_size = opt.vocab_size 33 | self.st2towidx = opt.st2towidx 34 | self.opt = opt 35 | # self.st2towidx.requires_grad=False 36 | self.CiderD_scorer = CiderD(df=opt.cached_tokens) 37 | 38 | def forward(self, gen_input, greedy_input, gt_gts, ncap): 39 | 40 | gen_txt_seq, gen_bn_seq, gen_vis_seq = gen_input 41 | greedy_txt_seq, greedy_bn_seq, greedy_vis_seq = greedy_input 42 | 43 | self.st2towidx = self.st2towidx.type_as(gen_txt_seq) 44 | batch_size = gen_txt_seq.size(0) 45 | seq_per_img = batch_size // gt_gts.size(0) 46 | 47 | gen_result = gen_txt_seq.new(gen_txt_seq.size()).zero_() 48 | greedy_result = greedy_txt_seq.new(greedy_txt_seq.size()).zero_() 49 | 50 | gen_mask = gen_txt_seq < self.vocab_size 51 | gen_vis_seq = gen_vis_seq.view(batch_size,-1) 52 | gen_bn_seq = gen_bn_seq.view(batch_size, -1) 53 | 54 | # compose the seq 55 | gen_result[gen_mask] = gen_txt_seq[gen_mask] 56 | gen_vis_idx = gen_vis_seq[gen_mask==0]*2 + gen_bn_seq[gen_mask==0] - 1 57 | 58 | gen_result[gen_mask==0] = self.st2towidx[gen_vis_idx] 59 | 60 | greedy_mask = greedy_txt_seq < self.vocab_size 61 | greedy_vis_seq = greedy_vis_seq.view(batch_size,-1) 62 | greedy_bn_seq = greedy_bn_seq.view(batch_size, -1) 63 | 64 | # compose the seq 65 | greedy_result[greedy_mask] = greedy_txt_seq[greedy_txt_seq < self.vocab_size] 66 | greedy_vis_idx = greedy_vis_seq[greedy_mask==0]*2 + greedy_bn_seq[greedy_mask==0] - 1 67 | greedy_result[greedy_mask==0] = self.st2towidx[greedy_vis_idx] 68 | 69 | res = OrderedDict() 70 | gen_result = gen_result.cpu().numpy() 71 | greedy_result = greedy_result.cpu().numpy() 72 | 73 | for i in range(batch_size): 74 | res[i] = [array_to_str(gen_result[i])] 75 | for i in range(batch_size): 76 | res[batch_size + i] = [array_to_str(greedy_result[i])] 77 | 78 | gts = OrderedDict() 79 | for i in range(batch_size): 80 | gts_np = gt_gts[i][:ncap.data[i]].data.cpu().numpy() 81 | gts[i] = [array_to_str(gts_np[j]) for j in range(len(gts_np))] 82 | 83 | # caption = utils.decode_normal(self.opt.itow, torch.from_numpy(gen_result)) 84 | # pdb.set_trace() 85 | # print(caption[0]) 86 | 87 | # utils.decode_normal(self.opt.itow, gt_gts.data.view(-1,20)) 88 | #_, scores = Bleu(4).compute_score(gts, res) 89 | #scores = np.array(scores[3]) 90 | res = [{'image_id':i, 'caption': res[i]} for i in range(2 * batch_size)] 91 | gts = {i: gts[i % batch_size // seq_per_img] for i in range(2 * batch_size)} 92 | _, scores = self.CiderD_scorer.compute_score(gts, res) 93 | # print(_) 94 | 95 | scores = scores[:batch_size] - scores[batch_size:] 96 | rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1) 97 | 98 | return rewards, _ 99 | -------------------------------------------------------------------------------- /misc/vgg16.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import torch 11 | import torch.nn as nn 12 | import torch.nn.functional as F 13 | from torch.autograd import Variable 14 | import math 15 | import pdb 16 | import torchvision.models as models 17 | 18 | class vgg16(nn.Module): 19 | def __init__(self, opt, pretrained=True): 20 | super(vgg16, self).__init__() 21 | 22 | self.model_path = '%s/imagenet_weights/vgg16_caffe.pth' %(opt.data_path) 23 | self.pretrained = pretrained 24 | 25 | vgg = models.vgg16() 26 | vgg.classifier = nn.Sequential(*list(vgg.classifier._modules.values())[:-1]) 27 | self.fc = vgg.classifier 28 | self.pooling = nn.AdaptiveAvgPool2d((7,7)) 29 | if self.pretrained: 30 | print("Loading pretrained weights from %s" %(self.model_path)) 31 | state_dict = torch.load(self.model_path) 32 | vgg.load_state_dict({k:v for k,v in state_dict.items() if k in vgg.state_dict()}) 33 | 34 | # not using the last maxpool layer 35 | self.cnn_net = nn.Sequential(*list(vgg.features._modules.values())[:-1]) 36 | 37 | def forward(self, img): 38 | 39 | conv_feat = self.cnn_net(img) 40 | pooled_conv_feat = self.pooling(conv_feat) 41 | 42 | pooled_conv_feat_flat = pooled_conv_feat.view(pooled_conv_feat.size(0), -1) 43 | fc_feat = self.fc(pooled_conv_feat_flat) 44 | 45 | return conv_feat, fc_feat -------------------------------------------------------------------------------- /opts.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | def parse_opt(): 4 | parser = argparse.ArgumentParser() 5 | # # Data input settings 6 | parser.add_argument('--path_opt', type=str, default='cfgs/coco.yml', 7 | help='') 8 | parser.add_argument('--dataset', type=str, default='coco', 9 | help='') 10 | parser.add_argument('--input_json', type=str, default='data/coco/cap_coco.json', 11 | help='path to the json file containing additional info and vocab') 12 | parser.add_argument('--input_dic', type=str, default='data/coco/dic_coco.json', 13 | help='path to the json containing the preprocessed dataset') 14 | parser.add_argument('--image_path', type=str, default='/srv/share/datasets/coco/images', 15 | help='path to the h5file containing the image data') 16 | parser.add_argument('--proposal_h5', type=str, default='data/coco/coco_detection.h5', 17 | help='path to the json containing the detection result.') 18 | parser.add_argument('--cnn_backend', type=str, default='res101', 19 | help='res101 or vgg16') 20 | parser.add_argument('--data_path', type=str, default='', 21 | help='') 22 | 23 | parser.add_argument('--decode_noc', type=bool, default=True, 24 | help='decoding option: normal | noc') 25 | parser.add_argument('--att_model', type=str, default='topdown', 26 | help='different attention model, now supporting topdown | att2in2') 27 | parser.add_argument('--num_workers', dest='num_workers', 28 | help='number of worker to load data', 29 | default=10, type=int) 30 | parser.add_argument('--cuda', type=bool, default=True, 31 | help='whether use cuda') 32 | parser.add_argument('--mGPUs', type=bool, default=False, 33 | help='whether use multiple GPUs') 34 | parser.add_argument('--cached_tokens', type=str, default='dataset/coco-train-idxs', 35 | help='Cached token file for calculating cider score during self critical training.') 36 | 37 | # Model settings 38 | parser.add_argument('--rnn_size', type=int, default=1024, 39 | help='size of the rnn in number of hidden nodes in each layer') 40 | parser.add_argument('--num_layers', type=int, default=1, 41 | help='number of layers in the RNN') 42 | parser.add_argument('--rnn_type', type=str, default='lstm', 43 | help='rnn, gru, or lstm') 44 | parser.add_argument('--input_encoding_size', type=int, default=512, 45 | help='the encoding size of each token in the vocabulary, and the image.') 46 | parser.add_argument('--att_hid_size', type=int, default=512, 47 | help='the hidden size of the attention MLP; only useful in show_attend_tell; 0 if not using hidden layer') 48 | parser.add_argument('--fc_feat_size', type=int, default=2048, 49 | help='2048 for resnet, 4096 for vgg') 50 | parser.add_argument('--att_feat_size', type=int, default=2048, 51 | help='2048 for resnet, 512 for vgg') 52 | parser.add_argument('--image_size', type=int, default=576, 53 | help='image random crop size') 54 | parser.add_argument('--image_crop_size', type=int, default=512, 55 | help='image random crop size') 56 | 57 | # Optimization: General 58 | parser.add_argument('--max_epochs', type=int, default=30, 59 | help='number of epochs') 60 | parser.add_argument('--batch_size', type=int, default=10, 61 | help='minibatch size') 62 | parser.add_argument('--grad_clip', type=float, default=0.1, #5., 63 | help='clip gradients at this value') 64 | parser.add_argument('--drop_prob_lm', type=float, default=0.5, 65 | help='strength of dropout in the Language Model RNN') 66 | parser.add_argument('--self_critical', type=bool, default=False, 67 | help='whether use self critical training.') 68 | parser.add_argument('--seq_per_img', type=int, default=5, 69 | help='number of captions to sample for each image during training. Done for efficiency since CNN forward pass is expensive. E.g. coco has 5 sents/image') 70 | parser.add_argument('--seq_length', type=int, default=20, help='') 71 | parser.add_argument('--beam_size', type=int, default=1, 72 | help='used when sample_max = 1, indicates number of beams in beam search. Usually 2 or 3 works well. More is not better. Set this to 1 for faster runtime but a bit worse performance.') 73 | 74 | # Schedule Sampling. 75 | parser.add_argument('--scheduled_sampling_start', type=int, default=-1, 76 | help='at what iteration to start decay gt probability') 77 | parser.add_argument('--scheduled_sampling_increase_every', type=int, default=5, 78 | help='every how many iterations thereafter to gt probability') 79 | parser.add_argument('--scheduled_sampling_increase_prob', type=float, default=0.05, 80 | help='How much to update the prob') 81 | parser.add_argument('--scheduled_sampling_max_prob', type=float, default=0.25, 82 | help='Maximum scheduled sampling prob.') 83 | 84 | #Optimization: for the Language Model 85 | parser.add_argument('--optim', type=str, default='adam', 86 | help='what update to use? rmsprop|sgd|sgdmom|adagrad|adam') 87 | parser.add_argument('--learning_rate', type=float, default=5e-4, 88 | help='learning rate') 89 | parser.add_argument('--learning_rate_decay_start', type=int, default=1, 90 | help='at what iteration to start decaying learning rate? (-1 = dont) (in epoch)') 91 | parser.add_argument('--learning_rate_decay_every', type=int, default=3, 92 | help='every how many iterations thereafter to drop LR?(in epoch)') 93 | parser.add_argument('--learning_rate_decay_rate', type=float, default=0.8, 94 | help='every how many iterations thereafter to drop LR?(in epoch)') 95 | parser.add_argument('--optim_alpha', type=float, default=0.9, 96 | help='alpha for adam') 97 | parser.add_argument('--optim_beta', type=float, default=0.999, 98 | help='beta used for adam') 99 | parser.add_argument('--optim_epsilon', type=float, default=1e-8, 100 | help='epsilon that goes into denominator for smoothing') 101 | parser.add_argument('--weight_decay', type=float, default=0, 102 | help='weight_decay') 103 | 104 | # Optimization: for the CNN 105 | parser.add_argument('--finetune_cnn', action='store_true', 106 | help='finetune CNN') 107 | parser.add_argument('--fixed_block', type=float, default=1, 108 | help='fixed cnn block when training. [0-4] \ 109 | 0:finetune all block, 4: fix all block') 110 | parser.add_argument('--cnn_optim', type=str, default='adam', 111 | help='what update to use? rmsprop|sgd|sgdmom|adagrad|adam') 112 | parser.add_argument('--cnn_optim_alpha', type=float, default=0.8, 113 | help='cnn alpha for adam') 114 | parser.add_argument('--cnn_optim_beta', type=float, default=0.999, 115 | help='beta used for adam') 116 | parser.add_argument('--cnn_learning_rate', type=float, default=1e-5, 117 | help='cnn learning rate') 118 | parser.add_argument('--cnn_weight_decay', type=float, default=0, 119 | help='weight_decay') 120 | # set training session 121 | parser.add_argument('--start_from', type=str, default=None, 122 | help="""continue training from saved model at this path. Path must contain files saved by previous training process: 123 | 'infos.pkl' : configuration; 124 | 'checkpoint' : paths to model file(s) (created by tf). 125 | Note: this file contains absolute paths, be careful when moving files around; 126 | 'model.ckpt-*' : file(s) with model definition (created by tf) 127 | """) 128 | parser.add_argument('--id', type=str, default='', 129 | help='an id identifying this run/job. used in cross-val and appended when writing progress files') 130 | # Evaluation/Checkpointing 131 | parser.add_argument('--cider_df', type=str, default='corpus', 132 | help='') 133 | parser.add_argument('--val_split', type=str, default='test', 134 | help='') 135 | parser.add_argument('--inference_only', type=bool, default=False, 136 | help='') 137 | parser.add_argument('--val_images_use', type=int, default=5000, 138 | help='how many images to use when periodically evaluating the validation loss? (-1 = all)') 139 | parser.add_argument('--val_every_epoch', type=int, default=3, 140 | help='how many images to use when periodically evaluating the validation loss? (-1 = all)') 141 | parser.add_argument('--checkpoint_path', type=str, default='save', 142 | help='directory to store checkpointed models') 143 | parser.add_argument('--language_eval', type=int, default=1, 144 | help='Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.') 145 | parser.add_argument('--load_best_score', type=int, default=1, 146 | help='Do we load previous best score when resuming training.') 147 | parser.add_argument('--disp_interval', type=int, default=100, 148 | help='how many iteration to display an loss.') 149 | parser.add_argument('--losses_log_every', type=int, default=10, 150 | help='how many iteration for log.') 151 | parser.add_argument('--cbs', type=bool, default=False, 152 | help='whether use constraint beam search.') 153 | parser.add_argument('--cbs_tag_size', type=int, default=3, 154 | help='whether use constraint beam search.') 155 | parser.add_argument('--cbs_mode', type=str, default='all', 156 | help='which cbs mode to use in the decoding stage. cbs_mode: all|unique|novel') 157 | parser.add_argument('--det_oracle', type=bool, default=False, 158 | help='whether use oracle bounding box.') 159 | args = parser.parse_args() 160 | 161 | return args 162 | -------------------------------------------------------------------------------- /pooling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/__init__.py -------------------------------------------------------------------------------- /pooling/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # CUDA_PATH=/usr/local/cuda/ 4 | 5 | export CUDA_PATH=/usr/local/cuda/ 6 | #You may also want to ad the following 7 | #export C_INCLUDE_PATH=/opt/cuda/include 8 | 9 | export CXXFLAGS="-std=c++11" 10 | export CFLAGS="-std=c99" 11 | 12 | CUDA_ARCH="-gencode arch=compute_30,code=sm_30 \ 13 | -gencode arch=compute_35,code=sm_35 \ 14 | -gencode arch=compute_50,code=sm_50 \ 15 | -gencode arch=compute_52,code=sm_52 \ 16 | -gencode arch=compute_60,code=sm_60 \ 17 | -gencode arch=compute_61,code=sm_61 " 18 | 19 | # compile roi_align 20 | cd roi_align/src 21 | echo "Compiling roi align kernels by nvcc..." 22 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu \ 23 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=$CUDA_ARCH 24 | cd ../ 25 | python build.py 26 | cd ../ 27 | -------------------------------------------------------------------------------- /pooling/roi_align/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/__init__.py -------------------------------------------------------------------------------- /pooling/roi_align/_ext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/_ext/__init__.py -------------------------------------------------------------------------------- /pooling/roi_align/_ext/roi_align/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._roi_align import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /pooling/roi_align/_ext/roi_align/_roi_align.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/_ext/roi_align/_roi_align.so -------------------------------------------------------------------------------- /pooling/roi_align/build.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | from torch.utils.ffi import create_extension 5 | 6 | # sources = ['src/roi_align.c'] 7 | # headers = ['src/roi_align.h'] 8 | sources = [] 9 | headers = [] 10 | defines = [] 11 | with_cuda = False 12 | 13 | if torch.cuda.is_available(): 14 | print('Including CUDA code.') 15 | sources += ['src/roi_align_cuda.c'] 16 | headers += ['src/roi_align_cuda.h'] 17 | defines += [('WITH_CUDA', None)] 18 | with_cuda = True 19 | 20 | this_file = os.path.dirname(os.path.realpath(__file__)) 21 | print(this_file) 22 | extra_objects = ['src/roi_align_kernel.cu.o'] 23 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 24 | 25 | ffi = create_extension( 26 | '_ext.roi_align', 27 | headers=headers, 28 | sources=sources, 29 | define_macros=defines, 30 | relative_to=__file__, 31 | with_cuda=with_cuda, 32 | extra_objects=extra_objects 33 | ) 34 | 35 | if __name__ == '__main__': 36 | ffi.build() 37 | -------------------------------------------------------------------------------- /pooling/roi_align/functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/functions/__init__.py -------------------------------------------------------------------------------- /pooling/roi_align/functions/roi_align.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from .._ext import roi_align 4 | 5 | 6 | # TODO use save_for_backward instead 7 | class RoIAlignFunction(Function): 8 | def __init__(self, aligned_height, aligned_width, spatial_scale): 9 | self.aligned_width = int(aligned_width) 10 | self.aligned_height = int(aligned_height) 11 | self.spatial_scale = float(spatial_scale) 12 | self.rois = None 13 | self.feature_size = None 14 | 15 | def forward(self, features, rois): 16 | self.rois = rois 17 | self.feature_size = features.size() 18 | 19 | batch_size, num_channels, data_height, data_width = features.size() 20 | num_rois = rois.size(0) 21 | 22 | output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_() 23 | if features.is_cuda: 24 | roi_align.roi_align_forward_cuda(self.aligned_height, 25 | self.aligned_width, 26 | self.spatial_scale, features, 27 | rois, output) 28 | else: 29 | roi_align.roi_align_forward(self.aligned_height, 30 | self.aligned_width, 31 | self.spatial_scale, features, 32 | rois, output) 33 | # raise NotImplementedError 34 | 35 | return output 36 | 37 | def backward(self, grad_output): 38 | assert(self.feature_size is not None and grad_output.is_cuda) 39 | 40 | batch_size, num_channels, data_height, data_width = self.feature_size 41 | 42 | grad_input = self.rois.new(batch_size, num_channels, data_height, 43 | data_width).zero_() 44 | roi_align.roi_align_backward_cuda(self.aligned_height, 45 | self.aligned_width, 46 | self.spatial_scale, grad_output, 47 | self.rois, grad_input) 48 | 49 | # print grad_input 50 | 51 | return grad_input, None 52 | -------------------------------------------------------------------------------- /pooling/roi_align/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # CUDA_PATH=/usr/local/cuda/ 4 | 5 | export CUDA_PATH=/usr/local/cuda/ 6 | #You may also want to ad the following 7 | #export C_INCLUDE_PATH=/opt/cuda/include 8 | 9 | export CXXFLAGS="-std=c++11" 10 | export CFLAGS="-std=c99" 11 | 12 | CUDA_ARCH="-gencode arch=compute_30,code=sm_30 \ 13 | -gencode arch=compute_35,code=sm_35 \ 14 | -gencode arch=compute_50,code=sm_50 \ 15 | -gencode arch=compute_52,code=sm_52 \ 16 | -gencode arch=compute_60,code=sm_60 \ 17 | -gencode arch=compute_61,code=sm_61 " 18 | 19 | # compile roi_align 20 | cd src 21 | echo "Compiling roi align kernels by nvcc..." 22 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu \ 23 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC $CUDA_ARCH 24 | cd ../ 25 | python build.py 26 | -------------------------------------------------------------------------------- /pooling/roi_align/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/modules/__init__.py -------------------------------------------------------------------------------- /pooling/roi_align/modules/roi_align.py: -------------------------------------------------------------------------------- 1 | from torch.nn.modules.module import Module 2 | from torch.nn.functional import avg_pool2d, max_pool2d 3 | from ..functions.roi_align import RoIAlignFunction 4 | 5 | 6 | class RoIAlign(Module): 7 | def __init__(self, aligned_height, aligned_width, spatial_scale): 8 | super(RoIAlign, self).__init__() 9 | 10 | self.aligned_width = int(aligned_width) 11 | self.aligned_height = int(aligned_height) 12 | self.spatial_scale = float(spatial_scale) 13 | 14 | def forward(self, features, rois): 15 | return RoIAlignFunction(self.aligned_height, self.aligned_width, 16 | self.spatial_scale)(features, rois) 17 | 18 | class RoIAlignAvg(Module): 19 | def __init__(self, aligned_height, aligned_width, spatial_scale): 20 | super(RoIAlignAvg, self).__init__() 21 | 22 | self.aligned_width = int(aligned_width) 23 | self.aligned_height = int(aligned_height) 24 | self.spatial_scale = float(spatial_scale) 25 | 26 | def forward(self, features, rois): 27 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 28 | self.spatial_scale)(features, rois) 29 | return avg_pool2d(x, kernel_size=2, stride=1) 30 | 31 | class RoIAlignMax(Module): 32 | def __init__(self, aligned_height, aligned_width, spatial_scale): 33 | super(RoIAlignMax, self).__init__() 34 | 35 | self.aligned_width = int(aligned_width) 36 | self.aligned_height = int(aligned_height) 37 | self.spatial_scale = float(spatial_scale) 38 | 39 | def forward(self, features, rois): 40 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1, 41 | self.spatial_scale)(features, rois) 42 | return max_pool2d(x, kernel_size=2, stride=1) 43 | -------------------------------------------------------------------------------- /pooling/roi_align/src/roi_align.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois, 7 | const int height, const int width, const int channels, 8 | const int aligned_height, const int aligned_width, const float * bottom_rois, 9 | float* top_data); 10 | 11 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois, 12 | const int height, const int width, const int channels, 13 | const int aligned_height, const int aligned_width, const float * bottom_rois, 14 | float* top_data); 15 | 16 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale, 17 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output) 18 | { 19 | //Grab the input tensor 20 | float * data_flat = THFloatTensor_data(features); 21 | float * rois_flat = THFloatTensor_data(rois); 22 | 23 | float * output_flat = THFloatTensor_data(output); 24 | 25 | // Number of ROIs 26 | int num_rois = THFloatTensor_size(rois, 0); 27 | int size_rois = THFloatTensor_size(rois, 1); 28 | if (size_rois != 5) 29 | { 30 | return 0; 31 | } 32 | 33 | // data height 34 | int data_height = THFloatTensor_size(features, 2); 35 | // data width 36 | int data_width = THFloatTensor_size(features, 3); 37 | // Number of channels 38 | int num_channels = THFloatTensor_size(features, 1); 39 | 40 | // do ROIAlignForward 41 | ROIAlignForwardCpu(data_flat, spatial_scale, num_rois, data_height, data_width, num_channels, 42 | aligned_height, aligned_width, rois_flat, output_flat); 43 | 44 | return 1; 45 | } 46 | 47 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale, 48 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad) 49 | { 50 | //Grab the input tensor 51 | float * top_grad_flat = THFloatTensor_data(top_grad); 52 | float * rois_flat = THFloatTensor_data(rois); 53 | 54 | float * bottom_grad_flat = THFloatTensor_data(bottom_grad); 55 | 56 | // Number of ROIs 57 | int num_rois = THFloatTensor_size(rois, 0); 58 | int size_rois = THFloatTensor_size(rois, 1); 59 | if (size_rois != 5) 60 | { 61 | return 0; 62 | } 63 | 64 | // batch size 65 | // int batch_size = THFloatTensor_size(bottom_grad, 0); 66 | // data height 67 | int data_height = THFloatTensor_size(bottom_grad, 2); 68 | // data width 69 | int data_width = THFloatTensor_size(bottom_grad, 3); 70 | // Number of channels 71 | int num_channels = THFloatTensor_size(bottom_grad, 1); 72 | 73 | // do ROIAlignBackward 74 | ROIAlignBackwardCpu(top_grad_flat, spatial_scale, num_rois, data_height, 75 | data_width, num_channels, aligned_height, aligned_width, rois_flat, bottom_grad_flat); 76 | 77 | return 1; 78 | } 79 | 80 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois, 81 | const int height, const int width, const int channels, 82 | const int aligned_height, const int aligned_width, const float * bottom_rois, 83 | float* top_data) 84 | { 85 | const int output_size = num_rois * aligned_height * aligned_width * channels; 86 | 87 | int idx = 0; 88 | for (idx = 0; idx < output_size; ++idx) 89 | { 90 | // (n, c, ph, pw) is an element in the aligned output 91 | int pw = idx % aligned_width; 92 | int ph = (idx / aligned_width) % aligned_height; 93 | int c = (idx / aligned_width / aligned_height) % channels; 94 | int n = idx / aligned_width / aligned_height / channels; 95 | 96 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 97 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 98 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 99 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 100 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 101 | 102 | // Force malformed ROI to be 1x1 103 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 104 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 105 | float bin_size_h = roi_height / (aligned_height - 1.); 106 | float bin_size_w = roi_width / (aligned_width - 1.); 107 | 108 | float h = (float)(ph) * bin_size_h + roi_start_h; 109 | float w = (float)(pw) * bin_size_w + roi_start_w; 110 | 111 | int hstart = fminf(floor(h), height - 2); 112 | int wstart = fminf(floor(w), width - 2); 113 | 114 | int img_start = roi_batch_ind * channels * height * width; 115 | 116 | // bilinear interpolation 117 | if (h < 0 || h >= height || w < 0 || w >= width) 118 | { 119 | top_data[idx] = 0.; 120 | } 121 | else 122 | { 123 | float h_ratio = h - (float)(hstart); 124 | float w_ratio = w - (float)(wstart); 125 | int upleft = img_start + (c * height + hstart) * width + wstart; 126 | int upright = upleft + 1; 127 | int downleft = upleft + width; 128 | int downright = downleft + 1; 129 | 130 | top_data[idx] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 131 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 132 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 133 | + bottom_data[downright] * h_ratio * w_ratio; 134 | } 135 | } 136 | } 137 | 138 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois, 139 | const int height, const int width, const int channels, 140 | const int aligned_height, const int aligned_width, const float * bottom_rois, 141 | float* bottom_diff) 142 | { 143 | const int output_size = num_rois * aligned_height * aligned_width * channels; 144 | 145 | int idx = 0; 146 | for (idx = 0; idx < output_size; ++idx) 147 | { 148 | // (n, c, ph, pw) is an element in the aligned output 149 | int pw = idx % aligned_width; 150 | int ph = (idx / aligned_width) % aligned_height; 151 | int c = (idx / aligned_width / aligned_height) % channels; 152 | int n = idx / aligned_width / aligned_height / channels; 153 | 154 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 155 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 156 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 157 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 158 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 159 | 160 | // Force malformed ROI to be 1x1 161 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 162 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 163 | float bin_size_h = roi_height / (aligned_height - 1.); 164 | float bin_size_w = roi_width / (aligned_width - 1.); 165 | 166 | float h = (float)(ph) * bin_size_h + roi_start_h; 167 | float w = (float)(pw) * bin_size_w + roi_start_w; 168 | 169 | int hstart = fminf(floor(h), height - 2); 170 | int wstart = fminf(floor(w), width - 2); 171 | 172 | int img_start = roi_batch_ind * channels * height * width; 173 | 174 | // bilinear interpolation 175 | if (h < 0 || h >= height || w < 0 || w >= width) 176 | { 177 | float h_ratio = h - (float)(hstart); 178 | float w_ratio = w - (float)(wstart); 179 | int upleft = img_start + (c * height + hstart) * width + wstart; 180 | int upright = upleft + 1; 181 | int downleft = upleft + width; 182 | int downright = downleft + 1; 183 | 184 | bottom_diff[upleft] += top_diff[idx] * (1. - h_ratio) * (1. - w_ratio); 185 | bottom_diff[upright] += top_diff[idx] * (1. - h_ratio) * w_ratio; 186 | bottom_diff[downleft] += top_diff[idx] * h_ratio * (1. - w_ratio); 187 | bottom_diff[downright] += top_diff[idx] * h_ratio * w_ratio; 188 | } 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /pooling/roi_align/src/roi_align.h: -------------------------------------------------------------------------------- 1 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale, 2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output); 3 | 4 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale, 5 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /pooling/roi_align/src/roi_align_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "roi_align_kernel.h" 4 | 5 | extern THCState *state; 6 | 7 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output) 9 | { 10 | // Grab the input tensor 11 | float * data_flat = THCudaTensor_data(state, features); 12 | float * rois_flat = THCudaTensor_data(state, rois); 13 | 14 | float * output_flat = THCudaTensor_data(state, output); 15 | 16 | // Number of ROIs 17 | int num_rois = THCudaTensor_size(state, rois, 0); 18 | int size_rois = THCudaTensor_size(state, rois, 1); 19 | if (size_rois != 5) 20 | { 21 | return 0; 22 | } 23 | 24 | // data height 25 | int data_height = THCudaTensor_size(state, features, 2); 26 | // data width 27 | int data_width = THCudaTensor_size(state, features, 3); 28 | // Number of channels 29 | int num_channels = THCudaTensor_size(state, features, 1); 30 | 31 | cudaStream_t stream = THCState_getCurrentStream(state); 32 | 33 | ROIAlignForwardLaucher( 34 | data_flat, spatial_scale, num_rois, data_height, 35 | data_width, num_channels, aligned_height, 36 | aligned_width, rois_flat, 37 | output_flat, stream); 38 | 39 | return 1; 40 | } 41 | 42 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 43 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad) 44 | { 45 | // Grab the input tensor 46 | float * top_grad_flat = THCudaTensor_data(state, top_grad); 47 | float * rois_flat = THCudaTensor_data(state, rois); 48 | 49 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad); 50 | 51 | // Number of ROIs 52 | int num_rois = THCudaTensor_size(state, rois, 0); 53 | int size_rois = THCudaTensor_size(state, rois, 1); 54 | if (size_rois != 5) 55 | { 56 | return 0; 57 | } 58 | 59 | // batch size 60 | int batch_size = THCudaTensor_size(state, bottom_grad, 0); 61 | // data height 62 | int data_height = THCudaTensor_size(state, bottom_grad, 2); 63 | // data width 64 | int data_width = THCudaTensor_size(state, bottom_grad, 3); 65 | // Number of channels 66 | int num_channels = THCudaTensor_size(state, bottom_grad, 1); 67 | 68 | cudaStream_t stream = THCState_getCurrentStream(state); 69 | ROIAlignBackwardLaucher( 70 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height, 71 | data_width, num_channels, aligned_height, 72 | aligned_width, rois_flat, 73 | bottom_grad_flat, stream); 74 | 75 | return 1; 76 | } 77 | -------------------------------------------------------------------------------- /pooling/roi_align/src/roi_align_cuda.h: -------------------------------------------------------------------------------- 1 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale, 2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output); 3 | 4 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale, 5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad); 6 | -------------------------------------------------------------------------------- /pooling/roi_align/src/roi_align_kernel.cu: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | #include 6 | #include 7 | #include 8 | #include "roi_align_kernel.h" 9 | 10 | #define CUDA_1D_KERNEL_LOOP(i, n) \ 11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \ 12 | i += blockDim.x * gridDim.x) 13 | 14 | 15 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width, 16 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) { 17 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 18 | // (n, c, ph, pw) is an element in the aligned output 19 | // int n = index; 20 | // int pw = n % aligned_width; 21 | // n /= aligned_width; 22 | // int ph = n % aligned_height; 23 | // n /= aligned_height; 24 | // int c = n % channels; 25 | // n /= channels; 26 | 27 | int pw = index % aligned_width; 28 | int ph = (index / aligned_width) % aligned_height; 29 | int c = (index / aligned_width / aligned_height) % channels; 30 | int n = index / aligned_width / aligned_height / channels; 31 | 32 | // bottom_rois += n * 5; 33 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 34 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 35 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 36 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 37 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 38 | 39 | // Force malformed ROIs to be 1x1 40 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 41 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 42 | float bin_size_h = roi_height / (aligned_height - 1.); 43 | float bin_size_w = roi_width / (aligned_width - 1.); 44 | 45 | float h = (float)(ph) * bin_size_h + roi_start_h; 46 | float w = (float)(pw) * bin_size_w + roi_start_w; 47 | 48 | int hstart = fminf(floor(h), height - 2); 49 | int wstart = fminf(floor(w), width - 2); 50 | 51 | int img_start = roi_batch_ind * channels * height * width; 52 | 53 | // bilinear interpolation 54 | if (h < 0 || h >= height || w < 0 || w >= width) { 55 | top_data[index] = 0.; 56 | } else { 57 | float h_ratio = h - (float)(hstart); 58 | float w_ratio = w - (float)(wstart); 59 | int upleft = img_start + (c * height + hstart) * width + wstart; 60 | int upright = upleft + 1; 61 | int downleft = upleft + width; 62 | int downright = downleft + 1; 63 | 64 | top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio) 65 | + bottom_data[upright] * (1. - h_ratio) * w_ratio 66 | + bottom_data[downleft] * h_ratio * (1. - w_ratio) 67 | + bottom_data[downright] * h_ratio * w_ratio; 68 | } 69 | } 70 | } 71 | 72 | 73 | int ROIAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width, 74 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) { 75 | const int kThreadsPerBlock = 1024; 76 | const int output_size = num_rois * aligned_height * aligned_width * channels; 77 | cudaError_t err; 78 | 79 | 80 | ROIAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 81 | output_size, bottom_data, spatial_scale, height, width, channels, 82 | aligned_height, aligned_width, bottom_rois, top_data); 83 | 84 | err = cudaGetLastError(); 85 | if(cudaSuccess != err) { 86 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 87 | exit( -1 ); 88 | } 89 | 90 | return 1; 91 | } 92 | 93 | 94 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width, 95 | const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) { 96 | CUDA_1D_KERNEL_LOOP(index, nthreads) { 97 | 98 | // (n, c, ph, pw) is an element in the aligned output 99 | int pw = index % aligned_width; 100 | int ph = (index / aligned_width) % aligned_height; 101 | int c = (index / aligned_width / aligned_height) % channels; 102 | int n = index / aligned_width / aligned_height / channels; 103 | 104 | float roi_batch_ind = bottom_rois[n * 5 + 0]; 105 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale; 106 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale; 107 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale; 108 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale; 109 | /* int roi_start_w = round(bottom_rois[1] * spatial_scale); */ 110 | /* int roi_start_h = round(bottom_rois[2] * spatial_scale); */ 111 | /* int roi_end_w = round(bottom_rois[3] * spatial_scale); */ 112 | /* int roi_end_h = round(bottom_rois[4] * spatial_scale); */ 113 | 114 | // Force malformed ROIs to be 1x1 115 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.); 116 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.); 117 | float bin_size_h = roi_height / (aligned_height - 1.); 118 | float bin_size_w = roi_width / (aligned_width - 1.); 119 | 120 | float h = (float)(ph) * bin_size_h + roi_start_h; 121 | float w = (float)(pw) * bin_size_w + roi_start_w; 122 | 123 | int hstart = fminf(floor(h), height - 2); 124 | int wstart = fminf(floor(w), width - 2); 125 | 126 | int img_start = roi_batch_ind * channels * height * width; 127 | 128 | // bilinear interpolation 129 | if (!(h < 0 || h >= height || w < 0 || w >= width)) { 130 | float h_ratio = h - (float)(hstart); 131 | float w_ratio = w - (float)(wstart); 132 | int upleft = img_start + (c * height + hstart) * width + wstart; 133 | int upright = upleft + 1; 134 | int downleft = upleft + width; 135 | int downright = downleft + 1; 136 | 137 | atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio)); 138 | atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio); 139 | atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio)); 140 | atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio); 141 | } 142 | } 143 | } 144 | 145 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width, 146 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) { 147 | const int kThreadsPerBlock = 1024; 148 | const int output_size = num_rois * aligned_height * aligned_width * channels; 149 | cudaError_t err; 150 | 151 | ROIAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>( 152 | output_size, top_diff, spatial_scale, height, width, channels, 153 | aligned_height, aligned_width, bottom_diff, bottom_rois); 154 | 155 | err = cudaGetLastError(); 156 | if(cudaSuccess != err) { 157 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) ); 158 | exit( -1 ); 159 | } 160 | 161 | return 1; 162 | } 163 | 164 | 165 | #ifdef __cplusplus 166 | } 167 | #endif 168 | -------------------------------------------------------------------------------- /pooling/roi_align/src/roi_align_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _ROI_ALIGN_KERNEL 2 | #define _ROI_ALIGN_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, 9 | const float spatial_scale, const int height, const int width, 10 | const int channels, const int aligned_height, const int aligned_width, 11 | const float* bottom_rois, float* top_data); 12 | 13 | int ROIAlignForwardLaucher( 14 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height, 15 | const int width, const int channels, const int aligned_height, 16 | const int aligned_width, const float* bottom_rois, 17 | float* top_data, cudaStream_t stream); 18 | 19 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, 20 | const float spatial_scale, const int height, const int width, 21 | const int channels, const int aligned_height, const int aligned_width, 22 | float* bottom_diff, const float* bottom_rois); 23 | 24 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, 25 | const int height, const int width, const int channels, const int aligned_height, 26 | const int aligned_width, const float* bottom_rois, 27 | float* bottom_diff, cudaStream_t stream); 28 | 29 | #ifdef __cplusplus 30 | } 31 | #endif 32 | 33 | #endif 34 | 35 | -------------------------------------------------------------------------------- /prepro/prepro_det.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pdb 3 | import numpy as np 4 | import h5py 5 | 6 | dataset = 'coco' 7 | 8 | if dataset == 'coco': 9 | det_train = json.load(open('data/coco_noc/coco_detection_noc_train.json')) 10 | det_val = json.load(open('data/coco_noc/coco_detection_noc_val.json')) 11 | info = json.load(open('data/coco_noc/dic_coco.json')) 12 | 13 | det = [] 14 | for img in det_train: 15 | img['split'] = 'train2014' 16 | det.append(img) 17 | 18 | for img in det_val: 19 | img['split'] = 'val2014' 20 | det.append(img) 21 | elif dataset == 'flickr30k': 22 | det_file = json.load(open('data/flickr30k/flickr30k_detection.json')) 23 | info = json.load(open('data/flickr30k/dic_flickr30k.json')) 24 | det = [] 25 | for img in det_file: 26 | det.append(img) 27 | 28 | proposal_file = {} 29 | for img in det: 30 | proposal_file[img['image_id']] = img 31 | 32 | N = len(det) 33 | dets_labels = np.zeros((N, 100, 6)) 34 | dets_num = np.zeros((N)) 35 | nms_num = np.zeros((N)) 36 | 37 | for idx, img in enumerate(info['images']): 38 | image_id = img['id'] 39 | proposal = proposal_file[image_id] 40 | 41 | num_proposal = len(proposal['detection']) 42 | 43 | num_nms = proposal['num_boxes'] 44 | proposals = np.zeros([num_proposal, 6]) 45 | for i in range(num_proposal): 46 | proposals[i, :4] = proposal['detection'][i]['location'] 47 | proposals[i, 4] = proposal['detection'][i]['label'] 48 | proposals[i, 5] = proposal['detection'][i]['score'] 49 | 50 | dets_labels[idx,:num_proposal] = proposals 51 | dets_num[idx] = num_proposal 52 | nms_num[idx] = num_nms 53 | 54 | if dataset == 'coco': 55 | f = h5py.File('coco_noc_detection.h5', "w") 56 | elif dataset == 'flickr30k': 57 | f = h5py.File('flickr30k_detection.h5', "w") 58 | 59 | f.create_dataset("dets_labels", data=dets_labels) 60 | f.create_dataset("dets_num", data=dets_num) 61 | f.create_dataset("nms_num", data=nms_num) 62 | f.close() 63 | -------------------------------------------------------------------------------- /prepro/prepro_dic_coco.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua 3 | 4 | Input: json file that has the form 5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...] 6 | example element in this list would look like 7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895} 8 | 9 | This script reads this json, does some basic preprocessing on the captions 10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays 11 | 12 | Output: a json file and an hdf5 file 13 | The hdf5 file contains several fields: 14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format 15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded 16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the 17 | first and last indices (in range 1..M) of labels for each image 18 | /label_length stores the length of the sequence for each of the M sequences 19 | 20 | The json file has a dict that contains: 21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed 22 | - an 'images' field that is a list holding auxiliary information for each image, 23 | such as in particular the 'split' it was assigned to. 24 | """ 25 | """ 26 | to get the prepro file for neural baby talk. we need 2 additional dictionaries. 27 | wtol: word to lemma, find the orignial form of the word. 28 | wtod: word to detection, find the detection label for the word. 29 | """ 30 | import os 31 | import json 32 | import argparse 33 | from random import shuffle, seed 34 | import string 35 | # non-standard dependencies: 36 | import h5py 37 | import numpy as np 38 | import torch 39 | import torchvision.models as models 40 | from torch.autograd import Variable 41 | import skimage.io 42 | import pdb 43 | from stanfordcorenlp import StanfordCoreNLP 44 | from nltk.tokenize import word_tokenize 45 | 46 | nlp = StanfordCoreNLP( 47 | os.path.join(os.path.dirname(os.path.realpath(__file__)), 48 | 'stanford-corenlp-full-2017-06-09'), memory='8g') 49 | props={'annotators': 'ssplit, tokenize, lemma','pipelineLanguage':'en', 'outputFormat':'json'} 50 | 51 | def build_vocab(imgs, params): 52 | count_thr = params['word_count_threshold'] 53 | 54 | # count up the number of words 55 | counts = {} 56 | for img in imgs: 57 | for sent in img['sentences']: 58 | # sent['tokens'] = word_tokenize(sent['raw'].lower()) 59 | for w in sent['tokens']: 60 | counts[w] = counts.get(w, 0) + 1 61 | cw = sorted([(count,w) for w,count in counts.items()], reverse=True) 62 | print('top words and their counts:') 63 | print('\n'.join(map(str,cw[:20]))) 64 | 65 | # print some stats 66 | total_words = sum(counts.values()) 67 | print('total words:', total_words) 68 | bad_words = [w for w,n in counts.items() if n <= count_thr] 69 | vocab = [w for w,n in counts.items() if n > count_thr] 70 | bad_count = sum(counts[w] for w in bad_words) 71 | print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts))) 72 | print('number of words in vocab would be %d' % (len(vocab), )) 73 | print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words)) 74 | 75 | # lets look at the distribution of lengths as well 76 | sent_lengths = {} 77 | for img in imgs: 78 | for sent in img['sentences']: 79 | txt = sent['tokens'] 80 | nw = len(txt) 81 | sent_lengths[nw] = sent_lengths.get(nw, 0) + 1 82 | max_len = max(sent_lengths.keys()) 83 | print('max length sentence in raw data: ', max_len) 84 | print('sentence length distribution (count, number of words):') 85 | sum_len = sum(sent_lengths.values()) 86 | for i in range(max_len+1): 87 | print('%2d: %10d %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len)) 88 | 89 | # lets now produce the final annotations 90 | if bad_count > 0: 91 | # additional special UNK token we will use below to map infrequent words to 92 | print('inserting the special UNK token') 93 | vocab.append('UNK') 94 | 95 | imgs_new = [] 96 | for img in imgs: 97 | img['final_captions'] = [] 98 | for sent in img['sentences']: 99 | txt = sent['tokens'] 100 | caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt] 101 | img['final_captions'].append(caption) 102 | imgs_new.append(img['final_captions']) 103 | 104 | return vocab, imgs_new 105 | 106 | def main(params): 107 | 108 | coco_class_all = [] 109 | coco_class_name = open('data/coco/coco_class_name.txt', 'r') 110 | for line in coco_class_name: 111 | coco_class = line.rstrip("\n").split(', ') 112 | coco_class_all.append(coco_class) 113 | 114 | # word to detection label 115 | wtod = {} 116 | for i in range(len(coco_class_all)): 117 | for w in coco_class_all[i]: 118 | wtod[w] = i 119 | 120 | imgs = json.load(open(params['input_json'], 'r')) 121 | imgs = imgs['images'] 122 | 123 | seed(123) # make reproducible 124 | 125 | # create the vocab 126 | vocab, imgs_new = build_vocab(imgs, params) 127 | itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table 128 | wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table 129 | 130 | wtol = {} 131 | for w in vocab: 132 | out = json.loads(nlp.annotate(w, properties=props)) 133 | lemma_w = out['sentences'][0]['tokens'][0]['lemma'] 134 | wtol[w] = lemma_w 135 | 136 | if params['split'] == 'robust': 137 | split_path = 'data/robust_coco/split_robust_coco.json' 138 | split_file = json.load(open(split_path, 'r')) 139 | split_map = {} 140 | split_map['train'] = {} 141 | split_map['val'] = {} 142 | split_map['test'] = {} 143 | 144 | for img in split_file['train_id']: 145 | split_map['train'][str(img['img_id'])] = 1 146 | for img in split_file['val_id']: 147 | split_map['val'][str(img['img_id'])] = 1 148 | for img in split_file['test_id']: 149 | split_map['test'][str(img['img_id'])] = 1 150 | 151 | elif params['split'] == 'noc': 152 | split_path = 'data/noc_coco/split_noc_coco.json' 153 | split_file = json.load(open(split_path, 'r')) 154 | split_map = {} 155 | split_map['train'] = {} 156 | split_map['val'] = {} 157 | split_map['test'] = {} 158 | 159 | for img in split_file['train']: 160 | split_map['train'][img] = 1 161 | for img in split_file['val']: 162 | split_map['val'][img] = 1 163 | for img in split_file['val_train']: 164 | split_map['val'][img] = 1 165 | for img in split_file['test']: 166 | split_map['test'][img] = 1 167 | for img in split_file['test_train']: 168 | split_map['test'][img] = 1 169 | 170 | # create output json file 171 | out = {} 172 | out['ix_to_word'] = itow # encode the (1-indexed) vocab 173 | out['wtod'] = wtod 174 | out['wtol'] = wtol 175 | out['images'] = [] 176 | count = 0 177 | for i,img in enumerate(imgs): 178 | jimg = {} 179 | 180 | if params['split'] == 'robust' or params['split'] == 'noc': 181 | img_id = str(img['cocoid']) 182 | if img_id in split_map['train']: 183 | jimg['split'] = 'train' 184 | elif img_id in split_map['val']: 185 | jimg['split'] = 'val' 186 | elif img_id in split_map['test']: 187 | jimg['split'] = 'test' 188 | else: 189 | jimg['split'] = 'rest' 190 | 191 | elif params['split'] == 'challenge': 192 | if img['split'] == 'val' and count < 1000: # we use 1000 image from val as validation, and the rest as train. 193 | jimg['split'] = img['split'] 194 | count += 1 195 | else: 196 | jimg['split'] = 'train' # put restrl into train. 197 | else: 198 | if img['split'] == 'val' or img['split'] == 'test': 199 | jimg['split'] = img['split'] 200 | else: 201 | jimg['split'] = 'train' # put restrl into train. 202 | 203 | if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need 204 | if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful) 205 | out['images'].append(jimg) 206 | 207 | json.dump(out, open(params['output_dic_json'], 'w')) 208 | print('wrote ', params['output_dic_json']) 209 | 210 | json.dump(imgs_new, open(params['output_cap_json'], 'w')) 211 | print('wrote ', params['output_cap_json']) 212 | 213 | if __name__ == "__main__": 214 | parser = argparse.ArgumentParser() 215 | 216 | # input json 217 | parser.add_argument('--input_json', default='data/coco/dataset_coco.json', help='input json file to process into hdf5') 218 | parser.add_argument('--split', default='normal', help='different split for different task.') 219 | 220 | parser.add_argument('--output_dic_json', default='data/coco_noc/dic_coco_noc_only.json', help='output json file') 221 | parser.add_argument('--output_cap_json', default='data/coco_noc/cap_coco_noc_only.json', help='output json file') 222 | 223 | # options 224 | parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.') 225 | parser.add_argument('--word_count_threshold', default=5, type=int, help='only words that occur more than this number of times will be put in vocab') 226 | 227 | args = parser.parse_args() 228 | params = vars(args) # convert to ordinary dict 229 | print('parsed input parameters:') 230 | print(json.dumps(params, indent = 2)) 231 | main(params) 232 | -------------------------------------------------------------------------------- /prepro/prepro_dic_flickr.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua 3 | 4 | Input: json file that has the form 5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...] 6 | example element in this list would look like 7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895} 8 | 9 | This script reads this json, does some basic preprocessing on the captions 10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays 11 | 12 | Output: a json file and an hdf5 file 13 | The hdf5 file contains several fields: 14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format 15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded 16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the 17 | first and last indices (in range 1..M) of labels for each image 18 | /label_length stores the length of the sequence for each of the M sequences 19 | 20 | The json file has a dict that contains: 21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed 22 | - an 'images' field that is a list holding auxiliary information for each image, 23 | such as in particular the 'split' it was assigned to. 24 | """ 25 | """ 26 | to get the prepro file for neural baby talk. we need 2 additional dictionaries. 27 | wtol: word to lemma, find the orignial form of the word. 28 | wtod: word to detection, find the detection label for the word. 29 | """ 30 | import os 31 | import json 32 | import argparse 33 | from random import shuffle, seed 34 | import string 35 | # non-standard dependencies: 36 | import h5py 37 | import numpy as np 38 | import torch 39 | import torchvision.models as models 40 | from torch.autograd import Variable 41 | import skimage.io 42 | import pdb 43 | from stanfordcorenlp import StanfordCoreNLP 44 | from nltk.tokenize import word_tokenize 45 | 46 | nlp = StanfordCoreNLP('../stanford-corenlp-full-2017-06-09', memory='8g') 47 | props={'annotators': 'ssplit, tokenize, lemma','pipelineLanguage':'en', 'outputFormat':'json'} 48 | 49 | def build_vocab(imgs, params): 50 | count_thr = params['word_count_threshold'] 51 | 52 | # count up the number of words 53 | counts = {} 54 | for img in imgs: 55 | for sent in img['captions']: 56 | sent['tokens'] = [w.lower() for w in sent['tokens']] 57 | for w in sent['tokens']: 58 | counts[w] = counts.get(w, 0) + 1 59 | cw = sorted([(count,w) for w,count in counts.items()], reverse=True) 60 | print('top words and their counts:') 61 | print('\n'.join(map(str,cw[:20]))) 62 | 63 | counts[''] = 0 64 | # print some stats 65 | total_words = sum(counts.values()) 66 | print('total words:', total_words) 67 | bad_words = [w for w,n in counts.items() if n <= count_thr] 68 | vocab = [w for w,n in counts.items() if n > count_thr] 69 | bad_count = sum(counts[w] for w in bad_words) 70 | print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts))) 71 | print('number of words in vocab would be %d' % (len(vocab), )) 72 | print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words)) 73 | 74 | # lets look at the distribution of lengths as well 75 | sent_lengths = {} 76 | for img in imgs: 77 | for sent in img['captions']: 78 | txt = sent['tokens'] 79 | nw = len(txt) 80 | sent_lengths[nw] = sent_lengths.get(nw, 0) + 1 81 | max_len = max(sent_lengths.keys()) 82 | print('max length sentence in raw data: ', max_len) 83 | print('sentence length distribution (count, number of words):') 84 | sum_len = sum(sent_lengths.values()) 85 | for i in range(max_len+1): 86 | print('%2d: %10d %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len)) 87 | 88 | # lets now produce the final annotations 89 | if bad_count > 0: 90 | # additional special UNK token we will use below to map infrequent words to 91 | print('inserting the special UNK token') 92 | vocab.append('UNK') 93 | 94 | imgs_new = [] 95 | for img in imgs: 96 | img['final_captions'] = [] 97 | for sent in img['captions']: 98 | txt = sent['tokens'] 99 | clss = sent['process_clss'] 100 | bbox = sent['process_bnd_box'] 101 | idx = sent['process_idx'] 102 | caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt] 103 | img['final_captions'].append({'caption':caption, 'clss':clss, 'bbox':bbox, 'idx':idx}) 104 | imgs_new.append(img['final_captions']) 105 | 106 | return vocab, imgs_new 107 | 108 | def main(params): 109 | 110 | coco_class_all = [] 111 | coco_class_name = open(params['input_class_name'], 'r') 112 | for line in coco_class_name: 113 | coco_class = line.rstrip("\n").split(', ') 114 | coco_class_all.append(coco_class) 115 | 116 | # word to detection label 117 | wtod = {} 118 | for i in range(len(coco_class_all)): 119 | for w in coco_class_all[i]: 120 | wtod[w] = i 121 | 122 | imgs_split = json.load(open(params['input_json'], 'r')) 123 | imgs_split = imgs_split['images'] 124 | 125 | split = {} 126 | for img in imgs_split: 127 | split[img['filename'].split('.')[0]] = img['split'] 128 | 129 | imgs_processed = json.load(open('data/flickr30k/flickr30k_cleaned_class.json', 'r')) 130 | imgs_processed = imgs_processed['annotations'] 131 | 132 | for img in imgs_processed: 133 | if str(img['image_id']) in split: 134 | img['split'] = split[str(img['image_id'])] 135 | else: 136 | img['split'] = 'rest' 137 | seed(123) # make reproducible 138 | 139 | # create the vocab 140 | vocab, imgs_new = build_vocab(imgs_processed, params) 141 | itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table 142 | wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table 143 | 144 | wtol = {} 145 | for w in vocab: 146 | out = json.loads(nlp.annotate(w.encode('utf-8'), properties=props)) 147 | lemma_w = out['sentences'][0]['tokens'][0]['lemma'] 148 | wtol[w] = lemma_w 149 | 150 | # create output json file 151 | out = {} 152 | out['ix_to_word'] = itow # encode the (1-indexed) vocab 153 | out['wtod'] = wtod 154 | out['wtol'] = wtol 155 | out['images'] = [] 156 | for i,img in enumerate(imgs_processed): 157 | jimg = {} 158 | jimg['split'] = img['split'] 159 | if params['dataset'] == 'flickr30k': 160 | if 'image_id' in img: jimg['file_path'] = str(img['image_id']) + '.jpg' # copy it over, might need 161 | if 'image_id' in img: jimg['id'] = img['image_id'] # copy over & mantain an id, if present (e.g. coco ids, useful) 162 | elif params['dataset'] == 'coco': 163 | if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need 164 | if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful) 165 | out['images'].append(jimg) 166 | 167 | json.dump(out, open(params['outpu_dic_json'], 'w')) 168 | print('wrote ', params['outpu_dic_json']) 169 | 170 | json.dump(imgs_new, open(params['output_cap_json'], 'w')) 171 | print('wrote ', params['output_cap_json']) 172 | 173 | if __name__ == "__main__": 174 | parser = argparse.ArgumentParser() 175 | 176 | # input json 177 | parser.add_argument('--dataset', default='flickr30k', help='dataset') 178 | parser.add_argument('--input_json', default='data/flickr30k/dataset_flickr30k.json', help='input json file to process into hdf5') 179 | parser.add_argument('--input_class_name', default='data/flickr30k/flickr30k_class_name.txt',help='class name') 180 | parser.add_argument('--outpu_dic_json', default='data/flickr30k/dic_flickr30k.json', help='output json file') 181 | parser.add_argument('--output_cap_json', default='data/flickr30k/cap_flickr30k.json', help='output json file') 182 | 183 | # options 184 | parser.add_argument('--max_length', default=20, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.') 185 | parser.add_argument('--word_count_threshold', default=3, type=int, help='only words that occur more than this number of times will be put in vocab') 186 | 187 | args = parser.parse_args() 188 | params = vars(args) # convert to ordinary dict 189 | print('parsed input parameters:') 190 | print(json.dumps(params, indent = 2)) 191 | main(params) 192 | -------------------------------------------------------------------------------- /prepro/prepro_ngrams.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua 3 | 4 | Input: json file that has the form 5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...] 6 | example element in this list would look like 7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895} 8 | 9 | This script reads this json, does some basic preprocessing on the captions 10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays 11 | 12 | Output: a json file and an hdf5 file 13 | The hdf5 file contains several fields: 14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format 15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded 16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the 17 | first and last indices (in range 1..M) of labels for each image 18 | /label_length stores the length of the sequence for each of the M sequences 19 | 20 | The json file has a dict that contains: 21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed 22 | - an 'images' field that is a list holding auxiliary information for each image, 23 | such as in particular the 'split' it was assigned to. 24 | """ 25 | 26 | import os 27 | import json 28 | import argparse 29 | from six.moves import cPickle, xrange 30 | from collections import defaultdict 31 | import pdb 32 | 33 | def precook(s, n=4, out=False): 34 | """ 35 | Takes a string as input and returns an object that can be given to 36 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 37 | can take string arguments as well. 38 | :param s: string : sentence to be converted into ngrams 39 | :param n: int : number of ngrams for which representation is calculated 40 | :return: term frequency vector for occuring ngrams 41 | """ 42 | words = s.split() 43 | counts = defaultdict(int) 44 | for k in xrange(1,n+1): 45 | for i in xrange(len(words)-k+1): 46 | ngram = tuple(words[i:i+k]) 47 | counts[ngram] += 1 48 | return counts 49 | 50 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 51 | '''Takes a list of reference sentences for a single segment 52 | and returns an object that encapsulates everything that BLEU 53 | needs to know about them. 54 | :param refs: list of string : reference sentences for some image 55 | :param n: int : number of ngrams for which (ngram) representation is calculated 56 | :return: result (list of dict) 57 | ''' 58 | return [precook(ref, n) for ref in refs] 59 | 60 | def create_crefs(refs): 61 | crefs = [] 62 | for ref in refs: 63 | # ref is a list of 5 captions 64 | crefs.append(cook_refs(ref)) 65 | return crefs 66 | 67 | def compute_doc_freq(crefs): 68 | ''' 69 | Compute term frequency for reference data. 70 | This will be used to compute idf (inverse document frequency later) 71 | The term frequency is stored in the object 72 | :return: None 73 | ''' 74 | document_frequency = defaultdict(float) 75 | for refs in crefs: 76 | # refs, k ref captions of one image 77 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): 78 | document_frequency[ngram] += 1 79 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 80 | return document_frequency 81 | 82 | def build_dict(imgs, wtoi, params): 83 | wtoi[''] = 0 84 | 85 | count_imgs = 0 86 | refs_words = [] 87 | refs_idxs = [] 88 | for img in imgs: 89 | if (params['split'] == img['split']) or \ 90 | (params['split'] == 'train' and img['split'] == 'restval') or \ 91 | (params['split'] == 'all'): 92 | #(params['split'] == 'val' and img['split'] == 'restval') or \ 93 | ref_words = [] 94 | ref_idxs = [] 95 | for sent in img['sentences']: 96 | tmp_tokens = sent['tokens'] + [''] 97 | tmp_tokens = [_ if _ in wtoi else 'UNK' for _ in tmp_tokens] 98 | ref_words.append(' '.join(tmp_tokens)) 99 | ref_idxs.append(' '.join([str(wtoi[_]) for _ in tmp_tokens])) 100 | 101 | refs_words.append(ref_words) 102 | refs_idxs.append(ref_idxs) 103 | count_imgs += 1 104 | print('total imgs:', count_imgs) 105 | 106 | ngram_words = compute_doc_freq(create_crefs(refs_words)) 107 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs)) 108 | return ngram_words, ngram_idxs, count_imgs 109 | 110 | def main(params): 111 | 112 | imgs = json.load(open(params['input_json'], 'r')) 113 | itow = json.load(open(params['dict_json'], 'r'))['ix_to_word'] 114 | wtoi = {w:i for i,w in itow.items()} 115 | imgs = imgs['images'] 116 | 117 | ngram_words, ngram_idxs, ref_len = build_dict(imgs, wtoi, params) 118 | cPickle.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(params['output_pkl']+'-words.p','w'), protocol=cPickle.HIGHEST_PROTOCOL) 119 | cPickle.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(params['output_pkl']+'-idxs.p','w'), protocol=cPickle.HIGHEST_PROTOCOL) 120 | 121 | if __name__ == "__main__": 122 | 123 | parser = argparse.ArgumentParser() 124 | # input json 125 | parser.add_argument('--input_json', default='data/flickr30k/dataset_flickr30k.json', help='input json file to process into hdf5') 126 | parser.add_argument('--dict_json', default='data/flickr30k/dic_flickr30k.json', help='output json file') 127 | parser.add_argument('--output_pkl', default='data/flickr30k-train', help='output pickle file') 128 | parser.add_argument('--split', default='train', help='test, val, train, all') 129 | args = parser.parse_args() 130 | params = vars(args) # convert to ordinary dict 131 | 132 | main(params) 133 | -------------------------------------------------------------------------------- /prepro/prepro_ngrams_bak.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua 3 | 4 | Input: json file that has the form 5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...] 6 | example element in this list would look like 7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895} 8 | 9 | This script reads this json, does some basic preprocessing on the captions 10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays 11 | 12 | Output: a json file and an hdf5 file 13 | The hdf5 file contains several fields: 14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format 15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded 16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the 17 | first and last indices (in range 1..M) of labels for each image 18 | /label_length stores the length of the sequence for each of the M sequences 19 | 20 | The json file has a dict that contains: 21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed 22 | - an 'images' field that is a list holding auxiliary information for each image, 23 | such as in particular the 'split' it was assigned to. 24 | """ 25 | import sys 26 | import os 27 | sys.path.append(os.getcwd()) 28 | 29 | import json 30 | import argparse 31 | from six.moves import cPickle, xrange 32 | from collections import defaultdict 33 | from pycocotools.coco import COCO 34 | import numpy as np 35 | import copy 36 | import pdb 37 | 38 | def precook(s, n=4, out=False): 39 | """ 40 | Takes a string as input and returns an object that can be given to 41 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 42 | can take string arguments as well. 43 | :param s: string : sentence to be converted into ngrams 44 | :param n: int : number of ngrams for which representation is calculated 45 | :return: term frequency vector for occuring ngrams 46 | """ 47 | words = s.split() 48 | counts = defaultdict(int) 49 | for k in xrange(1,n+1): 50 | for i in xrange(len(words)-k+1): 51 | ngram = tuple(words[i:i+k]) 52 | counts[ngram] += 1 53 | return counts 54 | 55 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 56 | '''Takes a list of reference sentences for a single segment 57 | and returns an object that encapsulates everything that BLEU 58 | needs to know about them. 59 | :param refs: list of string : reference sentences for some image 60 | :param n: int : number of ngrams for which (ngram) representation is calculated 61 | :return: result (list of dict) 62 | ''' 63 | return [precook(ref, n) for ref in refs] 64 | 65 | def create_crefs(refs): 66 | crefs = [] 67 | for ref in refs: 68 | # ref is a list of 5 captions 69 | crefs.append(cook_refs(ref)) 70 | return crefs 71 | 72 | def compute_doc_freq(crefs): 73 | ''' 74 | Compute term frequency for reference data. 75 | This will be used to compute idf (inverse document frequency later) 76 | The term frequency is stored in the object 77 | :return: None 78 | ''' 79 | document_frequency = defaultdict(float) 80 | for refs in crefs: 81 | # refs, k ref captions of one image 82 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): 83 | document_frequency[ngram] += 1 84 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 85 | return document_frequency 86 | 87 | def build_dict(imgs, info, wtoi, wtod, dtoi, wtol, ctol, coco_det_train, coco_det_val, params): 88 | vocab_size = len(wtoi)+1 89 | seq_length = 16 90 | wtoi[''] = 0 91 | wtol[''] = '' 92 | count_imgs = 0 93 | 94 | refs_words = [] 95 | refs_idxs = [] 96 | for idx, img in enumerate(imgs): 97 | image_id = info['images'][idx]['id'] 98 | # image_id = img['cocoid'] 99 | file_path = info['images'][idx]['file_path'].split('/')[0] 100 | 101 | if file_path == 'train2014': 102 | coco = coco_det_train 103 | else: 104 | coco = coco_det_val 105 | 106 | bbox_ann_ids = coco.getAnnIds(imgIds=image_id) 107 | bbox_ann = [{'label': ctol[i['category_id']], 'bbox': i['bbox']} for i in coco.loadAnns(bbox_ann_ids)] 108 | 109 | if (params['split'] == info['images'][idx]['split']) or \ 110 | (params['split'] == 'train' and info['images'][idx]['split'] == 'restval') or \ 111 | (params['split'] == 'all'): 112 | #(params['split'] == 'val' and img['split'] == 'restval') or \ 113 | ref_words = [] 114 | ref_idxs = [] 115 | 116 | captions = [] 117 | for sent in img: 118 | captions.append(sent + ['']) 119 | det_indicator = get_det_word(bbox_ann, captions, wtoi, wtod, dtoi, wtol) 120 | 121 | ncap = len(captions) # number of captions available for this image 122 | for i, caption in enumerate(captions): 123 | tmp_tokens = [] 124 | j = 0 125 | k = 0 126 | while j < len(caption): 127 | is_det = False 128 | for n in range(2, 0, -1): 129 | if det_indicator[n][i][j][0] != 0: 130 | tmp_tokens.append(vocab_size + det_indicator[n][i][j][2] * 2 + det_indicator[n][i][j][1]) 131 | is_det = True 132 | j += n # skip the ngram. 133 | break 134 | if is_det == False: 135 | tmp_tokens.append(wtoi[caption[j]]) 136 | j += 1 137 | k += 1 138 | ref_idxs.append(' '.join([str(int(_)) for _ in tmp_tokens])) 139 | # refs_words.append(ref_words) 140 | refs_idxs.append(ref_idxs) 141 | count_imgs += 1 142 | 143 | print('total imgs:', count_imgs) 144 | 145 | # ngram_words = compute_doc_freq(create_crefs(refs_words)) 146 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs)) 147 | return ngram_idxs, count_imgs 148 | 149 | 150 | def get_det_word(bbox_ann, captions, wtoi, wtod, dtoi, wtol, ngram=2): 151 | 152 | # get the present category. 153 | pcats = [box['label'] for box in bbox_ann] 154 | 155 | # get the orginial form of the caption. 156 | indicator = [] 157 | stem_caption = [] 158 | for s in captions: 159 | tmp = [] 160 | for w in s: 161 | tmp.append(wtol[w]) 162 | stem_caption.append(tmp) 163 | indicator.append([(0, 0, 0)]*len(s)) # category class, binary class, fine-grain class. 164 | 165 | ngram_indicator = {i+1:copy.deepcopy(indicator) for i in range(ngram)} 166 | # get the 2 gram of the caption. 167 | for n in range(ngram,0,-1): 168 | for i, s in enumerate(stem_caption): 169 | for j in xrange(len(s)-n+1): 170 | ng = ' '.join(s[j:j+n]) 171 | # if the n-gram exist in word_to_detection dictionary. 172 | if ng in wtod and indicator[i][j][0] == 0 and wtod[ng] in pcats: # make sure that larger gram not overwright with lower gram. 173 | bn = (ng != ' '.join(captions[i][j:j+n])) + 1 174 | fg = dtoi[ng] 175 | ngram_indicator[n][i][j] = (wtod[ng], bn, fg) 176 | indicator[i][j:j+n] = [(wtod[ng], bn, fg)] * n 177 | 178 | return ngram_indicator 179 | 180 | def main(params): 181 | 182 | det_train_path = 'data/coco/annotations/instances_train2014.json' 183 | det_val_path = 'data/coco/annotations/instances_val2014.json' 184 | 185 | coco_det_train = COCO(det_train_path) 186 | coco_det_val = COCO(det_val_path) 187 | 188 | info = json.load(open(params['dict_json'], 'r')) 189 | imgs = json.load(open(params['input_json'], 'r')) 190 | 191 | itow = info['ix_to_word'] 192 | wtoi = {w:i for i,w in itow.items()} 193 | wtod = {w:i+1 for w,i in info['wtod'].items()} # word to detection 194 | dtoi = {w:i+1 for i,w in enumerate(wtod.keys())} # detection to index 195 | wtol = info['wtol'] 196 | ctol = {c:i+1 for i, c in enumerate(coco_det_train.cats.keys())} 197 | 198 | # imgs = imgs['images'] 199 | 200 | ngram_idxs, ref_len = build_dict(imgs, info, wtoi, wtod, dtoi, wtol, ctol, coco_det_train, coco_det_val, params) 201 | 202 | # cPickle.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(params['output_pkl']+'-words.p','w'), protocol=cPickle.HIGHEST_PROTOCOL) 203 | cPickle.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(params['output_pkl']+'-idxs.p','w'), protocol=cPickle.HIGHEST_PROTOCOL) 204 | 205 | if __name__ == "__main__": 206 | 207 | parser = argparse.ArgumentParser() 208 | 209 | # input json 210 | parser.add_argument('--input_json', default='data/coco/cap_coco.json', help='input json file to process into hdf5') 211 | parser.add_argument('--dict_json', default='data/coco/dic_coco.json', help='output json file') 212 | parser.add_argument('--output_pkl', default='data/coco-train', help='output pickle file') 213 | parser.add_argument('--split', default='train', help='test, val, train, all') 214 | args = parser.parse_args() 215 | params = vars(args) # convert to ordinary dict 216 | 217 | main(params) 218 | -------------------------------------------------------------------------------- /prepro/prepro_ngrams_flickr30k.py: -------------------------------------------------------------------------------- 1 | """ 2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua 3 | 4 | Input: json file that has the form 5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...] 6 | example element in this list would look like 7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895} 8 | 9 | This script reads this json, does some basic preprocessing on the captions 10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays 11 | 12 | Output: a json file and an hdf5 file 13 | The hdf5 file contains several fields: 14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format 15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded 16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the 17 | first and last indices (in range 1..M) of labels for each image 18 | /label_length stores the length of the sequence for each of the M sequences 19 | 20 | The json file has a dict that contains: 21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed 22 | - an 'images' field that is a list holding auxiliary information for each image, 23 | such as in particular the 'split' it was assigned to. 24 | """ 25 | import sys 26 | import os 27 | sys.path.append(os.getcwd()) 28 | 29 | import json 30 | import argparse 31 | from six.moves import cPickle, xrange 32 | from collections import defaultdict 33 | from pycocotools.coco import COCO 34 | import numpy as np 35 | import copy 36 | import pdb 37 | 38 | def precook(s, n=4, out=False): 39 | """ 40 | Takes a string as input and returns an object that can be given to 41 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 42 | can take string arguments as well. 43 | :param s: string : sentence to be converted into ngrams 44 | :param n: int : number of ngrams for which representation is calculated 45 | :return: term frequency vector for occuring ngrams 46 | """ 47 | words = s.split() 48 | counts = defaultdict(int) 49 | for k in xrange(1,n+1): 50 | for i in xrange(len(words)-k+1): 51 | ngram = tuple(words[i:i+k]) 52 | counts[ngram] += 1 53 | return counts 54 | 55 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 56 | '''Takes a list of reference sentences for a single segment 57 | and returns an object that encapsulates everything that BLEU 58 | needs to know about them. 59 | :param refs: list of string : reference sentences for some image 60 | :param n: int : number of ngrams for which (ngram) representation is calculated 61 | :return: result (list of dict) 62 | ''' 63 | return [precook(ref, n) for ref in refs] 64 | 65 | def create_crefs(refs): 66 | crefs = [] 67 | for ref in refs: 68 | # ref is a list of 5 captions 69 | crefs.append(cook_refs(ref)) 70 | return crefs 71 | 72 | def compute_doc_freq(crefs): 73 | ''' 74 | Compute term frequency for reference data. 75 | This will be used to compute idf (inverse document frequency later) 76 | The term frequency is stored in the object 77 | :return: None 78 | ''' 79 | document_frequency = defaultdict(float) 80 | for refs in crefs: 81 | # refs, k ref captions of one image 82 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): 83 | document_frequency[ngram] += 1 84 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 85 | return document_frequency 86 | 87 | def build_dict(imgs, info, wtoi, wtod, dtoi, wtol, itod, params): 88 | vocab_size = len(wtoi)+1 89 | seq_length = 16 90 | wtoi[''] = 0 91 | wtol[''] = '' 92 | count_imgs = 0 93 | 94 | refs_words = [] 95 | refs_idxs = [] 96 | for idx, img in enumerate(imgs): 97 | image_id = info['images'][idx]['id'] 98 | # image_id = img['cocoid'] 99 | file_path = info['images'][idx]['file_path'].split('/')[0] 100 | bbox_ann = [] 101 | bbox_idx = 0 102 | for sent in img: 103 | sent['bbox_idx'] = [] 104 | for i, box in enumerate(sent['bbox']): 105 | sent['bbox_idx'].append(bbox_idx) 106 | bbox_ann.append({'bbox':box, 'label': dtoi[sent['clss'][i]], 'bbox_idx':bbox_idx}) 107 | bbox_idx += 1 108 | gt_bboxs = np.zeros((len(bbox_ann), 6)) 109 | for i, bbox in enumerate(bbox_ann): 110 | gt_bboxs[i, :4] = bbox['bbox'] 111 | gt_bboxs[i, 4] = bbox['label'] 112 | gt_bboxs[i, 5] = bbox['bbox_idx'] 113 | 114 | if (params['split'] == info['images'][idx]['split']) or \ 115 | (params['split'] == 'train' and info['images'][idx]['split'] == 'restval') or \ 116 | (params['split'] == 'all'): 117 | #(params['split'] == 'val' and img['split'] == 'restval') or \ 118 | ref_words = [] 119 | ref_idxs = [] 120 | 121 | captions = [] 122 | for sent in img: 123 | sent['caption'] = sent['caption'] + [''] 124 | sent['caption'] = [_ if _ in wtoi else 'UNK' for _ in sent['caption']] 125 | captions.append(sent) 126 | 127 | det_indicator = get_det_word(gt_bboxs, captions, wtod, dtoi) 128 | 129 | ncap = len(captions) # number of captions available for this image 130 | for i, caption in enumerate(captions): 131 | tmp_tokens = [] 132 | j = 0 133 | while j < len(caption['caption']): 134 | if det_indicator[i][j][0] != 0: 135 | tmp_tokens.append(vocab_size + det_indicator[i][j][2] * 2 + det_indicator[i][j][1]-1) 136 | else: 137 | tmp_tokens.append(wtoi[caption['caption'][j]]) 138 | j += 1 139 | ref_idxs.append(' '.join([str(int(_)) for _ in tmp_tokens])) 140 | # refs_words.append(ref_words) 141 | refs_idxs.append(ref_idxs) 142 | count_imgs += 1 143 | 144 | print('total imgs:', count_imgs) 145 | 146 | # ngram_words = compute_doc_freq(create_crefs(refs_words)) 147 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs)) 148 | return ngram_idxs, count_imgs 149 | 150 | 151 | def get_det_word(gt_bboxs, captions, wtod, dtoi): 152 | 153 | # get the present category. 154 | pcats = [] 155 | for i in range(gt_bboxs.shape[0]): 156 | pcats.append(gt_bboxs[i,5]) 157 | # get the orginial form of the caption. 158 | indicator = [] 159 | for i, sent in enumerate(captions): 160 | indicator.append([(0, 0, 0)]*len(sent['caption'])) # category class, binary class, fine-grain class. 161 | for j, bbox_idx in enumerate(sent['bbox_idx']): 162 | # if the bbox_idx is not filtered out. 163 | if bbox_idx in pcats: 164 | w_idx = sent['idx'][j] 165 | ng = sent['clss'][j] 166 | bn = (ng != sent['caption'][w_idx]) + 1 167 | fg = dtoi[ng] 168 | indicator[i][w_idx] = (wtod[sent['clss'][j]], bn, fg) 169 | 170 | return indicator 171 | 172 | def main(params): 173 | 174 | info = json.load(open(params['dict_json'], 'r')) 175 | imgs = json.load(open(params['input_json'], 'r')) 176 | 177 | itow = info['ix_to_word'] 178 | wtoi = {w:i for i,w in itow.items()} 179 | wtod = {w:i+1 for w,i in info['wtod'].items()} # word to detection 180 | # dtoi = {w:i+1 for i,w in enumerate(wtod.keys())} # detection to index 181 | dtoi = wtod 182 | wtol = info['wtol'] 183 | itod = {i:w for w,i in dtoi.items()} 184 | 185 | # imgs = imgs['images'] 186 | 187 | ngram_idxs, ref_len = build_dict(imgs, info, wtoi, wtod, dtoi, wtol, itod, params) 188 | 189 | # cPickle.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(params['output_pkl']+'-words.p','w'), protocol=cPickle.HIGHEST_PROTOCOL) 190 | cPickle.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(params['output_pkl']+'-idxs.p','w'), protocol=cPickle.HIGHEST_PROTOCOL) 191 | 192 | if __name__ == "__main__": 193 | 194 | parser = argparse.ArgumentParser() 195 | 196 | # input json 197 | parser.add_argument('--input_json', default='data/flickr30k/cap_flickr30k.json', help='input json file to process into hdf5') 198 | parser.add_argument('--dict_json', default='data/flickr30k/dic_flickr30k.json', help='output json file') 199 | parser.add_argument('--output_pkl', default='data/flickr30k-train', help='output pickle file') 200 | parser.add_argument('--split', default='train', help='test, val, train, all') 201 | args = parser.parse_args() 202 | params = vars(args) # convert to ordinary dict 203 | 204 | main(params) 205 | -------------------------------------------------------------------------------- /tools/pycider/PyDataFormat/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rama' 2 | -------------------------------------------------------------------------------- /tools/pycider/PyDataFormat/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/PyDataFormat/__init__.pyc -------------------------------------------------------------------------------- /tools/pycider/PyDataFormat/jsonify_refs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code to convert mat file with structures into json files 3 | Created on : 5/18/15 3:27 PM by rama 4 | """ 5 | 6 | import scipy.io as io 7 | import os 8 | import re 9 | import json 10 | import string 11 | import pdb 12 | 13 | pathToMat = '/Users/rama/Research/data/pyCider/' 14 | matfile = 'pascal_cands.mat' 15 | jsonfile = 'pascal_cands' 16 | 17 | data = io.loadmat(os.path.join(pathToMat, matfile)) 18 | refs = list(data['cands'][0]) 19 | 20 | A = [] 21 | B = [] 22 | 23 | for image in refs: 24 | for sentences in image[1]: 25 | for i, sent in enumerate(sentences): 26 | sent_struct = {} 27 | imname = str(image[0][0]).split('/')[-1] 28 | sent_struct['image_id'] = imname 29 | string_sent = sent[0].strip().split('\\') 30 | if len(string_sent) == 1: 31 | sent_struct['caption'] = string_sent[0] 32 | else: 33 | sent_struct['caption'] = ' '.join(string_sent[:-1]) 34 | if i == 1: 35 | A.append(sent_struct) 36 | else: 37 | B.append(sent_struct) 38 | 39 | with open(os.path.join(pathToMat, jsonfile + 'A.json'), 'w') as outfile: 40 | json.dump(A, outfile) 41 | 42 | with open(os.path.join(pathToMat, jsonfile + 'B.json'), 'w') as outfile: 43 | json.dump(B, outfile) 44 | -------------------------------------------------------------------------------- /tools/pycider/PyDataFormat/loadData.py: -------------------------------------------------------------------------------- 1 | """ 2 | Load the reference and candidate json files, which are to be evaluated using CIDEr. 3 | 4 | Reference file: list of dict('image_id': image_id, 'caption': caption). 5 | Candidate file: list of dict('image_id': image_id, 'caption': caption). 6 | 7 | """ 8 | import json 9 | import os 10 | from collections import defaultdict 11 | 12 | class LoadData(): 13 | def __init__(self, path): 14 | self.pathToData = path 15 | 16 | def readJson(self, refname, candname): 17 | 18 | path_to_ref_file = os.path.join(self.pathToData, refname) 19 | path_to_cand_file = os.path.join(self.pathToData, candname) 20 | 21 | ref_list = json.loads(open(path_to_ref_file, 'r').read()) 22 | cand_list = json.loads(open(path_to_cand_file, 'r').read()) 23 | 24 | gts = defaultdict(list) 25 | res = [] 26 | 27 | for l in ref_list: 28 | gts[l['image_id']].append({"caption": l['caption']}) 29 | 30 | res = cand_list; 31 | return gts, res 32 | -------------------------------------------------------------------------------- /tools/pycider/PyDataFormat/loadData.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/PyDataFormat/loadData.pyc -------------------------------------------------------------------------------- /tools/pycider/README.md: -------------------------------------------------------------------------------- 1 | Consensus-based Image Description Evaluation (CIDEr Code) 2 | =================== 3 | 4 | Evaluation code for CIDEr metric. Provides CIDEr as well as 5 | CIDEr-D (CIDEr Defended) which is more robust to gaming effects. 6 | 7 | ## Important Note ## 8 | CIDEr by default (with idf parameter set to "corpus" mode) computes IDF values using the reference sentences provided. Thus, CIDEr score for a reference dataset with only 1 image will be zero. When evaluating using one (or few) images, set idf to "coco-val-df" instead, which uses IDF from the MSCOCO Vaildation Dataset for reliable results. 9 | 10 | ## Requirements ## 11 | - java 1.8.0 12 | - python 2.7 13 | 14 | For running the ipython notebook file, update your Ipython to [Jupyter](https://jupyter.org/) 15 | 16 | ## Files ## 17 | ./ 18 | - cidereval.py (demo script) 19 | 20 | ./PyDataFormat 21 | - loadData.py (load the json files for references and candidates) 22 | 23 | - {$result\_file}.json (file with the CIDEr and CIDEr-D scores) 24 | 25 | ./pycocoevalcap: The folder where all evaluation codes are stored. 26 | - evals.py: Performs tokenization and runs both the metrics 27 | - tokenizer: Python wrapper of Stanford CoreNLP PTBTokenizer 28 | - cider: CIDEr evaluation codes 29 | - ciderD: CIDEr-D evaluation codes 30 | 31 | ## Instructions ## 32 | 1. Edit the params.json file to contain path to reference and candidate json files, and the result file where the scores are stored\*. 33 | 2. Set the "idf" value in params.json to "corpus" if not evaluating on a single image/instance. Set the "idf" value to "coco-val-df" if evaluating on a single image. In this case IDF values from the MSCOCO dataset are used. If using some other corpus, get the document frequencies into a similar format as "coco-val-df", and put them in the data/ folder as a pickle file. Then set mode to the name of the document frequency file (without the '.p' extension). 34 | 3. Sample json reference and candidate files are pascal50S.json and pascal_candsB.json 35 | 4. CIDEr scores are stored in "scores" variable: scores['CIDEr'] -> CIDEr scores, scores['CIDErD'] -> CIDEr-D scores 36 | 37 | *Even when evaluating with independent candidate/references (for eg. when using "coco-val-df"), put multiple candidate and reference entries into the same json files. This is much faster than having separate candidate and reference files and calling the evaluation code separately on each candidate/reference file. 38 | ## References ## 39 | 40 | - PTBTokenizer: We use the [Stanford Tokenizer](http://nlp.stanford.edu/software/tokenizer.shtml) which is included in [Stanford CoreNLP 3.4.1](http://nlp.stanford.edu/software/corenlp.shtml). 41 | - CIDEr: [CIDEr: Consensus-based Image Description Evaluation] (http://arxiv.org/pdf/1411.5726.pdf) 42 | 43 | ## Developers ## 44 | - Ramakrishna Vedantam (Virgina Tech) 45 | 46 | ## Acknowledgments ## 47 | - MS COCO Caption Evaluation Team 48 | -------------------------------------------------------------------------------- /tools/pycider/cidereval.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "# demo script for running CIDEr\n", 12 | "from PyDataFormat.loadData import LoadData\n", 13 | "import pdb\n", 14 | "import json\n", 15 | "from pyciderevalcap.eval import CIDErEvalCap as ciderEval\n", 16 | "from collections import defaultdict\n", 17 | "\n", 18 | "pathToData = './data/'\n", 19 | "\n", 20 | "refName = 'pascal50S.json'\n", 21 | "candName = 'pascal_candsB.json'\n", 22 | "\n", 23 | "result_file = 'results.json'" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 13, 29 | "metadata": { 30 | "collapsed": false 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "# load reference and candidate sentences\n", 35 | "loadDat = LoadData(pathToData)\n", 36 | "gts, res = loadDat.readJson(refName, candName)\n", 37 | "\n", 38 | "#res = res[:100]\n", 39 | "#gts = {img['image_id']: gts[img['image_id']] for img in res}" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 15, 45 | "metadata": { 46 | "collapsed": false 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "from pyciderevalcap.ciderD.ciderD import CiderD\n", 51 | "from pyciderevalcap.cider.cider import Cider\n", 52 | "from pyciderevalcap.tokenizer.ptbtokenizer import PTBTokenizer\n", 53 | "tokenizer = PTBTokenizer('gts')\n", 54 | "_gts = tokenizer.tokenize(gts)\n", 55 | "tokenizer = PTBTokenizer('res')\n", 56 | "_res = tokenizer.tokenize(res)\n" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "scorer = Cider(df='coco-val')\n", 68 | "scorerD = CiderD(df='coco-val')" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 16, 74 | "metadata": { 75 | "collapsed": false 76 | }, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "0.535560513246\n", 83 | "0.448542862876\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "score, scores = scorer.compute_score(_gts, _res)\n", 89 | "scoreD, scoresD = scorerD.compute_score(_gts, _res)\n", 90 | "print score\n", 91 | "print scoreD" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "outputs": [], 101 | "source": [] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 2", 107 | "language": "python", 108 | "name": "python2" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 2 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython2", 120 | "version": "2.7.11" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 0 125 | } 126 | -------------------------------------------------------------------------------- /tools/pycider/cidereval.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # In[1]: 4 | 5 | # demo script for running CIDEr 6 | import json 7 | from pydataformat.loadData import LoadData 8 | from pyciderevalcap.eval import CIDErEvalCap as ciderEval 9 | 10 | # load the configuration file 11 | config = json.loads(open('params.json', 'r').read()) 12 | 13 | # Print the parameters 14 | print("""Running CIDEr with the following settings 15 | ***************************** 16 | Reference File:{refName} 17 | Candidate File:{candName} 18 | Result File:{resultFile} 19 | IDF:{idf} 20 | *****************************""".format(**config)) 21 | 22 | pathToData = config['pathToData'] 23 | refName = config['refName'] 24 | candName = config['candName'] 25 | resultFile = config['resultFile'] 26 | df_mode = config['idf'] 27 | 28 | # In[2]: 29 | 30 | # load reference and candidate sentences 31 | loadDat = LoadData(pathToData) 32 | gts, res = loadDat.readJson(refName, candName) 33 | 34 | 35 | # In[3]: 36 | 37 | # calculate cider scores 38 | scorer = ciderEval(gts, res, df_mode) 39 | # scores: dict of list with key = metric and value = score given to each 40 | # candidate 41 | scores = scorer.evaluate() 42 | 43 | 44 | # In[7]: 45 | 46 | # scores['CIDEr'] contains CIDEr scores in a list for each candidate 47 | # scores['CIDErD'] contains CIDEr-D scores in a list for each candidate 48 | 49 | with open(resultFile, 'w') as outfile: 50 | json.dump(scores, outfile) 51 | -------------------------------------------------------------------------------- /tools/pycider/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /tools/pycider/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "pathToData" : "data/", 3 | "refName" : "pascal50S.json", 4 | "candName" : "pascal_candsB.json", 5 | "resultFile" : "results.json", 6 | "idf" : "coco-val-df" 7 | } 8 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/__init__.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/cider/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/cider/__init__.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # 4 | # Description: Describes the class to compute the CIDEr 5 | # (Consensus-Based Image Description Evaluation) Metric 6 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 7 | # 8 | # Creation Date: Sun Feb 8 14:16:54 2015 9 | # 10 | # Authors: Ramakrishna Vedantam and 11 | # Tsung-Yi Lin 12 | 13 | from cider_scorer import CiderScorer 14 | 15 | 16 | class Cider: 17 | """ 18 | Main Class to compute the CIDEr metric 19 | 20 | """ 21 | def __init__(self, n=4, df="corpus"): 22 | """ 23 | Initialize the CIDEr scoring function 24 | : param n (int): n-gram size 25 | : param df (string): specifies where to get the IDF values from 26 | takes values 'corpus', 'coco-train' 27 | : return: None 28 | """ 29 | # set cider to sum over 1 to 4-grams 30 | self._n = n 31 | self._df = df 32 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df) 33 | 34 | def compute_score(self, gts, res): 35 | """ 36 | Main function to compute CIDEr score 37 | : param gts (dict) : {image:tokenized reference sentence} 38 | : param res (dict) : {image:tokenized candidate sentence} 39 | : return: cider (float) : computed CIDEr score for the corpus 40 | """ 41 | 42 | # clear all the previous hypos and refs 43 | self.cider_scorer.clear() 44 | 45 | for res_id in res: 46 | 47 | hypo = res_id['caption'] 48 | ref = gts[res_id['image_id']] 49 | 50 | # Sanity check. 51 | assert(type(hypo) is list) 52 | assert(len(hypo) == 1) 53 | assert(type(ref) is list) 54 | assert(len(ref) > 0) 55 | self.cider_scorer += (hypo[0], ref) 56 | 57 | (score, scores) = self.cider_scorer.compute_score() 58 | 59 | return score, scores 60 | 61 | def method(self): 62 | return "CIDEr" 63 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/cider/cider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/cider/cider.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/cider/cider_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | import copy 6 | import pickle 7 | from collections import defaultdict 8 | import numpy as np 9 | import math 10 | import os 11 | 12 | from six.moves import xrange 13 | 14 | def precook(s, n=4, out=False): 15 | """ 16 | Takes a string as input and returns an object that can be given to 17 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 18 | can take string arguments as well. 19 | :param s: string : sentence to be converted into ngrams 20 | :param n: int : number of ngrams for which representation is calculated 21 | :return: term frequency vector for occuring ngrams 22 | """ 23 | words = s.split() 24 | counts = defaultdict(int) 25 | for k in xrange(1,n+1): 26 | for i in xrange(len(words)-k+1): 27 | ngram = tuple(words[i:i+k]) 28 | counts[ngram] += 1 29 | return counts 30 | 31 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 32 | '''Takes a list of reference sentences for a single segment 33 | and returns an object that encapsulates everything that BLEU 34 | needs to know about them. 35 | :param refs: list of string : reference sentences for some image 36 | :param n: int : number of ngrams for which (ngram) representation is calculated 37 | :return: result (list of dict) 38 | ''' 39 | return [precook(ref, n) for ref in refs] 40 | 41 | def cook_test(test, n=4): 42 | '''Takes a test sentence and returns an object that 43 | encapsulates everything that BLEU needs to know about it. 44 | :param test: list of string : hypothesis sentence for some image 45 | :param n: int : number of ngrams for which (ngram) representation is calculated 46 | :return: result (dict) 47 | ''' 48 | return precook(test, n, True) 49 | 50 | class CiderScorer(object): 51 | """CIDEr scorer. 52 | """ 53 | 54 | def copy(self): 55 | ''' copy the refs.''' 56 | new = CiderScorer(n=self.n) 57 | new.ctest = copy.copy(self.ctest) 58 | new.crefs = copy.copy(self.crefs) 59 | return new 60 | 61 | def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0): 62 | ''' singular instance ''' 63 | self.n = n 64 | self.sigma = sigma 65 | self.crefs = [] 66 | self.ctest = [] 67 | self.df_mode = df_mode 68 | if self.df_mode != "corpus": 69 | self.document_frequency = pickle.load(open(os.path.join('data', df_mode + '.p'),'r')) 70 | self.cook_append(test, refs) 71 | self.ref_len = None 72 | 73 | def clear(self): 74 | self.crefs = [] 75 | self.ctest = [] 76 | 77 | def cook_append(self, test, refs): 78 | '''called by constructor and __iadd__ to avoid creating new instances.''' 79 | 80 | if refs is not None: 81 | self.crefs.append(cook_refs(refs)) 82 | if test is not None: 83 | self.ctest.append(cook_test(test)) ## N.B.: -1 84 | else: 85 | self.ctest.append(None) # lens of crefs and ctest have to match 86 | 87 | def size(self): 88 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 89 | return len(self.crefs) 90 | 91 | def __iadd__(self, other): 92 | '''add an instance (e.g., from another sentence).''' 93 | 94 | if type(other) is tuple: 95 | ## avoid creating new CiderScorer instances 96 | self.cook_append(other[0], other[1]) 97 | else: 98 | self.ctest.extend(other.ctest) 99 | self.crefs.extend(other.crefs) 100 | 101 | return self 102 | def compute_doc_freq(self): 103 | ''' 104 | Compute term frequency for reference data. 105 | This will be used to compute idf (inverse document frequency later) 106 | The term frequency is stored in the object 107 | :return: None 108 | ''' 109 | for refs in self.crefs: 110 | # refs, k ref captions of one image 111 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): 112 | self.document_frequency[ngram] += 1 113 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 114 | 115 | def compute_cider(self): 116 | def counts2vec(cnts): 117 | """ 118 | Function maps counts of ngram to vector of tfidf weights. 119 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 120 | The n-th entry of array denotes length of n-grams. 121 | :param cnts: 122 | :return: vec (array of dict), norm (array of float), length (int) 123 | """ 124 | vec = [defaultdict(float) for _ in range(self.n)] 125 | length = 0 126 | norm = [0.0 for _ in range(self.n)] 127 | for (ngram,term_freq) in cnts.iteritems(): 128 | # give word count 1 if it doesn't appear in reference corpus 129 | df = np.log(max(1.0, self.document_frequency[ngram])) 130 | # ngram index 131 | n = len(ngram)-1 132 | # tf (term_freq) * idf (precomputed idf) for n-grams 133 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 134 | # compute norm for the vector. the norm will be used for 135 | # computing similarity 136 | norm[n] += pow(vec[n][ngram], 2) 137 | 138 | if n == 1: 139 | length += term_freq 140 | norm = [np.sqrt(n) for n in norm] 141 | return vec, norm, length 142 | 143 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 144 | ''' 145 | Compute the cosine similarity of two vectors. 146 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 147 | :param vec_ref: array of dictionary for vector corresponding to reference 148 | :param norm_hyp: array of float for vector corresponding to hypothesis 149 | :param norm_ref: array of float for vector corresponding to reference 150 | :param length_hyp: int containing length of hypothesis 151 | :param length_ref: int containing length of reference 152 | :return: array of score for each n-grams cosine similarity 153 | ''' 154 | delta = float(length_hyp - length_ref) 155 | # measure consine similarity 156 | val = np.array([0.0 for _ in range(self.n)]) 157 | for n in range(self.n): 158 | # ngram 159 | for (ngram,count) in vec_hyp[n].iteritems(): 160 | val[n] += vec_hyp[n][ngram] * vec_ref[n][ngram] 161 | 162 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 163 | val[n] /= (norm_hyp[n]*norm_ref[n]) 164 | 165 | assert(not math.isnan(val[n])) 166 | return val 167 | 168 | # compute log reference length 169 | if self.df_mode == "corpus": 170 | self.ref_len = np.log(float(len(self.crefs))) 171 | elif self.df_mode == "coco-val": 172 | # if coco option selected, use length of coco-val set 173 | self.ref_len = np.log(float(40504)) 174 | 175 | scores = [] 176 | for test, refs in zip(self.ctest, self.crefs): 177 | # compute vector for test captions 178 | vec, norm, length = counts2vec(test) 179 | # compute vector for ref captions 180 | score = np.array([0.0 for _ in range(self.n)]) 181 | for ref in refs: 182 | vec_ref, norm_ref, length_ref = counts2vec(ref) 183 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 184 | # change by vrama91 - mean of ngram scores, instead of sum 185 | score_avg = np.mean(score) 186 | # divide by number of references 187 | score_avg /= len(refs) 188 | # multiply score by 10 189 | score_avg *= 10.0 190 | # append score of an image to the score list 191 | scores.append(score_avg) 192 | return scores 193 | 194 | def compute_score(self, option=None, verbose=0): 195 | # compute idf 196 | if self.df_mode == "corpus": 197 | self.document_frequency = defaultdict(float) 198 | self.compute_doc_freq() 199 | # assert to check document frequency 200 | assert(len(self.ctest) >= max(self.document_frequency.values())) 201 | # import json for now and write the corresponding files 202 | # compute cider score 203 | score = self.compute_cider() 204 | # debug 205 | # print score 206 | return np.mean(np.array(score)), np.array(score) 207 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/cider/cider_scorer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/cider/cider_scorer.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/ciderD/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/ciderD/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/ciderD/__init__.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/ciderD/ciderD.py: -------------------------------------------------------------------------------- 1 | # Filename: ciderD.py 2 | # 3 | # Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from .ciderD_scorer import CiderScorer 11 | import torch.nn as nn 12 | import pdb 13 | 14 | class CiderD(nn.Module): 15 | """ 16 | Main Class to compute the CIDEr metric 17 | 18 | """ 19 | def __init__(self, n=4, sigma=6.0, df="corpus"): 20 | super(CiderD, self).__init__() 21 | 22 | # set cider to sum over 1 to 4-grams 23 | self._n = n 24 | # set the standard deviation parameter for gaussian penalty 25 | self._sigma = sigma 26 | # set which where to compute document frequencies from 27 | self._df = df 28 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df) 29 | 30 | def compute_score(self, gts, res): 31 | """ 32 | Main function to compute CIDEr score 33 | :param hypo_for_image (dict) : dictionary with key and value 34 | ref_for_image (dict) : dictionary with key and value 35 | :return: cider (float) : computed CIDEr score for the corpus 36 | """ 37 | 38 | # clear all the previous hypos and refs 39 | self.cider_scorer.clear() 40 | for res_id in res: 41 | 42 | hypo = res_id['caption'] 43 | ref = gts[res_id['image_id']] 44 | 45 | # Sanity check. 46 | assert(type(hypo) is list) 47 | assert(len(hypo) == 1) 48 | assert(type(ref) is list) 49 | assert(len(ref) > 0) 50 | self.cider_scorer += (hypo[0], ref) 51 | 52 | (score, scores) = self.cider_scorer.compute_score() 53 | 54 | return score, scores 55 | 56 | def method(self): 57 | return "CIDEr-D" 58 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/ciderD/ciderD.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/ciderD/ciderD.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/ciderD/ciderD_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | 5 | import copy 6 | import math 7 | import os 8 | import pickle 9 | from collections import defaultdict 10 | 11 | import numpy as np 12 | from six.moves import xrange 13 | 14 | import torch.nn as nn 15 | 16 | 17 | def precook(s, n=4, out=False): 18 | """ 19 | Takes a string as input and returns an object that can be given to 20 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 21 | can take string arguments as well. 22 | :param s: string : sentence to be converted into ngrams 23 | :param n: int : number of ngrams for which representation is calculated 24 | :return: term frequency vector for occuring ngrams 25 | """ 26 | words = s.split() 27 | counts = defaultdict(int) 28 | for k in xrange(1,n+1): 29 | for i in xrange(len(words)-k+1): 30 | ngram = tuple(words[i:i+k]) 31 | counts[ngram] += 1 32 | return counts 33 | 34 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 35 | '''Takes a list of reference sentences for a single segment 36 | and returns an object that encapsulates everything that BLEU 37 | needs to know about them. 38 | :param refs: list of string : reference sentences for some image 39 | :param n: int : number of ngrams for which (ngram) representation is calculated 40 | :return: result (list of dict) 41 | ''' 42 | return [precook(ref, n) for ref in refs] 43 | 44 | def cook_test(test, n=4): 45 | '''Takes a test sentence and returns an object that 46 | encapsulates everything that BLEU needs to know about it. 47 | :param test: list of string : hypothesis sentence for some image 48 | :param n: int : number of ngrams for which (ngram) representation is calculated 49 | :return: result (dict) 50 | ''' 51 | return precook(test, n, True) 52 | 53 | class CiderScorer(nn.Module): 54 | """CIDEr scorer. 55 | """ 56 | def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0): 57 | ''' singular instance ''' 58 | super(CiderScorer, self).__init__() 59 | self.n = n 60 | self.sigma = sigma 61 | self.crefs = [] 62 | self.ctest = [] 63 | self.df_mode = df_mode 64 | self.ref_len = None 65 | if self.df_mode != "corpus": 66 | pkl_file = pickle.load(open(os.path.join('data', df_mode + '.p'),'r')) 67 | self.ref_len = pkl_file['ref_len'] 68 | self.document_frequency = pkl_file['document_frequency'] 69 | self.cook_append(test, refs) 70 | 71 | def clear(self): 72 | self.crefs = [] 73 | self.ctest = [] 74 | 75 | def copy(self): 76 | ''' copy the refs.''' 77 | new = CiderScorer(n=self.n) 78 | new.ctest = copy.copy(self.ctest) 79 | new.crefs = copy.copy(self.crefs) 80 | return new 81 | 82 | def cook_append(self, test, refs): 83 | '''called by constructor and __iadd__ to avoid creating new instances.''' 84 | 85 | if refs is not None: 86 | self.crefs.append(cook_refs(refs)) 87 | if test is not None: 88 | self.ctest.append(cook_test(test)) ## N.B.: -1 89 | else: 90 | self.ctest.append(None) # lens of crefs and ctest have to match 91 | 92 | def size(self): 93 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 94 | return len(self.crefs) 95 | 96 | def __iadd__(self, other): 97 | '''add an instance (e.g., from another sentence).''' 98 | 99 | if type(other) is tuple: 100 | ## avoid creating new CiderScorer instances 101 | self.cook_append(other[0], other[1]) 102 | else: 103 | self.ctest.extend(other.ctest) 104 | self.crefs.extend(other.crefs) 105 | 106 | return self 107 | def compute_doc_freq(self): 108 | ''' 109 | Compute term frequency for reference data. 110 | This will be used to compute idf (inverse document frequency later) 111 | The term frequency is stored in the object 112 | :return: None 113 | ''' 114 | for refs in self.crefs: 115 | # refs, k ref captions of one image 116 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]): 117 | self.document_frequency[ngram] += 1 118 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 119 | 120 | def compute_cider(self): 121 | def counts2vec(cnts): 122 | """ 123 | Function maps counts of ngram to vector of tfidf weights. 124 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 125 | The n-th entry of array denotes length of n-grams. 126 | :param cnts: 127 | :return: vec (array of dict), norm (array of float), length (int) 128 | """ 129 | vec = [defaultdict(float) for _ in range(self.n)] 130 | length = 0 131 | norm = [0.0 for _ in range(self.n)] 132 | for (ngram,term_freq) in cnts.iteritems(): 133 | # give word count 1 if it doesn't appear in reference corpus 134 | df = np.log(max(1.0, self.document_frequency[ngram])) 135 | # ngram index 136 | n = len(ngram)-1 137 | # tf (term_freq) * idf (precomputed idf) for n-grams 138 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 139 | # compute norm for the vector. the norm will be used for computing similarity 140 | norm[n] += pow(vec[n][ngram], 2) 141 | 142 | if n == 1: 143 | length += term_freq 144 | norm = [np.sqrt(n) for n in norm] 145 | return vec, norm, length 146 | 147 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 148 | ''' 149 | Compute the cosine similarity of two vectors. 150 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 151 | :param vec_ref: array of dictionary for vector corresponding to reference 152 | :param norm_hyp: array of float for vector corresponding to hypothesis 153 | :param norm_ref: array of float for vector corresponding to reference 154 | :param length_hyp: int containing length of hypothesis 155 | :param length_ref: int containing length of reference 156 | :return: array of score for each n-grams cosine similarity 157 | ''' 158 | delta = float(length_hyp - length_ref) 159 | # measure consine similarity 160 | val = np.array([0.0 for _ in range(self.n)]) 161 | for n in range(self.n): 162 | # ngram 163 | for (ngram,count) in vec_hyp[n].iteritems(): 164 | # vrama91 : added clipping 165 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 166 | 167 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 168 | val[n] /= (norm_hyp[n]*norm_ref[n]) 169 | 170 | assert(not math.isnan(val[n])) 171 | # vrama91: added a length based gaussian penalty 172 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) 173 | return val 174 | 175 | # compute log reference length 176 | if self.df_mode == "corpus": 177 | self.ref_len = np.log(float(len(self.crefs))) 178 | #elif self.df_mode == "coco-val": 179 | # if coco option selected, use length of coco-val set 180 | # self.ref_len = np.log(float(40504)) 181 | 182 | scores = [] 183 | for test, refs in zip(self.ctest, self.crefs): 184 | # compute vector for test captions 185 | vec, norm, length = counts2vec(test) 186 | # compute vector for ref captions 187 | score = np.array([0.0 for _ in range(self.n)]) 188 | for ref in refs: 189 | vec_ref, norm_ref, length_ref = counts2vec(ref) 190 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 191 | # change by vrama91 - mean of ngram scores, instead of sum 192 | score_avg = np.mean(score) 193 | # divide by number of references 194 | score_avg /= len(refs) 195 | # multiply score by 10 196 | score_avg *= 10.0 197 | # append score of an image to the score list 198 | scores.append(score_avg) 199 | return scores 200 | 201 | def compute_score(self, option=None, verbose=0): 202 | # compute idf 203 | if self.df_mode == "corpus": 204 | self.document_frequency = defaultdict(float) 205 | self.compute_doc_freq() 206 | # assert to check document frequency 207 | assert(len(self.ctest) >= max(self.document_frequency.values())) 208 | # import json for now and write the corresponding files 209 | # compute cider score 210 | score = self.compute_cider() 211 | # debug 212 | # print score 213 | return np.mean(np.array(score)), np.array(score) 214 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/ciderD/ciderD_scorer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/ciderD/ciderD_scorer.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rama' 2 | from tokenizer.ptbtokenizer import PTBTokenizer 3 | from cider.cider import Cider 4 | from ciderD.ciderD import CiderD 5 | 6 | 7 | class CIDErEvalCap: 8 | def __init__(self, gts, res, df): 9 | print('tokenization...') 10 | tokenizer = PTBTokenizer('gts') 11 | _gts = tokenizer.tokenize(gts) 12 | print('tokenized refs') 13 | tokenizer = PTBTokenizer('res') 14 | _res = tokenizer.tokenize(res) 15 | print('tokenized cands') 16 | 17 | self.gts = _gts 18 | self.res = _res 19 | self.df = df 20 | 21 | def evaluate(self): 22 | # ================================================= 23 | # Set up scorers 24 | # ================================================= 25 | 26 | print('setting up scorers...') 27 | scorers = [ 28 | (Cider(df=self.df), "CIDEr"), (CiderD(df=self.df), "CIDErD") 29 | ] 30 | 31 | # ================================================= 32 | # Compute scores 33 | # ================================================= 34 | metric_scores = {} 35 | for scorer, method in scorers: 36 | print('computing %s score...' % (scorer.method())) 37 | score, scores = scorer.compute_score(self.gts, self.res) 38 | print("Mean %s score: %0.3f" % (method, score)) 39 | metric_scores[method] = list(scores) 40 | return metric_scores 41 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/eval.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/eval.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/__init__.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import pdb # python debugger 13 | import sys 14 | import subprocess 15 | import re 16 | import tempfile 17 | import itertools 18 | 19 | # path to the stanford corenlp jar 20 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' 21 | 22 | # punctuations to be removed from the sentences 23 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \ 24 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 25 | 26 | class PTBTokenizer: 27 | """Python wrapper of Stanford PTBTokenizer""" 28 | def __init__(self, _source='gts'): 29 | self.source = _source 30 | 31 | def tokenize(self, captions_for_image): 32 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \ 33 | 'edu.stanford.nlp.process.PTBTokenizer', \ 34 | '-preserveLines', '-lowerCase'] 35 | 36 | # ====================================================== 37 | # prepare data for PTB Tokenizer 38 | # ====================================================== 39 | 40 | if self.source == 'gts': 41 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 42 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 43 | final_tokenized_captions_for_image = {} 44 | 45 | elif self.source == 'res': 46 | index = [i for i, v in enumerate(captions_for_image)] 47 | image_id = [v["image_id"] for v in captions_for_image] 48 | sentences = '\n'.join(v["caption"].replace('\n', ' ') for v in captions_for_image ) 49 | final_tokenized_captions_for_index = [] 50 | 51 | # ====================================================== 52 | # save sentences to temporary file 53 | # ====================================================== 54 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__)) 55 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) 56 | tmp_file.write(sentences) 57 | tmp_file.close() 58 | 59 | # ====================================================== 60 | # tokenize sentence 61 | # ====================================================== 62 | cmd.append(os.path.basename(tmp_file.name)) 63 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \ 64 | stdout=subprocess.PIPE) 65 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 66 | lines = token_lines.split('\n') 67 | # remove temp file 68 | os.remove(tmp_file.name) 69 | 70 | # ====================================================== 71 | # create dictionary for tokenized captions 72 | # ====================================================== 73 | if self.source == 'gts': 74 | for k, line in zip(image_id, lines): 75 | if not k in final_tokenized_captions_for_image: 76 | final_tokenized_captions_for_image[k] = [] 77 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 78 | if w not in PUNCTUATIONS]) 79 | final_tokenized_captions_for_image[k].append(tokenized_caption) 80 | 81 | return final_tokenized_captions_for_image 82 | 83 | elif self.source == 'res': 84 | for k, img, line in zip(index, image_id, lines): 85 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \ 86 | if w not in PUNCTUATIONS]) 87 | final_tokenized_captions_for_index.append({'image_id': img, 'caption': [tokenized_caption]}) 88 | 89 | return final_tokenized_captions_for_index 90 | -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/ptbtokenizer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/ptbtokenizer.pyc -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/tmpBF49XX: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpBF49XX -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/tmpql9uU7: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpql9uU7 -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/tmpuCp_T0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpuCp_T0 -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/tmpxAmV_C: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpxAmV_C -------------------------------------------------------------------------------- /tools/pycider/pyciderevalcap/tokenizer/tmpzNW4I2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpzNW4I2 -------------------------------------------------------------------------------- /tools/sentence_gen_tools/__init__.py: -------------------------------------------------------------------------------- 1 | #init 2 | -------------------------------------------------------------------------------- /tools/sentence_gen_tools/coco_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import sys 4 | 5 | # from python_utils import * 6 | sys.path.append('tools/coco-caption/') 7 | COCO_EVAL_PATH = '.tools/coco-caption/pycocotools' 8 | sys.path.insert(0, COCO_EVAL_PATH) 9 | from pycocoevalcap.bleu.bleu import Bleu 10 | from pycocoevalcap.cider.cider import Cider 11 | from pycocoevalcap.eval import COCOEvalCap 12 | from pycocoevalcap.meteor.meteor import Meteor 13 | from pycocoevalcap.rouge.rouge import Rouge 14 | from pycocoevalcap.spice.spice import Spice 15 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer 16 | from pycocotools.coco import COCO 17 | 18 | rm_word_dict = {'bus': ['bus', 'busses'], 19 | 'bottle': ['bottle', 'bottles'], 20 | 'couch': ['couch', 'couches', 'sofa', 'sofas'], 21 | 'microwave': ['microwave', 'microwaves'], 22 | 'pizza': ['pizza', 'pizzas'], 23 | 'racket': ['racket', 'rackets', 'racquet', 'racquets'], 24 | 'suitcase': ['luggage', 'luggages', 'suitcase', 'suitcases'], 25 | 'zebra': ['zebra', 'zebras']} 26 | 27 | 28 | def read_json(t_file): 29 | j_file = open(t_file).read() 30 | return json.loads(j_file) 31 | 32 | 33 | class DCCScorer(COCOEvalCap): 34 | 35 | def get_dcc_scores(self): 36 | 37 | imgIds = self.params['image_id'] 38 | # imgIds = self.coco.getImgIds() 39 | gts = {} 40 | res = {} 41 | for imgId in imgIds: 42 | gts[imgId] = self.coco.imgToAnns[imgId] 43 | res[imgId] = self.cocoRes.imgToAnns[imgId] 44 | 45 | tokenizer = PTBTokenizer() 46 | gts = tokenizer.tokenize(gts) 47 | res = tokenizer.tokenize(res) 48 | scorers = [ 49 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 50 | (Meteor(), "METEOR"), 51 | (Rouge(), "ROUGE_L"), 52 | (Cider(df='noc_test_freq'), "CIDEr"), 53 | (Spice(), "SPICE") 54 | ] 55 | score_dict = {} 56 | for scorer, method in scorers: 57 | print('computing %s score...' % (scorer.method())) 58 | score, scores = scorer.compute_score(gts, res) 59 | if type(method) == list: 60 | for sc, scs, m in zip(score, scores, method): 61 | score_dict[m] = sc 62 | print("%s: %0.3f" % (m, sc)) 63 | else: 64 | score_dict[method] = score 65 | print("%s: %0.3f" % (method, score)) 66 | 67 | return score_dict 68 | 69 | 70 | def split_sent(sent): 71 | sent = sent.lower() 72 | sent = re.sub('[^A-Za-z0-9\s]+', '', sent) 73 | return sent.split() 74 | 75 | 76 | def F1(generated_json, novel_ids, train_ids, word): 77 | set_rm_words = set(rm_word_dict[word]) 78 | gen_dict = {} 79 | for c in generated_json: 80 | gen_dict[c['image_id']] = c['caption'] 81 | 82 | # true positive are sentences that contain match words and should 83 | tp = sum([1 for c in novel_ids if len(set_rm_words.intersection(set(split_sent(gen_dict[c])))) > 0]) 84 | # false positive are sentences that contain match words and should not 85 | fp = sum([1 for c in train_ids if len(set_rm_words.intersection(set(split_sent(gen_dict[c])))) > 0]) 86 | # false positive are sentences that do not contain match words and should 87 | fn = sum([1 for c in novel_ids if len(set_rm_words.intersection(set(split_sent(gen_dict[c])))) == 0]) 88 | 89 | # precision = tp/(tp+fp) 90 | if tp > 0: 91 | precision = float(tp)/(tp+fp) 92 | # recall = tp/(tp+fn) 93 | recall = float(tp)/(tp+fn) 94 | # f1 = 2* (precision*recall)/(precision+recall) 95 | return 2*(precision*recall)/(precision+recall) 96 | else: 97 | return 0. 98 | 99 | 100 | def score_dcc(gt_template_novel, gt_template_train, 101 | generation_result, words, dset, cache_path): 102 | 103 | score_dict_dcc = {} 104 | generated_sentences = generation_result 105 | f1_scores = 0 106 | 107 | for word in words: 108 | gt_file = gt_template_novel % (word, dset) 109 | gt_json_novel = read_json(gt_template_novel % (word, dset)) 110 | gt_json_train = read_json(gt_template_train % (word, dset)) 111 | gt_ids_novel = [c['image_id'] for c in gt_json_novel['annotations']] 112 | gt_ids_train = [c['image_id'] for c in gt_json_train['annotations']] 113 | gen = [] 114 | for c in generated_sentences: 115 | if c['image_id'] in gt_ids_novel: 116 | gen.append(c) 117 | 118 | json.dump(gen, open(cache_path, 'w')) 119 | # save_json(gen, 'tmp_gen.json') 120 | coco = COCO(gt_file) 121 | generation_coco = coco.loadRes(cache_path) 122 | dcc_evaluator = DCCScorer(coco, generation_coco, 'noc_test_freq') 123 | score_dict = dcc_evaluator.get_dcc_scores() 124 | # os.remove(cache_path) 125 | 126 | for key in score_dict.keys(): 127 | if key not in score_dict_dcc.keys(): 128 | score_dict_dcc[key] = 0 129 | score_dict_dcc[key] += score_dict[key] 130 | 131 | f1_score = F1(generated_sentences, gt_ids_novel, gt_ids_train, word) 132 | print("F1 score for %s: %f" % (word, f1_score)) 133 | f1_scores += f1_score 134 | 135 | print("########################################################################") 136 | for key in sorted(score_dict_dcc.keys()): 137 | score_dict_dcc[key] = score_dict_dcc[key]/len(words) 138 | print("Average %s: %0.3f" % (key, score_dict_dcc[key])) 139 | print("Average F1 score: %f" % (f1_scores/len(words))) 140 | out = {} 141 | for key in sorted(score_dict_dcc.keys()): 142 | out[key] = score_dict_dcc[key] 143 | out['F1'] = f1_scores / len(words) 144 | 145 | return out 146 | 147 | 148 | def score_generation(gt_filename=None, generation_result=None): 149 | 150 | coco = COCO(gt_filename) 151 | generation_coco = coco.loadRes(generation_result) 152 | coco_evaluator = COCOEvalCap(coco, generation_coco, 'noc_test_freq') 153 | coco_evaluator.evaluate() 154 | 155 | 156 | def save_json_coco_format(caps, save_name): 157 | 158 | def get_coco_id(im_name): 159 | coco_id = int(im_name.split('/')[-1].split('_')[-1].split('.jpg')[0]) 160 | return coco_id 161 | 162 | coco_format_caps = [{'caption': value, 'image_id': get_coco_id(key)} 163 | for value, key in zip(caps.values(), caps.keys())] 164 | 165 | json.dump(coco_format_caps, open(save_name, 'w')) 166 | # save_json(coco_format_caps, save_name) 167 | 168 | 169 | def save_json_other_format(caps, save_name): 170 | 171 | format_caps = [{'caption': value, 'image_id': key} 172 | for value, key in zip(caps.values(), caps.keys())] 173 | 174 | # save_json(format_caps, save_name) 175 | json.dump(format_caps, open(save_name, 'w')) 176 | --------------------------------------------------------------------------------