├── .gitignore
├── .gitmodules
├── Dockerfile
├── LICENSE
├── README.md
├── cfgs
├── noc_coco_res101.yml
├── noc_coco_vgg16.yml
├── normal_coco_res101.yml
└── robust_coco.yml
├── data
├── README.md
├── coco
│ └── coco_class_name.txt
├── flickr30k
│ └── flickr30k_class_name.txt
├── noc_coco
│ └── split_noc_coco.json
└── robust_coco
│ └── split_robust_coco.json
├── demo.py
├── demo
├── img1.png
└── img2.png
├── generate_robust_split.py
├── main.py
├── misc
├── AttModel.py
├── CaptionModel.py
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-36.pyc
│ └── eval_utils.cpython-36.pyc
├── bak.py
├── bbox_transform.py
├── dataloader_coco.py
├── dataloader_flickr30k.py
├── dataloader_hdf.py
├── eval_utils.py
├── model.py
├── resnet.py
├── rewards.py
├── utils.py
└── vgg16.py
├── opts.py
├── pooling
├── __init__.py
├── make.sh
└── roi_align
│ ├── __init__.py
│ ├── _ext
│ ├── __init__.py
│ └── roi_align
│ │ ├── __init__.py
│ │ └── _roi_align.so
│ ├── build.py
│ ├── functions
│ ├── __init__.py
│ └── roi_align.py
│ ├── make.sh
│ ├── modules
│ ├── __init__.py
│ └── roi_align.py
│ └── src
│ ├── roi_align.c
│ ├── roi_align.h
│ ├── roi_align_cuda.c
│ ├── roi_align_cuda.h
│ ├── roi_align_kernel.cu
│ └── roi_align_kernel.h
├── prepro
├── prepro_det.py
├── prepro_dic_coco.py
├── prepro_dic_flickr.py
├── prepro_ngrams.py
├── prepro_ngrams_bak.py
└── prepro_ngrams_flickr30k.py
└── tools
├── pycider
├── PyDataFormat
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── jsonify_refs.py
│ ├── loadData.py
│ └── loadData.pyc
├── README.md
├── cidereval.ipynb
├── cidereval.py
├── license.txt
├── params.json
└── pyciderevalcap
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── cider
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── cider.py
│ ├── cider.pyc
│ ├── cider_scorer.py
│ └── cider_scorer.pyc
│ ├── ciderD
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── ciderD.py
│ ├── ciderD.pyc
│ ├── ciderD_scorer.py
│ └── ciderD_scorer.pyc
│ ├── eval.py
│ ├── eval.pyc
│ └── tokenizer
│ ├── __init__.py
│ ├── __init__.pyc
│ ├── ptbtokenizer.py
│ ├── ptbtokenizer.pyc
│ ├── stanford-corenlp-3.4.1.jar
│ ├── tmpBF49XX
│ ├── tmpql9uU7
│ ├── tmpuCp_T0
│ ├── tmpxAmV_C
│ └── tmpzNW4I2
└── sentence_gen_tools
├── __init__.py
└── coco_eval.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # project specific
2 | save/
3 |
4 | # remote ftp files
5 | .ftpconfig
6 |
7 | # Byte-compiled / optimized / DLL files
8 | __pycache__/
9 | *.py[cod]
10 | *$py.class
11 |
12 | # C extensions
13 | *.so
14 |
15 | # Distribution / packaging
16 | .Python
17 | .vector_cache/
18 | env/
19 | env3/
20 | build/
21 | data/
22 | logs/
23 | develop-eggs/
24 | dist/
25 | downloads/
26 | eggs/
27 | .eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | *.egg-info/
34 | .installed.cfg
35 | *.egg
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 | *.log
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # PyBuilder
53 | target/
54 |
55 | # IPython Notebook
56 | .ipynb_checkpoints
57 |
58 | # pyenv
59 | .python-version
60 |
61 | # celery beat schedule file
62 | celerybeat-schedule
63 |
64 | # dotenv
65 | .env
66 |
67 | # virtualenv
68 | venv/
69 | ENV/
70 |
71 | # Spyder project settings
72 | .spyderproject
73 |
74 | # Rope project settings
75 | .ropeproject
76 |
77 | # OS X files
78 | .DS_Store
79 |
80 | .idea
81 | media/
82 | db.sqlite3
83 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "tools/coco-caption"]
2 | path = tools/coco-caption
3 | url = https://www.github.com/kdexd/coco-caption
4 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:0.4-cuda9-cudnn7-devel
2 |
3 | COPY . /workspace/neuralbabytalk
4 |
5 | # ----------------------------------------------------------------------------
6 | # -- install apt and pip dependencies
7 | # ----------------------------------------------------------------------------
8 |
9 | RUN apt-get update && \
10 | apt-get install -y \
11 | ant \
12 | ca-certificates-java \
13 | nano \
14 | openjdk-8-jdk \
15 | python2.7 \
16 | unzip \
17 | wget && \
18 | apt-get clean
19 |
20 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
21 | RUN update-ca-certificates -f && export JAVA_HOME
22 |
23 | RUN pip install Cython && pip install h5py \
24 | matplotlib \
25 | nltk \
26 | numpy \
27 | pycocotools \
28 | scikit-image \
29 | stanfordcorenlp \
30 | tensorflow \
31 | torchtext \
32 | tqdm && python -c "import nltk; nltk.download('punkt')"
33 |
34 |
35 | # ----------------------------------------------------------------------------
36 | # -- download pretrained imagenet weights for resnet-101
37 | # ----------------------------------------------------------------------------
38 |
39 | RUN mkdir /workspace/neuralbabytalk/data/imagenet_weights && \
40 | cd /workspace/neuralbabytalk/data/imagenet_weights && \
41 | wget --quiet https://www.dropbox.com/sh/67fc8n6ddo3qp47/AAACkO4QntI0RPvYic5voWHFa/resnet101.pth
42 |
43 |
44 | # ----------------------------------------------------------------------------
45 | # -- download Karpathy's preprocessed captions datasets and corenlp jar
46 | # ----------------------------------------------------------------------------
47 |
48 | RUN cd /workspace/neuralbabytalk/data && \
49 | wget --quiet http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip && \
50 | unzip caption_datasets.zip && \
51 | mv dataset_coco.json coco/ && \
52 | mv dataset_flickr30k.json flickr30k/ && \
53 | rm caption_datasets.zip dataset_flickr8k.json
54 |
55 | RUN cd /workspace/neuralbabytalk/prepro && \
56 | wget --quiet https://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip && \
57 | unzip stanford-corenlp-full-2017-06-09.zip && \
58 | rm stanford-corenlp-full-2017-06-09.zip
59 |
60 | RUN cd /workspace/neuralbabytalk/tools/coco-caption && \
61 | sh get_stanford_models.sh
62 |
63 | # ----------------------------------------------------------------------------
64 | # -- download preprocessed COCO detection output HDF file and pretrained model
65 | # ----------------------------------------------------------------------------
66 |
67 | RUN cd /workspace/neuralbabytalk/data/coco && \
68 | wget --quiet https://www.dropbox.com/s/2gzo4ops5gbjx5h/coco_detection.h5.tar.gz && \
69 | tar -xzvf coco_detection.h5.tar.gz && \
70 | rm coco_detection.h5.tar.gz
71 |
72 | RUN mkdir -p /workspace/neuralbabytalk/save && \
73 | cd /workspace/neuralbabytalk/save && \
74 | wget --quiet https://www.dropbox.com/s/6buajkxm9oed1jp/coco_nbt_1024.tar.gz && \
75 | tar -xzvf coco_nbt_1024.tar.gz && \
76 | rm coco_nbt_1024.tar.gz
77 |
78 | WORKDIR /workspace/neuralbabytalk
79 | RUN python prepro/prepro_dic_coco.py \
80 | --input_json data/coco/dataset_coco.json \
81 | --split normal \
82 | --output_dic_json data/coco/dic_coco.json \
83 | --output_cap_json data/coco/cap_coco.json && \
84 | python prepro/prepro_dic_coco.py \
85 | --input_json data/coco/dataset_coco.json \
86 | --split robust \
87 | --output_dic_json data/robust_coco/dic_coco.json \
88 | --output_cap_json data/robust_coco/cap_coco.json && \
89 | python prepro/prepro_dic_coco.py \
90 | --input_json data/coco/dataset_coco.json \
91 | --split noc \
92 | --output_dic_json data/noc_coco/dic_coco.json \
93 | --output_cap_json data/noc_coco/cap_coco.json
94 |
95 | EXPOSE 8888
96 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Jiasen Lu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Neural Baby Talk
2 |
3 | 
4 |
5 |
6 | ## Docker Setup
7 |
8 | This repository provides a Dockerfile for setting up all dependencies and preprocessed data for COCO experiments (normal / robust / NOC). Docker support for Flickr30k experiments is not yet supported. To build the Dockerfile, just execute this from project root:
9 |
10 | ```shell
11 | docker build -t nbt .
12 | ```
13 |
14 | Before running the container, you need to get COCO dataset downloaded and kept somewhere in your filesystem. Declare two environment variables:
15 | 1. `$COCO_IMAGES`: path to a directory with sub-directories of images as `train2014`, `val2014`, `test2015`, etc...
16 | 2. `$COCO_ANNOTATIONS`: path to a directory with annotation files like `instances_train2014.json`, `captions_train2014.json` etc...
17 |
18 | These directories will be attached as "volumes" to our docker container for Neural Baby Talk to use within. Run the docker image within a container in an interactive mode (bash session). Get [nvidia-docker](https://www.github.com/NVIDIA/nvidia-docker) and execute this command to run the fresh built docker image.
19 |
20 | ```shell
21 | nvidia-docker run --name nbt_container -it \
22 | -v $COCO_IMAGES:/workspace/neuralbabytalk/data/coco/images \
23 | -v $COCO_ANNOTATIONS:/workspace/neuralbabytalk/data/coco/annotations \
24 | --shm-size 8G -p 8888:8888 nbt /bin/bash
25 | ```
26 |
27 | Ideally, shared memory size (`--shm-size`) of 8GB would be enough. Tune it according to your requirements / machine specifications.
28 |
29 | **Saved Checkpoints:** All checkpoints will be saved in `/workspace/neuralbabytalk/save`. From outside the container, execute this to get your checkpoints from this container into the main filesystem:
30 | The container would expose port 8888, which can be used to host tensorboard visualizations.
31 |
32 | ```shell
33 | docker container cp nbt_container:workspace/neuralbabytalk/save /path/to/local/filesystem/save
34 | ```
35 |
36 | Skip directly to **Training and Evaluation** section to execute specified commands within the container.
37 |
38 |
39 | ## requirement
40 |
41 | Inference:
42 |
43 | - [pytorch](http://pytorch.org/)
44 | - [torchvision](https://github.com/pytorch/vision)
45 | - [torchtext](https://github.com/pytorch/text)
46 |
47 | Data Preparation:
48 |
49 | - [stanford-corenlp-wrapper](https://github.com/Lynten/stanford-corenlp)
50 | - [stanford-corenlp](https://stanfordnlp.github.io/CoreNLP/)
51 |
52 | Evaluation:
53 |
54 | - [coco-caption](https://github.com/jiasenlu/coco-caption): Download the modified version of coco-caption and put it under `tools/`
55 |
56 |
57 | ## Demo
58 |
59 | #### Without detection bbox
60 |
61 |
62 | #### With detection bbox
63 |
64 | #### Constraint beam search
65 | This code also involve the implementation of constraint beam search proposed by Peter Anderson. I'm not sure my impmentation is 100% correct, but it works well in conjuction with neural baby talk code. You can refer to [this](http://users.cecs.anu.edu.au/~sgould/papers/emnlp17-constrained-beam-search.pdf) paper for more details. To enable CBS while decoding, please set the following flags:
66 | ```
67 | --cbs True|False : Whether use the constraint beam search.
68 | --cbs_tag_size 3 : How many detection bboxes do we want to include in the decoded caption.
69 | --cbs_mode all|unqiue|novel : Do we allow the repetive bounding box? `novel` is an option only for novel object detection task.
70 | ```
71 |
72 | ## Training and Evaluation
73 | ### Data Preparation
74 | Head to `data/README.md`, and prepare the data for training and evaluation.
75 |
76 | ### Pretrained model
77 | | Task | Dataset | Backend | Batch size | Link |
78 | | ---- | :----:| :----:| :----:|:----:|
79 | | Standard image captioning | COCO | Res-101 | 100 | [Pre-trained Model](https://www.dropbox.com/s/6buajkxm9oed1jp/coco_nbt_1024.tar.gz?dl=0) |
80 | | Standard image captioning | Flickr30k | Res-101 | 50 | [Pre-trained Model](https://www.dropbox.com/s/cirzj1b2jul6yzx/flickr30k_nbt_1024.tar.gz?dl=0) |
81 | | Robust image captioning | COCO | Res-101 | 100 | [Pre-trained Model](https://www.dropbox.com/s/sxuodvob0ftesm9/robust_coco_nbt_1024.tar.gz?dl=0) |
82 | | Novel object captioning | COCO | Res-101 | 100 | [Pre-trained Model](https://www.dropbox.com/s/b7i6vx5pf98540l/noc_coco_nbt_1024.tar.gz?dl=0) |
83 |
84 |
85 | ### Standard Image Captioning
86 | ##### Training (COCO)
87 |
88 | First, modify the cofig file `cfgs/normal_coco_res101.yml` with the correct file path.
89 |
90 | ```
91 | python main.py --path_opt cfgs/normal_coco_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30
92 | ```
93 | ##### Evaluation (COCO)
94 | Download Pre-trained model. Extract the tar.zip file and put it under `save/`.
95 |
96 | ```
97 | python main.py --path_opt cfgs/normal_coco_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 --inference_only True --beam_size 3 --start_from save/coco_nbt_1024
98 | ```
99 |
100 | ##### Training (Flickr30k)
101 | Modify the cofig file `cfgs/normal_flickr_res101.yml` with the correct file path.
102 |
103 | ```
104 | python main.py --path_opt cfgs/normal_flickr_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30
105 | ```
106 |
107 | ##### Evaluation (Flickr30k)
108 | Download Pre-trained model. Extract the tar.zip file and put it under `save/`.
109 |
110 | ```
111 | python main.py --path_opt cfgs/normal_flickr_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 --inference_only True --beam_size 3 --start_from save/flickr30k_nbt_1024
112 | ```
113 |
114 | ### Robust Image Captioning
115 |
116 | ##### Training
117 | Modify the cofig file `cfgs/normal_flickr_res101.yml` with the correct file path.
118 |
119 | ```
120 | python main.py --path_opt cfgs/robust_coco.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30
121 | ```
122 | ##### Evaluation (robust-coco)
123 | Download Pre-trained model. Extract the tar.zip file and put it under `save/`.
124 |
125 | ```
126 | python main.py --path_opt cfgs/robust_coco.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 --inference_only True --beam_size 3 --start_from save/robust_coco_nbt_1024
127 | ```
128 |
129 | ### Novel Object Captioning
130 |
131 | ##### Training
132 | Modify the cofig file `cfgs/noc_coco_res101.yml` with the correct file path.
133 |
134 | ```
135 | python main.py --path_opt cfgs/noc_coco_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30
136 | ```
137 | ##### Evaluation (noc-coco)
138 | Download Pre-trained model. Extract the tar.zip file and put it under `save/`.
139 |
140 | ```
141 | python main.py --path_opt cfgs/noc_coco_res101.yml --batch_size 20 --cuda True --num_workers 20 --max_epoch 30 --inference_only True --beam_size 3 --start_from save/noc_coco_nbt_1024
142 | ```
143 |
144 | ### Multi-GPU Training
145 | This codebase also support training with multiple GPU. To enable this feature, simply add `--mGPUs Ture` in the commnad.
146 |
147 | ### Self-Critic Training and Fine-Tuning CNN
148 |
149 | This codebase also support self-critic training and fine-tuning CNN. You are welcome to try this part and upload your trained model to the repo!
150 |
151 | ## More Visualization Results
152 | 
153 |
154 | ## Reference
155 | If you use this code as part of any published research, please acknowledge the following paper
156 |
157 | ```
158 | @inproceedings{Lu2018Neural,
159 | author = {Lu, Jiasen and Yang, Jianwei and Batra, Dhruv and Parikh, Devi},
160 | title = {Neural Baby Talk},
161 | booktitle = {CVPR},
162 | year = {2018}
163 | }
164 | ```
165 | ## Acknowledgement
166 | We thank Ruotian Luo for his [self-critical.pytorch](https://github.com/ruotianluo/self-critical.pytorch) repo.
167 |
--------------------------------------------------------------------------------
/cfgs/noc_coco_res101.yml:
--------------------------------------------------------------------------------
1 | # dataset setting
2 | dataset: coco
3 | input_json: '/srv/share/jiasenlu/nbtv2/data/coco/cap_coco.json'
4 | input_dic: '/srv/share/jiasenlu/nbtv2/data/coco_noc/dic_coco.json'
5 | image_path: '/srv/share/datasets/coco/images'
6 | proposal_h5: '/srv/share/jiasenlu/nbtv2/data/coco_noc/coco_noc_detection.h5'
7 | data_path: '/srv/share/jiasenlu/nbtv2/data'
8 | # language model
9 | cnn_backend: res101
10 | att_model: topdown
11 | rnn_size: 1024
12 | num_layers: 1
13 | seq_length: 20
14 | # image model
15 | image_size: 576
16 | image_crop_size: 512
17 | # decode setting
18 | decode_noc: True
19 | cached_tokens: 'coco-train-idxs'
20 | val_split: 'test'
21 | val_images_use: -1
22 | cider_df: 'noc_test_freq'
23 | optim: 'adam'
--------------------------------------------------------------------------------
/cfgs/noc_coco_vgg16.yml:
--------------------------------------------------------------------------------
1 | # dataset setting
2 | dataset: coco
3 | input_json: '/srv/share/jiasenlu/nbtv2/data/coco/cap_coco.json'
4 | input_dic: '/srv/share/jiasenlu/nbtv2/data/coco_noc/dic_coco.json'
5 | image_path: '/srv/share/datasets/coco/images'
6 | proposal_h5: '/srv/share/jiasenlu/nbtv2/data/coco_noc/coco_noc_detection.h5'
7 | checkpoint_path: '/srv/share/jiasenlu/nbtv2/model/coco_noc_vgg16'
8 | data_path: '/srv/share/jiasenlu/nbtv2/data'
9 | # language model
10 | cnn_backend: vgg16
11 | att_feat_size: 512
12 | fc_feat_size: 4096
13 | att_model: topdown
14 | rnn_size: 1024
15 | num_layers: 1
16 | seq_length: 20
17 | # image model
18 | image_size: 576
19 | image_crop_size: 512
20 | # decode setting
21 | decode_noc: True
22 | cached_tokens: 'coco-train-idxs'
23 | val_split: 'test'
24 | val_images_use: -1
25 | cider_df: 'noc_test_freq'
--------------------------------------------------------------------------------
/cfgs/normal_coco_res101.yml:
--------------------------------------------------------------------------------
1 | # dataset setting
2 | dataset: coco
3 | input_json: 'data/coco/cap_coco.json'
4 | input_dic: 'data/coco/dic_coco.json'
5 | image_path: 'data/coco/images'
6 | proposal_h5: 'data/coco/coco_detection.h5'
7 | data_path: 'data'
8 | # language model
9 | cnn_backend: res101
10 | att_model: topdown
11 | rnn_size: 1024
12 | num_layers: 1
13 | seq_length: 20
14 | # image model
15 | image_size: 576
16 | image_crop_size: 512
17 | # decode setting
18 | decode_noc: False
19 | cached_tokens: 'coco-train-idxs'
20 | val_split: 'test'
21 | val_images_use: -1
22 | cider_df: 'corpus'
23 | optim: 'adam'
24 | checkpoint_path: 'save/normal_coco_1024_adam'
25 |
--------------------------------------------------------------------------------
/cfgs/robust_coco.yml:
--------------------------------------------------------------------------------
1 | # dataset setting
2 | dataset: coco
3 | input_json: 'data/robust_coco/cap_coco.json'
4 | input_dic: 'data/robust_coco/dic_coco.json'
5 | image_path: 'data/coco/images'
6 | proposal_h5: 'data/coco/coco_detection.h5'
7 | data_path: 'data'
8 | # language model
9 | cnn_backend: res101
10 | att_model: topdown
11 | rnn_size: 1024
12 | num_layers: 1
13 | seq_length: 20
14 | # image model
15 | image_size: 576
16 | image_crop_size: 512
17 | # decode setting
18 | decode_noc: False
19 | cached_tokens: 'coco-all-idxs'
20 | val_split: 'test'
21 | val_images_use: -1
22 | cider_df: 'corpus'
23 | optim: 'adam'
24 | checkpoint_path: 'save/robust_coco_1024'
25 |
--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | ## Data Preparation for Neural Baby Talk
2 | ### Image Dataset
3 |
4 | - COCO: Download coco images from [link](http://cocodataset.org/#download), we need `2014 training` images and `2014 val` images. You should put the image in some directory, denoted as `$IMAGE_ROOT`.
5 |
6 | - Flickr30k: Download flickr30k entity images from [link](http://web.engr.illinois.edu/~bplumme2/Flickr30kEntities/), you may need to fill a form to get the annotations.
7 |
8 | ### Pretrained CNN weight
9 | - Download pretrained CNN weight from [link](https://www.dropbox.com/sh/67fc8n6ddo3qp47/AADUMRqlcvjv4zqBX6K2L8c2a?dl=0), and put it into `/data`
10 |
11 | ### COCO
12 | - Download the preprocessed Karpathy's split of coco caption from [link](http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip). Extract `dataset_coco.json` from the zip file and copy it into `coco/`.
13 | - Download COCO 2014 Train/Val annotations from [link](http://images.cocodataset.org/annotations/annotations_trainval2014.zip). Extract the zip file and put the json file under `coco/annotations/`
14 | - Download stanford core nlp tools and modified the `scripts/prepro_dic_coco.py` with correct stanford core nlp location. (In my experiment, I use the the version of `stanford-corenlp-full-2017-06-09` [link](https://nlp.stanford.edu/software/stanford-corenlp-full-2017-06-09.zip))
15 | - You can either download the preprocessed data from [here](https://www.dropbox.com/s/1t9nrbevzqn93to/coco.tar.gz?dl=0) or you can use the pre-process script to generate the data. Under the `root` directory, run the following command to pre-process the data.
16 | ```
17 | python prepro/prepro_dic_coco.py --input_json data/coco/dataset_coco.json --split normal --output_dic_json data/coco/dic_coco.json --output_cap_json data/coco/cap_coco.json
18 | ```
19 | - Download the pre-extracted coco detection result from [link](https://www.dropbox.com/s/2gzo4ops5gbjx5h/coco_detection.h5.tar.gz?dl=0) and extract the tar.gz file and copy it into `coco/`. You can also extract using our reimplementation of faster rcnn code, or any exsiting detection framework. The format of bounding box data will added later.
20 | - After all these steps, we are ready to train the model for coco :)
21 |
22 | ### Flickr30k
23 | - Download the preprocessed Karpathy's split of coco caption from [link](http://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip). Extract `dataset_flickr30k.json` from the zip file and copy it into `flickr30k/`.
24 | - Download the preprocessed Flickr30k annotation for NeuralBabyTalk (annotations that linking the nouns to specific bounding box) from [link](https://www.dropbox.com/s/h4ru86ocb10axa1/flickr30k_cleaned_class.json.tar.gz?dl=0). Extract the tar.gz file and copy it into `flickr30k/`.
25 | - Download the stanford corenlp as COCO's instruction.
26 | - You can either download the preprocessed data from [here](https://www.dropbox.com/s/twve5exs8qj9xgd/flickr30k.tar.gz?dl=0) or you can use the pre-process script to generate the data. Under the `root` directory, run the following command to pre-process the data.
27 | ```
28 | python prepro/prepro_dic_flickr.py --input_json data/flickr30k/dataset_flickr30k.json --input_class_name data/flickr30k/flickr30k_class_name.txt
29 | ```
30 | - Download the pre-extracted flickr30k detection result from [link](https://www.dropbox.com/s/5o6so7h4xq5ki1t/flickr30k_detection.h5.tar.gz?dl=0) and extract the tar.gz file and copy it into `flickr30k/`. You can also extract using our reimplementation of faster rcnn code, or any exsiting detection framework. The format of bounding box data will added later.
31 | - After all these steps, we are ready to train the model for flickr30k :)
32 |
33 | ### Robust-COCO
34 | - Follow the instructions as COCO (1-3, 5).
35 | - You can either download the preprocessed data from [here](https://www.dropbox.com/s/tevyub9rxz6d22l/coco_robust.tar.gz?dl=0) or you can use the pre-process script to generate the data. Under the `root` directory, run the following command to pre-process the data.
36 | ```
37 | python prepro/prepro_dic_coco.py --input_json data/coco/dataset_coco.json --split robust --output_dic_json data/robust_coco/dic_coco.json --output_cap_json data/robust_coco/cap_coco.json
38 | ```
39 |
40 | ### NOC-COCO
41 | - Follow the instructions as COCO (1-3).
42 | - You can either download the preprocessed data from [here](https://www.dropbox.com/s/tevyub9rxz6d22l/coco_robust.tar.gz?dl=0) or you can use the pre-process script to generate the data. Under the `root` directory, run the following command to pre-process the data.
43 | ```
44 | python prepro/prepro_dic_coco.py --input_json data/coco/dataset_coco.json --split noc --output_dic_json data/noc_coco/dic_coco.json --output_cap_json data/noc_coco/cap_coco.json
45 | ```
46 | - Download the pre-extracted coco detection result trained on `train2014` from [link](https://www.dropbox.com/s/2gzo4ops5gbjx5h/coco_detection.h5.tar.gz?dl=0) and extract the tar.gz file and copy it into `coco/`. You can also extract using our reimplementation of faster rcnn code, or any exsiting detection framework. The format of bounding box data will added later.
47 |
48 |
--------------------------------------------------------------------------------
/data/coco/coco_class_name.txt:
--------------------------------------------------------------------------------
1 | person, girl, boy, man, woman, kid, child, chef, baker, people, adult, rider, children, baby, worker, passenger, sister, biker, policeman, cop, officer, lady, cowboy, bride, groom, male, female, guy, traveler, mother, father, gentleman, pitcher, player, skier, snowboarder, skater, skateboarder, person, woman, guy, foreigner, child, gentleman, caller, offender, coworker, trespasser, patient, politician, soldier, grandchild, serviceman, walker, drinker, doctor, bicyclist, thief, buyer, teenager, student, camper, driver, solider, hunter, shopper, villager
2 | bicycle, bike, bicycle, bike, unicycle, minibike, trike
3 | car, automobile, van, minivan, sedan, suv, hatchback, cab, jeep, coupe, taxicab, limo, taxi
4 | motorcycle, scooter, motor bike, motor cycle, motorbike, scooter, moped
5 | airplane, jetliner, plane, air plane, monoplane, aircraft, jet, jetliner, airbus, biplane, seaplane
6 | bus, minibus, trolley
7 | train, locomotive, tramway, caboose
8 | truck, pickup, lorry, hauler, firetruck
9 | boat, ship, liner, sailboat, motorboat, dinghy, powerboat, speedboat, canoe, skiff, yacht, kayak, catamaran, pontoon, houseboat, vessel, rowboat, trawler, ferryboat, watercraft, tugboat, schooner, barge, ferry, sailboard, paddleboat, lifeboat, freighter, steamboat, riverboat, surfboard, battleship, steamship
10 | traffic light, street light, traffic signal, stop light, streetlight, stoplight
11 | fire hydrant, hydrant
12 | stop sign, street sign
13 | parking meter
14 | bench, pew
15 | bird, ostrich, owl, seagull, goose, duck, parakeet, falcon, robin, pelican, waterfowl, heron, hummingbird, mallard, finch, pigeon, sparrow, seabird, osprey, blackbird, fowl, shorebird, woodpecker, egret, chickadee, quail, bluebird, kingfisher, buzzard, willet, gull, swan, bluejay, flamingo, cormorant, parrot, loon, gosling, waterbird, pheasant, rooster, sandpiper, crow, raven, turkey, oriole, cowbird, warbler, magpie, peacock, cockatiel, lorikeet, puffin, vulture, condor, macaw, peafowl, cockatoo, songbird
16 | cat, kitten, feline, tabby
17 | dog, puppy, beagle, pup, chihuahua, schnauzer, dachshund, rottweiler, canine, pitbull, collie, pug, terrier, poodle, labrador, doggie, doberman, mutt, doggy, spaniel, bulldog, sheepdog, weimaraner, corgi, cocker, greyhound, retriever, brindle, hound, whippet, husky
18 | horse, colt, pony, racehorse, stallion, equine, mare, foal, palomino, mustang, clydesdale, bronc, bronco
19 | sheep, lamb, goat, ram, cattle, lamb, goat, ewe
20 | cow, cattle, oxen, ox, calf, cattle, ewe, holstein, heifer, buffalo, bull, zebu, bison
21 | elephant
22 | bear, panda
23 | zebra
24 | giraffe
25 | backpack, knapsack
26 | umbrella
27 | handbag, wallet, purse, briefcase
28 | tie
29 | suitcase, suit case, luggage
30 | frisbee
31 | skis, ski
32 | snowboard
33 | sports ball, baseball, ball, football, soccer, basketball, softball, volleyball, pinball, fastball, racquetball
34 | kite
35 | baseball bat
36 | baseball glove
37 | skateboard
38 | surfboard, longboard, skimboard, shortboard, wakeboard
39 | tennis racket
40 | bottle
41 | wine glass
42 | cup
43 | fork
44 | knife, pocketknife, knive
45 | spoon
46 | bowl, container, plate
47 | banana
48 | apple
49 | sandwich, burger, sub, cheeseburger, hamburger
50 | orange, lemons
51 | broccoli
52 | carrot
53 | hot dog
54 | pizza
55 | donut, doughnut, bagel
56 | cake, cheesecake, cupcake, shortcake, coffeecake, pancake
57 | chair, seat, recliner, stool
58 | couch, sofa, recliner, futon, loveseat, settee, chesterfield
59 | potted plant, houseplant
60 | bed
61 | dining table, table
62 | toilet, urinal, commode, toilet, lavatory, potty
63 | tv, monitor, televison, television
64 | laptop, computer, notebook, netbook, lenovo, macbook
65 | mouse
66 | remote
67 | keyboard
68 | cell phone, mobile phone, phone, cellphone, telephone, phon, smartphone, iPhone
69 | microwave
70 | oven, stovetop, stove
71 | toaster
72 | sink
73 | refrigerator, fridge, fridge, freezer
74 | book
75 | clock
76 | vase
77 | scissors
78 | teddy bear, teddybear
79 | hair drier, hairdryer
80 | toothbrush
--------------------------------------------------------------------------------
/data/flickr30k/flickr30k_class_name.txt:
--------------------------------------------------------------------------------
1 | railing
2 | blouse
3 | puppy
4 | climber
5 | glass
6 | hole
7 | machine
8 | wine
9 | backpack
10 | telescope
11 | basketball
12 | tree
13 | runner
14 | escalator
15 | concrete
16 | object
17 | mouth
18 | singer
19 | bush
20 | lady
21 | plate
22 | hammer
23 | jumpsuit
24 | worker
25 | child
26 | player
27 | violin
28 | hay
29 | hat
30 | pavement
31 | crowd
32 | people
33 | trampoline
34 | son
35 | fabric
36 | onlooker
37 | passenger
38 | floor
39 | chair
40 | team
41 | sign
42 | blond-hair
43 | wheelchair
44 | truck
45 | accordion
46 | apron
47 | container
48 | city
49 | alley
50 | knee
51 | lawn
52 | robe
53 | artist
54 | screen
55 | boat
56 | newspaper
57 | canoe
58 | purse
59 | advertisement
60 | instrument
61 | skier
62 | tube
63 | stone
64 | bubble
65 | cream
66 | beverage
67 | groom
68 | mask
69 | smile
70 | door
71 | male
72 | dress
73 | plant
74 | plane
75 | volleyball
76 | paper
77 | swimsuit
78 | number
79 | pedestrian
80 | pajamas
81 | banner
82 | set
83 | crosswalk
84 | church
85 | belt
86 | fire
87 | racer
88 | person
89 | sandal
90 | couple
91 | sock
92 | suit
93 | poster
94 | t-shirt
95 | line
96 | lane
97 | race
98 | athlete
99 | bird
100 | leg
101 | baby
102 | customer
103 | trumpet
104 | animal
105 | counter
106 | dock
107 | obstacle
108 | graffitus
109 | scarf
110 | shorts
111 | device
112 | face
113 | mustache
114 | painting
115 | biker
116 | motorcycle
117 | wire
118 | drum
119 | ramp
120 | doorway
121 | ball
122 | drink
123 | overalls
124 | desk
125 | pier
126 | stage
127 | ponytail
128 | bike
129 | blanket
130 | daughter
131 | sweater
132 | work
133 | beach
134 | ladder
135 | lap
136 | coffee
137 | band
138 | bread
139 | hurdle
140 | train
141 | sled
142 | goalie
143 | gentleman
144 | kitchen
145 | cow
146 | cone
147 | wheel
148 | rail
149 | hand
150 | goggles
151 | board
152 | gun
153 | sidewalk
154 | uniform
155 | teacher
156 | pillow
157 | snowboard
158 | market
159 | car
160 | cap
161 | cat
162 | clothing
163 | airplane
164 | staircase
165 | haircut
166 | window
167 | cart
168 | card
169 | ring
170 | sheep
171 | friend
172 | equipment
173 | tooth
174 | snack
175 | stair
176 | jeans
177 | fruit
178 | snowboarder
179 | log
180 | area
181 | lot
182 | pitcher
183 | bucket
184 | podium
185 | pool
186 | building
187 | gymnast
188 | fountain
189 | fence
190 | trunk
191 | soldier
192 | family
193 | grill
194 | tattoo
195 | food
196 | foot
197 | dirt
198 | base
199 | horse
200 | station
201 | bride
202 | scooter
203 | lake
204 | rope
205 | bikini
206 | camera
207 | game
208 | parade
209 | step
210 | block
211 | structure
212 | wave
213 | booth
214 | vehicle
215 | ride
216 | skirt
217 | costume
218 | broom
219 | skater
220 | slide
221 | umbrella
222 | art
223 | beard
224 | neck
225 | bed
226 | basket
227 | cellphone
228 | computer
229 | tent
230 | group
231 | jersey
232 | surface
233 | balloon
234 | fireman
235 | tongue
236 | carriage
237 | display
238 | balcony
239 | clothes
240 | net
241 | red-hair
242 | adult
243 | room
244 | roof
245 | deck
246 | keyboard
247 | crew
248 | meat
249 | meal
250 | hoodie
251 | hoop
252 | chef
253 | chest
254 | dog
255 | bat
256 | bar
257 | bag
258 | microscope
259 | wetsuit
260 | cane
261 | vegetable
262 | waterfall
263 | kid
264 | shoulder
265 | skateboarder
266 | magazine
267 | ship
268 | jacket
269 | father
270 | item
271 | makeup
272 | box
273 | boy
274 | kayak
275 | pink
276 | sword
277 | map
278 | mat
279 | man
280 | rock
281 | girl
282 | headband
283 | tractor
284 | track
285 | sunglass
286 | shop
287 | shoe
288 | corner
289 | seat
290 | doctor
291 | pan
292 | bottle
293 | audience
294 | nose
295 | sneaker
296 | knife
297 | road
298 | harness
299 | walkway
300 | field
301 | ribbon
302 | eye
303 | raft
304 | coat
305 | infant
306 | house
307 | fish
308 | flower
309 | pigeon
310 | paint
311 | leash
312 | park
313 | mountain
314 | couch
315 | individual
316 | restaurant
317 | dancer
318 | stripe
319 | cup
320 | glove
321 | cheerleader
322 | back
323 | mirror
324 | candle
325 | goods
326 | jockey
327 | opponent
328 | curb
329 | firetruck
330 | stroller
331 | mural
332 | trail
333 | forest
334 | sweatshirt
335 | yard
336 | skateboard
337 | gear
338 | beam
339 | puddle
340 | racket
341 | swimmer
342 | orange
343 | bull
344 | bench
345 | heel
346 | hair
347 | hose
348 | guard
349 | female
350 | firefighter
351 | bus
352 | ledge
353 | hiker
354 | motorcyclist
355 | bicycle
356 | street
357 | path
358 | luggage
359 | scaffolding
360 | phone
361 | drummer
362 | gate
363 | tourist
364 | sand
365 | outfits
366 | toy
367 | top
368 | tool
369 | bridge
370 | snow
371 | rider
372 | stool
373 | mud
374 | finger
375 | metal
376 | surfer
377 | beer
378 | microphone
379 | ocean
380 | mother
381 | laptop
382 | teenager
383 | officer
384 | ice
385 | cowboy
386 | head
387 | papers
388 | tie
389 | picture
390 | football
391 | policeman
392 | water
393 | baseball
394 | tire
395 | post
396 | piano
397 | performer
398 | figure
399 | platform
400 | wagon
401 | swing
402 | slope
403 | wood
404 | bicyclist
405 | guitar
406 | color
407 | pot
408 | pole
409 | teammate
410 | vest
411 | someone
412 | helmet
413 | bowl
414 | driver
415 | statue
416 | towel
417 | table
418 | stand
419 | garb
420 | grass
421 | dish
422 | woman
423 | fan
424 | saxophone
425 | sun
426 | flag
427 | stick
428 | pond
429 | court
430 | goal
431 | shore
432 | hill
433 | guy
434 | store
435 | surfboard
436 | cigarette
437 | arm
438 | outfit
439 | referee
440 | shirt
441 | machinery
442 | vendor
443 | clown
444 | cloth
445 | attire
446 | cyclist
447 | shovel
448 | duck
449 | stream
450 | musician
451 | something
452 | toddler
453 | light
454 | necklace
455 | van
456 | river
457 | sculpture
458 | class
459 | pipe
460 | ear
461 | pants
462 | wall
463 | motorbike
464 | member
465 | student
466 | collar
467 | spectator
468 | bandanna
469 | camel
470 | boot
471 | sky
472 | book
473 | ski
474 | leaf
475 | headphone
476 | cliff
477 | cake
478 | guitarist
479 | other
480 | branch
481 |
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import torch
6 | import torch.nn as nn
7 | from torch.autograd import Variable
8 | import torch.optim as optim
9 |
10 | import numpy as np
11 | import time
12 | import os
13 | from six.moves import cPickle
14 | import torch.backends.cudnn as cudnn
15 | import yaml
16 |
17 | import opts
18 | import misc.eval_utils
19 | import misc.utils as utils
20 | import misc.AttModel as AttModel
21 | import yaml
22 |
23 | # from misc.rewards import get_self_critical_reward
24 | import torchvision.transforms as transforms
25 | import pdb
26 | import argparse
27 | import torch.nn.functional as F
28 | import matplotlib.pyplot as plt
29 | from PIL import Image
30 | plt.switch_backend('agg')
31 | import json
32 | def demo(opt):
33 | model.eval()
34 | #########################################################################################
35 | # eval begins here
36 | #########################################################################################
37 | data_iter_val = iter(dataloader_val)
38 | loss_temp = 0
39 | start = time.time()
40 |
41 | num_show = 0
42 | predictions = []
43 | count = 0
44 | for step in range(1000):
45 | data = data_iter_val.next()
46 | img, iseq, gts_seq, num, proposals, bboxs, box_mask, img_id = data
47 |
48 | # if img_id[0] != 134688:
49 | # continue
50 |
51 | # # for i in range(proposals.size(1)): print(opt.itoc[proposals[0][i][4]], i)
52 |
53 | # # list1 = [6, 10]
54 | # list1 = [0, 1, 10, 2, 3, 4, 5, 6, 7, 8, 9]
55 | # proposals = proposals[:,list1]
56 | # num[0,1] = len(list1)
57 | proposals = proposals[:,:max(int(max(num[:,1])),1),:]
58 |
59 | input_imgs.data.resize_(img.size()).copy_(img)
60 | input_seqs.data.resize_(iseq.size()).copy_(iseq)
61 | gt_seqs.data.resize_(gts_seq.size()).copy_(gts_seq)
62 | input_num.data.resize_(num.size()).copy_(num)
63 | input_ppls.data.resize_(proposals.size()).copy_(proposals)
64 | gt_bboxs.data.resize_(bboxs.size()).copy_(bboxs)
65 | mask_bboxs.data.resize_(box_mask.size()).copy_(box_mask)
66 | input_imgs.data.resize_(img.size()).copy_(img)
67 |
68 | eval_opt = {'sample_max':1, 'beam_size': opt.beam_size, 'inference_mode' : True, 'tag_size' : opt.cbs_tag_size}
69 | seq, bn_seq, fg_seq, _, _, _ = model._sample(input_imgs, input_ppls, input_num, eval_opt)
70 |
71 | sents, det_idx, det_word = utils.decode_sequence_det(dataset_val.itow, dataset_val.itod, dataset_val.ltow, dataset_val.itoc, dataset_val.wtod, \
72 | seq, bn_seq, fg_seq, opt.vocab_size, opt)
73 |
74 | if opt.dataset == 'flickr30k':
75 | im2show = Image.open(os.path.join(opt.image_path, '%d.jpg' % img_id[0])).convert('RGB')
76 | else:
77 |
78 | if os.path.isfile(os.path.join(opt.image_path, 'val2014/COCO_val2014_%012d.jpg' % img_id[0])):
79 | im2show = Image.open(os.path.join(opt.image_path, 'val2014/COCO_val2014_%012d.jpg' % img_id[0])).convert('RGB')
80 | else:
81 | im2show = Image.open(os.path.join(opt.image_path, 'train2014/COCO_train2014_%012d.jpg' % img_id[0])).convert('RGB')
82 |
83 | w, h = im2show.size
84 |
85 | rest_idx = []
86 | for i in range(proposals[0].shape[0]):
87 | if i not in det_idx:
88 | rest_idx.append(i)
89 |
90 |
91 | if len(det_idx) > 0:
92 | # for visulization
93 | proposals = proposals[0].numpy()
94 | proposals[:,0] = proposals[:,0] * w / float(opt.image_crop_size)
95 | proposals[:,2] = proposals[:,2] * w / float(opt.image_crop_size)
96 | proposals[:,1] = proposals[:,1] * h / float(opt.image_crop_size)
97 | proposals[:,3] = proposals[:,3] * h / float(opt.image_crop_size)
98 |
99 | cls_dets = proposals[det_idx]
100 | rest_dets = proposals[rest_idx]
101 |
102 | # fig = plt.figure()
103 | # fig = plt.figure(frameon=False)
104 | # ax = plt.Axes(fig, [0., 0., 1., 1.])
105 | fig = plt.figure(frameon=False)
106 | # fig.set_size_inches(5,5*h/w)
107 | ax = plt.Axes(fig, [0., 0., 1., 1.])
108 | ax.set_axis_off()
109 | fig.add_axes(ax)
110 | a=fig.gca()
111 | a.set_frame_on(False)
112 | a.set_xticks([]); a.set_yticks([])
113 | plt.axis('off')
114 | plt.xlim(0,w); plt.ylim(h,0)
115 | # fig, ax = plt.subplots(1)
116 |
117 | # show other box in grey.
118 |
119 | plt.imshow(im2show)
120 |
121 | if len(rest_idx) > 0:
122 | for i in range(len(rest_dets)):
123 | ax = utils.vis_detections(ax, dataset_val.itoc[int(rest_dets[i,4])], rest_dets[i,:5], i, 1)
124 |
125 | if len(det_idx) > 0:
126 | for i in range(len(cls_dets)):
127 | ax = utils.vis_detections(ax, dataset_val.itoc[int(cls_dets[i,4])], cls_dets[i,:5], i, 0)
128 |
129 | # plt.axis('off')
130 | # plt.axis('tight')
131 | # plt.tight_layout()
132 | fig.savefig('visu/%d.jpg' %(img_id[0]), bbox_inches='tight', pad_inches=0, dpi=150)
133 | print(str(img_id[0]) + ': ' + sents[0])
134 |
135 | entry = {'image_id': img_id[0], 'caption': sents[0]}
136 | predictions.append(entry)
137 |
138 | return predictions
139 | ####################################################################################
140 | # Main
141 | ####################################################################################
142 | # initialize the data holder.
143 | if __name__ == '__main__':
144 |
145 | parser = argparse.ArgumentParser()
146 | parser.add_argument('--start_from', type=str, default='', help='')
147 | parser.add_argument('--load_best_score', type=int, default=1,
148 | help='Do we load previous best score when resuming training.')
149 | parser.add_argument('--id', type=str, default='',
150 | help='an id identifying this run/job. used in cross-val and appended when writing progress files')
151 | parser.add_argument('--image_path', type=str, default='/home/jiasen/data/coco/images/',
152 | help='path to the h5file containing the image data')
153 | parser.add_argument('--cbs', type=bool, default=False,
154 | help='whether use constraint beam search.')
155 | parser.add_argument('--cbs_tag_size', type=int, default=3,
156 | help='whether use constraint beam search.')
157 | parser.add_argument('--cbs_mode', type=str, default='all',
158 | help='which cbs mode to use in the decoding stage. cbs_mode: all|unique|novel')
159 | parser.add_argument('--det_oracle', type=bool, default=False,
160 | help='whether use oracle bounding box.')
161 | parser.add_argument('--cnn_backend', type=str, default='res101',
162 | help='res101 or vgg16')
163 | parser.add_argument('--data_path', type=str, default='')
164 | parser.add_argument('--beam_size', type=int, default=1)
165 |
166 | args = parser.parse_args()
167 |
168 | infos = {}
169 | histories = {}
170 | if args.start_from is not None:
171 | if args.load_best_score == 1:
172 | model_path = os.path.join(args.start_from, 'model-best.pth')
173 | info_path = os.path.join(args.start_from, 'infos_'+args.id+'-best.pkl')
174 | else:
175 | model_path = os.path.join(args.start_from, 'model.pth')
176 | info_path = os.path.join(args.start_from, 'infos_'+args.id+'.pkl')
177 |
178 | # open old infos and check if models are compatible
179 | with open(info_path) as f:
180 | infos = cPickle.load(f)
181 | opt = infos['opt']
182 | opt.image_path = args.image_path
183 | opt.cbs = args.cbs
184 | opt.cbs_tag_size = args.cbs_tag_size
185 | opt.cbs_mode = args.cbs_mode
186 | opt.det_oracle = args.det_oracle
187 | opt.cnn_backend = args.cnn_backend
188 | opt.data_path = args.data_path
189 | opt.beam_size = args.beam_size
190 | else:
191 | print("please specify the model path...")
192 | pdb.set_trace()
193 |
194 | cudnn.benchmark = True
195 |
196 | if opt.dataset == 'flickr30k':
197 | from misc.dataloader_flickr30k import DataLoader
198 | else:
199 | from misc.dataloader_coco import DataLoader
200 |
201 |
202 | ####################################################################################
203 | # Data Loader
204 | ####################################################################################
205 | dataset_val = DataLoader(opt, split='test')
206 | dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=1,
207 | shuffle=False, num_workers=0)
208 |
209 | input_imgs = torch.FloatTensor(1)
210 | input_seqs = torch.LongTensor(1)
211 | input_ppls = torch.FloatTensor(1)
212 | gt_bboxs = torch.FloatTensor(1)
213 | mask_bboxs = torch.ByteTensor(1)
214 | gt_seqs = torch.LongTensor(1)
215 | input_num = torch.LongTensor(1)
216 |
217 | if opt.cuda:
218 | input_imgs = input_imgs.cuda()
219 | input_seqs = input_seqs.cuda()
220 | gt_seqs = gt_seqs.cuda()
221 | input_num = input_num.cuda()
222 | input_ppls = input_ppls.cuda()
223 | gt_bboxs = gt_bboxs.cuda()
224 | mask_bboxs = mask_bboxs.cuda()
225 |
226 | input_imgs = Variable(input_imgs)
227 | input_seqs = Variable(input_seqs)
228 | gt_seqs = Variable(gt_seqs)
229 | input_num = Variable(input_num)
230 | input_ppls = Variable(input_ppls)
231 | gt_bboxs = Variable(gt_bboxs)
232 | mask_bboxs = Variable(mask_bboxs)
233 |
234 | ####################################################################################
235 | # Build the Model
236 | ####################################################################################
237 | opt.vocab_size = dataset_val.vocab_size
238 | opt.detect_size = dataset_val.detect_size
239 | opt.seq_length = opt.seq_length
240 | opt.fg_size = dataset_val.fg_size
241 | opt.fg_mask = torch.from_numpy(dataset_val.fg_mask).byte()
242 | opt.glove_fg = torch.from_numpy(dataset_val.glove_fg).float()
243 | opt.glove_clss = torch.from_numpy(dataset_val.glove_clss).float()
244 | opt.st2towidx = torch.from_numpy(dataset_val.st2towidx).long()
245 |
246 | opt.itow = dataset_val.itow
247 | opt.itod = dataset_val.itod
248 | opt.ltow = dataset_val.ltow
249 | opt.itoc = dataset_val.itoc
250 |
251 | pdb.set_trace()
252 | if opt.att_model == 'topdown':
253 | model = AttModel.TopDownModel(opt)
254 | elif opt.att_model == 'att2in2':
255 | model = AttModel.Att2in2Model(opt)
256 |
257 | if opt.decode_noc:
258 | model._reinit_word_weight(opt, dataset_val.ctoi, dataset_val.wtoi)
259 |
260 | if args.start_from != None:
261 | # opt.learning_rate = saved_model_opt.learning_rate
262 | print('Loading the model %s...' %(model_path))
263 | model.load_state_dict(torch.load(model_path))
264 | if os.path.isfile(os.path.join(args.start_from, 'histories_'+opt.id+'.pkl')):
265 | with open(os.path.join(args.start_from, 'histories_'+opt.id+'.pkl')) as f:
266 | histories = cPickle.load(f)
267 |
268 | if opt.cuda:
269 | model.cuda()
270 |
271 | predictions = demo(opt)
272 |
273 | print('saving...')
274 | json.dump(predictions, open('visu.json', 'w'))
275 |
276 |
--------------------------------------------------------------------------------
/demo/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/demo/img1.png
--------------------------------------------------------------------------------
/demo/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/demo/img2.png
--------------------------------------------------------------------------------
/generate_robust_split.py:
--------------------------------------------------------------------------------
1 | # import _init_paths
2 | import copy
3 | import json
4 | import operator
5 | from random import seed, shuffle
6 |
7 | import numpy as np
8 | from six.moves import xrange
9 |
10 | from pycocotools.coco import COCO
11 |
12 |
13 | def get_det_word(bbox_ann, captions, wtoi, wtod, dtoi, wtol, ngram=2):
14 |
15 | # get the present category.
16 | pcats = [box['label'] for box in bbox_ann]
17 |
18 | # get the orginial form of the caption.
19 | indicator = []
20 | stem_caption = []
21 | for s in captions:
22 | tmp = []
23 | for w in s:
24 | if w in wtol:
25 | tmp.append(wtol[w])
26 | else:
27 | tmp.append(w)
28 |
29 | stem_caption.append(tmp)
30 | indicator.append([(0, 0, 0)]*len(s)) # category class, binary class, fine-grain class.
31 |
32 | ngram_indicator = {i+1:copy.deepcopy(indicator) for i in range(ngram)}
33 | # get the 2 gram of the caption.
34 | for n in range(ngram,0,-1):
35 | for i, s in enumerate(stem_caption):
36 | for j in xrange(len(s)-n+1):
37 | ng = ' '.join(s[j:j+n])
38 | # if the n-gram exist in word_to_detection dictionary.
39 | if ng in wtod and indicator[i][j][0] == 0 and wtod[ng] in pcats: # make sure that larger gram not overwright with lower gram.
40 | bn = (ng != ' '.join(captions[i][j:j+n])) + 1
41 | fg = dtoi[ng]
42 | ngram_indicator[n][i][j] = (wtod[ng], bn, fg)
43 | indicator[i][j:j+n] = [(wtod[ng], bn, fg)] * n
44 |
45 | return ngram_indicator
46 |
47 | def get_stats(imgs, wtoi, wtod, dtoi, wtol, ctol, coco_det_train, coco_det_val):
48 |
49 | train_matrix = np.zeros((len(wtod),len(wtod)))
50 | test_matrix = np.zeros((len(wtod),len(wtod)))
51 | test_num = 0
52 | coco_stats = []
53 |
54 | for idx, img in enumerate(imgs):
55 |
56 | image_id = info['images'][idx]['id']
57 | file_path = info['images'][idx]['file_path'].split('/')[0]
58 |
59 | if file_path == 'train2014':
60 | coco = coco_det_train
61 | else:
62 | coco = coco_det_val
63 | bbox_ann_ids = coco.getAnnIds(imgIds=image_id)
64 | bbox_ann = [{'label': ctol[i['category_id']], 'bbox': i['bbox']} for i in coco.loadAnns(bbox_ann_ids)]
65 | captions = []
66 | for sent in img['sentences']:
67 | captions.append(sent['tokens'])
68 | det_indicator = get_det_word(bbox_ann, captions, wtoi, wtod, dtoi, wtol)
69 |
70 | present_clss = []
71 |
72 | for i, caption in enumerate(captions):
73 | for j in range(len(caption)):
74 | for n in range(2, 0, -1):
75 | if det_indicator[n][i][j][0] != 0 and det_indicator[n][i][j][0] not in present_clss:
76 | present_clss.append(det_indicator[n][i][j][0])
77 | coco_stats.append({'pclss':present_clss, 'image_id':image_id})
78 |
79 | return coco_stats
80 |
81 | imgs = json.load(open('data/robust_coco_creation/dataset_coco.json', 'r'))
82 |
83 | det_train_path = 'data/robust_coco_creation/annotations/instances_train2014.json'
84 | det_val_path = 'data/robust_coco_creation/annotations/instances_val2014.json'
85 |
86 | coco_det_train = COCO(det_train_path)
87 | coco_det_val = COCO(det_val_path)
88 |
89 | info = json.load(open('data/robust_coco_creation/dic_coco.json', 'r'))
90 | itow = info['ix_to_word']
91 | wtoi = {w:i for i,w in itow.items()}
92 | wtod = {w:i+1 for w,i in info['wtod'].items()} # word to detection
93 | dtoi = {w:i+1 for i,w in enumerate(wtod.keys())} # detection to index
94 | wtol = info['wtol']
95 | ctol = {c:i+1 for i, c in enumerate(coco_det_train.cats.keys())}
96 | imgs = imgs['images']
97 | coco_stats = get_stats(imgs, wtoi, wtod, dtoi, wtol, ctol, coco_det_train, coco_det_val)
98 | class_total = np.zeros(80)
99 | # get the sum for each category.
100 | for img in coco_stats:
101 | img['pclss'] = [i-1 for i in img['pclss']]
102 | for idx in img['pclss']:
103 | class_total[idx] += 1
104 |
105 | json.dump(coco_stats, open('coco_obj_stats.json', 'w'))
106 | pair_list = {}
107 | for img in coco_stats:
108 | for i in range(len(img['pclss'])):
109 | for j in range(len(img['pclss'])):
110 | if i != j:
111 | idx_i = img['pclss'][i]
112 | idx_j = img['pclss'][j]
113 | if idx_i < idx_j:
114 | idx_ij = (idx_i, idx_j)
115 | else:
116 | idx_ij = (idx_j, idx_i)
117 | if idx_ij not in pair_list:
118 | pair_list[idx_ij] = 0
119 | else:
120 | pair_list[idx_ij] += 1
121 |
122 | pair_list_sort = sorted(pair_list.items(), key=operator.itemgetter(1))
123 |
124 | pair_list = []
125 | for pair in pair_list_sort:
126 | pair_list.append([pair[0][0], pair[0][1], pair[1]])
127 |
128 | # for each pair, go throughall the images
129 | testing_total = np.zeros(80)
130 | test_pair = []
131 | count = 0
132 | test_img_num = 0
133 | for pair in pair_list:
134 | tmp_num = 0
135 | testing_total_copy = copy.deepcopy(testing_total)
136 | for img in coco_stats:
137 | if pair[0] in img['pclss'] and pair[1] in img['pclss']:
138 | # also accumulate other class.
139 | for idx in img['pclss']:
140 | testing_total_copy[idx] += 1
141 | tmp_num += 1
142 |
143 | # if the testing data exceed half of the total data, don't count this pair.
144 | drop_flag = False
145 | for i in range(80):
146 | if testing_total_copy[i] > (class_total[i] / 2):
147 | drop_flag = True
148 | print("drop pair " + str(pair[0]) + '_' + str(pair[1]))
149 | break
150 |
151 | if drop_flag == False:
152 | test_pair.append(pair)
153 | testing_total = copy.deepcopy(testing_total_copy)
154 | test_img_num += tmp_num
155 |
156 | count += 1
157 | print(count, test_img_num)
158 | if test_img_num > 15000:
159 | break
160 |
161 | print('saving the test pair list....')
162 | json.dump(test_pair, open('test_pair_list.json', 'w'))
163 |
164 | test_pair_dic = {}
165 | for pair in test_pair:
166 | test_pair_dic[str(pair[0])+'_'+str(pair[1])] = 0
167 |
168 | train_img_id = []
169 | test_img_id = []
170 | for img in coco_stats:
171 | present_clss = img['pclss']
172 |
173 | # generate the pair.
174 | tmp = []
175 | for i in range(len(present_clss)):
176 | for j in range(len(present_clss)):
177 | if i != j:
178 | tmp.append(str(present_clss[i]) + '_' + str(present_clss[j]))
179 |
180 | test_flag = False
181 | for i in tmp:
182 | if i in test_pair_dic:
183 | test_flag = True
184 | if test_flag == True:
185 | test_img_id.append({'img_id': img['image_id']})
186 | else:
187 | train_img_id.append({'img_id': img['image_id']})
188 |
189 | seed(123) # make reproducible
190 | shuffle(test_img_id) # shuffle the order
191 |
192 | num_val = int(0.3 * len(test_img_id))
193 |
194 | train_id = train_img_id
195 | val_id = test_img_id[:num_val]
196 | test_id = test_img_id[num_val:]
197 |
198 | print("train, val, test", len(train_id), len(val_id), len(test_id))
199 | robust_split = {'train_id':train_id, 'val_id':val_id, 'test_id':test_id}
200 | json.dump(robust_split, open('split_robust_coco.json', 'w'))
201 |
--------------------------------------------------------------------------------
/misc/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/misc/__init__.py
--------------------------------------------------------------------------------
/misc/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/misc/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/misc/__pycache__/eval_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/misc/__pycache__/eval_utils.cpython-36.pyc
--------------------------------------------------------------------------------
/misc/bbox_transform.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | # --------------------------------------------------------
8 | # Reorganized and modified by Jianwei Yang and Jiasen Lu
9 | # --------------------------------------------------------
10 |
11 | import torch
12 | import numpy as np
13 | import pdb
14 |
15 | def bbox_transform(ex_rois, gt_rois):
16 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
17 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
18 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
19 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
20 |
21 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
22 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
23 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
24 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
25 |
26 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
27 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
28 | targets_dw = torch.log(gt_widths / ex_widths)
29 | targets_dh = torch.log(gt_heights / ex_heights)
30 |
31 | targets = torch.stack(
32 | (targets_dx, targets_dy, targets_dw, targets_dh),1)
33 |
34 | return targets
35 |
36 | def bbox_transform_batch(ex_rois, gt_rois):
37 |
38 | if ex_rois.dim() == 2:
39 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
40 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
41 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
42 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
43 |
44 | gt_widths = gt_rois[:, :, 2] - gt_rois[:, :, 0] + 1.0
45 | gt_heights = gt_rois[:, :, 3] - gt_rois[:, :, 1] + 1.0
46 | gt_ctr_x = gt_rois[:, :, 0] + 0.5 * gt_widths
47 | gt_ctr_y = gt_rois[:, :, 1] + 0.5 * gt_heights
48 |
49 | targets_dx = (gt_ctr_x - ex_ctr_x.view(1,-1).expand_as(gt_ctr_x)) / ex_widths
50 | targets_dy = (gt_ctr_y - ex_ctr_y.view(1,-1).expand_as(gt_ctr_y)) / ex_heights
51 | targets_dw = torch.log(gt_widths / ex_widths.view(1,-1).expand_as(gt_widths))
52 | targets_dh = torch.log(gt_heights / ex_heights.view(1,-1).expand_as(gt_heights))
53 |
54 | elif ex_rois.dim() == 3:
55 | ex_widths = ex_rois[:, :, 2] - ex_rois[:, :, 0] + 1.0
56 | ex_heights = ex_rois[:,:, 3] - ex_rois[:,:, 1] + 1.0
57 | ex_ctr_x = ex_rois[:, :, 0] + 0.5 * ex_widths
58 | ex_ctr_y = ex_rois[:, :, 1] + 0.5 * ex_heights
59 |
60 | gt_widths = gt_rois[:, :, 2] - gt_rois[:, :, 0] + 1.0
61 | gt_heights = gt_rois[:, :, 3] - gt_rois[:, :, 1] + 1.0
62 | gt_ctr_x = gt_rois[:, :, 0] + 0.5 * gt_widths
63 | gt_ctr_y = gt_rois[:, :, 1] + 0.5 * gt_heights
64 |
65 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
66 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
67 | targets_dw = torch.log(gt_widths / ex_widths)
68 | targets_dh = torch.log(gt_heights / ex_heights)
69 | else:
70 | raise ValueError('ex_roi input dimension is not correct.')
71 |
72 | targets = torch.stack(
73 | (targets_dx, targets_dy, targets_dw, targets_dh),2)
74 |
75 | return targets
76 |
77 | def bbox_transform_inv(boxes, deltas, batch_size):
78 | widths = boxes[:, :, 2] - boxes[:, :, 0] + 1.0
79 | heights = boxes[:, :, 3] - boxes[:, :, 1] + 1.0
80 | ctr_x = boxes[:, :, 0] + 0.5 * widths
81 | ctr_y = boxes[:, :, 1] + 0.5 * heights
82 |
83 | dx = deltas[:, :, 0::4]
84 | dy = deltas[:, :, 1::4]
85 | dw = deltas[:, :, 2::4]
86 | dh = deltas[:, :, 3::4]
87 |
88 | pred_ctr_x = dx * widths.unsqueeze(2) + ctr_x.unsqueeze(2)
89 | pred_ctr_y = dy * heights.unsqueeze(2) + ctr_y.unsqueeze(2)
90 | pred_w = np.exp(dw) * widths.unsqueeze(2)
91 | pred_h = np.exp(dh) * heights.unsqueeze(2)
92 |
93 | pred_boxes = deltas.clone()
94 | # x1
95 | pred_boxes[:, :, 0::4] = pred_ctr_x - 0.5 * pred_w
96 | # y1
97 | pred_boxes[:, :, 1::4] = pred_ctr_y - 0.5 * pred_h
98 | # x2
99 | pred_boxes[:, :, 2::4] = pred_ctr_x + 0.5 * pred_w
100 | # y2
101 | pred_boxes[:, :, 3::4] = pred_ctr_y + 0.5 * pred_h
102 |
103 | return pred_boxes
104 |
105 | def clip_boxes_batch(boxes, im_shape, batch_size):
106 | """
107 | Clip boxes to image boundaries.
108 | """
109 | num_rois = boxes.size(1)
110 |
111 | boxes[boxes < 0] = 0
112 | # batch_x = (im_shape[:,0]-1).view(batch_size, 1).expand(batch_size, num_rois)
113 | # batch_y = (im_shape[:,1]-1).view(batch_size, 1).expand(batch_size, num_rois)
114 |
115 | batch_x = im_shape[:, 1] - 1
116 | batch_y = im_shape[:, 0] - 1
117 |
118 | boxes[:,:,0][boxes[:,:,0] > batch_x] = batch_x
119 | boxes[:,:,1][boxes[:,:,1] > batch_y] = batch_y
120 | boxes[:,:,2][boxes[:,:,2] > batch_x] = batch_x
121 | boxes[:,:,3][boxes[:,:,3] > batch_y] = batch_y
122 |
123 | return boxes
124 |
125 | def clip_boxes(boxes, im_shape, batch_size):
126 |
127 | for i in range(batch_size):
128 | boxes[i,:,0::4].clamp_(0, im_shape[i, 1]-1)
129 | boxes[i,:,1::4].clamp_(0, im_shape[i, 0]-1)
130 | boxes[i,:,2::4].clamp_(0, im_shape[i, 1]-1)
131 | boxes[i,:,3::4].clamp_(0, im_shape[i, 0]-1)
132 |
133 | return boxes
134 |
135 |
136 | def bbox_overlaps(anchors, gt_boxes):
137 | """
138 | anchors: (N, 4) ndarray of float
139 | gt_boxes: (K, 4) ndarray of float
140 |
141 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes
142 | """
143 | N = anchors.size(0)
144 | K = gt_boxes.size(0)
145 |
146 | gt_boxes_area = ((gt_boxes[:,2] - gt_boxes[:,0] + 1) *
147 | (gt_boxes[:,3] - gt_boxes[:,1] + 1)).view(1, K)
148 |
149 | anchors_area = ((anchors[:,2] - anchors[:,0] + 1) *
150 | (anchors[:,3] - anchors[:,1] + 1)).view(N, 1)
151 |
152 | boxes = anchors.view(N, 1, 4).expand(N, K, 4)
153 | query_boxes = gt_boxes.view(1, K, 4).expand(N, K, 4)
154 |
155 | iw = (torch.min(boxes[:,:,2], query_boxes[:,:,2]) -
156 | torch.max(boxes[:,:,0], query_boxes[:,:,0]) + 1)
157 | iw[iw < 0] = 0
158 |
159 | ih = (torch.min(boxes[:,:,3], query_boxes[:,:,3]) -
160 | torch.max(boxes[:,:,1], query_boxes[:,:,1]) + 1)
161 | ih[ih < 0] = 0
162 |
163 | ua = anchors_area + gt_boxes_area - (iw * ih)
164 | overlaps = iw * ih / ua
165 |
166 | return overlaps
167 |
168 | def bbox_overlaps_batch(anchors, gt_boxes):
169 | """
170 | anchors: (N, 4) ndarray of float
171 | gt_boxes: (b, K, 5) ndarray of float
172 |
173 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes
174 | """
175 | batch_size = gt_boxes.size(0)
176 |
177 |
178 | if anchors.dim() == 2:
179 |
180 | N = anchors.size(0)
181 | K = gt_boxes.size(1)
182 |
183 | anchors = anchors.view(1, N, 4).expand(batch_size, N, 4).contiguous()
184 | gt_boxes = gt_boxes[:,:,:4].contiguous()
185 |
186 |
187 | gt_boxes_x = (gt_boxes[:,:,2] - gt_boxes[:,:,0] + 1)
188 | gt_boxes_y = (gt_boxes[:,:,3] - gt_boxes[:,:,1] + 1)
189 | gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K)
190 |
191 | anchors_boxes_x = (anchors[:,:,2] - anchors[:,:,0] + 1)
192 | anchors_boxes_y = (anchors[:,:,3] - anchors[:,:,1] + 1)
193 | anchors_area = (anchors_boxes_x * anchors_boxes_y).view(batch_size, N, 1)
194 |
195 | gt_area_zero = (gt_boxes_x == 1) & (gt_boxes_y == 1)
196 | anchors_area_zero = (anchors_boxes_x == 1) & (anchors_boxes_y == 1)
197 |
198 | boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4)
199 | query_boxes = gt_boxes.view(batch_size, 1, K, 4).expand(batch_size, N, K, 4)
200 |
201 | iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) -
202 | torch.max(boxes[:,:,:,0], query_boxes[:,:,:,0]) + 1)
203 | iw[iw < 0] = 0
204 |
205 | ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) -
206 | torch.max(boxes[:,:,:,1], query_boxes[:,:,:,1]) + 1)
207 | ih[ih < 0] = 0
208 | ua = anchors_area + gt_boxes_area - (iw * ih)
209 | overlaps = iw * ih / ua
210 |
211 | # mask the overlap here.
212 | overlaps.masked_fill_(gt_area_zero.view(batch_size, 1, K).expand(batch_size, N, K), 0)
213 | overlaps.masked_fill_(anchors_area_zero.view(batch_size, N, 1).expand(batch_size, N, K), -1)
214 |
215 | elif anchors.dim() == 3:
216 | N = anchors.size(1)
217 | K = gt_boxes.size(1)
218 |
219 | if anchors.size(2) == 4:
220 | anchors = anchors[:,:,:4].contiguous()
221 | else:
222 | anchors = anchors[:,:,1:5].contiguous()
223 |
224 | gt_boxes = gt_boxes[:,:,:4].contiguous()
225 |
226 | gt_boxes_x = (gt_boxes[:,:,2] - gt_boxes[:,:,0] + 1)
227 | gt_boxes_y = (gt_boxes[:,:,3] - gt_boxes[:,:,1] + 1)
228 | gt_boxes_area = (gt_boxes_x * gt_boxes_y).view(batch_size, 1, K)
229 |
230 | anchors_boxes_x = (anchors[:,:,2] - anchors[:,:,0] + 1)
231 | anchors_boxes_y = (anchors[:,:,3] - anchors[:,:,1] + 1)
232 | anchors_area = (anchors_boxes_x * anchors_boxes_y).view(batch_size, N, 1)
233 |
234 | gt_area_zero = (gt_boxes_x == 1) & (gt_boxes_y == 1)
235 | anchors_area_zero = (anchors_boxes_x == 1) & (anchors_boxes_y == 1)
236 |
237 | boxes = anchors.view(batch_size, N, 1, 4).expand(batch_size, N, K, 4)
238 | query_boxes = gt_boxes.view(batch_size, 1, K, 4).expand(batch_size, N, K, 4)
239 |
240 | iw = (torch.min(boxes[:,:,:,2], query_boxes[:,:,:,2]) -
241 | torch.max(boxes[:,:,:,0], query_boxes[:,:,:,0]) + 1)
242 | iw[iw < 0] = 0
243 |
244 | ih = (torch.min(boxes[:,:,:,3], query_boxes[:,:,:,3]) -
245 | torch.max(boxes[:,:,:,1], query_boxes[:,:,:,1]) + 1)
246 | ih[ih < 0] = 0
247 | ua = anchors_area + gt_boxes_area - (iw * ih)
248 |
249 | overlaps = iw * ih / ua
250 |
251 | # mask the overlap here.
252 | overlaps.masked_fill_(gt_area_zero.view(batch_size, 1, K).expand(batch_size, N, K), 0)
253 | overlaps.masked_fill_(anchors_area_zero.view(batch_size, N, 1).expand(batch_size, N, K), -1)
254 | else:
255 | raise ValueError('anchors input dimension is not correct.')
256 |
257 | return overlaps
258 |
259 |
--------------------------------------------------------------------------------
/misc/dataloader_hdf.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import h5py
4 | from torch.utils.data import Dataset
5 |
6 |
7 | class HDFShardDataset(Dataset):
8 | def __init__(self, shard_dir, shard_names=None, primary_key=None, stride=1):
9 | super().__init__()
10 | self.shard_dir = shard_dir
11 | self.shard_names = shard_names
12 | if not shard_names:
13 | self.shard_names = sorted(os.listdir(shard_dir))
14 | self.primary_key = self.__primary_key(primary_key)
15 | self.stride = stride
16 |
17 | # length is expressed as per items, not rows (#items * stride = #rows)
18 | self.shard_len, self.dataset_len = self.__shard_len_dataset_len()
19 |
20 | def __len__(self):
21 | return self.dataset_len
22 |
23 | def __getitem__(self, idx):
24 | shard_num = idx // self.shard_len
25 | idx -= shard_num * self.shard_len
26 | nth_shard = h5py.File(os.path.join(self.shard_dir, self.shard_names[shard_num]), 'r')
27 | keys = list(nth_shard.keys())
28 | item = {}
29 | for key in keys:
30 | item[key] = nth_shard[key][idx * self.stride : (idx + 1) * self.stride]
31 | nth_shard.close()
32 | return item
33 |
34 | def __primary_key(self, primary_key):
35 | first_shard = h5py.File(os.path.join(self.shard_dir, self.shard_names[0]), 'r')
36 | if not primary_key:
37 | primary_key = list(first_shard.keys())[0]
38 | first_shard.close()
39 | return primary_key
40 |
41 | def __shard_len_dataset_len(self):
42 | # check number of items per shard by opening one shard
43 | # check remainder number of items in last shard
44 | first_shard = h5py.File(os.path.join(self.shard_dir, self.shard_names[0]), 'r')
45 | last_shard = h5py.File(os.path.join(self.shard_dir, self.shard_names[-1]), 'r')
46 | rows_per_shard = len(first_shard[self.primary_key])
47 | rows_per_last_shard = len(last_shard[self.primary_key])
48 |
49 | dataset_len = rows_per_shard * (len(self.shard_names) - 1) // self.stride
50 | dataset_len += rows_per_last_shard // self.stride
51 | shard_len = rows_per_shard // self.stride
52 | first_shard.close()
53 | last_shard.close()
54 | return shard_len, dataset_len
55 |
56 |
57 | class HDFSingleDataset(HDFShardDataset):
58 | def __init__(self, hdf_path, primary_key=None, stride=1):
59 | super().__init__(
60 | os.path.dirname(hdf_path),
61 | shard_names=[os.path.basename(hdf_path)],
62 | primary_key=primary_key,
63 | stride=stride,
64 | )
65 |
--------------------------------------------------------------------------------
/misc/eval_utils.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import torch
6 | import torch.nn as nn
7 | from torch.autograd import Variable
8 |
9 | import numpy as np
10 | import json
11 | from json import encoder
12 | import random
13 | import string
14 | import time
15 | import os
16 | import sys
17 | import misc.utils as utils
18 |
19 | def language_eval(dataset, preds, model_id, split):
20 | import sys
21 | sys.path.append("coco-caption")
22 | annFile = 'coco-caption/annotations/captions_val2014.json'
23 | from pycocotools.coco import COCO
24 | from pycocoevalcap.eval import COCOEvalCap
25 |
26 | encoder.FLOAT_REPR = lambda o: format(o, '.3f')
27 |
28 | if not os.path.isdir('eval_results'):
29 | os.mkdir('eval_results')
30 | cache_path = os.path.join('eval_results/', model_id + '_' + split + '.json')
31 |
32 | coco = COCO(annFile)
33 | valids = coco.getImgIds()
34 |
35 | # filter results to only those in MSCOCO validation set (will be about a third)
36 | preds_filt = [p for p in preds if p['image_id'] in valids]
37 | print('using %d/%d predictions' % (len(preds_filt), len(preds)))
38 | json.dump(preds_filt, open(cache_path, 'w')) # serialize to temporary json file. Sigh, COCO API...
39 |
40 | cocoRes = coco.loadRes(cache_path)
41 | cocoEval = COCOEvalCap(coco, cocoRes)
42 | cocoEval.params['image_id'] = cocoRes.getImgIds()
43 | cocoEval.evaluate()
44 |
45 | # create output dictionary
46 | out = {}
47 | for metric, score in cocoEval.eval.items():
48 | out[metric] = score
49 |
50 | imgToEval = cocoEval.imgToEval
51 | for p in preds_filt:
52 | image_id, caption = p['image_id'], p['caption']
53 | imgToEval[image_id]['caption'] = caption
54 | with open(cache_path, 'w') as outfile:
55 | json.dump({'overall': out, 'imgToEval': imgToEval}, outfile)
56 |
57 | return out
58 |
59 | def eval_split(model, crit, loader, eval_kwargs={}):
60 | verbose = eval_kwargs.get('verbose', True)
61 | num_images = eval_kwargs.get('num_images', eval_kwargs.get('val_images_use', -1))
62 | split = eval_kwargs.get('split', 'val')
63 | lang_eval = eval_kwargs.get('language_eval', 0)
64 | dataset = eval_kwargs.get('dataset', 'coco')
65 | beam_size = eval_kwargs.get('beam_size', 1)
66 |
67 | # Make sure in the evaluation mode
68 | model.eval()
69 |
70 | loader.reset_iterator(split)
71 |
72 | n = 0
73 | loss = 0
74 | loss_sum = 0
75 | loss_evals = 1e-8
76 | predictions = []
77 | while True:
78 | data = loader.get_batch(split)
79 | n = n + loader.batch_size
80 |
81 | if data.get('labels', None) is not None:
82 | # forward the model to get loss
83 | tmp = [data['fc_feats'], data['att_feats'], data['labels'], data['masks']]
84 | tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
85 | fc_feats, att_feats, labels, masks = tmp
86 |
87 | loss = crit(model(fc_feats, att_feats, labels), labels[:,1:], masks[:,1:]).data[0]
88 | loss_sum = loss_sum + loss
89 | loss_evals = loss_evals + 1
90 |
91 | # forward the model to also get generated samples for each image
92 | # Only leave one feature for each image, in case duplicate sample
93 | tmp = [data['fc_feats'][np.arange(loader.batch_size) * loader.seq_per_img],
94 | data['att_feats'][np.arange(loader.batch_size) * loader.seq_per_img]]
95 | tmp = [Variable(torch.from_numpy(_), volatile=True).cuda() for _ in tmp]
96 | fc_feats, att_feats = tmp
97 | # forward the model to also get generated samples for each image
98 | seq, _ = model.sample(fc_feats, att_feats, eval_kwargs)
99 |
100 | #set_trace()
101 | sents = utils.decode_sequence(loader.get_vocab(), seq)
102 |
103 | for k, sent in enumerate(sents):
104 | entry = {'image_id': data['infos'][k]['id'], 'caption': sent}
105 | if eval_kwargs.get('dump_path', 0) == 1:
106 | entry['file_name'] = data['infos'][k]['file_path']
107 | predictions.append(entry)
108 | if eval_kwargs.get('dump_images', 0) == 1:
109 | # dump the raw image to vis/ folder
110 | cmd = 'cp "' + os.path.join(eval_kwargs['image_root'], data['infos'][k]['file_path']) + '" vis/imgs/img' + str(len(predictions)) + '.jpg' # bit gross
111 | print(cmd)
112 | os.system(cmd)
113 |
114 | if verbose:
115 | print('image %s: %s' %(entry['image_id'], entry['caption']))
116 |
117 | # if we wrapped around the split or used up val imgs budget then bail
118 | ix0 = data['bounds']['it_pos_now']
119 | ix1 = data['bounds']['it_max']
120 | if num_images != -1:
121 | ix1 = min(ix1, num_images)
122 | for i in range(n - ix1):
123 | predictions.pop()
124 |
125 | if verbose:
126 | print('evaluating validation preformance... %d/%d (%f)' %(ix0 - 1, ix1, loss))
127 |
128 | if data['bounds']['wrapped']:
129 | break
130 | if num_images >= 0 and n >= num_images:
131 | break
132 |
133 | lang_stats = None
134 | if lang_eval == 1:
135 | lang_stats = language_eval(dataset, predictions, eval_kwargs['id'], split)
136 |
137 | # Switch back to training mode
138 | model.train()
139 | return loss_sum/loss_evals, predictions, lang_stats
140 |
--------------------------------------------------------------------------------
/misc/resnet.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | import torchvision.models as models
9 | import math
10 | import pdb
11 | import torch.utils.model_zoo as model_zoo
12 |
13 |
14 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101',
15 | 'resnet152']
16 |
17 |
18 | model_urls = {
19 | 'resnet18': 'https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth',
20 | 'resnet34': 'https://s3.amazonaws.com/pytorch/models/resnet34-333f7ec4.pth',
21 | 'resnet50': 'https://s3.amazonaws.com/pytorch/models/resnet50-19c8e357.pth',
22 | 'resnet101': 'https://s3.amazonaws.com/pytorch/models/resnet101-5d3b4d8f.pth',
23 | 'resnet152': 'https://s3.amazonaws.com/pytorch/models/resnet152-b121ed2d.pth',
24 | }
25 |
26 | def conv3x3(in_planes, out_planes, stride=1):
27 | "3x3 convolution with padding"
28 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
29 | padding=1, bias=False)
30 |
31 |
32 | class BasicBlock(nn.Module):
33 | expansion = 1
34 |
35 | def __init__(self, inplanes, planes, stride=1, downsample=None):
36 | super(BasicBlock, self).__init__()
37 | self.conv1 = conv3x3(inplanes, planes, stride)
38 | self.bn1 = nn.BatchNorm2d(planes)
39 | self.relu = nn.ReLU(inplace=True)
40 | self.conv2 = conv3x3(planes, planes)
41 | self.bn2 = nn.BatchNorm2d(planes)
42 | self.downsample = downsample
43 | self.stride = stride
44 |
45 | def forward(self, x):
46 | residual = x
47 |
48 | out = self.conv1(x)
49 | out = self.bn1(out)
50 | out = self.relu(out)
51 |
52 | out = self.conv2(out)
53 | out = self.bn2(out)
54 |
55 | if self.downsample is not None:
56 | residual = self.downsample(x)
57 |
58 | out += residual
59 | out = self.relu(out)
60 |
61 | return out
62 |
63 |
64 | class Bottleneck(nn.Module):
65 | expansion = 4
66 |
67 | def __init__(self, inplanes, planes, stride=1, downsample=None):
68 | super(Bottleneck, self).__init__()
69 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False) # change
70 | self.bn1 = nn.BatchNorm2d(planes)
71 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, # change
72 | padding=1, bias=False)
73 | self.bn2 = nn.BatchNorm2d(planes)
74 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
75 | self.bn3 = nn.BatchNorm2d(planes * 4)
76 | self.relu = nn.ReLU(inplace=True)
77 | self.downsample = downsample
78 | self.stride = stride
79 |
80 | def forward(self, x):
81 | residual = x
82 |
83 | out = self.conv1(x)
84 | out = self.bn1(out)
85 | out = self.relu(out)
86 |
87 | out = self.conv2(out)
88 | out = self.bn2(out)
89 | out = self.relu(out)
90 |
91 | out = self.conv3(out)
92 | out = self.bn3(out)
93 |
94 | if self.downsample is not None:
95 | residual = self.downsample(x)
96 |
97 | out += residual
98 | out = self.relu(out)
99 |
100 | return out
101 |
102 |
103 | class ResNet(nn.Module):
104 | def __init__(self, block, layers, num_classes=1000):
105 | self.inplanes = 64
106 | super(ResNet, self).__init__()
107 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
108 | bias=False)
109 | self.bn1 = nn.BatchNorm2d(64)
110 | self.relu = nn.ReLU(inplace=True)
111 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=0, ceil_mode=True) # change
112 | self.layer1 = self._make_layer(block, 64, layers[0])
113 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
114 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
115 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
116 | self.avgpool = nn.AvgPool2d(7)
117 | self.fc = nn.Linear(512 * block.expansion, num_classes)
118 |
119 | for m in self.modules():
120 | if isinstance(m, nn.Conv2d):
121 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
122 | m.weight.data.normal_(0, math.sqrt(2. / n))
123 | elif isinstance(m, nn.BatchNorm2d):
124 | m.weight.data.fill_(1)
125 | m.bias.data.zero_()
126 |
127 | def _make_layer(self, block, planes, blocks, stride=1):
128 | downsample = None
129 | if stride != 1 or self.inplanes != planes * block.expansion:
130 | downsample = nn.Sequential(
131 | nn.Conv2d(self.inplanes, planes * block.expansion,
132 | kernel_size=1, stride=stride, bias=False),
133 | nn.BatchNorm2d(planes * block.expansion),
134 | )
135 |
136 | layers = []
137 | layers.append(block(self.inplanes, planes, stride, downsample))
138 | self.inplanes = planes * block.expansion
139 | for i in range(1, blocks):
140 | layers.append(block(self.inplanes, planes))
141 |
142 | return nn.Sequential(*layers)
143 |
144 | def forward(self, x):
145 | x = self.conv1(x)
146 | x = self.bn1(x)
147 | x = self.relu(x)
148 | x = self.maxpool(x)
149 |
150 | x = self.layer1(x)
151 | x = self.layer2(x)
152 | x = self.layer3(x)
153 | x = self.layer4(x)
154 |
155 | x = self.avgpool(x)
156 | x = x.view(x.size(0), -1)
157 | x = self.fc(x)
158 |
159 | return x
160 |
161 |
162 | def resnet18(pretrained=False):
163 | """Constructs a ResNet-18 model.
164 | Args:
165 | pretrained (bool): If True, returns a model pre-trained on ImageNet
166 | """
167 | model = ResNet(BasicBlock, [2, 2, 2, 2])
168 | if pretrained:
169 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18']))
170 | return model
171 |
172 |
173 | def resnet34(pretrained=False):
174 | """Constructs a ResNet-34 model.
175 | Args:
176 | pretrained (bool): If True, returns a model pre-trained on ImageNet
177 | """
178 | model = ResNet(BasicBlock, [3, 4, 6, 3])
179 | if pretrained:
180 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34']))
181 | return model
182 |
183 |
184 | def resnet50(pretrained=False):
185 | """Constructs a ResNet-50 model.
186 | Args:
187 | pretrained (bool): If True, returns a model pre-trained on ImageNet
188 | """
189 | model = ResNet(Bottleneck, [3, 4, 6, 3])
190 | if pretrained:
191 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50']))
192 | return model
193 |
194 |
195 | def resnet101(pretrained=False):
196 | """Constructs a ResNet-101 model.
197 | Args:
198 | pretrained (bool): If True, returns a model pre-trained on ImageNet
199 | """
200 | model = ResNet(Bottleneck, [3, 4, 23, 3])
201 | if pretrained:
202 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101']))
203 | return model
204 |
205 |
206 | def resnet152(pretrained=False):
207 | """Constructs a ResNet-152 model.
208 | Args:
209 | pretrained (bool): If True, returns a model pre-trained on ImageNet
210 | """
211 | model = ResNet(Bottleneck, [3, 8, 36, 3])
212 | if pretrained:
213 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152']))
214 | return model
215 |
216 |
217 | class resnet(nn.Module):
218 | def __init__(self, opt, _num_layers=101, _fixed_block=1, pretrained=True):
219 | super(resnet, self).__init__()
220 | self._num_layers = _num_layers
221 | self._fixed_block = _fixed_block
222 | self.pretrained = pretrained
223 | self.model_path = '%s/imagenet_weights/resnet' %(opt.data_path) + str(_num_layers) + '.pth'
224 |
225 | if self._num_layers == 50:
226 | self.resnet = resnet50(pretrained=False)
227 |
228 | elif self._num_layers == 101:
229 | self.resnet = resnet101(pretrained=False)
230 |
231 | elif self._num_layers == 152:
232 | self.resnet = resnet152(pretrained=False)
233 | else:
234 | raise NotImplementedError
235 |
236 | if self.pretrained == True:
237 | print("Loading pretrained weights from %s" %(self.model_path))
238 | state_dict = torch.load(self.model_path)
239 | self.resnet.load_state_dict({k:v for k,v in state_dict.items() if k in self.resnet.state_dict()})
240 |
241 | # Fix blocks
242 | for p in self.resnet.bn1.parameters(): p.requires_grad=False
243 | for p in self.resnet.conv1.parameters(): p.requires_grad=False
244 | assert (0 <= _fixed_block <= 4)
245 | if _fixed_block >= 4:
246 | for p in self.resnet.layer4.parameters(): p.requires_grad=False
247 | if _fixed_block >= 3:
248 | for p in self.resnet.layer3.parameters(): p.requires_grad=False
249 | if _fixed_block >= 2:
250 | for p in self.resnet.layer2.parameters(): p.requires_grad=False
251 | if _fixed_block >= 1:
252 | for p in self.resnet.layer1.parameters(): p.requires_grad=False
253 |
254 | def set_bn_fix(m):
255 | classname = m.__class__.__name__
256 | if classname.find('BatchNorm') != -1:
257 | for p in m.parameters(): p.requires_grad=False
258 |
259 | self.resnet.apply(set_bn_fix)
260 |
261 | self.cnn_net = nn.Sequential(self.resnet.conv1, self.resnet.bn1,self.resnet.relu,
262 | self.resnet.maxpool,self.resnet.layer1,self.resnet.layer2,self.resnet.layer3, self.resnet.layer4)
263 |
264 | def forward(self, img):
265 | conv_feat = self.cnn_net(img)
266 | fc_feat = conv_feat.mean(3).mean(2)
267 |
268 | return conv_feat, fc_feat
269 |
270 | def train(self, mode=True):
271 | # Override train so that the training mode is set as we want
272 | nn.Module.train(self, mode)
273 | if mode:
274 | # Set fixed blocks to be in eval mode
275 | self.resnet.eval()
276 | if self._fixed_block <= 3:
277 | self.resnet.layer4.train()
278 | if self._fixed_block <= 2:
279 | self.resnet.layer3.train()
280 | if self._fixed_block <= 1:
281 | self.resnet.layer2.train()
282 | if self._fixed_block <= 0:
283 | self.resnet.layer1.train()
284 |
285 | def set_bn_eval(m):
286 | classname = m.__class__.__name__
287 | if classname.find('BatchNorm') != -1:
288 | m.eval()
289 |
290 | self.resnet.apply(set_bn_eval)
--------------------------------------------------------------------------------
/misc/rewards.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from __future__ import division
3 | from __future__ import print_function
4 |
5 | import numpy as np
6 | import time
7 | import misc.utils as utils
8 | from collections import OrderedDict
9 | import torch
10 | from torch.autograd import Variable
11 | import torch.nn as nn
12 | from torch.nn.parameter import Parameter
13 |
14 | import sys
15 | sys.path.append("tools/pycider")
16 | from pyciderevalcap.ciderD.ciderD import CiderD
17 | import pdb
18 |
19 | #CiderD_scorer = CiderD(df='corpus')
20 |
21 | def array_to_str(arr):
22 | out = ''
23 | for i in range(len(arr)):
24 | out += str(arr[i]) + ' '
25 | if arr[i] == 0:
26 | break
27 | return out.strip()
28 |
29 | class get_self_critical_reward(nn.Module):
30 | def __init__(self, opt):
31 | super(get_self_critical_reward, self).__init__()
32 | self.vocab_size = opt.vocab_size
33 | self.st2towidx = opt.st2towidx
34 | self.opt = opt
35 | # self.st2towidx.requires_grad=False
36 | self.CiderD_scorer = CiderD(df=opt.cached_tokens)
37 |
38 | def forward(self, gen_input, greedy_input, gt_gts, ncap):
39 |
40 | gen_txt_seq, gen_bn_seq, gen_vis_seq = gen_input
41 | greedy_txt_seq, greedy_bn_seq, greedy_vis_seq = greedy_input
42 |
43 | self.st2towidx = self.st2towidx.type_as(gen_txt_seq)
44 | batch_size = gen_txt_seq.size(0)
45 | seq_per_img = batch_size // gt_gts.size(0)
46 |
47 | gen_result = gen_txt_seq.new(gen_txt_seq.size()).zero_()
48 | greedy_result = greedy_txt_seq.new(greedy_txt_seq.size()).zero_()
49 |
50 | gen_mask = gen_txt_seq < self.vocab_size
51 | gen_vis_seq = gen_vis_seq.view(batch_size,-1)
52 | gen_bn_seq = gen_bn_seq.view(batch_size, -1)
53 |
54 | # compose the seq
55 | gen_result[gen_mask] = gen_txt_seq[gen_mask]
56 | gen_vis_idx = gen_vis_seq[gen_mask==0]*2 + gen_bn_seq[gen_mask==0] - 1
57 |
58 | gen_result[gen_mask==0] = self.st2towidx[gen_vis_idx]
59 |
60 | greedy_mask = greedy_txt_seq < self.vocab_size
61 | greedy_vis_seq = greedy_vis_seq.view(batch_size,-1)
62 | greedy_bn_seq = greedy_bn_seq.view(batch_size, -1)
63 |
64 | # compose the seq
65 | greedy_result[greedy_mask] = greedy_txt_seq[greedy_txt_seq < self.vocab_size]
66 | greedy_vis_idx = greedy_vis_seq[greedy_mask==0]*2 + greedy_bn_seq[greedy_mask==0] - 1
67 | greedy_result[greedy_mask==0] = self.st2towidx[greedy_vis_idx]
68 |
69 | res = OrderedDict()
70 | gen_result = gen_result.cpu().numpy()
71 | greedy_result = greedy_result.cpu().numpy()
72 |
73 | for i in range(batch_size):
74 | res[i] = [array_to_str(gen_result[i])]
75 | for i in range(batch_size):
76 | res[batch_size + i] = [array_to_str(greedy_result[i])]
77 |
78 | gts = OrderedDict()
79 | for i in range(batch_size):
80 | gts_np = gt_gts[i][:ncap.data[i]].data.cpu().numpy()
81 | gts[i] = [array_to_str(gts_np[j]) for j in range(len(gts_np))]
82 |
83 | # caption = utils.decode_normal(self.opt.itow, torch.from_numpy(gen_result))
84 | # pdb.set_trace()
85 | # print(caption[0])
86 |
87 | # utils.decode_normal(self.opt.itow, gt_gts.data.view(-1,20))
88 | #_, scores = Bleu(4).compute_score(gts, res)
89 | #scores = np.array(scores[3])
90 | res = [{'image_id':i, 'caption': res[i]} for i in range(2 * batch_size)]
91 | gts = {i: gts[i % batch_size // seq_per_img] for i in range(2 * batch_size)}
92 | _, scores = self.CiderD_scorer.compute_score(gts, res)
93 | # print(_)
94 |
95 | scores = scores[:batch_size] - scores[batch_size:]
96 | rewards = np.repeat(scores[:, np.newaxis], gen_result.shape[1], 1)
97 |
98 | return rewards, _
99 |
--------------------------------------------------------------------------------
/misc/vgg16.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Tensorflow Faster R-CNN
3 | # Licensed under The MIT License [see LICENSE for details]
4 | # Written by Xinlei Chen
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | import torch
11 | import torch.nn as nn
12 | import torch.nn.functional as F
13 | from torch.autograd import Variable
14 | import math
15 | import pdb
16 | import torchvision.models as models
17 |
18 | class vgg16(nn.Module):
19 | def __init__(self, opt, pretrained=True):
20 | super(vgg16, self).__init__()
21 |
22 | self.model_path = '%s/imagenet_weights/vgg16_caffe.pth' %(opt.data_path)
23 | self.pretrained = pretrained
24 |
25 | vgg = models.vgg16()
26 | vgg.classifier = nn.Sequential(*list(vgg.classifier._modules.values())[:-1])
27 | self.fc = vgg.classifier
28 | self.pooling = nn.AdaptiveAvgPool2d((7,7))
29 | if self.pretrained:
30 | print("Loading pretrained weights from %s" %(self.model_path))
31 | state_dict = torch.load(self.model_path)
32 | vgg.load_state_dict({k:v for k,v in state_dict.items() if k in vgg.state_dict()})
33 |
34 | # not using the last maxpool layer
35 | self.cnn_net = nn.Sequential(*list(vgg.features._modules.values())[:-1])
36 |
37 | def forward(self, img):
38 |
39 | conv_feat = self.cnn_net(img)
40 | pooled_conv_feat = self.pooling(conv_feat)
41 |
42 | pooled_conv_feat_flat = pooled_conv_feat.view(pooled_conv_feat.size(0), -1)
43 | fc_feat = self.fc(pooled_conv_feat_flat)
44 |
45 | return conv_feat, fc_feat
--------------------------------------------------------------------------------
/opts.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | def parse_opt():
4 | parser = argparse.ArgumentParser()
5 | # # Data input settings
6 | parser.add_argument('--path_opt', type=str, default='cfgs/coco.yml',
7 | help='')
8 | parser.add_argument('--dataset', type=str, default='coco',
9 | help='')
10 | parser.add_argument('--input_json', type=str, default='data/coco/cap_coco.json',
11 | help='path to the json file containing additional info and vocab')
12 | parser.add_argument('--input_dic', type=str, default='data/coco/dic_coco.json',
13 | help='path to the json containing the preprocessed dataset')
14 | parser.add_argument('--image_path', type=str, default='/srv/share/datasets/coco/images',
15 | help='path to the h5file containing the image data')
16 | parser.add_argument('--proposal_h5', type=str, default='data/coco/coco_detection.h5',
17 | help='path to the json containing the detection result.')
18 | parser.add_argument('--cnn_backend', type=str, default='res101',
19 | help='res101 or vgg16')
20 | parser.add_argument('--data_path', type=str, default='',
21 | help='')
22 |
23 | parser.add_argument('--decode_noc', type=bool, default=True,
24 | help='decoding option: normal | noc')
25 | parser.add_argument('--att_model', type=str, default='topdown',
26 | help='different attention model, now supporting topdown | att2in2')
27 | parser.add_argument('--num_workers', dest='num_workers',
28 | help='number of worker to load data',
29 | default=10, type=int)
30 | parser.add_argument('--cuda', type=bool, default=True,
31 | help='whether use cuda')
32 | parser.add_argument('--mGPUs', type=bool, default=False,
33 | help='whether use multiple GPUs')
34 | parser.add_argument('--cached_tokens', type=str, default='dataset/coco-train-idxs',
35 | help='Cached token file for calculating cider score during self critical training.')
36 |
37 | # Model settings
38 | parser.add_argument('--rnn_size', type=int, default=1024,
39 | help='size of the rnn in number of hidden nodes in each layer')
40 | parser.add_argument('--num_layers', type=int, default=1,
41 | help='number of layers in the RNN')
42 | parser.add_argument('--rnn_type', type=str, default='lstm',
43 | help='rnn, gru, or lstm')
44 | parser.add_argument('--input_encoding_size', type=int, default=512,
45 | help='the encoding size of each token in the vocabulary, and the image.')
46 | parser.add_argument('--att_hid_size', type=int, default=512,
47 | help='the hidden size of the attention MLP; only useful in show_attend_tell; 0 if not using hidden layer')
48 | parser.add_argument('--fc_feat_size', type=int, default=2048,
49 | help='2048 for resnet, 4096 for vgg')
50 | parser.add_argument('--att_feat_size', type=int, default=2048,
51 | help='2048 for resnet, 512 for vgg')
52 | parser.add_argument('--image_size', type=int, default=576,
53 | help='image random crop size')
54 | parser.add_argument('--image_crop_size', type=int, default=512,
55 | help='image random crop size')
56 |
57 | # Optimization: General
58 | parser.add_argument('--max_epochs', type=int, default=30,
59 | help='number of epochs')
60 | parser.add_argument('--batch_size', type=int, default=10,
61 | help='minibatch size')
62 | parser.add_argument('--grad_clip', type=float, default=0.1, #5.,
63 | help='clip gradients at this value')
64 | parser.add_argument('--drop_prob_lm', type=float, default=0.5,
65 | help='strength of dropout in the Language Model RNN')
66 | parser.add_argument('--self_critical', type=bool, default=False,
67 | help='whether use self critical training.')
68 | parser.add_argument('--seq_per_img', type=int, default=5,
69 | help='number of captions to sample for each image during training. Done for efficiency since CNN forward pass is expensive. E.g. coco has 5 sents/image')
70 | parser.add_argument('--seq_length', type=int, default=20, help='')
71 | parser.add_argument('--beam_size', type=int, default=1,
72 | help='used when sample_max = 1, indicates number of beams in beam search. Usually 2 or 3 works well. More is not better. Set this to 1 for faster runtime but a bit worse performance.')
73 |
74 | # Schedule Sampling.
75 | parser.add_argument('--scheduled_sampling_start', type=int, default=-1,
76 | help='at what iteration to start decay gt probability')
77 | parser.add_argument('--scheduled_sampling_increase_every', type=int, default=5,
78 | help='every how many iterations thereafter to gt probability')
79 | parser.add_argument('--scheduled_sampling_increase_prob', type=float, default=0.05,
80 | help='How much to update the prob')
81 | parser.add_argument('--scheduled_sampling_max_prob', type=float, default=0.25,
82 | help='Maximum scheduled sampling prob.')
83 |
84 | #Optimization: for the Language Model
85 | parser.add_argument('--optim', type=str, default='adam',
86 | help='what update to use? rmsprop|sgd|sgdmom|adagrad|adam')
87 | parser.add_argument('--learning_rate', type=float, default=5e-4,
88 | help='learning rate')
89 | parser.add_argument('--learning_rate_decay_start', type=int, default=1,
90 | help='at what iteration to start decaying learning rate? (-1 = dont) (in epoch)')
91 | parser.add_argument('--learning_rate_decay_every', type=int, default=3,
92 | help='every how many iterations thereafter to drop LR?(in epoch)')
93 | parser.add_argument('--learning_rate_decay_rate', type=float, default=0.8,
94 | help='every how many iterations thereafter to drop LR?(in epoch)')
95 | parser.add_argument('--optim_alpha', type=float, default=0.9,
96 | help='alpha for adam')
97 | parser.add_argument('--optim_beta', type=float, default=0.999,
98 | help='beta used for adam')
99 | parser.add_argument('--optim_epsilon', type=float, default=1e-8,
100 | help='epsilon that goes into denominator for smoothing')
101 | parser.add_argument('--weight_decay', type=float, default=0,
102 | help='weight_decay')
103 |
104 | # Optimization: for the CNN
105 | parser.add_argument('--finetune_cnn', action='store_true',
106 | help='finetune CNN')
107 | parser.add_argument('--fixed_block', type=float, default=1,
108 | help='fixed cnn block when training. [0-4] \
109 | 0:finetune all block, 4: fix all block')
110 | parser.add_argument('--cnn_optim', type=str, default='adam',
111 | help='what update to use? rmsprop|sgd|sgdmom|adagrad|adam')
112 | parser.add_argument('--cnn_optim_alpha', type=float, default=0.8,
113 | help='cnn alpha for adam')
114 | parser.add_argument('--cnn_optim_beta', type=float, default=0.999,
115 | help='beta used for adam')
116 | parser.add_argument('--cnn_learning_rate', type=float, default=1e-5,
117 | help='cnn learning rate')
118 | parser.add_argument('--cnn_weight_decay', type=float, default=0,
119 | help='weight_decay')
120 | # set training session
121 | parser.add_argument('--start_from', type=str, default=None,
122 | help="""continue training from saved model at this path. Path must contain files saved by previous training process:
123 | 'infos.pkl' : configuration;
124 | 'checkpoint' : paths to model file(s) (created by tf).
125 | Note: this file contains absolute paths, be careful when moving files around;
126 | 'model.ckpt-*' : file(s) with model definition (created by tf)
127 | """)
128 | parser.add_argument('--id', type=str, default='',
129 | help='an id identifying this run/job. used in cross-val and appended when writing progress files')
130 | # Evaluation/Checkpointing
131 | parser.add_argument('--cider_df', type=str, default='corpus',
132 | help='')
133 | parser.add_argument('--val_split', type=str, default='test',
134 | help='')
135 | parser.add_argument('--inference_only', type=bool, default=False,
136 | help='')
137 | parser.add_argument('--val_images_use', type=int, default=5000,
138 | help='how many images to use when periodically evaluating the validation loss? (-1 = all)')
139 | parser.add_argument('--val_every_epoch', type=int, default=3,
140 | help='how many images to use when periodically evaluating the validation loss? (-1 = all)')
141 | parser.add_argument('--checkpoint_path', type=str, default='save',
142 | help='directory to store checkpointed models')
143 | parser.add_argument('--language_eval', type=int, default=1,
144 | help='Evaluate language as well (1 = yes, 0 = no)? BLEU/CIDEr/METEOR/ROUGE_L? requires coco-caption code from Github.')
145 | parser.add_argument('--load_best_score', type=int, default=1,
146 | help='Do we load previous best score when resuming training.')
147 | parser.add_argument('--disp_interval', type=int, default=100,
148 | help='how many iteration to display an loss.')
149 | parser.add_argument('--losses_log_every', type=int, default=10,
150 | help='how many iteration for log.')
151 | parser.add_argument('--cbs', type=bool, default=False,
152 | help='whether use constraint beam search.')
153 | parser.add_argument('--cbs_tag_size', type=int, default=3,
154 | help='whether use constraint beam search.')
155 | parser.add_argument('--cbs_mode', type=str, default='all',
156 | help='which cbs mode to use in the decoding stage. cbs_mode: all|unique|novel')
157 | parser.add_argument('--det_oracle', type=bool, default=False,
158 | help='whether use oracle bounding box.')
159 | args = parser.parse_args()
160 |
161 | return args
162 |
--------------------------------------------------------------------------------
/pooling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/__init__.py
--------------------------------------------------------------------------------
/pooling/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # CUDA_PATH=/usr/local/cuda/
4 |
5 | export CUDA_PATH=/usr/local/cuda/
6 | #You may also want to ad the following
7 | #export C_INCLUDE_PATH=/opt/cuda/include
8 |
9 | export CXXFLAGS="-std=c++11"
10 | export CFLAGS="-std=c99"
11 |
12 | CUDA_ARCH="-gencode arch=compute_30,code=sm_30 \
13 | -gencode arch=compute_35,code=sm_35 \
14 | -gencode arch=compute_50,code=sm_50 \
15 | -gencode arch=compute_52,code=sm_52 \
16 | -gencode arch=compute_60,code=sm_60 \
17 | -gencode arch=compute_61,code=sm_61 "
18 |
19 | # compile roi_align
20 | cd roi_align/src
21 | echo "Compiling roi align kernels by nvcc..."
22 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu \
23 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC -arch=$CUDA_ARCH
24 | cd ../
25 | python build.py
26 | cd ../
27 |
--------------------------------------------------------------------------------
/pooling/roi_align/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/__init__.py
--------------------------------------------------------------------------------
/pooling/roi_align/_ext/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/_ext/__init__.py
--------------------------------------------------------------------------------
/pooling/roi_align/_ext/roi_align/__init__.py:
--------------------------------------------------------------------------------
1 |
2 | from torch.utils.ffi import _wrap_function
3 | from ._roi_align import lib as _lib, ffi as _ffi
4 |
5 | __all__ = []
6 | def _import_symbols(locals):
7 | for symbol in dir(_lib):
8 | fn = getattr(_lib, symbol)
9 | if callable(fn):
10 | locals[symbol] = _wrap_function(fn, _ffi)
11 | else:
12 | locals[symbol] = fn
13 | __all__.append(symbol)
14 |
15 | _import_symbols(locals())
16 |
--------------------------------------------------------------------------------
/pooling/roi_align/_ext/roi_align/_roi_align.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/_ext/roi_align/_roi_align.so
--------------------------------------------------------------------------------
/pooling/roi_align/build.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import os
3 | import torch
4 | from torch.utils.ffi import create_extension
5 |
6 | # sources = ['src/roi_align.c']
7 | # headers = ['src/roi_align.h']
8 | sources = []
9 | headers = []
10 | defines = []
11 | with_cuda = False
12 |
13 | if torch.cuda.is_available():
14 | print('Including CUDA code.')
15 | sources += ['src/roi_align_cuda.c']
16 | headers += ['src/roi_align_cuda.h']
17 | defines += [('WITH_CUDA', None)]
18 | with_cuda = True
19 |
20 | this_file = os.path.dirname(os.path.realpath(__file__))
21 | print(this_file)
22 | extra_objects = ['src/roi_align_kernel.cu.o']
23 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects]
24 |
25 | ffi = create_extension(
26 | '_ext.roi_align',
27 | headers=headers,
28 | sources=sources,
29 | define_macros=defines,
30 | relative_to=__file__,
31 | with_cuda=with_cuda,
32 | extra_objects=extra_objects
33 | )
34 |
35 | if __name__ == '__main__':
36 | ffi.build()
37 |
--------------------------------------------------------------------------------
/pooling/roi_align/functions/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/functions/__init__.py
--------------------------------------------------------------------------------
/pooling/roi_align/functions/roi_align.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch.autograd import Function
3 | from .._ext import roi_align
4 |
5 |
6 | # TODO use save_for_backward instead
7 | class RoIAlignFunction(Function):
8 | def __init__(self, aligned_height, aligned_width, spatial_scale):
9 | self.aligned_width = int(aligned_width)
10 | self.aligned_height = int(aligned_height)
11 | self.spatial_scale = float(spatial_scale)
12 | self.rois = None
13 | self.feature_size = None
14 |
15 | def forward(self, features, rois):
16 | self.rois = rois
17 | self.feature_size = features.size()
18 |
19 | batch_size, num_channels, data_height, data_width = features.size()
20 | num_rois = rois.size(0)
21 |
22 | output = features.new(num_rois, num_channels, self.aligned_height, self.aligned_width).zero_()
23 | if features.is_cuda:
24 | roi_align.roi_align_forward_cuda(self.aligned_height,
25 | self.aligned_width,
26 | self.spatial_scale, features,
27 | rois, output)
28 | else:
29 | roi_align.roi_align_forward(self.aligned_height,
30 | self.aligned_width,
31 | self.spatial_scale, features,
32 | rois, output)
33 | # raise NotImplementedError
34 |
35 | return output
36 |
37 | def backward(self, grad_output):
38 | assert(self.feature_size is not None and grad_output.is_cuda)
39 |
40 | batch_size, num_channels, data_height, data_width = self.feature_size
41 |
42 | grad_input = self.rois.new(batch_size, num_channels, data_height,
43 | data_width).zero_()
44 | roi_align.roi_align_backward_cuda(self.aligned_height,
45 | self.aligned_width,
46 | self.spatial_scale, grad_output,
47 | self.rois, grad_input)
48 |
49 | # print grad_input
50 |
51 | return grad_input, None
52 |
--------------------------------------------------------------------------------
/pooling/roi_align/make.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # CUDA_PATH=/usr/local/cuda/
4 |
5 | export CUDA_PATH=/usr/local/cuda/
6 | #You may also want to ad the following
7 | #export C_INCLUDE_PATH=/opt/cuda/include
8 |
9 | export CXXFLAGS="-std=c++11"
10 | export CFLAGS="-std=c99"
11 |
12 | CUDA_ARCH="-gencode arch=compute_30,code=sm_30 \
13 | -gencode arch=compute_35,code=sm_35 \
14 | -gencode arch=compute_50,code=sm_50 \
15 | -gencode arch=compute_52,code=sm_52 \
16 | -gencode arch=compute_60,code=sm_60 \
17 | -gencode arch=compute_61,code=sm_61 "
18 |
19 | # compile roi_align
20 | cd src
21 | echo "Compiling roi align kernels by nvcc..."
22 | nvcc -c -o roi_align_kernel.cu.o roi_align_kernel.cu \
23 | -D GOOGLE_CUDA=1 -x cu -Xcompiler -fPIC $CUDA_ARCH
24 | cd ../
25 | python build.py
26 |
--------------------------------------------------------------------------------
/pooling/roi_align/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/pooling/roi_align/modules/__init__.py
--------------------------------------------------------------------------------
/pooling/roi_align/modules/roi_align.py:
--------------------------------------------------------------------------------
1 | from torch.nn.modules.module import Module
2 | from torch.nn.functional import avg_pool2d, max_pool2d
3 | from ..functions.roi_align import RoIAlignFunction
4 |
5 |
6 | class RoIAlign(Module):
7 | def __init__(self, aligned_height, aligned_width, spatial_scale):
8 | super(RoIAlign, self).__init__()
9 |
10 | self.aligned_width = int(aligned_width)
11 | self.aligned_height = int(aligned_height)
12 | self.spatial_scale = float(spatial_scale)
13 |
14 | def forward(self, features, rois):
15 | return RoIAlignFunction(self.aligned_height, self.aligned_width,
16 | self.spatial_scale)(features, rois)
17 |
18 | class RoIAlignAvg(Module):
19 | def __init__(self, aligned_height, aligned_width, spatial_scale):
20 | super(RoIAlignAvg, self).__init__()
21 |
22 | self.aligned_width = int(aligned_width)
23 | self.aligned_height = int(aligned_height)
24 | self.spatial_scale = float(spatial_scale)
25 |
26 | def forward(self, features, rois):
27 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1,
28 | self.spatial_scale)(features, rois)
29 | return avg_pool2d(x, kernel_size=2, stride=1)
30 |
31 | class RoIAlignMax(Module):
32 | def __init__(self, aligned_height, aligned_width, spatial_scale):
33 | super(RoIAlignMax, self).__init__()
34 |
35 | self.aligned_width = int(aligned_width)
36 | self.aligned_height = int(aligned_height)
37 | self.spatial_scale = float(spatial_scale)
38 |
39 | def forward(self, features, rois):
40 | x = RoIAlignFunction(self.aligned_height+1, self.aligned_width+1,
41 | self.spatial_scale)(features, rois)
42 | return max_pool2d(x, kernel_size=2, stride=1)
43 |
--------------------------------------------------------------------------------
/pooling/roi_align/src/roi_align.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 |
5 |
6 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois,
7 | const int height, const int width, const int channels,
8 | const int aligned_height, const int aligned_width, const float * bottom_rois,
9 | float* top_data);
10 |
11 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois,
12 | const int height, const int width, const int channels,
13 | const int aligned_height, const int aligned_width, const float * bottom_rois,
14 | float* top_data);
15 |
16 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale,
17 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output)
18 | {
19 | //Grab the input tensor
20 | float * data_flat = THFloatTensor_data(features);
21 | float * rois_flat = THFloatTensor_data(rois);
22 |
23 | float * output_flat = THFloatTensor_data(output);
24 |
25 | // Number of ROIs
26 | int num_rois = THFloatTensor_size(rois, 0);
27 | int size_rois = THFloatTensor_size(rois, 1);
28 | if (size_rois != 5)
29 | {
30 | return 0;
31 | }
32 |
33 | // data height
34 | int data_height = THFloatTensor_size(features, 2);
35 | // data width
36 | int data_width = THFloatTensor_size(features, 3);
37 | // Number of channels
38 | int num_channels = THFloatTensor_size(features, 1);
39 |
40 | // do ROIAlignForward
41 | ROIAlignForwardCpu(data_flat, spatial_scale, num_rois, data_height, data_width, num_channels,
42 | aligned_height, aligned_width, rois_flat, output_flat);
43 |
44 | return 1;
45 | }
46 |
47 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale,
48 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad)
49 | {
50 | //Grab the input tensor
51 | float * top_grad_flat = THFloatTensor_data(top_grad);
52 | float * rois_flat = THFloatTensor_data(rois);
53 |
54 | float * bottom_grad_flat = THFloatTensor_data(bottom_grad);
55 |
56 | // Number of ROIs
57 | int num_rois = THFloatTensor_size(rois, 0);
58 | int size_rois = THFloatTensor_size(rois, 1);
59 | if (size_rois != 5)
60 | {
61 | return 0;
62 | }
63 |
64 | // batch size
65 | // int batch_size = THFloatTensor_size(bottom_grad, 0);
66 | // data height
67 | int data_height = THFloatTensor_size(bottom_grad, 2);
68 | // data width
69 | int data_width = THFloatTensor_size(bottom_grad, 3);
70 | // Number of channels
71 | int num_channels = THFloatTensor_size(bottom_grad, 1);
72 |
73 | // do ROIAlignBackward
74 | ROIAlignBackwardCpu(top_grad_flat, spatial_scale, num_rois, data_height,
75 | data_width, num_channels, aligned_height, aligned_width, rois_flat, bottom_grad_flat);
76 |
77 | return 1;
78 | }
79 |
80 | void ROIAlignForwardCpu(const float* bottom_data, const float spatial_scale, const int num_rois,
81 | const int height, const int width, const int channels,
82 | const int aligned_height, const int aligned_width, const float * bottom_rois,
83 | float* top_data)
84 | {
85 | const int output_size = num_rois * aligned_height * aligned_width * channels;
86 |
87 | int idx = 0;
88 | for (idx = 0; idx < output_size; ++idx)
89 | {
90 | // (n, c, ph, pw) is an element in the aligned output
91 | int pw = idx % aligned_width;
92 | int ph = (idx / aligned_width) % aligned_height;
93 | int c = (idx / aligned_width / aligned_height) % channels;
94 | int n = idx / aligned_width / aligned_height / channels;
95 |
96 | float roi_batch_ind = bottom_rois[n * 5 + 0];
97 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
98 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
99 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
100 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
101 |
102 | // Force malformed ROI to be 1x1
103 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
104 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
105 | float bin_size_h = roi_height / (aligned_height - 1.);
106 | float bin_size_w = roi_width / (aligned_width - 1.);
107 |
108 | float h = (float)(ph) * bin_size_h + roi_start_h;
109 | float w = (float)(pw) * bin_size_w + roi_start_w;
110 |
111 | int hstart = fminf(floor(h), height - 2);
112 | int wstart = fminf(floor(w), width - 2);
113 |
114 | int img_start = roi_batch_ind * channels * height * width;
115 |
116 | // bilinear interpolation
117 | if (h < 0 || h >= height || w < 0 || w >= width)
118 | {
119 | top_data[idx] = 0.;
120 | }
121 | else
122 | {
123 | float h_ratio = h - (float)(hstart);
124 | float w_ratio = w - (float)(wstart);
125 | int upleft = img_start + (c * height + hstart) * width + wstart;
126 | int upright = upleft + 1;
127 | int downleft = upleft + width;
128 | int downright = downleft + 1;
129 |
130 | top_data[idx] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)
131 | + bottom_data[upright] * (1. - h_ratio) * w_ratio
132 | + bottom_data[downleft] * h_ratio * (1. - w_ratio)
133 | + bottom_data[downright] * h_ratio * w_ratio;
134 | }
135 | }
136 | }
137 |
138 | void ROIAlignBackwardCpu(const float* top_diff, const float spatial_scale, const int num_rois,
139 | const int height, const int width, const int channels,
140 | const int aligned_height, const int aligned_width, const float * bottom_rois,
141 | float* bottom_diff)
142 | {
143 | const int output_size = num_rois * aligned_height * aligned_width * channels;
144 |
145 | int idx = 0;
146 | for (idx = 0; idx < output_size; ++idx)
147 | {
148 | // (n, c, ph, pw) is an element in the aligned output
149 | int pw = idx % aligned_width;
150 | int ph = (idx / aligned_width) % aligned_height;
151 | int c = (idx / aligned_width / aligned_height) % channels;
152 | int n = idx / aligned_width / aligned_height / channels;
153 |
154 | float roi_batch_ind = bottom_rois[n * 5 + 0];
155 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
156 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
157 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
158 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
159 |
160 | // Force malformed ROI to be 1x1
161 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
162 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
163 | float bin_size_h = roi_height / (aligned_height - 1.);
164 | float bin_size_w = roi_width / (aligned_width - 1.);
165 |
166 | float h = (float)(ph) * bin_size_h + roi_start_h;
167 | float w = (float)(pw) * bin_size_w + roi_start_w;
168 |
169 | int hstart = fminf(floor(h), height - 2);
170 | int wstart = fminf(floor(w), width - 2);
171 |
172 | int img_start = roi_batch_ind * channels * height * width;
173 |
174 | // bilinear interpolation
175 | if (h < 0 || h >= height || w < 0 || w >= width)
176 | {
177 | float h_ratio = h - (float)(hstart);
178 | float w_ratio = w - (float)(wstart);
179 | int upleft = img_start + (c * height + hstart) * width + wstart;
180 | int upright = upleft + 1;
181 | int downleft = upleft + width;
182 | int downright = downleft + 1;
183 |
184 | bottom_diff[upleft] += top_diff[idx] * (1. - h_ratio) * (1. - w_ratio);
185 | bottom_diff[upright] += top_diff[idx] * (1. - h_ratio) * w_ratio;
186 | bottom_diff[downleft] += top_diff[idx] * h_ratio * (1. - w_ratio);
187 | bottom_diff[downright] += top_diff[idx] * h_ratio * w_ratio;
188 | }
189 | }
190 | }
191 |
--------------------------------------------------------------------------------
/pooling/roi_align/src/roi_align.h:
--------------------------------------------------------------------------------
1 | int roi_align_forward(int aligned_height, int aligned_width, float spatial_scale,
2 | THFloatTensor * features, THFloatTensor * rois, THFloatTensor * output);
3 |
4 | int roi_align_backward(int aligned_height, int aligned_width, float spatial_scale,
5 | THFloatTensor * top_grad, THFloatTensor * rois, THFloatTensor * bottom_grad);
6 |
--------------------------------------------------------------------------------
/pooling/roi_align/src/roi_align_cuda.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include "roi_align_kernel.h"
4 |
5 | extern THCState *state;
6 |
7 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale,
8 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output)
9 | {
10 | // Grab the input tensor
11 | float * data_flat = THCudaTensor_data(state, features);
12 | float * rois_flat = THCudaTensor_data(state, rois);
13 |
14 | float * output_flat = THCudaTensor_data(state, output);
15 |
16 | // Number of ROIs
17 | int num_rois = THCudaTensor_size(state, rois, 0);
18 | int size_rois = THCudaTensor_size(state, rois, 1);
19 | if (size_rois != 5)
20 | {
21 | return 0;
22 | }
23 |
24 | // data height
25 | int data_height = THCudaTensor_size(state, features, 2);
26 | // data width
27 | int data_width = THCudaTensor_size(state, features, 3);
28 | // Number of channels
29 | int num_channels = THCudaTensor_size(state, features, 1);
30 |
31 | cudaStream_t stream = THCState_getCurrentStream(state);
32 |
33 | ROIAlignForwardLaucher(
34 | data_flat, spatial_scale, num_rois, data_height,
35 | data_width, num_channels, aligned_height,
36 | aligned_width, rois_flat,
37 | output_flat, stream);
38 |
39 | return 1;
40 | }
41 |
42 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale,
43 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad)
44 | {
45 | // Grab the input tensor
46 | float * top_grad_flat = THCudaTensor_data(state, top_grad);
47 | float * rois_flat = THCudaTensor_data(state, rois);
48 |
49 | float * bottom_grad_flat = THCudaTensor_data(state, bottom_grad);
50 |
51 | // Number of ROIs
52 | int num_rois = THCudaTensor_size(state, rois, 0);
53 | int size_rois = THCudaTensor_size(state, rois, 1);
54 | if (size_rois != 5)
55 | {
56 | return 0;
57 | }
58 |
59 | // batch size
60 | int batch_size = THCudaTensor_size(state, bottom_grad, 0);
61 | // data height
62 | int data_height = THCudaTensor_size(state, bottom_grad, 2);
63 | // data width
64 | int data_width = THCudaTensor_size(state, bottom_grad, 3);
65 | // Number of channels
66 | int num_channels = THCudaTensor_size(state, bottom_grad, 1);
67 |
68 | cudaStream_t stream = THCState_getCurrentStream(state);
69 | ROIAlignBackwardLaucher(
70 | top_grad_flat, spatial_scale, batch_size, num_rois, data_height,
71 | data_width, num_channels, aligned_height,
72 | aligned_width, rois_flat,
73 | bottom_grad_flat, stream);
74 |
75 | return 1;
76 | }
77 |
--------------------------------------------------------------------------------
/pooling/roi_align/src/roi_align_cuda.h:
--------------------------------------------------------------------------------
1 | int roi_align_forward_cuda(int aligned_height, int aligned_width, float spatial_scale,
2 | THCudaTensor * features, THCudaTensor * rois, THCudaTensor * output);
3 |
4 | int roi_align_backward_cuda(int aligned_height, int aligned_width, float spatial_scale,
5 | THCudaTensor * top_grad, THCudaTensor * rois, THCudaTensor * bottom_grad);
6 |
--------------------------------------------------------------------------------
/pooling/roi_align/src/roi_align_kernel.cu:
--------------------------------------------------------------------------------
1 | #ifdef __cplusplus
2 | extern "C" {
3 | #endif
4 |
5 | #include
6 | #include
7 | #include
8 | #include "roi_align_kernel.h"
9 |
10 | #define CUDA_1D_KERNEL_LOOP(i, n) \
11 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; \
12 | i += blockDim.x * gridDim.x)
13 |
14 |
15 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data, const float spatial_scale, const int height, const int width,
16 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data) {
17 | CUDA_1D_KERNEL_LOOP(index, nthreads) {
18 | // (n, c, ph, pw) is an element in the aligned output
19 | // int n = index;
20 | // int pw = n % aligned_width;
21 | // n /= aligned_width;
22 | // int ph = n % aligned_height;
23 | // n /= aligned_height;
24 | // int c = n % channels;
25 | // n /= channels;
26 |
27 | int pw = index % aligned_width;
28 | int ph = (index / aligned_width) % aligned_height;
29 | int c = (index / aligned_width / aligned_height) % channels;
30 | int n = index / aligned_width / aligned_height / channels;
31 |
32 | // bottom_rois += n * 5;
33 | float roi_batch_ind = bottom_rois[n * 5 + 0];
34 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
35 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
36 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
37 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
38 |
39 | // Force malformed ROIs to be 1x1
40 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
41 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
42 | float bin_size_h = roi_height / (aligned_height - 1.);
43 | float bin_size_w = roi_width / (aligned_width - 1.);
44 |
45 | float h = (float)(ph) * bin_size_h + roi_start_h;
46 | float w = (float)(pw) * bin_size_w + roi_start_w;
47 |
48 | int hstart = fminf(floor(h), height - 2);
49 | int wstart = fminf(floor(w), width - 2);
50 |
51 | int img_start = roi_batch_ind * channels * height * width;
52 |
53 | // bilinear interpolation
54 | if (h < 0 || h >= height || w < 0 || w >= width) {
55 | top_data[index] = 0.;
56 | } else {
57 | float h_ratio = h - (float)(hstart);
58 | float w_ratio = w - (float)(wstart);
59 | int upleft = img_start + (c * height + hstart) * width + wstart;
60 | int upright = upleft + 1;
61 | int downleft = upleft + width;
62 | int downright = downleft + 1;
63 |
64 | top_data[index] = bottom_data[upleft] * (1. - h_ratio) * (1. - w_ratio)
65 | + bottom_data[upright] * (1. - h_ratio) * w_ratio
66 | + bottom_data[downleft] * h_ratio * (1. - w_ratio)
67 | + bottom_data[downright] * h_ratio * w_ratio;
68 | }
69 | }
70 | }
71 |
72 |
73 | int ROIAlignForwardLaucher(const float* bottom_data, const float spatial_scale, const int num_rois, const int height, const int width,
74 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* top_data, cudaStream_t stream) {
75 | const int kThreadsPerBlock = 1024;
76 | const int output_size = num_rois * aligned_height * aligned_width * channels;
77 | cudaError_t err;
78 |
79 |
80 | ROIAlignForward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
81 | output_size, bottom_data, spatial_scale, height, width, channels,
82 | aligned_height, aligned_width, bottom_rois, top_data);
83 |
84 | err = cudaGetLastError();
85 | if(cudaSuccess != err) {
86 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
87 | exit( -1 );
88 | }
89 |
90 | return 1;
91 | }
92 |
93 |
94 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff, const float spatial_scale, const int height, const int width,
95 | const int channels, const int aligned_height, const int aligned_width, float* bottom_diff, const float* bottom_rois) {
96 | CUDA_1D_KERNEL_LOOP(index, nthreads) {
97 |
98 | // (n, c, ph, pw) is an element in the aligned output
99 | int pw = index % aligned_width;
100 | int ph = (index / aligned_width) % aligned_height;
101 | int c = (index / aligned_width / aligned_height) % channels;
102 | int n = index / aligned_width / aligned_height / channels;
103 |
104 | float roi_batch_ind = bottom_rois[n * 5 + 0];
105 | float roi_start_w = bottom_rois[n * 5 + 1] * spatial_scale;
106 | float roi_start_h = bottom_rois[n * 5 + 2] * spatial_scale;
107 | float roi_end_w = bottom_rois[n * 5 + 3] * spatial_scale;
108 | float roi_end_h = bottom_rois[n * 5 + 4] * spatial_scale;
109 | /* int roi_start_w = round(bottom_rois[1] * spatial_scale); */
110 | /* int roi_start_h = round(bottom_rois[2] * spatial_scale); */
111 | /* int roi_end_w = round(bottom_rois[3] * spatial_scale); */
112 | /* int roi_end_h = round(bottom_rois[4] * spatial_scale); */
113 |
114 | // Force malformed ROIs to be 1x1
115 | float roi_width = fmaxf(roi_end_w - roi_start_w + 1., 0.);
116 | float roi_height = fmaxf(roi_end_h - roi_start_h + 1., 0.);
117 | float bin_size_h = roi_height / (aligned_height - 1.);
118 | float bin_size_w = roi_width / (aligned_width - 1.);
119 |
120 | float h = (float)(ph) * bin_size_h + roi_start_h;
121 | float w = (float)(pw) * bin_size_w + roi_start_w;
122 |
123 | int hstart = fminf(floor(h), height - 2);
124 | int wstart = fminf(floor(w), width - 2);
125 |
126 | int img_start = roi_batch_ind * channels * height * width;
127 |
128 | // bilinear interpolation
129 | if (!(h < 0 || h >= height || w < 0 || w >= width)) {
130 | float h_ratio = h - (float)(hstart);
131 | float w_ratio = w - (float)(wstart);
132 | int upleft = img_start + (c * height + hstart) * width + wstart;
133 | int upright = upleft + 1;
134 | int downleft = upleft + width;
135 | int downright = downleft + 1;
136 |
137 | atomicAdd(bottom_diff + upleft, top_diff[index] * (1. - h_ratio) * (1 - w_ratio));
138 | atomicAdd(bottom_diff + upright, top_diff[index] * (1. - h_ratio) * w_ratio);
139 | atomicAdd(bottom_diff + downleft, top_diff[index] * h_ratio * (1 - w_ratio));
140 | atomicAdd(bottom_diff + downright, top_diff[index] * h_ratio * w_ratio);
141 | }
142 | }
143 | }
144 |
145 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois, const int height, const int width,
146 | const int channels, const int aligned_height, const int aligned_width, const float* bottom_rois, float* bottom_diff, cudaStream_t stream) {
147 | const int kThreadsPerBlock = 1024;
148 | const int output_size = num_rois * aligned_height * aligned_width * channels;
149 | cudaError_t err;
150 |
151 | ROIAlignBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock, 0, stream>>>(
152 | output_size, top_diff, spatial_scale, height, width, channels,
153 | aligned_height, aligned_width, bottom_diff, bottom_rois);
154 |
155 | err = cudaGetLastError();
156 | if(cudaSuccess != err) {
157 | fprintf( stderr, "cudaCheckError() failed : %s\n", cudaGetErrorString( err ) );
158 | exit( -1 );
159 | }
160 |
161 | return 1;
162 | }
163 |
164 |
165 | #ifdef __cplusplus
166 | }
167 | #endif
168 |
--------------------------------------------------------------------------------
/pooling/roi_align/src/roi_align_kernel.h:
--------------------------------------------------------------------------------
1 | #ifndef _ROI_ALIGN_KERNEL
2 | #define _ROI_ALIGN_KERNEL
3 |
4 | #ifdef __cplusplus
5 | extern "C" {
6 | #endif
7 |
8 | __global__ void ROIAlignForward(const int nthreads, const float* bottom_data,
9 | const float spatial_scale, const int height, const int width,
10 | const int channels, const int aligned_height, const int aligned_width,
11 | const float* bottom_rois, float* top_data);
12 |
13 | int ROIAlignForwardLaucher(
14 | const float* bottom_data, const float spatial_scale, const int num_rois, const int height,
15 | const int width, const int channels, const int aligned_height,
16 | const int aligned_width, const float* bottom_rois,
17 | float* top_data, cudaStream_t stream);
18 |
19 | __global__ void ROIAlignBackward(const int nthreads, const float* top_diff,
20 | const float spatial_scale, const int height, const int width,
21 | const int channels, const int aligned_height, const int aligned_width,
22 | float* bottom_diff, const float* bottom_rois);
23 |
24 | int ROIAlignBackwardLaucher(const float* top_diff, const float spatial_scale, const int batch_size, const int num_rois,
25 | const int height, const int width, const int channels, const int aligned_height,
26 | const int aligned_width, const float* bottom_rois,
27 | float* bottom_diff, cudaStream_t stream);
28 |
29 | #ifdef __cplusplus
30 | }
31 | #endif
32 |
33 | #endif
34 |
35 |
--------------------------------------------------------------------------------
/prepro/prepro_det.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pdb
3 | import numpy as np
4 | import h5py
5 |
6 | dataset = 'coco'
7 |
8 | if dataset == 'coco':
9 | det_train = json.load(open('data/coco_noc/coco_detection_noc_train.json'))
10 | det_val = json.load(open('data/coco_noc/coco_detection_noc_val.json'))
11 | info = json.load(open('data/coco_noc/dic_coco.json'))
12 |
13 | det = []
14 | for img in det_train:
15 | img['split'] = 'train2014'
16 | det.append(img)
17 |
18 | for img in det_val:
19 | img['split'] = 'val2014'
20 | det.append(img)
21 | elif dataset == 'flickr30k':
22 | det_file = json.load(open('data/flickr30k/flickr30k_detection.json'))
23 | info = json.load(open('data/flickr30k/dic_flickr30k.json'))
24 | det = []
25 | for img in det_file:
26 | det.append(img)
27 |
28 | proposal_file = {}
29 | for img in det:
30 | proposal_file[img['image_id']] = img
31 |
32 | N = len(det)
33 | dets_labels = np.zeros((N, 100, 6))
34 | dets_num = np.zeros((N))
35 | nms_num = np.zeros((N))
36 |
37 | for idx, img in enumerate(info['images']):
38 | image_id = img['id']
39 | proposal = proposal_file[image_id]
40 |
41 | num_proposal = len(proposal['detection'])
42 |
43 | num_nms = proposal['num_boxes']
44 | proposals = np.zeros([num_proposal, 6])
45 | for i in range(num_proposal):
46 | proposals[i, :4] = proposal['detection'][i]['location']
47 | proposals[i, 4] = proposal['detection'][i]['label']
48 | proposals[i, 5] = proposal['detection'][i]['score']
49 |
50 | dets_labels[idx,:num_proposal] = proposals
51 | dets_num[idx] = num_proposal
52 | nms_num[idx] = num_nms
53 |
54 | if dataset == 'coco':
55 | f = h5py.File('coco_noc_detection.h5', "w")
56 | elif dataset == 'flickr30k':
57 | f = h5py.File('flickr30k_detection.h5', "w")
58 |
59 | f.create_dataset("dets_labels", data=dets_labels)
60 | f.create_dataset("dets_num", data=dets_num)
61 | f.create_dataset("nms_num", data=nms_num)
62 | f.close()
63 |
--------------------------------------------------------------------------------
/prepro/prepro_dic_coco.py:
--------------------------------------------------------------------------------
1 | """
2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua
3 |
4 | Input: json file that has the form
5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
6 | example element in this list would look like
7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
8 |
9 | This script reads this json, does some basic preprocessing on the captions
10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
11 |
12 | Output: a json file and an hdf5 file
13 | The hdf5 file contains several fields:
14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format
15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded
16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
17 | first and last indices (in range 1..M) of labels for each image
18 | /label_length stores the length of the sequence for each of the M sequences
19 |
20 | The json file has a dict that contains:
21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
22 | - an 'images' field that is a list holding auxiliary information for each image,
23 | such as in particular the 'split' it was assigned to.
24 | """
25 | """
26 | to get the prepro file for neural baby talk. we need 2 additional dictionaries.
27 | wtol: word to lemma, find the orignial form of the word.
28 | wtod: word to detection, find the detection label for the word.
29 | """
30 | import os
31 | import json
32 | import argparse
33 | from random import shuffle, seed
34 | import string
35 | # non-standard dependencies:
36 | import h5py
37 | import numpy as np
38 | import torch
39 | import torchvision.models as models
40 | from torch.autograd import Variable
41 | import skimage.io
42 | import pdb
43 | from stanfordcorenlp import StanfordCoreNLP
44 | from nltk.tokenize import word_tokenize
45 |
46 | nlp = StanfordCoreNLP(
47 | os.path.join(os.path.dirname(os.path.realpath(__file__)),
48 | 'stanford-corenlp-full-2017-06-09'), memory='8g')
49 | props={'annotators': 'ssplit, tokenize, lemma','pipelineLanguage':'en', 'outputFormat':'json'}
50 |
51 | def build_vocab(imgs, params):
52 | count_thr = params['word_count_threshold']
53 |
54 | # count up the number of words
55 | counts = {}
56 | for img in imgs:
57 | for sent in img['sentences']:
58 | # sent['tokens'] = word_tokenize(sent['raw'].lower())
59 | for w in sent['tokens']:
60 | counts[w] = counts.get(w, 0) + 1
61 | cw = sorted([(count,w) for w,count in counts.items()], reverse=True)
62 | print('top words and their counts:')
63 | print('\n'.join(map(str,cw[:20])))
64 |
65 | # print some stats
66 | total_words = sum(counts.values())
67 | print('total words:', total_words)
68 | bad_words = [w for w,n in counts.items() if n <= count_thr]
69 | vocab = [w for w,n in counts.items() if n > count_thr]
70 | bad_count = sum(counts[w] for w in bad_words)
71 | print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts)))
72 | print('number of words in vocab would be %d' % (len(vocab), ))
73 | print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words))
74 |
75 | # lets look at the distribution of lengths as well
76 | sent_lengths = {}
77 | for img in imgs:
78 | for sent in img['sentences']:
79 | txt = sent['tokens']
80 | nw = len(txt)
81 | sent_lengths[nw] = sent_lengths.get(nw, 0) + 1
82 | max_len = max(sent_lengths.keys())
83 | print('max length sentence in raw data: ', max_len)
84 | print('sentence length distribution (count, number of words):')
85 | sum_len = sum(sent_lengths.values())
86 | for i in range(max_len+1):
87 | print('%2d: %10d %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len))
88 |
89 | # lets now produce the final annotations
90 | if bad_count > 0:
91 | # additional special UNK token we will use below to map infrequent words to
92 | print('inserting the special UNK token')
93 | vocab.append('UNK')
94 |
95 | imgs_new = []
96 | for img in imgs:
97 | img['final_captions'] = []
98 | for sent in img['sentences']:
99 | txt = sent['tokens']
100 | caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt]
101 | img['final_captions'].append(caption)
102 | imgs_new.append(img['final_captions'])
103 |
104 | return vocab, imgs_new
105 |
106 | def main(params):
107 |
108 | coco_class_all = []
109 | coco_class_name = open('data/coco/coco_class_name.txt', 'r')
110 | for line in coco_class_name:
111 | coco_class = line.rstrip("\n").split(', ')
112 | coco_class_all.append(coco_class)
113 |
114 | # word to detection label
115 | wtod = {}
116 | for i in range(len(coco_class_all)):
117 | for w in coco_class_all[i]:
118 | wtod[w] = i
119 |
120 | imgs = json.load(open(params['input_json'], 'r'))
121 | imgs = imgs['images']
122 |
123 | seed(123) # make reproducible
124 |
125 | # create the vocab
126 | vocab, imgs_new = build_vocab(imgs, params)
127 | itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table
128 | wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table
129 |
130 | wtol = {}
131 | for w in vocab:
132 | out = json.loads(nlp.annotate(w, properties=props))
133 | lemma_w = out['sentences'][0]['tokens'][0]['lemma']
134 | wtol[w] = lemma_w
135 |
136 | if params['split'] == 'robust':
137 | split_path = 'data/robust_coco/split_robust_coco.json'
138 | split_file = json.load(open(split_path, 'r'))
139 | split_map = {}
140 | split_map['train'] = {}
141 | split_map['val'] = {}
142 | split_map['test'] = {}
143 |
144 | for img in split_file['train_id']:
145 | split_map['train'][str(img['img_id'])] = 1
146 | for img in split_file['val_id']:
147 | split_map['val'][str(img['img_id'])] = 1
148 | for img in split_file['test_id']:
149 | split_map['test'][str(img['img_id'])] = 1
150 |
151 | elif params['split'] == 'noc':
152 | split_path = 'data/noc_coco/split_noc_coco.json'
153 | split_file = json.load(open(split_path, 'r'))
154 | split_map = {}
155 | split_map['train'] = {}
156 | split_map['val'] = {}
157 | split_map['test'] = {}
158 |
159 | for img in split_file['train']:
160 | split_map['train'][img] = 1
161 | for img in split_file['val']:
162 | split_map['val'][img] = 1
163 | for img in split_file['val_train']:
164 | split_map['val'][img] = 1
165 | for img in split_file['test']:
166 | split_map['test'][img] = 1
167 | for img in split_file['test_train']:
168 | split_map['test'][img] = 1
169 |
170 | # create output json file
171 | out = {}
172 | out['ix_to_word'] = itow # encode the (1-indexed) vocab
173 | out['wtod'] = wtod
174 | out['wtol'] = wtol
175 | out['images'] = []
176 | count = 0
177 | for i,img in enumerate(imgs):
178 | jimg = {}
179 |
180 | if params['split'] == 'robust' or params['split'] == 'noc':
181 | img_id = str(img['cocoid'])
182 | if img_id in split_map['train']:
183 | jimg['split'] = 'train'
184 | elif img_id in split_map['val']:
185 | jimg['split'] = 'val'
186 | elif img_id in split_map['test']:
187 | jimg['split'] = 'test'
188 | else:
189 | jimg['split'] = 'rest'
190 |
191 | elif params['split'] == 'challenge':
192 | if img['split'] == 'val' and count < 1000: # we use 1000 image from val as validation, and the rest as train.
193 | jimg['split'] = img['split']
194 | count += 1
195 | else:
196 | jimg['split'] = 'train' # put restrl into train.
197 | else:
198 | if img['split'] == 'val' or img['split'] == 'test':
199 | jimg['split'] = img['split']
200 | else:
201 | jimg['split'] = 'train' # put restrl into train.
202 |
203 | if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need
204 | if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful)
205 | out['images'].append(jimg)
206 |
207 | json.dump(out, open(params['output_dic_json'], 'w'))
208 | print('wrote ', params['output_dic_json'])
209 |
210 | json.dump(imgs_new, open(params['output_cap_json'], 'w'))
211 | print('wrote ', params['output_cap_json'])
212 |
213 | if __name__ == "__main__":
214 | parser = argparse.ArgumentParser()
215 |
216 | # input json
217 | parser.add_argument('--input_json', default='data/coco/dataset_coco.json', help='input json file to process into hdf5')
218 | parser.add_argument('--split', default='normal', help='different split for different task.')
219 |
220 | parser.add_argument('--output_dic_json', default='data/coco_noc/dic_coco_noc_only.json', help='output json file')
221 | parser.add_argument('--output_cap_json', default='data/coco_noc/cap_coco_noc_only.json', help='output json file')
222 |
223 | # options
224 | parser.add_argument('--max_length', default=16, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')
225 | parser.add_argument('--word_count_threshold', default=5, type=int, help='only words that occur more than this number of times will be put in vocab')
226 |
227 | args = parser.parse_args()
228 | params = vars(args) # convert to ordinary dict
229 | print('parsed input parameters:')
230 | print(json.dumps(params, indent = 2))
231 | main(params)
232 |
--------------------------------------------------------------------------------
/prepro/prepro_dic_flickr.py:
--------------------------------------------------------------------------------
1 | """
2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua
3 |
4 | Input: json file that has the form
5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
6 | example element in this list would look like
7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
8 |
9 | This script reads this json, does some basic preprocessing on the captions
10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
11 |
12 | Output: a json file and an hdf5 file
13 | The hdf5 file contains several fields:
14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format
15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded
16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
17 | first and last indices (in range 1..M) of labels for each image
18 | /label_length stores the length of the sequence for each of the M sequences
19 |
20 | The json file has a dict that contains:
21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
22 | - an 'images' field that is a list holding auxiliary information for each image,
23 | such as in particular the 'split' it was assigned to.
24 | """
25 | """
26 | to get the prepro file for neural baby talk. we need 2 additional dictionaries.
27 | wtol: word to lemma, find the orignial form of the word.
28 | wtod: word to detection, find the detection label for the word.
29 | """
30 | import os
31 | import json
32 | import argparse
33 | from random import shuffle, seed
34 | import string
35 | # non-standard dependencies:
36 | import h5py
37 | import numpy as np
38 | import torch
39 | import torchvision.models as models
40 | from torch.autograd import Variable
41 | import skimage.io
42 | import pdb
43 | from stanfordcorenlp import StanfordCoreNLP
44 | from nltk.tokenize import word_tokenize
45 |
46 | nlp = StanfordCoreNLP('../stanford-corenlp-full-2017-06-09', memory='8g')
47 | props={'annotators': 'ssplit, tokenize, lemma','pipelineLanguage':'en', 'outputFormat':'json'}
48 |
49 | def build_vocab(imgs, params):
50 | count_thr = params['word_count_threshold']
51 |
52 | # count up the number of words
53 | counts = {}
54 | for img in imgs:
55 | for sent in img['captions']:
56 | sent['tokens'] = [w.lower() for w in sent['tokens']]
57 | for w in sent['tokens']:
58 | counts[w] = counts.get(w, 0) + 1
59 | cw = sorted([(count,w) for w,count in counts.items()], reverse=True)
60 | print('top words and their counts:')
61 | print('\n'.join(map(str,cw[:20])))
62 |
63 | counts[''] = 0
64 | # print some stats
65 | total_words = sum(counts.values())
66 | print('total words:', total_words)
67 | bad_words = [w for w,n in counts.items() if n <= count_thr]
68 | vocab = [w for w,n in counts.items() if n > count_thr]
69 | bad_count = sum(counts[w] for w in bad_words)
70 | print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts)))
71 | print('number of words in vocab would be %d' % (len(vocab), ))
72 | print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words))
73 |
74 | # lets look at the distribution of lengths as well
75 | sent_lengths = {}
76 | for img in imgs:
77 | for sent in img['captions']:
78 | txt = sent['tokens']
79 | nw = len(txt)
80 | sent_lengths[nw] = sent_lengths.get(nw, 0) + 1
81 | max_len = max(sent_lengths.keys())
82 | print('max length sentence in raw data: ', max_len)
83 | print('sentence length distribution (count, number of words):')
84 | sum_len = sum(sent_lengths.values())
85 | for i in range(max_len+1):
86 | print('%2d: %10d %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len))
87 |
88 | # lets now produce the final annotations
89 | if bad_count > 0:
90 | # additional special UNK token we will use below to map infrequent words to
91 | print('inserting the special UNK token')
92 | vocab.append('UNK')
93 |
94 | imgs_new = []
95 | for img in imgs:
96 | img['final_captions'] = []
97 | for sent in img['captions']:
98 | txt = sent['tokens']
99 | clss = sent['process_clss']
100 | bbox = sent['process_bnd_box']
101 | idx = sent['process_idx']
102 | caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt]
103 | img['final_captions'].append({'caption':caption, 'clss':clss, 'bbox':bbox, 'idx':idx})
104 | imgs_new.append(img['final_captions'])
105 |
106 | return vocab, imgs_new
107 |
108 | def main(params):
109 |
110 | coco_class_all = []
111 | coco_class_name = open(params['input_class_name'], 'r')
112 | for line in coco_class_name:
113 | coco_class = line.rstrip("\n").split(', ')
114 | coco_class_all.append(coco_class)
115 |
116 | # word to detection label
117 | wtod = {}
118 | for i in range(len(coco_class_all)):
119 | for w in coco_class_all[i]:
120 | wtod[w] = i
121 |
122 | imgs_split = json.load(open(params['input_json'], 'r'))
123 | imgs_split = imgs_split['images']
124 |
125 | split = {}
126 | for img in imgs_split:
127 | split[img['filename'].split('.')[0]] = img['split']
128 |
129 | imgs_processed = json.load(open('data/flickr30k/flickr30k_cleaned_class.json', 'r'))
130 | imgs_processed = imgs_processed['annotations']
131 |
132 | for img in imgs_processed:
133 | if str(img['image_id']) in split:
134 | img['split'] = split[str(img['image_id'])]
135 | else:
136 | img['split'] = 'rest'
137 | seed(123) # make reproducible
138 |
139 | # create the vocab
140 | vocab, imgs_new = build_vocab(imgs_processed, params)
141 | itow = {i+1:w for i,w in enumerate(vocab)} # a 1-indexed vocab translation table
142 | wtoi = {w:i+1 for i,w in enumerate(vocab)} # inverse table
143 |
144 | wtol = {}
145 | for w in vocab:
146 | out = json.loads(nlp.annotate(w.encode('utf-8'), properties=props))
147 | lemma_w = out['sentences'][0]['tokens'][0]['lemma']
148 | wtol[w] = lemma_w
149 |
150 | # create output json file
151 | out = {}
152 | out['ix_to_word'] = itow # encode the (1-indexed) vocab
153 | out['wtod'] = wtod
154 | out['wtol'] = wtol
155 | out['images'] = []
156 | for i,img in enumerate(imgs_processed):
157 | jimg = {}
158 | jimg['split'] = img['split']
159 | if params['dataset'] == 'flickr30k':
160 | if 'image_id' in img: jimg['file_path'] = str(img['image_id']) + '.jpg' # copy it over, might need
161 | if 'image_id' in img: jimg['id'] = img['image_id'] # copy over & mantain an id, if present (e.g. coco ids, useful)
162 | elif params['dataset'] == 'coco':
163 | if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need
164 | if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful)
165 | out['images'].append(jimg)
166 |
167 | json.dump(out, open(params['outpu_dic_json'], 'w'))
168 | print('wrote ', params['outpu_dic_json'])
169 |
170 | json.dump(imgs_new, open(params['output_cap_json'], 'w'))
171 | print('wrote ', params['output_cap_json'])
172 |
173 | if __name__ == "__main__":
174 | parser = argparse.ArgumentParser()
175 |
176 | # input json
177 | parser.add_argument('--dataset', default='flickr30k', help='dataset')
178 | parser.add_argument('--input_json', default='data/flickr30k/dataset_flickr30k.json', help='input json file to process into hdf5')
179 | parser.add_argument('--input_class_name', default='data/flickr30k/flickr30k_class_name.txt',help='class name')
180 | parser.add_argument('--outpu_dic_json', default='data/flickr30k/dic_flickr30k.json', help='output json file')
181 | parser.add_argument('--output_cap_json', default='data/flickr30k/cap_flickr30k.json', help='output json file')
182 |
183 | # options
184 | parser.add_argument('--max_length', default=20, type=int, help='max length of a caption, in number of words. captions longer than this get clipped.')
185 | parser.add_argument('--word_count_threshold', default=3, type=int, help='only words that occur more than this number of times will be put in vocab')
186 |
187 | args = parser.parse_args()
188 | params = vars(args) # convert to ordinary dict
189 | print('parsed input parameters:')
190 | print(json.dumps(params, indent = 2))
191 | main(params)
192 |
--------------------------------------------------------------------------------
/prepro/prepro_ngrams.py:
--------------------------------------------------------------------------------
1 | """
2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua
3 |
4 | Input: json file that has the form
5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
6 | example element in this list would look like
7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
8 |
9 | This script reads this json, does some basic preprocessing on the captions
10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
11 |
12 | Output: a json file and an hdf5 file
13 | The hdf5 file contains several fields:
14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format
15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded
16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
17 | first and last indices (in range 1..M) of labels for each image
18 | /label_length stores the length of the sequence for each of the M sequences
19 |
20 | The json file has a dict that contains:
21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
22 | - an 'images' field that is a list holding auxiliary information for each image,
23 | such as in particular the 'split' it was assigned to.
24 | """
25 |
26 | import os
27 | import json
28 | import argparse
29 | from six.moves import cPickle, xrange
30 | from collections import defaultdict
31 | import pdb
32 |
33 | def precook(s, n=4, out=False):
34 | """
35 | Takes a string as input and returns an object that can be given to
36 | either cook_refs or cook_test. This is optional: cook_refs and cook_test
37 | can take string arguments as well.
38 | :param s: string : sentence to be converted into ngrams
39 | :param n: int : number of ngrams for which representation is calculated
40 | :return: term frequency vector for occuring ngrams
41 | """
42 | words = s.split()
43 | counts = defaultdict(int)
44 | for k in xrange(1,n+1):
45 | for i in xrange(len(words)-k+1):
46 | ngram = tuple(words[i:i+k])
47 | counts[ngram] += 1
48 | return counts
49 |
50 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
51 | '''Takes a list of reference sentences for a single segment
52 | and returns an object that encapsulates everything that BLEU
53 | needs to know about them.
54 | :param refs: list of string : reference sentences for some image
55 | :param n: int : number of ngrams for which (ngram) representation is calculated
56 | :return: result (list of dict)
57 | '''
58 | return [precook(ref, n) for ref in refs]
59 |
60 | def create_crefs(refs):
61 | crefs = []
62 | for ref in refs:
63 | # ref is a list of 5 captions
64 | crefs.append(cook_refs(ref))
65 | return crefs
66 |
67 | def compute_doc_freq(crefs):
68 | '''
69 | Compute term frequency for reference data.
70 | This will be used to compute idf (inverse document frequency later)
71 | The term frequency is stored in the object
72 | :return: None
73 | '''
74 | document_frequency = defaultdict(float)
75 | for refs in crefs:
76 | # refs, k ref captions of one image
77 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
78 | document_frequency[ngram] += 1
79 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
80 | return document_frequency
81 |
82 | def build_dict(imgs, wtoi, params):
83 | wtoi[''] = 0
84 |
85 | count_imgs = 0
86 | refs_words = []
87 | refs_idxs = []
88 | for img in imgs:
89 | if (params['split'] == img['split']) or \
90 | (params['split'] == 'train' and img['split'] == 'restval') or \
91 | (params['split'] == 'all'):
92 | #(params['split'] == 'val' and img['split'] == 'restval') or \
93 | ref_words = []
94 | ref_idxs = []
95 | for sent in img['sentences']:
96 | tmp_tokens = sent['tokens'] + ['']
97 | tmp_tokens = [_ if _ in wtoi else 'UNK' for _ in tmp_tokens]
98 | ref_words.append(' '.join(tmp_tokens))
99 | ref_idxs.append(' '.join([str(wtoi[_]) for _ in tmp_tokens]))
100 |
101 | refs_words.append(ref_words)
102 | refs_idxs.append(ref_idxs)
103 | count_imgs += 1
104 | print('total imgs:', count_imgs)
105 |
106 | ngram_words = compute_doc_freq(create_crefs(refs_words))
107 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs))
108 | return ngram_words, ngram_idxs, count_imgs
109 |
110 | def main(params):
111 |
112 | imgs = json.load(open(params['input_json'], 'r'))
113 | itow = json.load(open(params['dict_json'], 'r'))['ix_to_word']
114 | wtoi = {w:i for i,w in itow.items()}
115 | imgs = imgs['images']
116 |
117 | ngram_words, ngram_idxs, ref_len = build_dict(imgs, wtoi, params)
118 | cPickle.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(params['output_pkl']+'-words.p','w'), protocol=cPickle.HIGHEST_PROTOCOL)
119 | cPickle.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(params['output_pkl']+'-idxs.p','w'), protocol=cPickle.HIGHEST_PROTOCOL)
120 |
121 | if __name__ == "__main__":
122 |
123 | parser = argparse.ArgumentParser()
124 | # input json
125 | parser.add_argument('--input_json', default='data/flickr30k/dataset_flickr30k.json', help='input json file to process into hdf5')
126 | parser.add_argument('--dict_json', default='data/flickr30k/dic_flickr30k.json', help='output json file')
127 | parser.add_argument('--output_pkl', default='data/flickr30k-train', help='output pickle file')
128 | parser.add_argument('--split', default='train', help='test, val, train, all')
129 | args = parser.parse_args()
130 | params = vars(args) # convert to ordinary dict
131 |
132 | main(params)
133 |
--------------------------------------------------------------------------------
/prepro/prepro_ngrams_bak.py:
--------------------------------------------------------------------------------
1 | """
2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua
3 |
4 | Input: json file that has the form
5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
6 | example element in this list would look like
7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
8 |
9 | This script reads this json, does some basic preprocessing on the captions
10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
11 |
12 | Output: a json file and an hdf5 file
13 | The hdf5 file contains several fields:
14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format
15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded
16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
17 | first and last indices (in range 1..M) of labels for each image
18 | /label_length stores the length of the sequence for each of the M sequences
19 |
20 | The json file has a dict that contains:
21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
22 | - an 'images' field that is a list holding auxiliary information for each image,
23 | such as in particular the 'split' it was assigned to.
24 | """
25 | import sys
26 | import os
27 | sys.path.append(os.getcwd())
28 |
29 | import json
30 | import argparse
31 | from six.moves import cPickle, xrange
32 | from collections import defaultdict
33 | from pycocotools.coco import COCO
34 | import numpy as np
35 | import copy
36 | import pdb
37 |
38 | def precook(s, n=4, out=False):
39 | """
40 | Takes a string as input and returns an object that can be given to
41 | either cook_refs or cook_test. This is optional: cook_refs and cook_test
42 | can take string arguments as well.
43 | :param s: string : sentence to be converted into ngrams
44 | :param n: int : number of ngrams for which representation is calculated
45 | :return: term frequency vector for occuring ngrams
46 | """
47 | words = s.split()
48 | counts = defaultdict(int)
49 | for k in xrange(1,n+1):
50 | for i in xrange(len(words)-k+1):
51 | ngram = tuple(words[i:i+k])
52 | counts[ngram] += 1
53 | return counts
54 |
55 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
56 | '''Takes a list of reference sentences for a single segment
57 | and returns an object that encapsulates everything that BLEU
58 | needs to know about them.
59 | :param refs: list of string : reference sentences for some image
60 | :param n: int : number of ngrams for which (ngram) representation is calculated
61 | :return: result (list of dict)
62 | '''
63 | return [precook(ref, n) for ref in refs]
64 |
65 | def create_crefs(refs):
66 | crefs = []
67 | for ref in refs:
68 | # ref is a list of 5 captions
69 | crefs.append(cook_refs(ref))
70 | return crefs
71 |
72 | def compute_doc_freq(crefs):
73 | '''
74 | Compute term frequency for reference data.
75 | This will be used to compute idf (inverse document frequency later)
76 | The term frequency is stored in the object
77 | :return: None
78 | '''
79 | document_frequency = defaultdict(float)
80 | for refs in crefs:
81 | # refs, k ref captions of one image
82 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
83 | document_frequency[ngram] += 1
84 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
85 | return document_frequency
86 |
87 | def build_dict(imgs, info, wtoi, wtod, dtoi, wtol, ctol, coco_det_train, coco_det_val, params):
88 | vocab_size = len(wtoi)+1
89 | seq_length = 16
90 | wtoi[''] = 0
91 | wtol[''] = ''
92 | count_imgs = 0
93 |
94 | refs_words = []
95 | refs_idxs = []
96 | for idx, img in enumerate(imgs):
97 | image_id = info['images'][idx]['id']
98 | # image_id = img['cocoid']
99 | file_path = info['images'][idx]['file_path'].split('/')[0]
100 |
101 | if file_path == 'train2014':
102 | coco = coco_det_train
103 | else:
104 | coco = coco_det_val
105 |
106 | bbox_ann_ids = coco.getAnnIds(imgIds=image_id)
107 | bbox_ann = [{'label': ctol[i['category_id']], 'bbox': i['bbox']} for i in coco.loadAnns(bbox_ann_ids)]
108 |
109 | if (params['split'] == info['images'][idx]['split']) or \
110 | (params['split'] == 'train' and info['images'][idx]['split'] == 'restval') or \
111 | (params['split'] == 'all'):
112 | #(params['split'] == 'val' and img['split'] == 'restval') or \
113 | ref_words = []
114 | ref_idxs = []
115 |
116 | captions = []
117 | for sent in img:
118 | captions.append(sent + [''])
119 | det_indicator = get_det_word(bbox_ann, captions, wtoi, wtod, dtoi, wtol)
120 |
121 | ncap = len(captions) # number of captions available for this image
122 | for i, caption in enumerate(captions):
123 | tmp_tokens = []
124 | j = 0
125 | k = 0
126 | while j < len(caption):
127 | is_det = False
128 | for n in range(2, 0, -1):
129 | if det_indicator[n][i][j][0] != 0:
130 | tmp_tokens.append(vocab_size + det_indicator[n][i][j][2] * 2 + det_indicator[n][i][j][1])
131 | is_det = True
132 | j += n # skip the ngram.
133 | break
134 | if is_det == False:
135 | tmp_tokens.append(wtoi[caption[j]])
136 | j += 1
137 | k += 1
138 | ref_idxs.append(' '.join([str(int(_)) for _ in tmp_tokens]))
139 | # refs_words.append(ref_words)
140 | refs_idxs.append(ref_idxs)
141 | count_imgs += 1
142 |
143 | print('total imgs:', count_imgs)
144 |
145 | # ngram_words = compute_doc_freq(create_crefs(refs_words))
146 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs))
147 | return ngram_idxs, count_imgs
148 |
149 |
150 | def get_det_word(bbox_ann, captions, wtoi, wtod, dtoi, wtol, ngram=2):
151 |
152 | # get the present category.
153 | pcats = [box['label'] for box in bbox_ann]
154 |
155 | # get the orginial form of the caption.
156 | indicator = []
157 | stem_caption = []
158 | for s in captions:
159 | tmp = []
160 | for w in s:
161 | tmp.append(wtol[w])
162 | stem_caption.append(tmp)
163 | indicator.append([(0, 0, 0)]*len(s)) # category class, binary class, fine-grain class.
164 |
165 | ngram_indicator = {i+1:copy.deepcopy(indicator) for i in range(ngram)}
166 | # get the 2 gram of the caption.
167 | for n in range(ngram,0,-1):
168 | for i, s in enumerate(stem_caption):
169 | for j in xrange(len(s)-n+1):
170 | ng = ' '.join(s[j:j+n])
171 | # if the n-gram exist in word_to_detection dictionary.
172 | if ng in wtod and indicator[i][j][0] == 0 and wtod[ng] in pcats: # make sure that larger gram not overwright with lower gram.
173 | bn = (ng != ' '.join(captions[i][j:j+n])) + 1
174 | fg = dtoi[ng]
175 | ngram_indicator[n][i][j] = (wtod[ng], bn, fg)
176 | indicator[i][j:j+n] = [(wtod[ng], bn, fg)] * n
177 |
178 | return ngram_indicator
179 |
180 | def main(params):
181 |
182 | det_train_path = 'data/coco/annotations/instances_train2014.json'
183 | det_val_path = 'data/coco/annotations/instances_val2014.json'
184 |
185 | coco_det_train = COCO(det_train_path)
186 | coco_det_val = COCO(det_val_path)
187 |
188 | info = json.load(open(params['dict_json'], 'r'))
189 | imgs = json.load(open(params['input_json'], 'r'))
190 |
191 | itow = info['ix_to_word']
192 | wtoi = {w:i for i,w in itow.items()}
193 | wtod = {w:i+1 for w,i in info['wtod'].items()} # word to detection
194 | dtoi = {w:i+1 for i,w in enumerate(wtod.keys())} # detection to index
195 | wtol = info['wtol']
196 | ctol = {c:i+1 for i, c in enumerate(coco_det_train.cats.keys())}
197 |
198 | # imgs = imgs['images']
199 |
200 | ngram_idxs, ref_len = build_dict(imgs, info, wtoi, wtod, dtoi, wtol, ctol, coco_det_train, coco_det_val, params)
201 |
202 | # cPickle.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(params['output_pkl']+'-words.p','w'), protocol=cPickle.HIGHEST_PROTOCOL)
203 | cPickle.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(params['output_pkl']+'-idxs.p','w'), protocol=cPickle.HIGHEST_PROTOCOL)
204 |
205 | if __name__ == "__main__":
206 |
207 | parser = argparse.ArgumentParser()
208 |
209 | # input json
210 | parser.add_argument('--input_json', default='data/coco/cap_coco.json', help='input json file to process into hdf5')
211 | parser.add_argument('--dict_json', default='data/coco/dic_coco.json', help='output json file')
212 | parser.add_argument('--output_pkl', default='data/coco-train', help='output pickle file')
213 | parser.add_argument('--split', default='train', help='test, val, train, all')
214 | args = parser.parse_args()
215 | params = vars(args) # convert to ordinary dict
216 |
217 | main(params)
218 |
--------------------------------------------------------------------------------
/prepro/prepro_ngrams_flickr30k.py:
--------------------------------------------------------------------------------
1 | """
2 | Preprocess a raw json dataset into hdf5/json files for use in data_loader.lua
3 |
4 | Input: json file that has the form
5 | [{ file_path: 'path/img.jpg', captions: ['a caption', ...] }, ...]
6 | example element in this list would look like
7 | {'captions': [u'A man with a red helmet on a small moped on a dirt road. ', u'Man riding a motor bike on a dirt road on the countryside.', u'A man riding on the back of a motorcycle.', u'A dirt path with a young person on a motor bike rests to the foreground of a verdant area with a bridge and a background of cloud-wreathed mountains. ', u'A man in a red shirt and a red hat is on a motorcycle on a hill side.'], 'file_path': u'val2014/COCO_val2014_000000391895.jpg', 'id': 391895}
8 |
9 | This script reads this json, does some basic preprocessing on the captions
10 | (e.g. lowercase, etc.), creates a special UNK token, and encodes everything to arrays
11 |
12 | Output: a json file and an hdf5 file
13 | The hdf5 file contains several fields:
14 | /images is (N,3,256,256) uint8 array of raw image data in RGB format
15 | /labels is (M,max_length) uint32 array of encoded labels, zero padded
16 | /label_start_ix and /label_end_ix are (N,) uint32 arrays of pointers to the
17 | first and last indices (in range 1..M) of labels for each image
18 | /label_length stores the length of the sequence for each of the M sequences
19 |
20 | The json file has a dict that contains:
21 | - an 'ix_to_word' field storing the vocab in form {ix:'word'}, where ix is 1-indexed
22 | - an 'images' field that is a list holding auxiliary information for each image,
23 | such as in particular the 'split' it was assigned to.
24 | """
25 | import sys
26 | import os
27 | sys.path.append(os.getcwd())
28 |
29 | import json
30 | import argparse
31 | from six.moves import cPickle, xrange
32 | from collections import defaultdict
33 | from pycocotools.coco import COCO
34 | import numpy as np
35 | import copy
36 | import pdb
37 |
38 | def precook(s, n=4, out=False):
39 | """
40 | Takes a string as input and returns an object that can be given to
41 | either cook_refs or cook_test. This is optional: cook_refs and cook_test
42 | can take string arguments as well.
43 | :param s: string : sentence to be converted into ngrams
44 | :param n: int : number of ngrams for which representation is calculated
45 | :return: term frequency vector for occuring ngrams
46 | """
47 | words = s.split()
48 | counts = defaultdict(int)
49 | for k in xrange(1,n+1):
50 | for i in xrange(len(words)-k+1):
51 | ngram = tuple(words[i:i+k])
52 | counts[ngram] += 1
53 | return counts
54 |
55 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
56 | '''Takes a list of reference sentences for a single segment
57 | and returns an object that encapsulates everything that BLEU
58 | needs to know about them.
59 | :param refs: list of string : reference sentences for some image
60 | :param n: int : number of ngrams for which (ngram) representation is calculated
61 | :return: result (list of dict)
62 | '''
63 | return [precook(ref, n) for ref in refs]
64 |
65 | def create_crefs(refs):
66 | crefs = []
67 | for ref in refs:
68 | # ref is a list of 5 captions
69 | crefs.append(cook_refs(ref))
70 | return crefs
71 |
72 | def compute_doc_freq(crefs):
73 | '''
74 | Compute term frequency for reference data.
75 | This will be used to compute idf (inverse document frequency later)
76 | The term frequency is stored in the object
77 | :return: None
78 | '''
79 | document_frequency = defaultdict(float)
80 | for refs in crefs:
81 | # refs, k ref captions of one image
82 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
83 | document_frequency[ngram] += 1
84 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
85 | return document_frequency
86 |
87 | def build_dict(imgs, info, wtoi, wtod, dtoi, wtol, itod, params):
88 | vocab_size = len(wtoi)+1
89 | seq_length = 16
90 | wtoi[''] = 0
91 | wtol[''] = ''
92 | count_imgs = 0
93 |
94 | refs_words = []
95 | refs_idxs = []
96 | for idx, img in enumerate(imgs):
97 | image_id = info['images'][idx]['id']
98 | # image_id = img['cocoid']
99 | file_path = info['images'][idx]['file_path'].split('/')[0]
100 | bbox_ann = []
101 | bbox_idx = 0
102 | for sent in img:
103 | sent['bbox_idx'] = []
104 | for i, box in enumerate(sent['bbox']):
105 | sent['bbox_idx'].append(bbox_idx)
106 | bbox_ann.append({'bbox':box, 'label': dtoi[sent['clss'][i]], 'bbox_idx':bbox_idx})
107 | bbox_idx += 1
108 | gt_bboxs = np.zeros((len(bbox_ann), 6))
109 | for i, bbox in enumerate(bbox_ann):
110 | gt_bboxs[i, :4] = bbox['bbox']
111 | gt_bboxs[i, 4] = bbox['label']
112 | gt_bboxs[i, 5] = bbox['bbox_idx']
113 |
114 | if (params['split'] == info['images'][idx]['split']) or \
115 | (params['split'] == 'train' and info['images'][idx]['split'] == 'restval') or \
116 | (params['split'] == 'all'):
117 | #(params['split'] == 'val' and img['split'] == 'restval') or \
118 | ref_words = []
119 | ref_idxs = []
120 |
121 | captions = []
122 | for sent in img:
123 | sent['caption'] = sent['caption'] + ['']
124 | sent['caption'] = [_ if _ in wtoi else 'UNK' for _ in sent['caption']]
125 | captions.append(sent)
126 |
127 | det_indicator = get_det_word(gt_bboxs, captions, wtod, dtoi)
128 |
129 | ncap = len(captions) # number of captions available for this image
130 | for i, caption in enumerate(captions):
131 | tmp_tokens = []
132 | j = 0
133 | while j < len(caption['caption']):
134 | if det_indicator[i][j][0] != 0:
135 | tmp_tokens.append(vocab_size + det_indicator[i][j][2] * 2 + det_indicator[i][j][1]-1)
136 | else:
137 | tmp_tokens.append(wtoi[caption['caption'][j]])
138 | j += 1
139 | ref_idxs.append(' '.join([str(int(_)) for _ in tmp_tokens]))
140 | # refs_words.append(ref_words)
141 | refs_idxs.append(ref_idxs)
142 | count_imgs += 1
143 |
144 | print('total imgs:', count_imgs)
145 |
146 | # ngram_words = compute_doc_freq(create_crefs(refs_words))
147 | ngram_idxs = compute_doc_freq(create_crefs(refs_idxs))
148 | return ngram_idxs, count_imgs
149 |
150 |
151 | def get_det_word(gt_bboxs, captions, wtod, dtoi):
152 |
153 | # get the present category.
154 | pcats = []
155 | for i in range(gt_bboxs.shape[0]):
156 | pcats.append(gt_bboxs[i,5])
157 | # get the orginial form of the caption.
158 | indicator = []
159 | for i, sent in enumerate(captions):
160 | indicator.append([(0, 0, 0)]*len(sent['caption'])) # category class, binary class, fine-grain class.
161 | for j, bbox_idx in enumerate(sent['bbox_idx']):
162 | # if the bbox_idx is not filtered out.
163 | if bbox_idx in pcats:
164 | w_idx = sent['idx'][j]
165 | ng = sent['clss'][j]
166 | bn = (ng != sent['caption'][w_idx]) + 1
167 | fg = dtoi[ng]
168 | indicator[i][w_idx] = (wtod[sent['clss'][j]], bn, fg)
169 |
170 | return indicator
171 |
172 | def main(params):
173 |
174 | info = json.load(open(params['dict_json'], 'r'))
175 | imgs = json.load(open(params['input_json'], 'r'))
176 |
177 | itow = info['ix_to_word']
178 | wtoi = {w:i for i,w in itow.items()}
179 | wtod = {w:i+1 for w,i in info['wtod'].items()} # word to detection
180 | # dtoi = {w:i+1 for i,w in enumerate(wtod.keys())} # detection to index
181 | dtoi = wtod
182 | wtol = info['wtol']
183 | itod = {i:w for w,i in dtoi.items()}
184 |
185 | # imgs = imgs['images']
186 |
187 | ngram_idxs, ref_len = build_dict(imgs, info, wtoi, wtod, dtoi, wtol, itod, params)
188 |
189 | # cPickle.dump({'document_frequency': ngram_words, 'ref_len': ref_len}, open(params['output_pkl']+'-words.p','w'), protocol=cPickle.HIGHEST_PROTOCOL)
190 | cPickle.dump({'document_frequency': ngram_idxs, 'ref_len': ref_len}, open(params['output_pkl']+'-idxs.p','w'), protocol=cPickle.HIGHEST_PROTOCOL)
191 |
192 | if __name__ == "__main__":
193 |
194 | parser = argparse.ArgumentParser()
195 |
196 | # input json
197 | parser.add_argument('--input_json', default='data/flickr30k/cap_flickr30k.json', help='input json file to process into hdf5')
198 | parser.add_argument('--dict_json', default='data/flickr30k/dic_flickr30k.json', help='output json file')
199 | parser.add_argument('--output_pkl', default='data/flickr30k-train', help='output pickle file')
200 | parser.add_argument('--split', default='train', help='test, val, train, all')
201 | args = parser.parse_args()
202 | params = vars(args) # convert to ordinary dict
203 |
204 | main(params)
205 |
--------------------------------------------------------------------------------
/tools/pycider/PyDataFormat/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'rama'
2 |
--------------------------------------------------------------------------------
/tools/pycider/PyDataFormat/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/PyDataFormat/__init__.pyc
--------------------------------------------------------------------------------
/tools/pycider/PyDataFormat/jsonify_refs.py:
--------------------------------------------------------------------------------
1 | """
2 | Code to convert mat file with structures into json files
3 | Created on : 5/18/15 3:27 PM by rama
4 | """
5 |
6 | import scipy.io as io
7 | import os
8 | import re
9 | import json
10 | import string
11 | import pdb
12 |
13 | pathToMat = '/Users/rama/Research/data/pyCider/'
14 | matfile = 'pascal_cands.mat'
15 | jsonfile = 'pascal_cands'
16 |
17 | data = io.loadmat(os.path.join(pathToMat, matfile))
18 | refs = list(data['cands'][0])
19 |
20 | A = []
21 | B = []
22 |
23 | for image in refs:
24 | for sentences in image[1]:
25 | for i, sent in enumerate(sentences):
26 | sent_struct = {}
27 | imname = str(image[0][0]).split('/')[-1]
28 | sent_struct['image_id'] = imname
29 | string_sent = sent[0].strip().split('\\')
30 | if len(string_sent) == 1:
31 | sent_struct['caption'] = string_sent[0]
32 | else:
33 | sent_struct['caption'] = ' '.join(string_sent[:-1])
34 | if i == 1:
35 | A.append(sent_struct)
36 | else:
37 | B.append(sent_struct)
38 |
39 | with open(os.path.join(pathToMat, jsonfile + 'A.json'), 'w') as outfile:
40 | json.dump(A, outfile)
41 |
42 | with open(os.path.join(pathToMat, jsonfile + 'B.json'), 'w') as outfile:
43 | json.dump(B, outfile)
44 |
--------------------------------------------------------------------------------
/tools/pycider/PyDataFormat/loadData.py:
--------------------------------------------------------------------------------
1 | """
2 | Load the reference and candidate json files, which are to be evaluated using CIDEr.
3 |
4 | Reference file: list of dict('image_id': image_id, 'caption': caption).
5 | Candidate file: list of dict('image_id': image_id, 'caption': caption).
6 |
7 | """
8 | import json
9 | import os
10 | from collections import defaultdict
11 |
12 | class LoadData():
13 | def __init__(self, path):
14 | self.pathToData = path
15 |
16 | def readJson(self, refname, candname):
17 |
18 | path_to_ref_file = os.path.join(self.pathToData, refname)
19 | path_to_cand_file = os.path.join(self.pathToData, candname)
20 |
21 | ref_list = json.loads(open(path_to_ref_file, 'r').read())
22 | cand_list = json.loads(open(path_to_cand_file, 'r').read())
23 |
24 | gts = defaultdict(list)
25 | res = []
26 |
27 | for l in ref_list:
28 | gts[l['image_id']].append({"caption": l['caption']})
29 |
30 | res = cand_list;
31 | return gts, res
32 |
--------------------------------------------------------------------------------
/tools/pycider/PyDataFormat/loadData.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/PyDataFormat/loadData.pyc
--------------------------------------------------------------------------------
/tools/pycider/README.md:
--------------------------------------------------------------------------------
1 | Consensus-based Image Description Evaluation (CIDEr Code)
2 | ===================
3 |
4 | Evaluation code for CIDEr metric. Provides CIDEr as well as
5 | CIDEr-D (CIDEr Defended) which is more robust to gaming effects.
6 |
7 | ## Important Note ##
8 | CIDEr by default (with idf parameter set to "corpus" mode) computes IDF values using the reference sentences provided. Thus, CIDEr score for a reference dataset with only 1 image will be zero. When evaluating using one (or few) images, set idf to "coco-val-df" instead, which uses IDF from the MSCOCO Vaildation Dataset for reliable results.
9 |
10 | ## Requirements ##
11 | - java 1.8.0
12 | - python 2.7
13 |
14 | For running the ipython notebook file, update your Ipython to [Jupyter](https://jupyter.org/)
15 |
16 | ## Files ##
17 | ./
18 | - cidereval.py (demo script)
19 |
20 | ./PyDataFormat
21 | - loadData.py (load the json files for references and candidates)
22 |
23 | - {$result\_file}.json (file with the CIDEr and CIDEr-D scores)
24 |
25 | ./pycocoevalcap: The folder where all evaluation codes are stored.
26 | - evals.py: Performs tokenization and runs both the metrics
27 | - tokenizer: Python wrapper of Stanford CoreNLP PTBTokenizer
28 | - cider: CIDEr evaluation codes
29 | - ciderD: CIDEr-D evaluation codes
30 |
31 | ## Instructions ##
32 | 1. Edit the params.json file to contain path to reference and candidate json files, and the result file where the scores are stored\*.
33 | 2. Set the "idf" value in params.json to "corpus" if not evaluating on a single image/instance. Set the "idf" value to "coco-val-df" if evaluating on a single image. In this case IDF values from the MSCOCO dataset are used. If using some other corpus, get the document frequencies into a similar format as "coco-val-df", and put them in the data/ folder as a pickle file. Then set mode to the name of the document frequency file (without the '.p' extension).
34 | 3. Sample json reference and candidate files are pascal50S.json and pascal_candsB.json
35 | 4. CIDEr scores are stored in "scores" variable: scores['CIDEr'] -> CIDEr scores, scores['CIDErD'] -> CIDEr-D scores
36 |
37 | *Even when evaluating with independent candidate/references (for eg. when using "coco-val-df"), put multiple candidate and reference entries into the same json files. This is much faster than having separate candidate and reference files and calling the evaluation code separately on each candidate/reference file.
38 | ## References ##
39 |
40 | - PTBTokenizer: We use the [Stanford Tokenizer](http://nlp.stanford.edu/software/tokenizer.shtml) which is included in [Stanford CoreNLP 3.4.1](http://nlp.stanford.edu/software/corenlp.shtml).
41 | - CIDEr: [CIDEr: Consensus-based Image Description Evaluation] (http://arxiv.org/pdf/1411.5726.pdf)
42 |
43 | ## Developers ##
44 | - Ramakrishna Vedantam (Virgina Tech)
45 |
46 | ## Acknowledgments ##
47 | - MS COCO Caption Evaluation Team
48 |
--------------------------------------------------------------------------------
/tools/pycider/cidereval.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "# demo script for running CIDEr\n",
12 | "from PyDataFormat.loadData import LoadData\n",
13 | "import pdb\n",
14 | "import json\n",
15 | "from pyciderevalcap.eval import CIDErEvalCap as ciderEval\n",
16 | "from collections import defaultdict\n",
17 | "\n",
18 | "pathToData = './data/'\n",
19 | "\n",
20 | "refName = 'pascal50S.json'\n",
21 | "candName = 'pascal_candsB.json'\n",
22 | "\n",
23 | "result_file = 'results.json'"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 13,
29 | "metadata": {
30 | "collapsed": false
31 | },
32 | "outputs": [],
33 | "source": [
34 | "# load reference and candidate sentences\n",
35 | "loadDat = LoadData(pathToData)\n",
36 | "gts, res = loadDat.readJson(refName, candName)\n",
37 | "\n",
38 | "#res = res[:100]\n",
39 | "#gts = {img['image_id']: gts[img['image_id']] for img in res}"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 15,
45 | "metadata": {
46 | "collapsed": false
47 | },
48 | "outputs": [],
49 | "source": [
50 | "from pyciderevalcap.ciderD.ciderD import CiderD\n",
51 | "from pyciderevalcap.cider.cider import Cider\n",
52 | "from pyciderevalcap.tokenizer.ptbtokenizer import PTBTokenizer\n",
53 | "tokenizer = PTBTokenizer('gts')\n",
54 | "_gts = tokenizer.tokenize(gts)\n",
55 | "tokenizer = PTBTokenizer('res')\n",
56 | "_res = tokenizer.tokenize(res)\n"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": null,
62 | "metadata": {
63 | "collapsed": true
64 | },
65 | "outputs": [],
66 | "source": [
67 | "scorer = Cider(df='coco-val')\n",
68 | "scorerD = CiderD(df='coco-val')"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 16,
74 | "metadata": {
75 | "collapsed": false
76 | },
77 | "outputs": [
78 | {
79 | "name": "stdout",
80 | "output_type": "stream",
81 | "text": [
82 | "0.535560513246\n",
83 | "0.448542862876\n"
84 | ]
85 | }
86 | ],
87 | "source": [
88 | "score, scores = scorer.compute_score(_gts, _res)\n",
89 | "scoreD, scoresD = scorerD.compute_score(_gts, _res)\n",
90 | "print score\n",
91 | "print scoreD"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {
98 | "collapsed": true
99 | },
100 | "outputs": [],
101 | "source": []
102 | }
103 | ],
104 | "metadata": {
105 | "kernelspec": {
106 | "display_name": "Python 2",
107 | "language": "python",
108 | "name": "python2"
109 | },
110 | "language_info": {
111 | "codemirror_mode": {
112 | "name": "ipython",
113 | "version": 2
114 | },
115 | "file_extension": ".py",
116 | "mimetype": "text/x-python",
117 | "name": "python",
118 | "nbconvert_exporter": "python",
119 | "pygments_lexer": "ipython2",
120 | "version": "2.7.11"
121 | }
122 | },
123 | "nbformat": 4,
124 | "nbformat_minor": 0
125 | }
126 |
--------------------------------------------------------------------------------
/tools/pycider/cidereval.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | # In[1]:
4 |
5 | # demo script for running CIDEr
6 | import json
7 | from pydataformat.loadData import LoadData
8 | from pyciderevalcap.eval import CIDErEvalCap as ciderEval
9 |
10 | # load the configuration file
11 | config = json.loads(open('params.json', 'r').read())
12 |
13 | # Print the parameters
14 | print("""Running CIDEr with the following settings
15 | *****************************
16 | Reference File:{refName}
17 | Candidate File:{candName}
18 | Result File:{resultFile}
19 | IDF:{idf}
20 | *****************************""".format(**config))
21 |
22 | pathToData = config['pathToData']
23 | refName = config['refName']
24 | candName = config['candName']
25 | resultFile = config['resultFile']
26 | df_mode = config['idf']
27 |
28 | # In[2]:
29 |
30 | # load reference and candidate sentences
31 | loadDat = LoadData(pathToData)
32 | gts, res = loadDat.readJson(refName, candName)
33 |
34 |
35 | # In[3]:
36 |
37 | # calculate cider scores
38 | scorer = ciderEval(gts, res, df_mode)
39 | # scores: dict of list with key = metric and value = score given to each
40 | # candidate
41 | scores = scorer.evaluate()
42 |
43 |
44 | # In[7]:
45 |
46 | # scores['CIDEr'] contains CIDEr scores in a list for each candidate
47 | # scores['CIDErD'] contains CIDEr-D scores in a list for each candidate
48 |
49 | with open(resultFile, 'w') as outfile:
50 | json.dump(scores, outfile)
51 |
--------------------------------------------------------------------------------
/tools/pycider/license.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015, Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions are met:
6 |
7 | 1. Redistributions of source code must retain the above copyright notice, this
8 | list of conditions and the following disclaimer.
9 | 2. Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 |
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 |
24 | The views and conclusions contained in the software and documentation are those
25 | of the authors and should not be interpreted as representing official policies,
26 | either expressed or implied, of the FreeBSD Project.
27 |
--------------------------------------------------------------------------------
/tools/pycider/params.json:
--------------------------------------------------------------------------------
1 | {
2 | "pathToData" : "data/",
3 | "refName" : "pascal50S.json",
4 | "candName" : "pascal_candsB.json",
5 | "resultFile" : "results.json",
6 | "idf" : "coco-val-df"
7 | }
8 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/__init__.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/cider/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/cider/__init__.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/cider/cider.py:
--------------------------------------------------------------------------------
1 | # Filename: cider.py
2 | #
3 | #
4 | # Description: Describes the class to compute the CIDEr
5 | # (Consensus-Based Image Description Evaluation) Metric
6 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
7 | #
8 | # Creation Date: Sun Feb 8 14:16:54 2015
9 | #
10 | # Authors: Ramakrishna Vedantam and
11 | # Tsung-Yi Lin
12 |
13 | from cider_scorer import CiderScorer
14 |
15 |
16 | class Cider:
17 | """
18 | Main Class to compute the CIDEr metric
19 |
20 | """
21 | def __init__(self, n=4, df="corpus"):
22 | """
23 | Initialize the CIDEr scoring function
24 | : param n (int): n-gram size
25 | : param df (string): specifies where to get the IDF values from
26 | takes values 'corpus', 'coco-train'
27 | : return: None
28 | """
29 | # set cider to sum over 1 to 4-grams
30 | self._n = n
31 | self._df = df
32 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
33 |
34 | def compute_score(self, gts, res):
35 | """
36 | Main function to compute CIDEr score
37 | : param gts (dict) : {image:tokenized reference sentence}
38 | : param res (dict) : {image:tokenized candidate sentence}
39 | : return: cider (float) : computed CIDEr score for the corpus
40 | """
41 |
42 | # clear all the previous hypos and refs
43 | self.cider_scorer.clear()
44 |
45 | for res_id in res:
46 |
47 | hypo = res_id['caption']
48 | ref = gts[res_id['image_id']]
49 |
50 | # Sanity check.
51 | assert(type(hypo) is list)
52 | assert(len(hypo) == 1)
53 | assert(type(ref) is list)
54 | assert(len(ref) > 0)
55 | self.cider_scorer += (hypo[0], ref)
56 |
57 | (score, scores) = self.cider_scorer.compute_score()
58 |
59 | return score, scores
60 |
61 | def method(self):
62 | return "CIDEr"
63 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/cider/cider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/cider/cider.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/cider/cider_scorer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Tsung-Yi Lin
3 | # Ramakrishna Vedantam
4 |
5 | import copy
6 | import pickle
7 | from collections import defaultdict
8 | import numpy as np
9 | import math
10 | import os
11 |
12 | from six.moves import xrange
13 |
14 | def precook(s, n=4, out=False):
15 | """
16 | Takes a string as input and returns an object that can be given to
17 | either cook_refs or cook_test. This is optional: cook_refs and cook_test
18 | can take string arguments as well.
19 | :param s: string : sentence to be converted into ngrams
20 | :param n: int : number of ngrams for which representation is calculated
21 | :return: term frequency vector for occuring ngrams
22 | """
23 | words = s.split()
24 | counts = defaultdict(int)
25 | for k in xrange(1,n+1):
26 | for i in xrange(len(words)-k+1):
27 | ngram = tuple(words[i:i+k])
28 | counts[ngram] += 1
29 | return counts
30 |
31 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
32 | '''Takes a list of reference sentences for a single segment
33 | and returns an object that encapsulates everything that BLEU
34 | needs to know about them.
35 | :param refs: list of string : reference sentences for some image
36 | :param n: int : number of ngrams for which (ngram) representation is calculated
37 | :return: result (list of dict)
38 | '''
39 | return [precook(ref, n) for ref in refs]
40 |
41 | def cook_test(test, n=4):
42 | '''Takes a test sentence and returns an object that
43 | encapsulates everything that BLEU needs to know about it.
44 | :param test: list of string : hypothesis sentence for some image
45 | :param n: int : number of ngrams for which (ngram) representation is calculated
46 | :return: result (dict)
47 | '''
48 | return precook(test, n, True)
49 |
50 | class CiderScorer(object):
51 | """CIDEr scorer.
52 | """
53 |
54 | def copy(self):
55 | ''' copy the refs.'''
56 | new = CiderScorer(n=self.n)
57 | new.ctest = copy.copy(self.ctest)
58 | new.crefs = copy.copy(self.crefs)
59 | return new
60 |
61 | def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0):
62 | ''' singular instance '''
63 | self.n = n
64 | self.sigma = sigma
65 | self.crefs = []
66 | self.ctest = []
67 | self.df_mode = df_mode
68 | if self.df_mode != "corpus":
69 | self.document_frequency = pickle.load(open(os.path.join('data', df_mode + '.p'),'r'))
70 | self.cook_append(test, refs)
71 | self.ref_len = None
72 |
73 | def clear(self):
74 | self.crefs = []
75 | self.ctest = []
76 |
77 | def cook_append(self, test, refs):
78 | '''called by constructor and __iadd__ to avoid creating new instances.'''
79 |
80 | if refs is not None:
81 | self.crefs.append(cook_refs(refs))
82 | if test is not None:
83 | self.ctest.append(cook_test(test)) ## N.B.: -1
84 | else:
85 | self.ctest.append(None) # lens of crefs and ctest have to match
86 |
87 | def size(self):
88 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
89 | return len(self.crefs)
90 |
91 | def __iadd__(self, other):
92 | '''add an instance (e.g., from another sentence).'''
93 |
94 | if type(other) is tuple:
95 | ## avoid creating new CiderScorer instances
96 | self.cook_append(other[0], other[1])
97 | else:
98 | self.ctest.extend(other.ctest)
99 | self.crefs.extend(other.crefs)
100 |
101 | return self
102 | def compute_doc_freq(self):
103 | '''
104 | Compute term frequency for reference data.
105 | This will be used to compute idf (inverse document frequency later)
106 | The term frequency is stored in the object
107 | :return: None
108 | '''
109 | for refs in self.crefs:
110 | # refs, k ref captions of one image
111 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
112 | self.document_frequency[ngram] += 1
113 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
114 |
115 | def compute_cider(self):
116 | def counts2vec(cnts):
117 | """
118 | Function maps counts of ngram to vector of tfidf weights.
119 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
120 | The n-th entry of array denotes length of n-grams.
121 | :param cnts:
122 | :return: vec (array of dict), norm (array of float), length (int)
123 | """
124 | vec = [defaultdict(float) for _ in range(self.n)]
125 | length = 0
126 | norm = [0.0 for _ in range(self.n)]
127 | for (ngram,term_freq) in cnts.iteritems():
128 | # give word count 1 if it doesn't appear in reference corpus
129 | df = np.log(max(1.0, self.document_frequency[ngram]))
130 | # ngram index
131 | n = len(ngram)-1
132 | # tf (term_freq) * idf (precomputed idf) for n-grams
133 | vec[n][ngram] = float(term_freq)*(self.ref_len - df)
134 | # compute norm for the vector. the norm will be used for
135 | # computing similarity
136 | norm[n] += pow(vec[n][ngram], 2)
137 |
138 | if n == 1:
139 | length += term_freq
140 | norm = [np.sqrt(n) for n in norm]
141 | return vec, norm, length
142 |
143 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
144 | '''
145 | Compute the cosine similarity of two vectors.
146 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis
147 | :param vec_ref: array of dictionary for vector corresponding to reference
148 | :param norm_hyp: array of float for vector corresponding to hypothesis
149 | :param norm_ref: array of float for vector corresponding to reference
150 | :param length_hyp: int containing length of hypothesis
151 | :param length_ref: int containing length of reference
152 | :return: array of score for each n-grams cosine similarity
153 | '''
154 | delta = float(length_hyp - length_ref)
155 | # measure consine similarity
156 | val = np.array([0.0 for _ in range(self.n)])
157 | for n in range(self.n):
158 | # ngram
159 | for (ngram,count) in vec_hyp[n].iteritems():
160 | val[n] += vec_hyp[n][ngram] * vec_ref[n][ngram]
161 |
162 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
163 | val[n] /= (norm_hyp[n]*norm_ref[n])
164 |
165 | assert(not math.isnan(val[n]))
166 | return val
167 |
168 | # compute log reference length
169 | if self.df_mode == "corpus":
170 | self.ref_len = np.log(float(len(self.crefs)))
171 | elif self.df_mode == "coco-val":
172 | # if coco option selected, use length of coco-val set
173 | self.ref_len = np.log(float(40504))
174 |
175 | scores = []
176 | for test, refs in zip(self.ctest, self.crefs):
177 | # compute vector for test captions
178 | vec, norm, length = counts2vec(test)
179 | # compute vector for ref captions
180 | score = np.array([0.0 for _ in range(self.n)])
181 | for ref in refs:
182 | vec_ref, norm_ref, length_ref = counts2vec(ref)
183 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
184 | # change by vrama91 - mean of ngram scores, instead of sum
185 | score_avg = np.mean(score)
186 | # divide by number of references
187 | score_avg /= len(refs)
188 | # multiply score by 10
189 | score_avg *= 10.0
190 | # append score of an image to the score list
191 | scores.append(score_avg)
192 | return scores
193 |
194 | def compute_score(self, option=None, verbose=0):
195 | # compute idf
196 | if self.df_mode == "corpus":
197 | self.document_frequency = defaultdict(float)
198 | self.compute_doc_freq()
199 | # assert to check document frequency
200 | assert(len(self.ctest) >= max(self.document_frequency.values()))
201 | # import json for now and write the corresponding files
202 | # compute cider score
203 | score = self.compute_cider()
204 | # debug
205 | # print score
206 | return np.mean(np.array(score)), np.array(score)
207 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/cider/cider_scorer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/cider/cider_scorer.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/ciderD/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/ciderD/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/ciderD/__init__.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/ciderD/ciderD.py:
--------------------------------------------------------------------------------
1 | # Filename: ciderD.py
2 | #
3 | # Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric
4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
5 | #
6 | # Creation Date: Sun Feb 8 14:16:54 2015
7 | #
8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin
9 |
10 | from .ciderD_scorer import CiderScorer
11 | import torch.nn as nn
12 | import pdb
13 |
14 | class CiderD(nn.Module):
15 | """
16 | Main Class to compute the CIDEr metric
17 |
18 | """
19 | def __init__(self, n=4, sigma=6.0, df="corpus"):
20 | super(CiderD, self).__init__()
21 |
22 | # set cider to sum over 1 to 4-grams
23 | self._n = n
24 | # set the standard deviation parameter for gaussian penalty
25 | self._sigma = sigma
26 | # set which where to compute document frequencies from
27 | self._df = df
28 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
29 |
30 | def compute_score(self, gts, res):
31 | """
32 | Main function to compute CIDEr score
33 | :param hypo_for_image (dict) : dictionary with key and value
34 | ref_for_image (dict) : dictionary with key and value
35 | :return: cider (float) : computed CIDEr score for the corpus
36 | """
37 |
38 | # clear all the previous hypos and refs
39 | self.cider_scorer.clear()
40 | for res_id in res:
41 |
42 | hypo = res_id['caption']
43 | ref = gts[res_id['image_id']]
44 |
45 | # Sanity check.
46 | assert(type(hypo) is list)
47 | assert(len(hypo) == 1)
48 | assert(type(ref) is list)
49 | assert(len(ref) > 0)
50 | self.cider_scorer += (hypo[0], ref)
51 |
52 | (score, scores) = self.cider_scorer.compute_score()
53 |
54 | return score, scores
55 |
56 | def method(self):
57 | return "CIDEr-D"
58 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/ciderD/ciderD.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/ciderD/ciderD.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/ciderD/ciderD_scorer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Tsung-Yi Lin
3 | # Ramakrishna Vedantam
4 |
5 | import copy
6 | import math
7 | import os
8 | import pickle
9 | from collections import defaultdict
10 |
11 | import numpy as np
12 | from six.moves import xrange
13 |
14 | import torch.nn as nn
15 |
16 |
17 | def precook(s, n=4, out=False):
18 | """
19 | Takes a string as input and returns an object that can be given to
20 | either cook_refs or cook_test. This is optional: cook_refs and cook_test
21 | can take string arguments as well.
22 | :param s: string : sentence to be converted into ngrams
23 | :param n: int : number of ngrams for which representation is calculated
24 | :return: term frequency vector for occuring ngrams
25 | """
26 | words = s.split()
27 | counts = defaultdict(int)
28 | for k in xrange(1,n+1):
29 | for i in xrange(len(words)-k+1):
30 | ngram = tuple(words[i:i+k])
31 | counts[ngram] += 1
32 | return counts
33 |
34 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
35 | '''Takes a list of reference sentences for a single segment
36 | and returns an object that encapsulates everything that BLEU
37 | needs to know about them.
38 | :param refs: list of string : reference sentences for some image
39 | :param n: int : number of ngrams for which (ngram) representation is calculated
40 | :return: result (list of dict)
41 | '''
42 | return [precook(ref, n) for ref in refs]
43 |
44 | def cook_test(test, n=4):
45 | '''Takes a test sentence and returns an object that
46 | encapsulates everything that BLEU needs to know about it.
47 | :param test: list of string : hypothesis sentence for some image
48 | :param n: int : number of ngrams for which (ngram) representation is calculated
49 | :return: result (dict)
50 | '''
51 | return precook(test, n, True)
52 |
53 | class CiderScorer(nn.Module):
54 | """CIDEr scorer.
55 | """
56 | def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0):
57 | ''' singular instance '''
58 | super(CiderScorer, self).__init__()
59 | self.n = n
60 | self.sigma = sigma
61 | self.crefs = []
62 | self.ctest = []
63 | self.df_mode = df_mode
64 | self.ref_len = None
65 | if self.df_mode != "corpus":
66 | pkl_file = pickle.load(open(os.path.join('data', df_mode + '.p'),'r'))
67 | self.ref_len = pkl_file['ref_len']
68 | self.document_frequency = pkl_file['document_frequency']
69 | self.cook_append(test, refs)
70 |
71 | def clear(self):
72 | self.crefs = []
73 | self.ctest = []
74 |
75 | def copy(self):
76 | ''' copy the refs.'''
77 | new = CiderScorer(n=self.n)
78 | new.ctest = copy.copy(self.ctest)
79 | new.crefs = copy.copy(self.crefs)
80 | return new
81 |
82 | def cook_append(self, test, refs):
83 | '''called by constructor and __iadd__ to avoid creating new instances.'''
84 |
85 | if refs is not None:
86 | self.crefs.append(cook_refs(refs))
87 | if test is not None:
88 | self.ctest.append(cook_test(test)) ## N.B.: -1
89 | else:
90 | self.ctest.append(None) # lens of crefs and ctest have to match
91 |
92 | def size(self):
93 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
94 | return len(self.crefs)
95 |
96 | def __iadd__(self, other):
97 | '''add an instance (e.g., from another sentence).'''
98 |
99 | if type(other) is tuple:
100 | ## avoid creating new CiderScorer instances
101 | self.cook_append(other[0], other[1])
102 | else:
103 | self.ctest.extend(other.ctest)
104 | self.crefs.extend(other.crefs)
105 |
106 | return self
107 | def compute_doc_freq(self):
108 | '''
109 | Compute term frequency for reference data.
110 | This will be used to compute idf (inverse document frequency later)
111 | The term frequency is stored in the object
112 | :return: None
113 | '''
114 | for refs in self.crefs:
115 | # refs, k ref captions of one image
116 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.iteritems()]):
117 | self.document_frequency[ngram] += 1
118 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
119 |
120 | def compute_cider(self):
121 | def counts2vec(cnts):
122 | """
123 | Function maps counts of ngram to vector of tfidf weights.
124 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
125 | The n-th entry of array denotes length of n-grams.
126 | :param cnts:
127 | :return: vec (array of dict), norm (array of float), length (int)
128 | """
129 | vec = [defaultdict(float) for _ in range(self.n)]
130 | length = 0
131 | norm = [0.0 for _ in range(self.n)]
132 | for (ngram,term_freq) in cnts.iteritems():
133 | # give word count 1 if it doesn't appear in reference corpus
134 | df = np.log(max(1.0, self.document_frequency[ngram]))
135 | # ngram index
136 | n = len(ngram)-1
137 | # tf (term_freq) * idf (precomputed idf) for n-grams
138 | vec[n][ngram] = float(term_freq)*(self.ref_len - df)
139 | # compute norm for the vector. the norm will be used for computing similarity
140 | norm[n] += pow(vec[n][ngram], 2)
141 |
142 | if n == 1:
143 | length += term_freq
144 | norm = [np.sqrt(n) for n in norm]
145 | return vec, norm, length
146 |
147 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
148 | '''
149 | Compute the cosine similarity of two vectors.
150 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis
151 | :param vec_ref: array of dictionary for vector corresponding to reference
152 | :param norm_hyp: array of float for vector corresponding to hypothesis
153 | :param norm_ref: array of float for vector corresponding to reference
154 | :param length_hyp: int containing length of hypothesis
155 | :param length_ref: int containing length of reference
156 | :return: array of score for each n-grams cosine similarity
157 | '''
158 | delta = float(length_hyp - length_ref)
159 | # measure consine similarity
160 | val = np.array([0.0 for _ in range(self.n)])
161 | for n in range(self.n):
162 | # ngram
163 | for (ngram,count) in vec_hyp[n].iteritems():
164 | # vrama91 : added clipping
165 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
166 |
167 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
168 | val[n] /= (norm_hyp[n]*norm_ref[n])
169 |
170 | assert(not math.isnan(val[n]))
171 | # vrama91: added a length based gaussian penalty
172 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
173 | return val
174 |
175 | # compute log reference length
176 | if self.df_mode == "corpus":
177 | self.ref_len = np.log(float(len(self.crefs)))
178 | #elif self.df_mode == "coco-val":
179 | # if coco option selected, use length of coco-val set
180 | # self.ref_len = np.log(float(40504))
181 |
182 | scores = []
183 | for test, refs in zip(self.ctest, self.crefs):
184 | # compute vector for test captions
185 | vec, norm, length = counts2vec(test)
186 | # compute vector for ref captions
187 | score = np.array([0.0 for _ in range(self.n)])
188 | for ref in refs:
189 | vec_ref, norm_ref, length_ref = counts2vec(ref)
190 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
191 | # change by vrama91 - mean of ngram scores, instead of sum
192 | score_avg = np.mean(score)
193 | # divide by number of references
194 | score_avg /= len(refs)
195 | # multiply score by 10
196 | score_avg *= 10.0
197 | # append score of an image to the score list
198 | scores.append(score_avg)
199 | return scores
200 |
201 | def compute_score(self, option=None, verbose=0):
202 | # compute idf
203 | if self.df_mode == "corpus":
204 | self.document_frequency = defaultdict(float)
205 | self.compute_doc_freq()
206 | # assert to check document frequency
207 | assert(len(self.ctest) >= max(self.document_frequency.values()))
208 | # import json for now and write the corresponding files
209 | # compute cider score
210 | score = self.compute_cider()
211 | # debug
212 | # print score
213 | return np.mean(np.array(score)), np.array(score)
214 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/ciderD/ciderD_scorer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/ciderD/ciderD_scorer.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/eval.py:
--------------------------------------------------------------------------------
1 | __author__ = 'rama'
2 | from tokenizer.ptbtokenizer import PTBTokenizer
3 | from cider.cider import Cider
4 | from ciderD.ciderD import CiderD
5 |
6 |
7 | class CIDErEvalCap:
8 | def __init__(self, gts, res, df):
9 | print('tokenization...')
10 | tokenizer = PTBTokenizer('gts')
11 | _gts = tokenizer.tokenize(gts)
12 | print('tokenized refs')
13 | tokenizer = PTBTokenizer('res')
14 | _res = tokenizer.tokenize(res)
15 | print('tokenized cands')
16 |
17 | self.gts = _gts
18 | self.res = _res
19 | self.df = df
20 |
21 | def evaluate(self):
22 | # =================================================
23 | # Set up scorers
24 | # =================================================
25 |
26 | print('setting up scorers...')
27 | scorers = [
28 | (Cider(df=self.df), "CIDEr"), (CiderD(df=self.df), "CIDErD")
29 | ]
30 |
31 | # =================================================
32 | # Compute scores
33 | # =================================================
34 | metric_scores = {}
35 | for scorer, method in scorers:
36 | print('computing %s score...' % (scorer.method()))
37 | score, scores = scorer.compute_score(self.gts, self.res)
38 | print("Mean %s score: %0.3f" % (method, score))
39 | metric_scores[method] = list(scores)
40 | return metric_scores
41 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/eval.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/eval.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/__init__.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File Name : ptbtokenizer.py
4 | #
5 | # Description : Do the PTB Tokenization and remove punctuations.
6 | #
7 | # Creation Date : 29-12-2014
8 | # Last Modified : Thu Mar 19 09:53:35 2015
9 | # Authors : Hao Fang and Tsung-Yi Lin
10 |
11 | import os
12 | import pdb # python debugger
13 | import sys
14 | import subprocess
15 | import re
16 | import tempfile
17 | import itertools
18 |
19 | # path to the stanford corenlp jar
20 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
21 |
22 | # punctuations to be removed from the sentences
23 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", \
24 | ".", "?", "!", ",", ":", "-", "--", "...", ";"]
25 |
26 | class PTBTokenizer:
27 | """Python wrapper of Stanford PTBTokenizer"""
28 | def __init__(self, _source='gts'):
29 | self.source = _source
30 |
31 | def tokenize(self, captions_for_image):
32 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, \
33 | 'edu.stanford.nlp.process.PTBTokenizer', \
34 | '-preserveLines', '-lowerCase']
35 |
36 | # ======================================================
37 | # prepare data for PTB Tokenizer
38 | # ======================================================
39 |
40 | if self.source == 'gts':
41 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
42 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
43 | final_tokenized_captions_for_image = {}
44 |
45 | elif self.source == 'res':
46 | index = [i for i, v in enumerate(captions_for_image)]
47 | image_id = [v["image_id"] for v in captions_for_image]
48 | sentences = '\n'.join(v["caption"].replace('\n', ' ') for v in captions_for_image )
49 | final_tokenized_captions_for_index = []
50 |
51 | # ======================================================
52 | # save sentences to temporary file
53 | # ======================================================
54 | path_to_jar_dirname=os.path.dirname(os.path.abspath(__file__))
55 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
56 | tmp_file.write(sentences)
57 | tmp_file.close()
58 |
59 | # ======================================================
60 | # tokenize sentence
61 | # ======================================================
62 | cmd.append(os.path.basename(tmp_file.name))
63 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, \
64 | stdout=subprocess.PIPE)
65 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
66 | lines = token_lines.split('\n')
67 | # remove temp file
68 | os.remove(tmp_file.name)
69 |
70 | # ======================================================
71 | # create dictionary for tokenized captions
72 | # ======================================================
73 | if self.source == 'gts':
74 | for k, line in zip(image_id, lines):
75 | if not k in final_tokenized_captions_for_image:
76 | final_tokenized_captions_for_image[k] = []
77 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
78 | if w not in PUNCTUATIONS])
79 | final_tokenized_captions_for_image[k].append(tokenized_caption)
80 |
81 | return final_tokenized_captions_for_image
82 |
83 | elif self.source == 'res':
84 | for k, img, line in zip(index, image_id, lines):
85 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') \
86 | if w not in PUNCTUATIONS])
87 | final_tokenized_captions_for_index.append({'image_id': img, 'caption': [tokenized_caption]})
88 |
89 | return final_tokenized_captions_for_index
90 |
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/ptbtokenizer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/ptbtokenizer.pyc
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/stanford-corenlp-3.4.1.jar
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/tmpBF49XX:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpBF49XX
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/tmpql9uU7:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpql9uU7
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/tmpuCp_T0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpuCp_T0
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/tmpxAmV_C:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpxAmV_C
--------------------------------------------------------------------------------
/tools/pycider/pyciderevalcap/tokenizer/tmpzNW4I2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jiasenlu/NeuralBabyTalk/c351ae6ca59380eb4cf38384d3e66233b4d9e1b1/tools/pycider/pyciderevalcap/tokenizer/tmpzNW4I2
--------------------------------------------------------------------------------
/tools/sentence_gen_tools/__init__.py:
--------------------------------------------------------------------------------
1 | #init
2 |
--------------------------------------------------------------------------------
/tools/sentence_gen_tools/coco_eval.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import sys
4 |
5 | # from python_utils import *
6 | sys.path.append('tools/coco-caption/')
7 | COCO_EVAL_PATH = '.tools/coco-caption/pycocotools'
8 | sys.path.insert(0, COCO_EVAL_PATH)
9 | from pycocoevalcap.bleu.bleu import Bleu
10 | from pycocoevalcap.cider.cider import Cider
11 | from pycocoevalcap.eval import COCOEvalCap
12 | from pycocoevalcap.meteor.meteor import Meteor
13 | from pycocoevalcap.rouge.rouge import Rouge
14 | from pycocoevalcap.spice.spice import Spice
15 | from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
16 | from pycocotools.coco import COCO
17 |
18 | rm_word_dict = {'bus': ['bus', 'busses'],
19 | 'bottle': ['bottle', 'bottles'],
20 | 'couch': ['couch', 'couches', 'sofa', 'sofas'],
21 | 'microwave': ['microwave', 'microwaves'],
22 | 'pizza': ['pizza', 'pizzas'],
23 | 'racket': ['racket', 'rackets', 'racquet', 'racquets'],
24 | 'suitcase': ['luggage', 'luggages', 'suitcase', 'suitcases'],
25 | 'zebra': ['zebra', 'zebras']}
26 |
27 |
28 | def read_json(t_file):
29 | j_file = open(t_file).read()
30 | return json.loads(j_file)
31 |
32 |
33 | class DCCScorer(COCOEvalCap):
34 |
35 | def get_dcc_scores(self):
36 |
37 | imgIds = self.params['image_id']
38 | # imgIds = self.coco.getImgIds()
39 | gts = {}
40 | res = {}
41 | for imgId in imgIds:
42 | gts[imgId] = self.coco.imgToAnns[imgId]
43 | res[imgId] = self.cocoRes.imgToAnns[imgId]
44 |
45 | tokenizer = PTBTokenizer()
46 | gts = tokenizer.tokenize(gts)
47 | res = tokenizer.tokenize(res)
48 | scorers = [
49 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
50 | (Meteor(), "METEOR"),
51 | (Rouge(), "ROUGE_L"),
52 | (Cider(df='noc_test_freq'), "CIDEr"),
53 | (Spice(), "SPICE")
54 | ]
55 | score_dict = {}
56 | for scorer, method in scorers:
57 | print('computing %s score...' % (scorer.method()))
58 | score, scores = scorer.compute_score(gts, res)
59 | if type(method) == list:
60 | for sc, scs, m in zip(score, scores, method):
61 | score_dict[m] = sc
62 | print("%s: %0.3f" % (m, sc))
63 | else:
64 | score_dict[method] = score
65 | print("%s: %0.3f" % (method, score))
66 |
67 | return score_dict
68 |
69 |
70 | def split_sent(sent):
71 | sent = sent.lower()
72 | sent = re.sub('[^A-Za-z0-9\s]+', '', sent)
73 | return sent.split()
74 |
75 |
76 | def F1(generated_json, novel_ids, train_ids, word):
77 | set_rm_words = set(rm_word_dict[word])
78 | gen_dict = {}
79 | for c in generated_json:
80 | gen_dict[c['image_id']] = c['caption']
81 |
82 | # true positive are sentences that contain match words and should
83 | tp = sum([1 for c in novel_ids if len(set_rm_words.intersection(set(split_sent(gen_dict[c])))) > 0])
84 | # false positive are sentences that contain match words and should not
85 | fp = sum([1 for c in train_ids if len(set_rm_words.intersection(set(split_sent(gen_dict[c])))) > 0])
86 | # false positive are sentences that do not contain match words and should
87 | fn = sum([1 for c in novel_ids if len(set_rm_words.intersection(set(split_sent(gen_dict[c])))) == 0])
88 |
89 | # precision = tp/(tp+fp)
90 | if tp > 0:
91 | precision = float(tp)/(tp+fp)
92 | # recall = tp/(tp+fn)
93 | recall = float(tp)/(tp+fn)
94 | # f1 = 2* (precision*recall)/(precision+recall)
95 | return 2*(precision*recall)/(precision+recall)
96 | else:
97 | return 0.
98 |
99 |
100 | def score_dcc(gt_template_novel, gt_template_train,
101 | generation_result, words, dset, cache_path):
102 |
103 | score_dict_dcc = {}
104 | generated_sentences = generation_result
105 | f1_scores = 0
106 |
107 | for word in words:
108 | gt_file = gt_template_novel % (word, dset)
109 | gt_json_novel = read_json(gt_template_novel % (word, dset))
110 | gt_json_train = read_json(gt_template_train % (word, dset))
111 | gt_ids_novel = [c['image_id'] for c in gt_json_novel['annotations']]
112 | gt_ids_train = [c['image_id'] for c in gt_json_train['annotations']]
113 | gen = []
114 | for c in generated_sentences:
115 | if c['image_id'] in gt_ids_novel:
116 | gen.append(c)
117 |
118 | json.dump(gen, open(cache_path, 'w'))
119 | # save_json(gen, 'tmp_gen.json')
120 | coco = COCO(gt_file)
121 | generation_coco = coco.loadRes(cache_path)
122 | dcc_evaluator = DCCScorer(coco, generation_coco, 'noc_test_freq')
123 | score_dict = dcc_evaluator.get_dcc_scores()
124 | # os.remove(cache_path)
125 |
126 | for key in score_dict.keys():
127 | if key not in score_dict_dcc.keys():
128 | score_dict_dcc[key] = 0
129 | score_dict_dcc[key] += score_dict[key]
130 |
131 | f1_score = F1(generated_sentences, gt_ids_novel, gt_ids_train, word)
132 | print("F1 score for %s: %f" % (word, f1_score))
133 | f1_scores += f1_score
134 |
135 | print("########################################################################")
136 | for key in sorted(score_dict_dcc.keys()):
137 | score_dict_dcc[key] = score_dict_dcc[key]/len(words)
138 | print("Average %s: %0.3f" % (key, score_dict_dcc[key]))
139 | print("Average F1 score: %f" % (f1_scores/len(words)))
140 | out = {}
141 | for key in sorted(score_dict_dcc.keys()):
142 | out[key] = score_dict_dcc[key]
143 | out['F1'] = f1_scores / len(words)
144 |
145 | return out
146 |
147 |
148 | def score_generation(gt_filename=None, generation_result=None):
149 |
150 | coco = COCO(gt_filename)
151 | generation_coco = coco.loadRes(generation_result)
152 | coco_evaluator = COCOEvalCap(coco, generation_coco, 'noc_test_freq')
153 | coco_evaluator.evaluate()
154 |
155 |
156 | def save_json_coco_format(caps, save_name):
157 |
158 | def get_coco_id(im_name):
159 | coco_id = int(im_name.split('/')[-1].split('_')[-1].split('.jpg')[0])
160 | return coco_id
161 |
162 | coco_format_caps = [{'caption': value, 'image_id': get_coco_id(key)}
163 | for value, key in zip(caps.values(), caps.keys())]
164 |
165 | json.dump(coco_format_caps, open(save_name, 'w'))
166 | # save_json(coco_format_caps, save_name)
167 |
168 |
169 | def save_json_other_format(caps, save_name):
170 |
171 | format_caps = [{'caption': value, 'image_id': key}
172 | for value, key in zip(caps.values(), caps.keys())]
173 |
174 | # save_json(format_caps, save_name)
175 | json.dump(format_caps, open(save_name, 'w'))
176 |
--------------------------------------------------------------------------------
|