├── .gitignore
├── LICENSE
├── Note.md
├── README.md
├── __init__.py
├── info
├── __init__.py
├── densecap_splits.json
├── read_regions.py
├── read_splits.py
├── test.txt
├── train.txt
└── val.txt
├── lib
├── Makefile
├── __init__.py
├── config.py
├── datasets
│ ├── __init__.py
│ ├── factory.py
│ ├── imdb.py
│ └── visual_genome.py
├── dense_cap
│ ├── __init__.py
│ ├── beam_search.py
│ ├── caption_generator.py
│ ├── test.py
│ ├── train.py
│ └── vis_whtml.py
├── download_data_vh.sh
├── fast_rcnn
│ ├── __init__.py
│ ├── bbox_transform.py
│ ├── layer.py
│ ├── minibatch.py
│ ├── nms_wrapper.py
│ └── roidb.py
├── layers
│ ├── __init__.py
│ ├── anchor_target_layer.py
│ ├── generate_anchors.py
│ ├── global_roi_layer.py
│ ├── proposal_layer.py
│ ├── proposal_target_layer.py
│ ├── proposal_target_single_class_layer.py
│ ├── proposal_top_layer.py
│ ├── rois_offset_layer.py
│ ├── sentence_data_layer.py
│ └── snippets.py
├── limit_ram
│ ├── __init__.py
│ └── utils.py
├── nets
│ ├── __init__.py
│ ├── mobilenet_v1.py
│ ├── network.py
│ ├── resnet_v1.py
│ └── vgg16.py
├── nms
│ ├── __init__.py
│ ├── cpu_nms.c
│ ├── cpu_nms.pyx
│ ├── gpu_nms.cpp
│ ├── gpu_nms.hpp
│ ├── gpu_nms.pyx
│ ├── nms_kernel.cu
│ └── py_cpu_nms.py
├── pre_glove.py
├── preprocess.py
├── preprocess.sh
├── pycocoevalcap
│ ├── README
│ ├── __init__.py
│ ├── bleu
│ │ ├── LICENSE
│ │ ├── __init__.py
│ │ ├── bleu.py
│ │ └── bleu_scorer.py
│ ├── cider
│ │ ├── __init__.py
│ │ ├── cider.py
│ │ └── cider_scorer.py
│ ├── eval.py
│ ├── meteor
│ │ ├── __init__.py
│ │ ├── meteor-1.5.jar
│ │ └── meteor.py
│ ├── rouge
│ │ ├── __init__.py
│ │ └── rouge.py
│ ├── tokenizer
│ │ ├── __init__.py
│ │ ├── ptbtokenizer.py
│ │ ├── stanford-corenlp-3.4.1.jar
│ │ └── tmpGeypfw
│ └── vg_eval.py
├── setup.py
└── utils
│ ├── __init__.py
│ ├── bbox.c
│ ├── bbox.pyx
│ ├── bbox_utils.py
│ ├── blob.py
│ ├── debug.py
│ ├── timer.py
│ └── visualization.py
├── logs
├── densecap.png
└── funny.png
├── requirements.txt
├── scripts
├── dense_cap_config.yml
├── dense_cap_demo.sh
├── dense_cap_test.sh
├── dense_cap_train.sh
└── old_dense_cap_train.sh
├── tests
├── README.md
├── __init__.py
├── architecture_test.py
├── bash_log_test
│ ├── bash_log_test.sh
│ ├── logs
│ │ └── test.txt.2017-10-18_15-33-56
│ └── nonsense.py
├── ckpt_restore_test.py
├── dencap_oa_test.sh
├── logs
│ ├── architecture_test.txt
│ ├── architecture_test_nodes.txt
│ ├── preprocessing.txt
│ └── sentence_data_layer_test.txt
├── pickle_read_test.py
├── read_regions_json
│ ├── ijson_example.txt
│ ├── read_regions_test.py
│ ├── test_region.json
│ ├── test_region_out.json
│ ├── true_id_1.json
│ └── true_id_1_out.json
├── roidata_test.py
├── sentence_data_layer_test.py
└── vh_train_command.sh
├── tools
├── __init__.py
├── _init_paths.py
├── demo.py
├── test_net.py
└── train_net.py
├── valohai.yaml
└── vis
├── README.md
├── d3.min.js
├── jquery-1.8.3.min.js
├── style.css
├── utils.js
└── view_results.html
/.gitignore:
--------------------------------------------------------------------------------
1 | #sublime
2 | *.sublime-workspace
3 | *.sublime-project
4 | #pycharm
5 | .idea/
6 | data/
7 | demo/
8 | experiments/
9 |
10 | tensorboard/
11 | output/
12 |
13 | # Byte-compiled / optimized / DLL files
14 | __pycache__/
15 | *.py[cod]
16 | *$py.class
17 |
18 | # C extensions
19 | *.so
20 |
21 | # Distribution / packaging
22 | .Python
23 | env/
24 | build/
25 | develop-eggs/
26 | dist/
27 | downloads/
28 | eggs/
29 | .eggs/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 | wheels/
35 | *.egg-info/
36 | .installed.cfg
37 | *.egg
38 |
39 | # PyInstaller
40 | # Usually these files are written by a python script from a template
41 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
42 | *.manifest
43 | *.spec
44 |
45 | # Installer logs
46 | pip-log.txt
47 | pip-delete-this-directory.txt
48 |
49 | # Unit test / coverage reports
50 | htmlcov/
51 | .tox/
52 | .coverage
53 | .coverage.*
54 | .cache
55 | nosetests.xml
56 | coverage.xml
57 | *.cover
58 | .hypothesis/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | target/
80 |
81 | # Jupyter Notebook
82 | .ipynb_checkpoints
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # celery beat schedule file
88 | celerybeat-schedule
89 |
90 | # SageMath parsed files
91 | *.sage.py
92 |
93 | # dotenv
94 | .env
95 |
96 | # virtualenv
97 | .venv
98 | venv/
99 | ENV/
100 |
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 |
105 | # Rope project settings
106 | .ropeproject
107 |
108 | # mkdocs documentation
109 | /site
110 |
111 | # mypy
112 | .mypy_cache/
113 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Innerpeace
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Densecap-tensorflow
2 |
3 | Implementation of CVPR2017 paper: [Dense captioning with joint inference and visual context](https://arxiv.org/abs/1611.06949) by **Linjie Yang, Kevin Tang, Jianchao Yang, Li-Jia Li**
4 |
5 | **WITH CHANGES:**
6 | 1. Borrow the idea of [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling](https://arxiv.org/abs/1611.01462), and tied word vectors and word classfiers during captioning.
7 | 2. Initialize Word Vectors and Word Classifers with pre-trained [glove](https://nlp.stanford.edu/projects/glove/) word vectors with dimensions of 300.
8 | 3. Change the backbone of the framework to ResNet-50.
9 | 4. Add `Beam Search` and `Length Normalization` in test mode.
10 | 5. Add "Limit_RAM" mode when praparing training date since my computer only has RAM with 8G.
11 |
12 |
13 |

14 |

15 |
16 |
17 | **Special thanks to [valohai](https://valohai.com/) for offering computing resource.**
18 |
19 | ## Note
20 |
21 | **Update 2017.12.31**
22 |
23 | * After 500k iterations of training with configurations of original paper (except for the weights tying of wordvec and classifiers), it achieves **mAP 8.296**.
24 |
25 | **Update 2017.12.20**
26 |
27 | * After 1 epoch(80000 iters) of training with randomly initialized word vectors(512d), it achieves **mAP 6.509**.
28 | * After 1 epoch(75000) of training with pre-trianed glove word vectors(300d), it got **mAP 5.5** nearly.
29 | * The complete training process will take almost **10 days** with the computation I have access to, and I just trained 1 epoch to varify the framework for now.
30 | * The scripts should be compatible with both python 2.X and 3.X. Although I built it under python 2.7.
31 | * Tested on Ubuntu 16.04, tensorflow 1.4, CUDA 8.0 and cudnn 6.0, with GPU Nvidia gtx 1060(LOL...).
32 |
33 | ## Dependencies
34 |
35 | To install required python modules by:
36 |
37 | ```commandline
38 | pip install -r lib/requirements.txt
39 | ```
40 |
41 | **For evaluation, one also need:**
42 | * java 1.8.0
43 | * python 2.7(according to
44 | [coco-caption](https://github.com/tylin/coco-caption))
45 |
46 | To install java runtime by:
47 | ```commandline
48 | sudo apt-get install openjdk-8-jre
49 | ```
50 |
51 | ## Preparing data
52 |
53 | ### Download
54 |
55 | [Website of Visual Genome Dataset](http://visualgenome.org/api/v0/api_home.html)
56 |
57 | * Make a new directory `VG` wherever you like.
58 | * Download `images` Part1 and Part2, extract `all (two parts)` to directory `VG/images`
59 | * Download `image meta data`, extract to directory `VG/1.2` or `VG/1.0` according to the version you download.
60 | * Download `region descriptions`, extract to directory `VG/1.2` or `VG/1.0` accordingly.
61 | * For the following process, we will refer **the absolute path** of directory `VG` as `raw_data_path`, e.g. `/home/user/git/VG`.
62 |
63 | ### Unlimit RAM
64 |
65 | If one has RAM more than 16G, then you can preprocessing dataset with following command.
66 | ```shell
67 | $ cd $ROOT/lib
68 | $ python preprocess.py --version [version] --path [raw_data_path] \
69 | --output_dir [dir] --max_words [max_len]
70 | ```
71 |
72 | ### Limit RAM (Less than 16G)
73 |
74 | If one has RAM `less than 16G`.
75 | * Firstly, setting up the data path in `info/read_regions.py` accordingly, and run the script with python. Then it will dump `regions` in `REGION_JSON` directory. It will take time to process more than 100k images, so be patient.
76 | ```shell
77 | $ cd $ROOT/info
78 | $ python read_regions --version [version] --vg_path [raw_data_path]
79 | ```
80 | * In `lib/preprocess.py`, set up data path accordingly. After running the file, it will dump `gt_regions` of every image respectively to `OUTPUT_DIR` as `directory`.
81 | ```shell
82 | $ cd $ROOT/lib
83 | $ python preprocess.py --version [version] --path [raw_data_path] \
84 | --output_dir [dir] --max_words [max_len] --limit_ram
85 | ```
86 |
87 | ## Compile local libs
88 |
89 | ```shell
90 | $ cd root/lib
91 | $ make
92 | ```
93 |
94 | ## Train
95 |
96 | Add or modify configurations in `root/scripts/dense_cap_config.yml`, refer to 'lib/config.py' for more configuration details.
97 | ```shell
98 | $ cd $ROOT
99 | $ bash scripts/dense_cap_train.sh [dataset] [net] [ckpt_to_init] [data_dir] [step]
100 | ```
101 |
102 | Parameters:
103 | * dataset: `visual_genome_1.2` or `visual_genome_1.0`.
104 | * net: res50, res101
105 | * ckpt_to_init: pretrained model to be initialized with. Refer to [tf_faster_rcnn](https://github.com/endernewton/tf-faster-rcnn) for more init weight details.
106 | * data_dir: the data directory where you save the outputs after `prepare data`.
107 | * step: for continue training.
108 | - step 1: fix convnet weights
109 | - stpe 2: finetune convnets weights
110 | - step 3: add context fusion, but fix convnets weights
111 | - step 4: finetune the whole model.
112 |
113 | ## Demo
114 |
115 | Create a directory `data/demo`
116 | ```sh
117 | $ mkdir $ROOT/data/demo
118 | ```
119 | Then put the images to be tested in the directory.
120 |
121 | **Download pretrained model (iters 500k)** by [Google Drive](https://drive.google.com/file/d/1yoJGXXpeSpQbU-6WpLsMXFLIka7xpTAy/view?usp=sharing)
122 | or [Jbox](https://jbox.sjtu.edu.cn/l/j5EeUN). Then create a "output"
123 | directory under `$ROOT`
124 | ```sh
125 | $ mkdir $ROOT/output
126 | ```
127 | Extract the downloaded "ckpt.zip" to directory `$ROOT/output`.
128 | And run
129 | ```sh
130 | $ cd $ROOT
131 | $ bash scripts/dense_cap_demo.sh ./output/ckpt ./output/ckpt/vocabulary.txt
132 | ```
133 | or run
134 | ```sh
135 | $ bash scripts/dense_cap_demo.sh [ckpt_path] [vocab_path]
136 | ```
137 | for your customized checkpoint directory.
138 |
139 | It will create html files in `$ROOT/demo`, just click it.
140 | Or you can use the web-based visualizer created by [karpathy](https://github.com/karpathy) by
141 | ```sh
142 | $ cd $ROOT/vis
143 | $ python -m SimpleHTTPServer 8181
144 | ```
145 | Then point your web brower to [http://localhost:8181/view_results.html](http://localhost:8181/view_results.html).
146 |
147 | ## TODO:
148 |
149 | - [x] preprocessing dataset.
150 | - [x] roi_data_layer & get data well prepared for feeding.
151 | - [x] proposal layer
152 | - [x] sentense data layer
153 | - [x] embedding layer
154 | - [x] get loc loss and caption loss
155 | - [x] overfit a mini-batch
156 | - [x] context fusion
157 | - [x] add experiment result.
158 |
159 | ## References
160 |
161 | * The Faster-RCNN framework inherited from repo [tf-faster-rcnn](https://github.com/endernewton/tf-faster-rcnn) by [endernewton](https://github.com/endernewton)
162 | * The official repo of [densecap](https://github.com/linjieyangsc/densecap)
163 | * [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling](https://arxiv.org/abs/1611.01462)
164 | * Official tensorflow models - "im2text".
165 | * Adapted web-based visualizer from [jcjohnson](https://github.com/jcjohnson)'s [densecap repo](https://github.com/jcjohnson/densecap)
166 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/__init__.py
--------------------------------------------------------------------------------
/info/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/info/__init__.py
--------------------------------------------------------------------------------
/info/read_regions.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------
2 | # DenseCap
3 | # Written by InnerPeace
4 | # ----------------------------------------------
5 | from __future__ import absolute_import
6 | from __future__ import division
7 | from __future__ import print_function
8 |
9 | """read large region description json files"""
10 |
11 | import ijson
12 | import json
13 | import sys
14 | import os
15 | import argparse
16 |
17 | parser = argparse.ArgumentParser(description='Preprocessing visual genome')
18 | parser.add_argument('--version', dest='version', type=float, default=1.2, help='the version of visual genome dataset.')
19 | parser.add_argument('--vg_path', dest='vg_path', type=str, default='/home/joe/git/VG_raw_data', help='directory keeping the raw dataset of visual genome')
20 |
21 | args = parser.parse_args()
22 | VG_VERSION = args.version
23 | VG_PATH = args.vg_path
24 |
25 | VG_REGION_PATH = '%s/%s/region_descriptions.json' % (VG_PATH, VG_VERSION)
26 | REGION_JSON = '%s/%s/regions' % (VG_PATH, VG_VERSION)
27 |
28 |
29 | def read_regions():
30 | if not os.path.exists(REGION_JSON):
31 | os.makedirs(REGION_JSON)
32 | parser = ijson.parse(open(VG_REGION_PATH))
33 | last_value = None
34 | Dic = {}
35 | regions = []
36 | dic = {}
37 | count = 0
38 | for prefix, event, value in parser:
39 | sys.stdout.write('>>> %d \r' % count)
40 | sys.stdout.flush()
41 | if value == 'regions':
42 | Dic = {}
43 | regions = []
44 | last_value = None
45 | elif last_value == 'id' and value:
46 | count += 1
47 | Dic['regions'] = regions
48 | Dic['id'] = value
49 | with open(REGION_JSON + '/%s.json' % value, 'w') as f:
50 | json.dump(Dic, f)
51 | elif event == 'map_key':
52 | last_value = value
53 | elif event == 'end_map':
54 | regions.append(dic)
55 | dic = {}
56 | last_value = None
57 | elif last_value:
58 | dic[last_value] = value
59 |
60 |
61 | if __name__ == '__main__':
62 | read_regions()
63 |
--------------------------------------------------------------------------------
/info/read_splits.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------
2 | # DenseCap
3 | # Written by InnerPeace
4 | # ----------------------------------------------
5 | from __future__ import absolute_import
6 | from __future__ import division
7 | from __future__ import print_function
8 |
9 | '''Read splits'''
10 |
11 | import json
12 |
13 | def read_splits():
14 | file = 'densecap_splits.json'
15 | with open(file, 'r') as f:
16 | data = json.load(f)
17 | splits = ['train', 'val', 'test']
18 | for split in splits:
19 | print("%s set has %s examples." % (split, len(data[split])))
20 | with open(split + '.txt', 'w') as f:
21 | for id in data[split]:
22 | f.write("%s\n" % id)
23 |
24 |
25 | if __name__ == '__main__':
26 | read_splits()
27 |
--------------------------------------------------------------------------------
/lib/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | python setup.py build_ext --inplace
3 | rm -rf build
4 |
--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/__init__.py
--------------------------------------------------------------------------------
/lib/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/datasets/__init__.py
--------------------------------------------------------------------------------
/lib/datasets/factory.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work
5 | # --------------------------------------------------------
6 |
7 | """Factory method for easily getting imdbs by name."""
8 |
9 | __sets = {}
10 |
11 | from visual_genome import visual_genome
12 |
13 |
14 | # Set up visual_genome_ using rpn mode
15 | # for version in ['1.0', '1.2']:
16 | for version in ['1.2']:
17 | for split in ['train', 'val', 'test']:
18 | name = 'vg_{}_{}'.format(version, split)
19 | __sets[name] = (lambda split=split, version=version:
20 | visual_genome(split, version))
21 |
22 |
23 | def get_imdb(name):
24 | """Get an imdb (image database) by name."""
25 | if not __sets.has_key(name):
26 | raise KeyError('Unknown dataset: {}'.format(name))
27 | return __sets[name]()
28 |
29 |
30 | def list_imdbs():
31 | """List all registered imdbs."""
32 | return __sets.keys()
33 |
--------------------------------------------------------------------------------
/lib/dense_cap/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/dense_cap/__init__.py
--------------------------------------------------------------------------------
/lib/dense_cap/beam_search.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work
5 | # and Google's im2txt project
6 | # --------------------------------------------------------
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | import pdb
12 | import math
13 | from lib.dense_cap.caption_generator import *
14 | import numpy as np
15 | from lib.config import cfg
16 | import tensorflow as tf
17 | from six.moves import xrange
18 |
19 |
20 | def beam_search(sess, net, blobs, im_scales):
21 | # (TODO wu) for now it only works with "concat" mode
22 | # get initial states and rois
23 | if cfg.CONTEXT_FUSION:
24 | cap_state, loc_state, scores, \
25 | rois, gfeat_state = net.feed_image(sess,
26 | blobs['data'],
27 | blobs['im_info'][0])
28 | all_states = np.concatenate((cap_state, loc_state, gfeat_state), axis=1)
29 | else:
30 | cap_state, loc_state, scores, rois = net.feed_image(sess, blobs['data'],
31 | blobs['im_info'][0])
32 | all_states = np.concatenate((cap_state, loc_state), axis=1)
33 |
34 | # proposal boxes
35 | boxes = rois[:, 1:5] / im_scales[0]
36 | proposal_n = rois.shape[0]
37 |
38 | all_partial_caps = []
39 | all_complete_caps = []
40 | beam_size = cfg.TEST.BEAM_SIZE
41 | for i in xrange(proposal_n):
42 | init_beam = Caption(sentence=[cfg.VOCAB_START_ID],
43 | state=all_states[i],
44 | box_pred=[],
45 | logprob=0.0,
46 | score=0.0,
47 | metadata=[""])
48 | partial_cap = TopN(beam_size)
49 | partial_cap.push(init_beam)
50 | complete_cap = TopN(beam_size)
51 | all_partial_caps.append(partial_cap)
52 | all_complete_caps.append(complete_cap)
53 |
54 | for j in xrange(cfg.TIME_STEPS - 1):
55 | all_candidates_len = []
56 | flag = False
57 | for i in xrange(proposal_n):
58 | partial_cap = all_partial_caps[i]
59 | size = partial_cap.size()
60 | all_candidates_len.append(size)
61 | if not size:
62 | continue
63 | partial_cap_list = partial_cap.get_data()
64 | input_feed_i = [c.sentence[-1] for c in partial_cap_list]
65 | state_feed_i = [c.state for c in partial_cap_list]
66 | if not flag:
67 | flag = True
68 | input_feed = np.array(input_feed_i)
69 | state_feed = np.array(state_feed_i)
70 | else:
71 | input_feed = np.concatenate((input_feed, np.array(input_feed_i)))
72 | state_feed = np.concatenate((state_feed, np.array(state_feed_i)))
73 |
74 | if cfg.CONTEXT_FUSION:
75 | cap_feed, loc_feed, gfeat_feed = np.split(state_feed, 3, axis=1)
76 | cap_probs, new_bbox_pred, new_cap_state, new_loc_state, \
77 | new_gfeat_state = net.inference_step(sess, input_feed,
78 | cap_feed, loc_feed, gfeat_feed)
79 | new_state = np.concatenate((new_cap_state, new_loc_state, new_gfeat_state),
80 | axis=1)
81 | else:
82 | cap_feed, loc_feed = np.split(state_feed, 2, axis=1)
83 | cap_probs, new_bbox_pred, new_cap_state, \
84 | new_loc_state = net.inference_step(sess, input_feed,
85 | cap_feed, loc_feed)
86 | new_state = np.concatenate((new_cap_state, new_loc_state), axis=1)
87 |
88 | count = 0
89 | for k in xrange(proposal_n):
90 | l = all_candidates_len[k]
91 | if l == 0:
92 | continue
93 | partial_cap = all_partial_caps[k]
94 | complete_cap = all_complete_caps[k]
95 | partial_cap_list = partial_cap.extract()
96 | partial_cap.reset()
97 | softmax_k = cap_probs[count: count + l]
98 | states_k = new_state[count: count + l]
99 | bbox_pred_k = new_bbox_pred[count: count + l]
100 | count += l
101 | for i, par_cap in enumerate(partial_cap_list):
102 | word_probs = softmax_k[i]
103 | state = states_k[i]
104 | bbox_pred = bbox_pred_k[i]
105 | # For this partial caption, get the beam_size most probable next words.
106 | words_and_probs = list(enumerate(word_probs))
107 | words_and_probs.sort(key=lambda x: -x[1])
108 | words_and_probs = words_and_probs[0: beam_size]
109 | # Each next word gives a new partial caption
110 | for w, p in words_and_probs:
111 | if p < 1e-12:
112 | continue # Avoid log(0)
113 | sentence = par_cap.sentence + [w]
114 | logprob = par_cap.logprob + math.log(p)
115 | sc = logprob
116 | box_pred = par_cap.box_pred
117 | box_pred.append(bbox_pred)
118 | if w == cfg.VOCAB_END_ID:
119 | if cfg.TEST.LN_FACTOR > 0:
120 | sc /= len(sentence) ** cfg.TEST.LN_FACTOR
121 | beam = Caption(sentence, state, box_pred, logprob, sc)
122 | complete_cap.push(beam)
123 | else:
124 | beam = Caption(sentence, state, box_pred, logprob, sc)
125 | partial_cap.push(beam)
126 | captions = []
127 | box_offsets = np.zeros((proposal_n, 4), dtype=np.float32)
128 | for i in xrange(proposal_n):
129 | complete_cap = all_complete_caps[i]
130 | if not complete_cap.size():
131 | complete_cap = all_partial_caps[i]
132 | caps_i = complete_cap.extract(sort=True)
133 | captions.append(caps_i[0].sentence)
134 | box_offsets[i] = caps_i[0].box_pred[-1]
135 |
136 | return scores, box_offsets, captions, boxes
137 |
--------------------------------------------------------------------------------
/lib/dense_cap/vis_whtml.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------
2 | # DenseCap
3 | # Written by InnerPeace
4 | # ----------------------------------------------
5 | from __future__ import absolute_import
6 | from __future__ import division
7 | from __future__ import print_function
8 |
9 | import cv2
10 | import os
11 | import numpy as np
12 | from six.moves import xrange
13 |
14 |
15 | def vis_whtml(im_path, im, captions, dets, pre_results=dict(),
16 | thresh=0.5, save_path='./vis/data'):
17 | print("visualizing with pretty html...")
18 | if not os.path.exists(save_path):
19 | os.mkdirs(save_path)
20 |
21 | im_name = im_path.split('/')[-1][:-4]
22 | box_xywh = []
23 | box_caps = []
24 | scores = []
25 | for i in xrange(dets.shape[0]):
26 | if dets[i, -1] > thresh:
27 | box_xywh.append(box2xywh(dets[i, :4].tolist()))
28 | box_caps.append(captions[i])
29 | scores.append(float(dets[i, -1]))
30 |
31 | # save image
32 | im_new = np.copy(im)
33 | cv2.imwrite("%s/%s.jpg" % (save_path, im_name), im_new)
34 | result = {"img_name": "%s.jpg" % im_name,
35 | "scores": scores,
36 | "captions": box_caps,
37 | "boxes": box_xywh}
38 | pre_results["results"] = pre_results.get("results", []) + [result]
39 |
40 | return pre_results
41 |
42 |
43 | def box2xywh(box):
44 | xywh = []
45 | xywh.extend(box[:2])
46 | for i in xrange(2):
47 | xywh.append(box[i+2] - box[i])
48 |
49 | return xywh
50 |
--------------------------------------------------------------------------------
/lib/download_data_vh.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | set -x
4 |
5 | cd /valohai/inputs
6 | mv image_1/images.zip image_2/images2.zip /valohai/outputs
7 |
--------------------------------------------------------------------------------
/lib/fast_rcnn/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
--------------------------------------------------------------------------------
/lib/fast_rcnn/bbox_transform.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import numpy as np
9 |
10 |
11 | def bbox_transform(ex_rois, gt_rois):
12 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
13 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
14 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
15 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
16 |
17 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
18 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
19 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
20 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
21 |
22 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
23 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
24 | targets_dw = np.log(gt_widths / ex_widths)
25 | targets_dh = np.log(gt_heights / ex_heights)
26 |
27 | targets = np.vstack(
28 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
29 | return targets
30 |
31 |
32 | def bbox_transform_inv(boxes, deltas):
33 | if boxes.shape[0] == 0:
34 | return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
35 |
36 | boxes = boxes.astype(deltas.dtype, copy=False)
37 |
38 | widths = boxes[:, 2] - boxes[:, 0] + 1.0
39 | heights = boxes[:, 3] - boxes[:, 1] + 1.0
40 | ctr_x = boxes[:, 0] + 0.5 * widths
41 | ctr_y = boxes[:, 1] + 0.5 * heights
42 |
43 | dx = deltas[:, 0::4]
44 | dy = deltas[:, 1::4]
45 | dw = deltas[:, 2::4]
46 | dh = deltas[:, 3::4]
47 |
48 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
49 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
50 | pred_w = np.exp(dw) * widths[:, np.newaxis]
51 | pred_h = np.exp(dh) * heights[:, np.newaxis]
52 |
53 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
54 | # x1
55 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
56 | # y1
57 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
58 | # x2
59 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 # to make it the perfect inversion of bbox_transform
60 | # y2
61 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 # to make it the perfect inversion of bbox_transform
62 |
63 | return pred_boxes
64 |
65 |
66 | def clip_boxes(boxes, im_shape):
67 | """
68 | Clip boxes to image boundaries.
69 | """
70 |
71 | # x1 >= 0
72 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
73 | # y1 >= 0
74 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
75 | # x2 < im_shape[1]
76 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
77 | # y2 < im_shape[0]
78 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
79 | return boxes
80 |
--------------------------------------------------------------------------------
/lib/fast_rcnn/layer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work and Xinlei's work
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | from os.path import join as pjoin
11 | from lib.config import cfg
12 | from lib.fast_rcnn.minibatch import get_minibatch
13 | import numpy as np
14 | import time
15 | import json
16 |
17 |
18 | class RoIDataLayer(object):
19 | """densecap data layer used for training."""
20 |
21 | def __init__(self, roidb, random=False):
22 | """set the roidb to be used by this layer during training."""
23 | self._roidb = roidb
24 | # set a random flag
25 | self._random = random
26 | self._shuffle_roidb_inds()
27 |
28 | def _shuffle_roidb_inds(self):
29 | """Randomly permute the training roidb."""
30 |
31 | # if the random flag is set,
32 | # then the database is shuffled according to system time
33 | # useful for the validation set.
34 | if self._random:
35 | st0 = np.random.get_state()
36 | millis = int(round(time.time() * 1000)) % 4294967259
37 | np.random.seed(millis)
38 |
39 | if not cfg.LIMIT_RAM:
40 | # with sending in the giant roidb list
41 | if cfg.TRAIN.ASPECT_GROUPING:
42 | widths = np.array([r['width'] for r in self._roidb])
43 | heights = np.array([r['height'] for r in self._roidb])
44 | horz = (widths >= heights)
45 | vert = np.logical_not(horz)
46 | horz_inds = np.where(horz)[0]
47 | vert_inds = np.where(vert)[0]
48 | inds = np.hstack((
49 | np.random.permutation(horz_inds),
50 | np.random.permutation(vert_inds)))
51 | inds = np.reshape(inds, (-1, 2))
52 | row_perm = np.random.permutation(np.arange(inds.shape[0]))
53 | inds = np.reshape(inds[row_perm, :], (-1,))
54 | self._perm = inds
55 | else:
56 | self._perm = np.random.permutation(np.arange(len(self._roidb)))
57 | else:
58 | # LIMIT_RAM and 'roidb' is the path to saved gt_roidbs.
59 | index_path = self._roidb + '/image_index.json'
60 | with open(index_path, 'r') as f:
61 | self._image_index = json.load(f)
62 | print("LIMIT_RAM version and load index from {}".format(index_path))
63 | self._perm = np.random.permutation(np.arange(len(self._image_index)))
64 |
65 | # restore the random state
66 | if self._random:
67 | np.random.set_state(st0)
68 |
69 | self._cur = 0
70 |
71 | def _get_next_minibatch_inds(self):
72 | """Return the roidb indices for the next minibatch."""
73 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._perm):
74 | self._shuffle_roidb_inds()
75 |
76 | db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH]
77 | self._cur += cfg.TRAIN.IMS_PER_BATCH
78 | return db_inds
79 |
80 | def _get_next_minibatch(self):
81 | """Return the blobs to be used for the next minibatch.
82 |
83 | If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
84 | separate process and made available through self._blob_queue.
85 | """
86 | db_inds = self._get_next_minibatch_inds()
87 | if cfg.LIMIT_RAM:
88 | assert len(db_inds) == 1, "LIMIT_RAM version only support one " \
89 | "image per minibatch."
90 | # it is the exact file path in the 'roidb' directory.
91 | minibatch_db = self._image_index[db_inds[0]]
92 | minibatch_db = pjoin(self._roidb, "%s.pkl" % minibatch_db)
93 | else:
94 | minibatch_db = [self._roidb[i] for i in db_inds]
95 | return get_minibatch(minibatch_db)
96 |
97 | def forward(self):
98 | """Get blobs"""
99 | blobs = self._get_next_minibatch()
100 | return blobs
101 |
--------------------------------------------------------------------------------
/lib/fast_rcnn/minibatch.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work and Xinlei's work
5 | # --------------------------------------------------------
6 | # Fast R-CNN
7 | # Copyright (c) 2015 Microsoft
8 | # Licensed under The MIT License [see LICENSE for details]
9 | # Written by Ross Girshick
10 | # --------------------------------------------------------
11 | from __future__ import absolute_import
12 | from __future__ import division
13 | from __future__ import print_function
14 |
15 | """Compute minibatch blobs for training a DenseCap network."""
16 |
17 | import numpy as np
18 | import numpy.random as npr
19 | import cv2
20 | from six.moves import cPickle, xrange
21 | from lib.config import cfg
22 | from lib.utils.blob import prep_im_for_blob, im_list_to_blob
23 |
24 |
25 | def get_minibatch(roidb):
26 | """Given a roidb, construct a minibatch sampled from it."""
27 |
28 | if cfg.LIMIT_RAM:
29 | num_images = 1 # one image per minibatch
30 | else:
31 | num_images = len(roidb)
32 |
33 | # Sample random scales to use for each image in this batch
34 | random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
35 | size=num_images)
36 | assert (cfg.TRAIN.BATCH_SIZE % num_images == 0), \
37 | 'num_images ({}) must divide BATCH_SIZE ({})'. \
38 | format(num_images, cfg.TRAIN.BATCH_SIZE)
39 |
40 | # Get the input image blob, formatted for caffe
41 | im_blob, im_scales, roidb = _get_image_blob(roidb, random_scale_inds)
42 |
43 | blobs = {'data': im_blob}
44 |
45 | if cfg.TRAIN.HAS_RPN:
46 | assert len(im_scales) == 1, "Single batch only"
47 | assert len(roidb) == 1, "Single batch only"
48 | # gt boxes: (x1, y1, x2, y2, cls)
49 | gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
50 | gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
51 | gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
52 | gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
53 | # TODO: add "gt_phrases"
54 | blobs['gt_phrases'] = _process_gt_phrases(roidb[0]['gt_phrases'])
55 | blobs['gt_boxes'] = gt_boxes
56 | blobs['im_info'] = np.array(
57 | # TODO: for blob format stick to tf_faster_rcnn version
58 | # [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]],
59 | # [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]],
60 | # make it shape [3,]
61 | [im_blob.shape[1], im_blob.shape[2], im_scales[0]],
62 | dtype=np.float32)
63 | # if cfg.LIMIT_RAM:
64 | # blobs['gt_phrases'] = roidb[0]['gt_phrases']
65 | else: # not using RPN
66 | raise NotImplementedError
67 |
68 | return blobs
69 |
70 |
71 | def _process_gt_phrases(phrases):
72 | """processing gt phrases for blob"""
73 | num_regions = len(phrases)
74 | gt_phrases = np.zeros((num_regions, cfg.MAX_WORDS), dtype=np.int32)
75 | for ix, phra in enumerate(phrases):
76 | l = len(phra)
77 | gt_phrases[ix, :l] = phra
78 |
79 | return gt_phrases
80 |
81 |
82 | def _get_image_blob(roidb, scale_inds):
83 | """Builds an input blob from the images in the roidb at the specified
84 | scales.
85 | """
86 | num_images = len(scale_inds)
87 | processed_ims = []
88 | im_scales = []
89 | if cfg.LIMIT_RAM:
90 | # roidb is the pickle file path
91 | assert num_images == 1, "LIMIT_RAM version, it has to be one image."
92 | with open(roidb, 'rb') as f:
93 | roidb = [cPickle.load(f)]
94 |
95 | for i in xrange(num_images):
96 | im = cv2.imread(roidb[i]['image'])
97 | if roidb[i]['flipped']:
98 | im = im[:, ::-1, :]
99 | target_size = cfg.TRAIN.SCALES[scale_inds[i]]
100 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
101 | cfg.TRAIN.MAX_SIZE)
102 | im_scales.append(im_scale)
103 | processed_ims.append(im)
104 |
105 | # Create a blob to hold the input images
106 | blob = im_list_to_blob(processed_ims)
107 |
108 | return blob, im_scales, roidb
109 |
110 |
--------------------------------------------------------------------------------
/lib/fast_rcnn/nms_wrapper.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | from lib.config import cfg
12 | from lib.nms.gpu_nms import gpu_nms
13 | from lib.nms.cpu_nms import cpu_nms
14 |
15 | def nms(dets, thresh, force_cpu=False):
16 | """Dispatch to either CPU or GPU NMS implementations."""
17 |
18 | if dets.shape[0] == 0:
19 | return []
20 | # print "gpu_id used by nms is: %d" % cfg.GPU_ID
21 | if cfg.USE_GPU_NMS and not force_cpu:
22 | return gpu_nms(dets, thresh, device_id=cfg.GPU_ID)
23 | else:
24 | return cpu_nms(dets, thresh)
25 |
--------------------------------------------------------------------------------
/lib/fast_rcnn/roidb.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | """Transform a roidb into a trainable roidb by adding a bunch of metadata."""
12 |
13 | # import sys
14 | # sys.path.append("..")
15 |
16 | import numpy as np
17 | from lib.config import cfg
18 | from lib.fast_rcnn.bbox_transform import bbox_transform
19 | from lib.utils.cython_bbox import bbox_overlaps
20 | from PIL import Image
21 |
22 |
23 | def prepare_roidb(imdb):
24 | """Enrich the imdb's roidb by adding some derived quantities that
25 | are useful for training. This function precomputes the maximum
26 | overlap, taken over ground-truth boxes, between each ROI and
27 | each ground-truth box. The class with maximum overlap is also
28 | recorded.
29 | """
30 | sizes = [Image.open(imdb.image_path_at(i)).size
31 | for i in xrange(imdb.num_images)]
32 | roidb = imdb.roidb
33 | for i in xrange(len(imdb.image_index)):
34 | roidb[i]['image'] = imdb.image_path_at(i)
35 | roidb[i]['width'] = sizes[i][0]
36 | roidb[i]['height'] = sizes[i][1]
37 | # need gt_overlaps as a dense array for argmax
38 | gt_overlaps = roidb[i]['gt_overlaps'].toarray()
39 | # max overlap with gt over classes (columns)
40 | max_overlaps = gt_overlaps.max(axis=1)
41 | # gt class that had the max overlap
42 | max_classes = gt_overlaps.argmax(axis=1)
43 | roidb[i]['max_classes'] = max_classes
44 | roidb[i]['max_overlaps'] = max_overlaps
45 | # sanity checks
46 | # max overlap of 0 => class should be zero (background)
47 | zero_inds = np.where(max_overlaps == 0)[0]
48 | assert all(max_classes[zero_inds] == 0)
49 | # max overlap > 0 => class should not be zero (must be a fg class)
50 | # nonzero_inds = np.where(max_overlaps > 0)[0]
51 | # assert all(max_classes[nonzero_inds] != 0)
52 |
53 |
54 | def add_bbox_regression_targets(roidb):
55 | """Add information needed to train bounding-box regressors."""
56 | assert len(roidb) > 0
57 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
58 |
59 | num_images = len(roidb)
60 | # Infer number of classes from the number of columns in gt_overlaps
61 | num_classes = roidb[0]['gt_overlaps'].shape[1]
62 | for im_i in xrange(num_images):
63 | rois = roidb[im_i]['boxes']
64 | max_overlaps = roidb[im_i]['max_overlaps']
65 | max_classes = roidb[im_i]['max_classes']
66 | roidb[im_i]['bbox_targets'] = \
67 | _compute_targets(rois, max_overlaps, max_classes)
68 |
69 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
70 | # Use fixed / precomputed "means" and "stds" instead of empirical values
71 | means = np.tile(
72 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
73 | stds = np.tile(
74 | np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
75 | else:
76 | # Compute values needed for means and stds
77 | # var(x) = E(x^2) - E(x)^2
78 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS
79 | sums = np.zeros((num_classes, 4))
80 | squared_sums = np.zeros((num_classes, 4))
81 | for im_i in xrange(num_images):
82 | targets = roidb[im_i]['bbox_targets']
83 | for cls in xrange(1, num_classes):
84 | cls_inds = np.where(targets[:, 0] == cls)[0]
85 | if cls_inds.size > 0:
86 | class_counts[cls] += cls_inds.size
87 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
88 | squared_sums[cls, :] += \
89 | (targets[cls_inds, 1:] ** 2).sum(axis=0)
90 |
91 | means = sums / class_counts
92 | stds = np.sqrt(squared_sums / class_counts - means ** 2)
93 |
94 | print('bbox target means:')
95 | print(means)
96 | print(means[1:, :].mean(axis=0)) # ignore bg class)
97 | print('bbox target stdevs:')
98 | print(stds)
99 | print(stds[1:, :].mean(axis=0)) # ignore bg class)
100 |
101 | # Normalize targets
102 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
103 | print("Normalizing targets")
104 | for im_i in xrange(num_images):
105 | targets = roidb[im_i]['bbox_targets']
106 | for cls in xrange(1, num_classes):
107 | cls_inds = np.where(targets[:, 0] == cls)[0]
108 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
109 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
110 | else:
111 | print("NOT normalizing targets")
112 |
113 | # These values will be needed for making predictions
114 | # (the predicts will need to be unnormalized and uncentered)
115 | return means.ravel(), stds.ravel()
116 |
117 |
118 | def _compute_targets(rois, overlaps, labels):
119 | """Compute bounding-box regression targets for an image."""
120 | # Indices of ground-truth ROIs
121 | gt_inds = np.where(overlaps == 1)[0]
122 | if len(gt_inds) == 0:
123 | # Bail if the image has no ground-truth ROIs
124 | return np.zeros((rois.shape[0], 5), dtype=np.float32)
125 | # Indices of examples for which we try to make predictions
126 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]
127 |
128 | # Get IoU overlap between each ex ROI and gt ROI
129 | ex_gt_overlaps = bbox_overlaps(
130 | np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
131 | np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
132 |
133 | # Find which gt ROI each ex ROI has max overlap with:
134 | # this will be the ex ROI's gt target
135 | gt_assignment = ex_gt_overlaps.argmax(axis=1)
136 | gt_rois = rois[gt_inds[gt_assignment], :]
137 | ex_rois = rois[ex_inds, :]
138 |
139 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
140 | targets[ex_inds, 0] = labels[ex_inds]
141 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
142 | return targets
143 |
--------------------------------------------------------------------------------
/lib/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/layers/__init__.py
--------------------------------------------------------------------------------
/lib/layers/anchor_target_layer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Faster R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick and Xinlei Chen
6 | # --------------------------------------------------------
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | import os
12 | from lib.config import cfg
13 | import numpy as np
14 | import numpy.random as npr
15 | from lib.utils.cython_bbox import bbox_overlaps
16 | from lib.fast_rcnn.bbox_transform import bbox_transform
17 |
18 |
19 | def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors):
20 | """Same as the anchor target layer in original Fast/er RCNN """
21 |
22 | A = num_anchors
23 | total_anchors = all_anchors.shape[0]
24 | K = total_anchors / num_anchors
25 |
26 | # allow boxes to sit over the edge by a small amount
27 | _allowed_border = 0
28 |
29 | # map of shape (..., H, W)
30 | height, width = rpn_cls_score.shape[1:3]
31 |
32 | # only keep anchors inside the image
33 | inds_inside = np.where(
34 | (all_anchors[:, 0] >= -_allowed_border) &
35 | (all_anchors[:, 1] >= -_allowed_border) &
36 | (all_anchors[:, 2] < im_info[1] + _allowed_border) & # width
37 | (all_anchors[:, 3] < im_info[0] + _allowed_border) # height
38 | )[0]
39 |
40 | # keep only inside anchors
41 | anchors = all_anchors[inds_inside, :]
42 |
43 | # label: 1 is positive, 0 is negative, -1 is dont care
44 | labels = np.empty((len(inds_inside),), dtype=np.float32)
45 | labels.fill(-1)
46 |
47 | # overlaps between the anchors and the gt boxes
48 | # overlaps (ex, gt)
49 | overlaps = bbox_overlaps(
50 | np.ascontiguousarray(anchors, dtype=np.float),
51 | np.ascontiguousarray(gt_boxes, dtype=np.float))
52 | argmax_overlaps = overlaps.argmax(axis=1)
53 | max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
54 | gt_argmax_overlaps = overlaps.argmax(axis=0)
55 | gt_max_overlaps = overlaps[gt_argmax_overlaps,
56 | np.arange(overlaps.shape[1])]
57 | gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
58 |
59 | if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
60 | # assign bg labels first so that positive labels can clobber them
61 | # first set the negatives
62 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
63 |
64 | # fg label: for each gt, anchor with highest overlap
65 | labels[gt_argmax_overlaps] = 1
66 |
67 | # fg label: above threshold IOU
68 | labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
69 |
70 | if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
71 | # assign bg labels last so that negative labels can clobber positives
72 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
73 |
74 | # subsample positive labels if we have too many
75 | num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
76 | fg_inds = np.where(labels == 1)[0]
77 | if len(fg_inds) > num_fg:
78 | disable_inds = npr.choice(
79 | fg_inds, size=(len(fg_inds) - num_fg), replace=False)
80 | labels[disable_inds] = -1
81 |
82 | # subsample negative labels if we have too many
83 | num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
84 | bg_inds = np.where(labels == 0)[0]
85 | if len(bg_inds) > num_bg:
86 | disable_inds = npr.choice(
87 | bg_inds, size=(len(bg_inds) - num_bg), replace=False)
88 | labels[disable_inds] = -1
89 |
90 | bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
91 | bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
92 |
93 | bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
94 | # only the positive ones have regression targets
95 | bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
96 |
97 | bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
98 | if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:
99 | # uniform weighting of examples (given non-uniform sampling)
100 | num_examples = np.sum(labels >= 0)
101 | positive_weights = np.ones((1, 4)) * 1.0 / num_examples
102 | negative_weights = np.ones((1, 4)) * 1.0 / num_examples
103 | else:
104 | assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
105 | (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
106 | positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
107 | np.sum(labels == 1))
108 | negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
109 | np.sum(labels == 0))
110 | bbox_outside_weights[labels == 1, :] = positive_weights
111 | bbox_outside_weights[labels == 0, :] = negative_weights
112 |
113 | # map up to original set of anchors
114 | labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
115 | bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
116 | bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
117 | bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
118 |
119 | # labels
120 | labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
121 | labels = labels.reshape((1, 1, A * height, width))
122 | rpn_labels = labels
123 |
124 | # bbox_targets
125 | bbox_targets = bbox_targets \
126 | .reshape((1, height, width, A * 4))
127 |
128 | rpn_bbox_targets = bbox_targets
129 | # bbox_inside_weights
130 | bbox_inside_weights = bbox_inside_weights \
131 | .reshape((1, height, width, A * 4))
132 |
133 | rpn_bbox_inside_weights = bbox_inside_weights
134 |
135 | # bbox_outside_weights
136 | bbox_outside_weights = bbox_outside_weights \
137 | .reshape((1, height, width, A * 4))
138 |
139 | rpn_bbox_outside_weights = bbox_outside_weights
140 | return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
141 |
142 |
143 | def _unmap(data, count, inds, fill=0):
144 | """ Unmap a subset of item (data) back to the original set of items (of
145 | size count) """
146 | if len(data.shape) == 1:
147 | ret = np.empty((count,), dtype=np.float32)
148 | ret.fill(fill)
149 | ret[inds] = data
150 | else:
151 | ret = np.empty((count,) + data.shape[1:], dtype=np.float32)
152 | ret.fill(fill)
153 | ret[inds, :] = data
154 | return ret
155 |
156 |
157 | def _compute_targets(ex_rois, gt_rois):
158 | """Compute bounding-box regression targets for an image."""
159 |
160 | assert ex_rois.shape[0] == gt_rois.shape[0]
161 | assert ex_rois.shape[1] == 4
162 | assert gt_rois.shape[1] == 5
163 |
164 | return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
165 |
--------------------------------------------------------------------------------
/lib/layers/generate_anchors.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Faster R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick and Sean Bell
6 | # --------------------------------------------------------
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | import numpy as np
12 |
13 |
14 | # Verify that we compute the same anchors as Shaoqing's matlab implementation:
15 | #
16 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
17 | # >> anchors
18 | #
19 | # anchors =
20 | #
21 | # -83 -39 100 56
22 | # -175 -87 192 104
23 | # -359 -183 376 200
24 | # -55 -55 72 72
25 | # -119 -119 136 136
26 | # -247 -247 264 264
27 | # -35 -79 52 96
28 | # -79 -167 96 184
29 | # -167 -343 184 360
30 |
31 | # array([[ -83., -39., 100., 56.],
32 | # [-175., -87., 192., 104.],
33 | # [-359., -183., 376., 200.],
34 | # [ -55., -55., 72., 72.],
35 | # [-119., -119., 136., 136.],
36 | # [-247., -247., 264., 264.],
37 | # [ -35., -79., 52., 96.],
38 | # [ -79., -167., 96., 184.],
39 | # [-167., -343., 184., 360.]])
40 |
41 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
42 | scales=2 ** np.arange(3, 6)):
43 | """
44 | Generate anchor (reference) windows by enumerating aspect ratios X
45 | scales wrt a reference (0, 0, 15, 15) window.
46 | """
47 |
48 | base_anchor = np.array([1, 1, base_size, base_size]) - 1
49 | ratio_anchors = _ratio_enum(base_anchor, ratios)
50 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
51 | for i in range(ratio_anchors.shape[0])])
52 | return anchors
53 |
54 |
55 | def _whctrs(anchor):
56 | """
57 | Return width, height, x center, and y center for an anchor (window).
58 | """
59 |
60 | w = anchor[2] - anchor[0] + 1
61 | h = anchor[3] - anchor[1] + 1
62 | x_ctr = anchor[0] + 0.5 * (w - 1)
63 | y_ctr = anchor[1] + 0.5 * (h - 1)
64 | return w, h, x_ctr, y_ctr
65 |
66 |
67 | def _mkanchors(ws, hs, x_ctr, y_ctr):
68 | """
69 | Given a vector of widths (ws) and heights (hs) around a center
70 | (x_ctr, y_ctr), output a set of anchors (windows).
71 | """
72 |
73 | ws = ws[:, np.newaxis]
74 | hs = hs[:, np.newaxis]
75 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
76 | y_ctr - 0.5 * (hs - 1),
77 | x_ctr + 0.5 * (ws - 1),
78 | y_ctr + 0.5 * (hs - 1)))
79 | return anchors
80 |
81 |
82 | def _ratio_enum(anchor, ratios):
83 | """
84 | Enumerate a set of anchors for each aspect ratio wrt an anchor.
85 | """
86 |
87 | w, h, x_ctr, y_ctr = _whctrs(anchor)
88 | size = w * h
89 | size_ratios = size / ratios
90 | ws = np.round(np.sqrt(size_ratios))
91 | hs = np.round(ws * ratios)
92 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
93 | return anchors
94 |
95 |
96 | def _scale_enum(anchor, scales):
97 | """
98 | Enumerate a set of anchors for each scale wrt an anchor.
99 | """
100 |
101 | w, h, x_ctr, y_ctr = _whctrs(anchor)
102 | ws = w * scales
103 | hs = h * scales
104 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
105 | return anchors
106 |
107 |
108 | if __name__ == '__main__':
109 | import time
110 |
111 | t = time.time()
112 | a = generate_anchors()
113 | print(time.time() - t)
114 | print(a)
115 | from IPython import embed;
116 |
117 | embed()
118 |
--------------------------------------------------------------------------------
/lib/layers/global_roi_layer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Xinlei's work
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | import numpy as np
11 |
12 |
13 | def GlobalRoILayer(im_info):
14 | """
15 | Set up the global RoI
16 | """
17 | return np.array([0., 0., 0., im_info[1] - 1, im_info[0] - 1])
18 |
--------------------------------------------------------------------------------
/lib/layers/proposal_layer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Faster R-CNN
3 | # Licensed under The MIT License [see LICENSE for details]
4 | # Written by Ross Girshick and Xinlei Chen
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | import numpy as np
11 | from lib.config import cfg
12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes
13 | from lib.fast_rcnn.nms_wrapper import nms
14 |
15 |
16 | def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
17 | """A simplified version compared to fast/er RCNN
18 | For details please see the technical report
19 | """
20 | if type(cfg_key) == bytes:
21 | cfg_key = cfg_key.decode('utf-8')
22 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
23 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
24 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
25 |
26 | # Get the scores and bounding boxes
27 | scores = rpn_cls_prob[:, :, :, num_anchors:]
28 | rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
29 | scores = scores.reshape((-1, 1))
30 | proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
31 | if cfg.DEBUG_ALL:
32 | print ('number of proposals before clip boxes to image board: {}'.format(
33 | proposals.shape[0]
34 | ))
35 | proposals = clip_boxes(proposals, im_info[:2])
36 |
37 | # remove predicted boxes with either height or width < threshold
38 | # (NOTE: convert min_size to input image scale stored in im_info[2])
39 | if cfg.FILTER_SMALL_BOX:
40 | min_size = cfg[cfg_key].RPN_MIN_SIZE
41 | keep = _filter_boxes(proposals, min_size * im_info[2])
42 | proposals = proposals[keep, :]
43 | scores = scores[keep]
44 |
45 | # Pick the top region proposals
46 | order = scores.ravel().argsort()[::-1]
47 | if pre_nms_topN > 0:
48 | order = order[:pre_nms_topN]
49 | proposals = proposals[order, :]
50 | scores = scores[order]
51 |
52 | # Non-maximal suppression
53 | if cfg.DEBUG_ALL:
54 | print("number of proposals before nms: {}".format(proposals.shape[0]))
55 | keep = nms(np.hstack((proposals, scores)), nms_thresh)
56 | if cfg.DEBUG_ALL:
57 | print("number of proposals after nms: {}".format(len(keep)))
58 |
59 | # Pick th top region proposals after NMS
60 | if post_nms_topN > 0:
61 | keep = keep[:post_nms_topN]
62 | proposals = proposals[keep, :]
63 | scores = scores[keep]
64 |
65 | # Only support single image as input
66 | batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
67 | blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
68 |
69 | return blob, scores
70 |
71 |
72 | def _filter_boxes(boxes, min_size):
73 | """Remove all boxes with any side smaller than min_size."""
74 |
75 | ws = boxes[:, 2] - boxes[:, 0] + 1
76 | hs = boxes[:, 3] - boxes[:, 1] + 1
77 | keep = np.where((ws >= min_size) & (hs >= min_size))[0]
78 | return keep
79 |
--------------------------------------------------------------------------------
/lib/layers/proposal_target_layer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Faster R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick, Sean Bell and Xinlei Chen
6 | # --------------------------------------------------------
7 | from __future__ import absolute_import
8 | from __future__ import division
9 | from __future__ import print_function
10 |
11 | import numpy as np
12 | import numpy.random as npr
13 | from lib.config import cfg
14 | from lib.fast_rcnn.bbox_transform import bbox_transform
15 | from lib.utils.cython_bbox import bbox_overlaps
16 |
17 |
18 | def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes):
19 | """
20 | Assign object detection proposals to ground-truth targets. Produces proposal
21 | classification labels and bounding-box regression targets.
22 | """
23 |
24 | # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
25 | # (i.e., layers.proposal_layer.ProposalLayer), or any other source
26 | all_rois = rpn_rois
27 | all_scores = rpn_scores
28 |
29 | # Include ground-truth boxes in the set of candidate rois
30 | if cfg.TRAIN.USE_GT:
31 | zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
32 | all_rois = np.vstack(
33 | (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
34 | )
35 | # not sure if it a wise appending, but anyway i am not using it
36 | all_scores = np.vstack((all_scores, zeros))
37 |
38 | num_images = 1
39 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
40 | fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
41 |
42 | # Sample rois with classification labels and bounding box regression
43 | # targets
44 | labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
45 | all_rois, all_scores, gt_boxes, fg_rois_per_image,
46 | rois_per_image, _num_classes)
47 |
48 | rois = rois.reshape(-1, 5)
49 | roi_scores = roi_scores.reshape(-1)
50 | labels = labels.reshape(-1, 1)
51 | bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
52 | bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
53 | bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
54 |
55 | return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
56 |
57 |
58 | def _get_bbox_regression_labels(bbox_target_data, num_classes):
59 | """Bounding-box regression targets (bbox_target_data) are stored in a
60 | compact form N x (class, tx, ty, tw, th)
61 |
62 | This function expands those targets into the 4-of-4*K representation used
63 | by the network (i.e. only one class has non-zero targets).
64 |
65 | Returns:
66 | bbox_target (ndarray): N x 4K blob of regression targets
67 | bbox_inside_weights (ndarray): N x 4K blob of loss weights
68 | """
69 |
70 | clss = bbox_target_data[:, 0]
71 | bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
72 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
73 | inds = np.where(clss > 0)[0]
74 | for ind in inds:
75 | cls = clss[ind]
76 | start = int(4 * cls)
77 | end = start + 4
78 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
79 | bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
80 | return bbox_targets, bbox_inside_weights
81 |
82 |
83 | def _compute_targets(ex_rois, gt_rois, labels):
84 | """Compute bounding-box regression targets for an image."""
85 |
86 | assert ex_rois.shape[0] == gt_rois.shape[0]
87 | assert ex_rois.shape[1] == 4
88 | assert gt_rois.shape[1] == 4
89 |
90 | targets = bbox_transform(ex_rois, gt_rois)
91 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
92 | # Optionally normalize targets by a precomputed mean and stdev
93 | targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
94 | / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
95 | return np.hstack(
96 | (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
97 |
98 |
99 | def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
100 | """Generate a random sample of RoIs comprising foreground and background
101 | examples.
102 | """
103 | # overlaps: (rois x gt_boxes)
104 | overlaps = bbox_overlaps(
105 | np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
106 | np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
107 | gt_assignment = overlaps.argmax(axis=1)
108 | max_overlaps = overlaps.max(axis=1)
109 | labels = gt_boxes[gt_assignment, 4]
110 |
111 | # Select foreground RoIs as those with >= FG_THRESH overlap
112 | fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
113 | # Guard against the case when an image has fewer than fg_rois_per_image
114 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
115 | bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
116 | (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
117 |
118 | # Small modification to the original version where we ensure a fixed number of regions are sampled
119 | if fg_inds.size > 0 and bg_inds.size > 0:
120 | fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
121 | fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
122 | bg_rois_per_image = rois_per_image - fg_rois_per_image
123 | to_replace = bg_inds.size < bg_rois_per_image
124 | bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
125 | elif fg_inds.size > 0:
126 | to_replace = fg_inds.size < rois_per_image
127 | fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
128 | fg_rois_per_image = rois_per_image
129 | elif bg_inds.size > 0:
130 | to_replace = bg_inds.size < rois_per_image
131 | bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
132 | fg_rois_per_image = 0
133 | else:
134 | import pdb
135 | pdb.set_trace()
136 |
137 | # The indices that we're selecting (both fg and bg)
138 | keep_inds = np.append(fg_inds, bg_inds)
139 | # Select sampled values from various arrays:
140 | labels = labels[keep_inds]
141 | # Clamp labels for the background RoIs to 0
142 | labels[int(fg_rois_per_image):] = 0
143 | rois = all_rois[keep_inds]
144 | roi_scores = all_scores[keep_inds]
145 |
146 | bbox_target_data = _compute_targets(
147 | rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
148 |
149 | bbox_targets, bbox_inside_weights = \
150 | _get_bbox_regression_labels(bbox_target_data, num_classes)
151 |
152 | return labels, rois, roi_scores, bbox_targets, bbox_inside_weights
153 |
--------------------------------------------------------------------------------
/lib/layers/proposal_target_single_class_layer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Xinlei's work
5 | # --------------------------------------------------------
6 | # Faster R-CNN
7 | # Copyright (c) 2015 Microsoft
8 | # Licensed under The MIT License [see LICENSE for details]
9 | # Written by Ross Girshick, Sean Bell and Xinlei Chen
10 | # --------------------------------------------------------
11 | from __future__ import absolute_import
12 | from __future__ import division
13 | from __future__ import print_function
14 |
15 | import numpy as np
16 | import numpy.random as npr
17 | from lib.config import cfg
18 | from lib.fast_rcnn.bbox_transform import bbox_transform
19 | from lib.utils.cython_bbox import bbox_overlaps
20 | from lib.layers.rois_offset_layer import compute_rois_offset
21 |
22 |
23 | def proposal_target_single_class_layer(rpn_rois, rpn_scores, gt_boxes, gt_phrases):
24 | """
25 | Assign object detection proposals to ground-truth targets. Produces proposal
26 | classification labels and bounding-box regression targets.
27 | """
28 |
29 | # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
30 | # (i.e., layers.proposal_layer.ProposalLayer), or any other source
31 | all_rois = rpn_rois
32 | all_scores = rpn_scores
33 |
34 | # Include ground-truth boxes in the set of candidate rois
35 | if cfg.TRAIN.USE_GT:
36 | zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
37 | all_rois = np.vstack(
38 | (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
39 | )
40 | # not sure if it a wise appending, but anyway i am not using it
41 | all_scores = np.vstack((all_scores, zeros))
42 |
43 | num_images = 1
44 | rois_per_image = cfg.TRAIN.BATCH_SIZE // num_images
45 | fg_rois_per_image = int(cfg.TRAIN.FG_FRACTION * rois_per_image)
46 |
47 | # Sample rois with classification labels and bounding box regression
48 | # targets
49 | labels, rois, roi_scores, bbox_targets, bbox_inside_weights, phrases = _sample_rois(
50 | all_rois, all_scores, gt_boxes, gt_phrases, fg_rois_per_image,
51 | rois_per_image)
52 |
53 | rois = rois.reshape(-1, 5)
54 | roi_scores = roi_scores.reshape(-1)
55 | labels = labels.reshape(-1, 1)
56 | phrases = phrases.reshape(-1, cfg.MAX_WORDS)
57 | bbox_targets = bbox_targets.reshape(-1, 4)
58 | bbox_inside_weights = bbox_inside_weights.reshape(-1, 4)
59 | bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
60 | clss = np.array(labels > 0).astype(np.int32)
61 |
62 | return rois, roi_scores, labels, bbox_targets, \
63 | bbox_inside_weights, bbox_outside_weights, clss, phrases
64 |
65 |
66 | def _get_bbox_regression_labels(bbox_target_data):
67 | """Bounding-box regression targets (bbox_target_data) are stored in a
68 | compact form N x (class, tx, ty, tw, th)
69 |
70 | Returns:
71 | bbox_target (ndarray): N x 4 blob of regression targets
72 | bbox_inside_weights (ndarray): N x 4 blob of loss weights
73 | """
74 |
75 | clss = bbox_target_data[:, 0]
76 | bbox_targets = np.zeros((clss.size, 4), dtype=np.float32)
77 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
78 | inds = np.where(clss > 0)[0]
79 | for ind in inds:
80 | bbox_targets[ind, :] = bbox_target_data[ind, 1:]
81 | bbox_inside_weights[ind, :] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
82 | return bbox_targets, bbox_inside_weights
83 |
84 |
85 | def _compute_targets(ex_rois, gt_rois, labels):
86 | """Compute bounding-box regression targets for an image."""
87 |
88 | assert ex_rois.shape[0] == gt_rois.shape[0]
89 | assert ex_rois.shape[1] == 4
90 | assert gt_rois.shape[1] == 4
91 |
92 | targets = bbox_transform(ex_rois, gt_rois)
93 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
94 | # Optionally normalize targets by a precomputed mean and stdev
95 | targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
96 | / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
97 | return np.hstack(
98 | (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
99 |
100 |
101 | def _sample_rois(all_rois, all_scores, gt_boxes, gt_phrases, fg_rois_per_image, rois_per_image):
102 | """Generate a random sample of RoIs comprising foreground and background
103 | examples.
104 | """
105 | # overlaps: (rois x gt_boxes)
106 | overlaps = bbox_overlaps(
107 | np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
108 | np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
109 | gt_assignment = overlaps.argmax(axis=1)
110 | max_overlaps = overlaps.max(axis=1)
111 | labels = gt_boxes[gt_assignment, 4]
112 | phrases = gt_phrases[gt_assignment]
113 |
114 | # Select foreground RoIs as those with >= FG_THRESH overlap
115 | fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
116 | # Guard against the case when an image has fewer than fg_rois_per_image
117 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
118 | bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
119 | (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
120 |
121 | # Small modification to the original version where we ensure a fixed number of regions are sampled
122 | if cfg.SAMPLE_NUM_FIXED_REGIONS:
123 | if fg_inds.size > 0 and bg_inds.size > 0:
124 | fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
125 | fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
126 | bg_rois_per_image = rois_per_image - fg_rois_per_image
127 | to_replace = bg_inds.size < bg_rois_per_image
128 | bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
129 | elif fg_inds.size > 0:
130 | to_replace = fg_inds.size < rois_per_image
131 | fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
132 | fg_rois_per_image = rois_per_image
133 | elif bg_inds.size > 0:
134 | to_replace = bg_inds.size < rois_per_image
135 | bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
136 | fg_rois_per_image = 0
137 | else:
138 | import pdb
139 | pdb.set_trace()
140 | else:
141 | # foreground RoIs
142 | fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size)
143 | # Sample foreground regions without replacement
144 | if fg_inds.size > 0:
145 | fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)
146 |
147 | # Compute number of background RoIs to take from this image (guarding
148 | # against there being fewer than desired)
149 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
150 | bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
151 | # Sample background regions without replacement
152 | if bg_inds.size > 0:
153 | bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
154 |
155 | # The indices that we're selecting (both fg and bg)
156 | keep_inds = np.append(fg_inds, bg_inds)
157 | # Select sampled values from various arrays:
158 | labels = labels[keep_inds]
159 | phrases = phrases[keep_inds]
160 | # Clamp labels for the background RoIs to 0
161 | labels[int(fg_rois_per_image):] = 0
162 | phrases[int(fg_rois_per_image):, :] = 0
163 | rois = all_rois[keep_inds]
164 | roi_scores = all_scores[keep_inds]
165 |
166 | bbox_target_data = _compute_targets(
167 | rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
168 |
169 | if cfg.DEBUG_ALL:
170 | target_boxes = compute_rois_offset(rois[:, 1:5], bbox_target_data[:, 1:5])
171 | match_boxes = gt_boxes[gt_assignment[keep_inds], :4]
172 | print('boxes consistency check')
173 | print(target_boxes[:2,:])
174 | print(match_boxes[:2,:])
175 | assert np.linalg.norm(target_boxes - match_boxes) < 0.01
176 |
177 | bbox_targets, bbox_inside_weights = \
178 | _get_bbox_regression_labels(bbox_target_data)
179 |
180 | return labels, rois, roi_scores, bbox_targets, bbox_inside_weights, phrases
181 |
--------------------------------------------------------------------------------
/lib/layers/proposal_top_layer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Faster R-CNN
3 | # Licensed under The MIT License [see LICENSE for details]
4 | # Written by Xinlei Chen
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | import numpy as np
11 | from lib.config import cfg
12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes
13 | import numpy.random as npr
14 |
15 |
16 | def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, anchors, num_anchors):
17 | """A layer that just selects the top region proposals
18 | without using non-maximal suppression,
19 | For details please see the technical report
20 | """
21 | rpn_top_n = cfg.TEST.RPN_TOP_N
22 |
23 | scores = rpn_cls_prob[:, :, :, num_anchors:]
24 |
25 | rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
26 | scores = scores.reshape((-1, 1))
27 |
28 | length = scores.shape[0]
29 | if length < rpn_top_n:
30 | # Random selection, maybe unnecessary and loses good proposals
31 | # But such case rarely happens
32 | top_inds = npr.choice(length, size=rpn_top_n, replace=True)
33 | else:
34 | top_inds = scores.argsort(0)[::-1]
35 | top_inds = top_inds[:rpn_top_n]
36 | top_inds = top_inds.reshape(rpn_top_n, )
37 |
38 | # Do the selection here
39 | anchors = anchors[top_inds, :]
40 | rpn_bbox_pred = rpn_bbox_pred[top_inds, :]
41 | scores = scores[top_inds]
42 |
43 | # Convert anchors into proposals via bbox transformations
44 | proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
45 |
46 | # Clip predicted boxes to image
47 | proposals = clip_boxes(proposals, im_info[:2])
48 |
49 | # Output rois blob
50 | # Our RPN implementation only supports a single input image, so all
51 | # batch inds are 0
52 | batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
53 | blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
54 | return blob, scores
55 |
--------------------------------------------------------------------------------
/lib/layers/rois_offset_layer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | import numpy as np
11 | from lib.config import cfg
12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes
13 |
14 |
15 | # compute the new bboxes shifted by offset from rois
16 | def compute_rois_offset(rois, offset, im_info=None):
17 | """Compute bounding-box offset for region of interests"""
18 |
19 | assert rois.shape[1] == 4
20 | assert offset.shape[1] == 4
21 |
22 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
23 | # Optionally normalize targets by a precomputed mean and stdev -- reverse the transformation
24 | offset_unnorm = offset * np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS) + \
25 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)
26 | else:
27 | offset_unnorm = offset.copy()
28 | rois_offset = bbox_transform_inv(rois, offset_unnorm)
29 | if not im_info is None:
30 | rois_offset = clip_boxes(rois_offset, im_info[:2])
31 | return rois_offset
32 |
--------------------------------------------------------------------------------
/lib/layers/sentence_data_layer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | """This python layer accepts region ids as input and
11 | retrieves region sentense for them."""
12 |
13 | from six.moves import cPickle
14 | from lib.config import cfg
15 | from collections import Counter
16 | import numpy as np
17 | import six
18 | from six.moves import xrange
19 |
20 | # TODO: disable debug and clear stuff
21 | DEBUG = True
22 |
23 |
24 | def sentence_data_layer(labels, roi_phrases, time_steps=12, mode='concat'):
25 | all_modes = ('repeat', 'concat')
26 | assert (mode in all_modes), "Wrong type of mode which should be 'repeat' or 'concat'"
27 |
28 | if cfg.DEBUG_ALL:
29 | print('length of labels, i.e. number of regions: {}'.format(len(roi_phrases)))
30 |
31 | # all_regions is a dict from region id to caption stream
32 | assert len(labels.shape) == 2, 'Pleace check the shape of "label"'
33 |
34 | num_regions = labels.shape[0]
35 | if mode == 'repeat':
36 | input_sentence = np.zeros((num_regions, time_steps), dtype=np.float32)
37 | elif mode == 'concat':
38 | input_sentence = np.zeros((num_regions, time_steps - 1), dtype=np.float32)
39 |
40 | target_sentence = np.zeros((num_regions, time_steps), dtype=np.float32)
41 | cont_sentence = np.zeros((num_regions, time_steps), dtype=np.float32)
42 | cont_bbox = np.zeros((num_regions, time_steps), dtype=np.float32)
43 | for i in xrange(num_regions):
44 | stream = get_streams(roi_phrases[i], int(labels[i]), time_steps, mode)
45 | input_sentence[i, :] = stream['input_sentence']
46 | target_sentence[i, :] = stream['target_sentence']
47 | cont_sentence[i, :] = stream['cont_sentence']
48 | cont_bbox[i, :] = stream['cont_bbox']
49 |
50 | if cfg.DEBUG_ALL:
51 | print('sentence data layer input (first 3)')
52 | for ix, l in enumerate(labels[:3]):
53 | print(l[0], roi_phrases[ix])
54 | print('sentence data layer output (first 3)')
55 | print('input sentence')
56 | print(input_sentence[:3, :])
57 | print('target sentence')
58 | print(target_sentence[:3, :])
59 | print('cont sentence')
60 | print(cont_sentence[:3, :])
61 | print('cont bbox')
62 | print(cont_bbox[:3, :])
63 |
64 | return input_sentence, target_sentence, cont_sentence, cont_bbox
65 |
66 |
67 | def get_streams(phrases, region_id, time_steps=12, mode='concat'):
68 |
69 | if mode == 'repeat':
70 | # Image features repeated at each time step
71 | if region_id > 0:
72 | stream = phrases[:np.sum(phrases > 0)]
73 | stream = stream.tolist()
74 | pad = time_steps - (len(stream) + 1)
75 | out = {}
76 | out['cont_sentence'] = [0] + [1] * len(stream) + [0] * pad
77 | out['input_sentence'] = [1] + stream + [0] * pad
78 | out['target_sentence'] = stream + [2] + [0] * pad
79 | # only make prediction at the last time step for bbox
80 | out['cont_bbox'] = [0] * len(stream) + [1] + [0] * pad
81 |
82 | for key, val in six.iteritems(out):
83 | if len(val) > time_steps:
84 | out[key] = val[:time_steps]
85 | else:
86 | # negative sample, no phrase related
87 | out = {}
88 | out['cont_sentence'] = [0] * time_steps
89 | out['input_sentence'] = [0] * time_steps
90 | out['target_sentence'] = [0] * time_steps
91 | out['cont_bbox'] = [0] * time_steps
92 |
93 | elif mode == 'concat':
94 | # Image feature concatenated to the first time step
95 | if region_id > 0:
96 | # stream = phrases[region_id]
97 | stream = phrases[:np.sum(phrases > 0)]
98 | stream = stream.tolist()
99 | pad = time_steps - (len(stream) + 2)
100 | out = {}
101 | out['cont_sentence'] = [0] + [1] * (len(stream) + 1) + [0] * pad
102 | out['input_sentence'] = [1] + stream + [0] * pad
103 | out['target_sentence'] = [1] + stream + [2] + [0] * pad
104 | # only make prediction at the last time step for bbox
105 | out['cont_bbox'] = [0] * (len(stream) + 1) + [1] + [0] * pad
106 |
107 | for key, val in six.iteritems(out):
108 | if len(val) > time_steps:
109 | out[key] = val[:time_steps]
110 | else:
111 | # negative sample, no phrase related
112 | out = {}
113 | out['cont_sentence'] = [0] * time_steps
114 | out['input_sentence'] = [0] * (time_steps - 1)
115 | out['target_sentence'] = [0] * time_steps
116 | out['cont_bbox'] = [0] * time_steps
117 | else:
118 | # Global feature and region feature concatenated to the first time step
119 | if region_id > 0:
120 | stream = phrases[region_id]
121 | stream = stream.tolist()
122 | pad = time_steps - (len(stream) + 3)
123 | out = {}
124 | out['cont_sentence'] = [0] + [1] * (len(stream) + 2) + [0] * pad
125 | out['input_sentence'] = [1] + stream + [0] * pad
126 | out['target_sentence'] = [1, 1] + stream + [2] + [0] * pad
127 | # only make prediction at the last time step for bbox
128 | out['cont_bbox'] = [0] * (len(stream) + 2) + [1] + [0] * pad
129 |
130 | for key, val in out.iteritems():
131 | if len(val) > time_steps:
132 | out[key] = val[:time_steps]
133 | else:
134 | # negative sample, no phrase related
135 | out = {}
136 | out['cont_sentence'] = [0] * time_steps
137 | out['input_sentence'] = [0] * (time_steps - 2)
138 | out['target_sentence'] = [0] * time_steps
139 | out['cont_bbox'] = [0] * time_steps
140 |
141 | return out
142 |
--------------------------------------------------------------------------------
/lib/layers/snippets.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Tensorflow Faster R-CNN
3 | # Licensed under The MIT License [see LICENSE for details]
4 | # Written by Xinlei Chen
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | import numpy as np
11 | from lib.layers.generate_anchors import generate_anchors
12 |
13 |
14 | def generate_anchors_pre(height, width, feat_stride, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)):
15 | """ A wrapper function to generate anchors given different scales
16 | Also return the number of anchors in variable 'length'
17 | """
18 | anchors = generate_anchors(ratios=np.array(anchor_ratios), scales=np.array(anchor_scales))
19 | A = anchors.shape[0]
20 | shift_x = np.arange(0, width) * feat_stride
21 | shift_y = np.arange(0, height) * feat_stride
22 | shift_x, shift_y = np.meshgrid(shift_x, shift_y)
23 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()
24 | K = shifts.shape[0]
25 | # width changes faster, so here it is H, W, C
26 | anchors = anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
27 | anchors = anchors.reshape((K * A, 4)).astype(np.float32, copy=False)
28 | length = np.int32(anchors.shape[0])
29 |
30 | return anchors, length
31 |
--------------------------------------------------------------------------------
/lib/limit_ram/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/limit_ram/__init__.py
--------------------------------------------------------------------------------
/lib/limit_ram/utils.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Ross Girshick's work
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | """functions for LIMIT_RAM version"""
11 |
12 | # import sys
13 | # sys.path.append("..")
14 |
15 | import numpy as np
16 | from lib.config import cfg
17 |
18 |
19 | def pre_roidb(roidb):
20 | """Enrich the imdb's roidb by adding some derived quantities that
21 | are useful for training. This function precomputes the maximum
22 | overlap, taken over ground-truth boxes, between each ROI and
23 | each ground-truth box. The class with maximum overlap is also
24 | recorded.
25 | """
26 | # need gt_overlaps as a dense array for argmax
27 | gt_overlaps = roidb['gt_overlaps'].toarray()
28 | # max overlap with gt over classes (columns)
29 | max_overlaps = gt_overlaps.max(axis=1)
30 | # gt class that had the max overlap
31 | max_classes = gt_overlaps.argmax(axis=1)
32 | roidb['max_classes'] = max_classes
33 | roidb['max_overlaps'] = max_overlaps
34 | # sanity checks
35 | # max overlap of 0 => class should be zero (background)
36 | zero_inds = np.where(max_overlaps == 0)[0]
37 | assert all(max_classes[zero_inds] == 0)
38 | # max overlap > 0 => class should not be zero (must be a fg class)
39 | # nonzero_inds = np.where(max_overlaps > 0)[0]
40 | # assert all(max_classes[nonzero_inds] != 0)
41 | return roidb
42 |
43 |
44 | def is_valid_limitRam(entry):
45 | # Valid images have:
46 | # (1) At least one foreground RoI OR
47 | # (2) At least one background RoI
48 | overlaps = entry['max_overlaps']
49 | # find boxes with sufficient overlap
50 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
51 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
52 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
53 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
54 | # image is only valid if such boxes exist
55 | valid = len(fg_inds) > 0 or len(bg_inds) > 0
56 | return valid
57 |
58 |
59 | def flip_image(roidb):
60 | """flip image and change the name for reading later"""
61 |
62 | boxes = roidb['boxes'].copy()
63 | oldx1 = boxes[:, 0].copy()
64 | oldx2 = boxes[:, 2].copy()
65 | boxes[:, 0] = roidb['width'] - oldx2 - 1
66 | boxes[:, 2] = roidb['width'] - oldx1 - 1
67 | assert (boxes[:, 2] >= boxes[:, 0]).all()
68 | entry = {'boxes': boxes,
69 | 'gt_overlaps': roidb['gt_overlaps'],
70 | 'gt_classes': roidb['gt_classes'],
71 | 'flipped': True,
72 | 'gt_phrases': roidb['gt_phrases'],
73 | 'width': roidb['width'],
74 | 'height': roidb['height'],
75 | 'image': roidb['image'],
76 | 'image_id': '%s_flip' % roidb['image_id']}
77 |
78 | return entry
79 |
--------------------------------------------------------------------------------
/lib/nets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/nets/__init__.py
--------------------------------------------------------------------------------
/lib/nets/vgg16.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Tensorflow Faster R-CNN
3 | # Licensed under The MIT License [see LICENSE for details]
4 | # Written by Xinlei Chen
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | import tensorflow as tf
11 | import tensorflow.contrib.slim as slim
12 | from tensorflow.contrib.slim import losses
13 | from tensorflow.contrib.slim import arg_scope
14 | import numpy as np
15 |
16 | from lib.nets.network import Network
17 | from lib.config import cfg
18 |
19 |
20 | class vgg16(Network):
21 | def __init__(self):
22 | Network.__init__(self)
23 | self._feat_stride = [16, ]
24 | self._feat_compress = [1. / float(self._feat_stride[0]), ]
25 | self._scope = 'DenseCap_VGG16'
26 | self._vgg_scope = 'vgg_16'
27 |
28 | def _image_to_head(self, is_training, reuse=None):
29 | with tf.variable_scope(self._vgg_scope, self._vgg_scope, reuse=reuse):
30 | net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3],
31 | trainable=False, scope='conv1')
32 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1')
33 | net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3],
34 | trainable=False, scope='conv2')
35 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2')
36 | net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3],
37 | trainable=is_training, scope='conv3')
38 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3')
39 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
40 | trainable=is_training, scope='conv4')
41 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4')
42 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
43 | trainable=is_training, scope='conv5')
44 |
45 | self._act_summaries.append(net)
46 | self._layers['head'] = net
47 |
48 | return net
49 |
50 | def _head_to_tail(self, pool5, is_training, reuse=None):
51 | with tf.variable_scope(self._vgg_scope, self._vgg_scope, reuse=reuse):
52 | pool5_flat = slim.flatten(pool5, scope='flatten')
53 | fc6 = slim.fully_connected(pool5_flat, 4096, scope='fc6')
54 | if is_training:
55 | fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True,
56 | scope='dropout6')
57 | fc7 = slim.fully_connected(fc6, 4096, scope='fc7')
58 | if is_training:
59 | fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True,
60 | scope='dropout7')
61 |
62 | return fc7
63 |
64 | def get_variables_to_restore(self, variables, var_keep_dic):
65 | variables_to_restore = []
66 |
67 | for v in variables:
68 | # exclude the conv weights that are fc weights in vgg16
69 | if v.name == (self._vgg_scope + '/fc6/weights:0') or \
70 | v.name == (self._vgg_scope + '/fc7/weights:0'):
71 | self._variables_to_fix[v.name] = v
72 | continue
73 | # exclude the first conv layer to swap RGB to BGR
74 | if v.name == (self._vgg_scope + '/conv1/conv1_1/weights:0'):
75 | self._variables_to_fix[v.name] = v
76 | continue
77 | if v.name.split(':')[0] in var_keep_dic:
78 | print('Variables restored: %s' % v.name)
79 | variables_to_restore.append(v)
80 |
81 | return variables_to_restore
82 |
83 | def fix_variables(self, sess, pretrained_model):
84 | print('Fix VGG16 layers..')
85 | with tf.variable_scope('Fix_VGG16') as scope:
86 | with tf.device("/cpu:0"):
87 | # fix the vgg16 issue from conv weights to fc weights
88 | # fix RGB to BGR
89 | fc6_conv = tf.get_variable("fc6_conv", [7, 7, 512, 4096], trainable=False)
90 | fc7_conv = tf.get_variable("fc7_conv", [1, 1, 4096, 4096], trainable=False)
91 | conv1_rgb = tf.get_variable("conv1_rgb", [3, 3, 3, 64], trainable=False)
92 | restorer_fc = tf.train.Saver({self._vgg_scope + "/fc6/weights": fc6_conv,
93 | self._vgg_scope + "/fc7/weights": fc7_conv,
94 | self._vgg_scope + "/conv1/conv1_1/weights": conv1_rgb})
95 | restorer_fc.restore(sess, pretrained_model)
96 |
97 | sess.run(tf.assign(self._variables_to_fix[self._vgg_scope + '/fc6/weights:0'], tf.reshape(fc6_conv,
98 | self._variables_to_fix[
99 | self._vgg_scope + '/fc6/weights:0'].get_shape())))
100 | sess.run(tf.assign(self._variables_to_fix[self._vgg_scope + '/fc7/weights:0'], tf.reshape(fc7_conv,
101 | self._variables_to_fix[
102 | self._vgg_scope + '/fc7/weights:0'].get_shape())))
103 | sess.run(tf.assign(self._variables_to_fix[self._vgg_scope + '/conv1/conv1_1/weights:0'],
104 | tf.reverse(conv1_rgb, [2])))
105 |
--------------------------------------------------------------------------------
/lib/nms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/nms/__init__.py
--------------------------------------------------------------------------------
/lib/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import numpy as np
9 | cimport numpy as np
10 |
11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
12 | return a if a >= b else b
13 |
14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
15 | return a if a <= b else b
16 |
17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
23 |
24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
26 |
27 | cdef int ndets = dets.shape[0]
28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \
29 | np.zeros((ndets), dtype=np.int)
30 |
31 | # nominal indices
32 | cdef int _i, _j
33 | # sorted indices
34 | cdef int i, j
35 | # temp variables for box i's (the box currently under consideration)
36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea
37 | # variables for computing overlap with box j (lower scoring box)
38 | cdef np.float32_t xx1, yy1, xx2, yy2
39 | cdef np.float32_t w, h
40 | cdef np.float32_t inter, ovr
41 |
42 | keep = []
43 | for _i in range(ndets):
44 | i = order[_i]
45 | if suppressed[i] == 1:
46 | continue
47 | keep.append(i)
48 | ix1 = x1[i]
49 | iy1 = y1[i]
50 | ix2 = x2[i]
51 | iy2 = y2[i]
52 | iarea = areas[i]
53 | for _j in range(_i + 1, ndets):
54 | j = order[_j]
55 | if suppressed[j] == 1:
56 | continue
57 | xx1 = max(ix1, x1[j])
58 | yy1 = max(iy1, y1[j])
59 | xx2 = min(ix2, x2[j])
60 | yy2 = min(iy2, y2[j])
61 | w = max(0.0, xx2 - xx1 + 1)
62 | h = max(0.0, yy2 - yy1 + 1)
63 | inter = w * h
64 | ovr = inter / (iarea + areas[j] - inter)
65 | if ovr >= thresh:
66 | suppressed[j] = 1
67 |
68 | return keep
69 |
--------------------------------------------------------------------------------
/lib/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 | int boxes_dim, float nms_overlap_thresh, int device_id);
3 |
--------------------------------------------------------------------------------
/lib/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Faster R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import numpy as np
9 | cimport numpy as np
10 |
11 | assert sizeof(int) == sizeof(np.int32_t)
12 |
13 | cdef extern from "gpu_nms.hpp":
14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
15 |
16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
17 | np.int32_t device_id=0):
18 | cdef int boxes_num = dets.shape[0]
19 | cdef int boxes_dim = dets.shape[1]
20 | cdef int num_out
21 | cdef np.ndarray[np.int32_t, ndim=1] \
22 | keep = np.zeros(boxes_num, dtype=np.int32)
23 | cdef np.ndarray[np.float32_t, ndim=1] \
24 | scores = dets[:, 4]
25 | cdef np.ndarray[np.int_t, ndim=1] \
26 | order = scores.argsort()[::-1]
27 | cdef np.ndarray[np.float32_t, ndim=2] \
28 | sorted_dets = dets[order, :]
29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
30 | keep = keep[:num_out]
31 | return list(order[keep])
32 |
--------------------------------------------------------------------------------
/lib/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
1 | // ------------------------------------------------------------------
2 | // Faster R-CNN
3 | // Copyright (c) 2015 Microsoft
4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
5 | // Written by Shaoqing Ren
6 | // ------------------------------------------------------------------
7 |
8 | #include "gpu_nms.hpp"
9 | #include
10 | #include
11 |
12 | #define CUDA_CHECK(condition) \
13 | /* Code block avoids redefinition of cudaError_t error */ \
14 | do { \
15 | cudaError_t error = condition; \
16 | if (error != cudaSuccess) { \
17 | std::cout << cudaGetErrorString(error) << std::endl; \
18 | } \
19 | } while (0)
20 |
21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
22 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
23 |
24 | __device__ inline float devIoU(float const * const a, float const * const b) {
25 | float left = max(a[0], b[0]), right = min(a[2], b[2]);
26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
28 | float interS = width * height;
29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
31 | return interS / (Sa + Sb - interS);
32 | }
33 |
34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
35 | const float *dev_boxes, unsigned long long *dev_mask) {
36 | const int row_start = blockIdx.y;
37 | const int col_start = blockIdx.x;
38 |
39 | // if (row_start > col_start) return;
40 |
41 | const int row_size =
42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
43 | const int col_size =
44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
45 |
46 | __shared__ float block_boxes[threadsPerBlock * 5];
47 | if (threadIdx.x < col_size) {
48 | block_boxes[threadIdx.x * 5 + 0] =
49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
50 | block_boxes[threadIdx.x * 5 + 1] =
51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
52 | block_boxes[threadIdx.x * 5 + 2] =
53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
54 | block_boxes[threadIdx.x * 5 + 3] =
55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
56 | block_boxes[threadIdx.x * 5 + 4] =
57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
58 | }
59 | __syncthreads();
60 |
61 | if (threadIdx.x < row_size) {
62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
63 | const float *cur_box = dev_boxes + cur_box_idx * 5;
64 | int i = 0;
65 | unsigned long long t = 0;
66 | int start = 0;
67 | if (row_start == col_start) {
68 | start = threadIdx.x + 1;
69 | }
70 | for (i = start; i < col_size; i++) {
71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
72 | t |= 1ULL << i;
73 | }
74 | }
75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
76 | dev_mask[cur_box_idx * col_blocks + col_start] = t;
77 | }
78 | }
79 |
80 | void _set_device(int device_id) {
81 | int current_device;
82 | CUDA_CHECK(cudaGetDevice(¤t_device));
83 | if (current_device == device_id) {
84 | return;
85 | }
86 | // The call to cudaSetDevice must come before any calls to Get, which
87 | // may perform initialization using the GPU.
88 | CUDA_CHECK(cudaSetDevice(device_id));
89 | }
90 |
91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
92 | int boxes_dim, float nms_overlap_thresh, int device_id) {
93 | _set_device(device_id);
94 |
95 | float* boxes_dev = NULL;
96 | unsigned long long* mask_dev = NULL;
97 |
98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
99 |
100 | CUDA_CHECK(cudaMalloc(&boxes_dev,
101 | boxes_num * boxes_dim * sizeof(float)));
102 | CUDA_CHECK(cudaMemcpy(boxes_dev,
103 | boxes_host,
104 | boxes_num * boxes_dim * sizeof(float),
105 | cudaMemcpyHostToDevice));
106 |
107 | CUDA_CHECK(cudaMalloc(&mask_dev,
108 | boxes_num * col_blocks * sizeof(unsigned long long)));
109 |
110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
111 | DIVUP(boxes_num, threadsPerBlock));
112 | dim3 threads(threadsPerBlock);
113 | nms_kernel<<>>(boxes_num,
114 | nms_overlap_thresh,
115 | boxes_dev,
116 | mask_dev);
117 |
118 | std::vector mask_host(boxes_num * col_blocks);
119 | CUDA_CHECK(cudaMemcpy(&mask_host[0],
120 | mask_dev,
121 | sizeof(unsigned long long) * boxes_num * col_blocks,
122 | cudaMemcpyDeviceToHost));
123 |
124 | std::vector remv(col_blocks);
125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
126 |
127 | int num_to_keep = 0;
128 | for (int i = 0; i < boxes_num; i++) {
129 | int nblock = i / threadsPerBlock;
130 | int inblock = i % threadsPerBlock;
131 |
132 | if (!(remv[nblock] & (1ULL << inblock))) {
133 | keep_out[num_to_keep++] = i;
134 | unsigned long long *p = &mask_host[0] + i * col_blocks;
135 | for (int j = nblock; j < col_blocks; j++) {
136 | remv[j] |= p[j];
137 | }
138 | }
139 | }
140 | *num_out = num_to_keep;
141 |
142 | CUDA_CHECK(cudaFree(boxes_dev));
143 | CUDA_CHECK(cudaFree(mask_dev));
144 | }
145 |
--------------------------------------------------------------------------------
/lib/nms/py_cpu_nms.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import numpy as np
9 |
10 | def py_cpu_nms(dets, thresh):
11 | """Pure Python NMS baseline."""
12 | x1 = dets[:, 0]
13 | y1 = dets[:, 1]
14 | x2 = dets[:, 2]
15 | y2 = dets[:, 3]
16 | scores = dets[:, 4]
17 |
18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1)
19 | order = scores.argsort()[::-1]
20 |
21 | keep = []
22 | while order.size > 0:
23 | i = order[0]
24 | keep.append(i)
25 | xx1 = np.maximum(x1[i], x1[order[1:]])
26 | yy1 = np.maximum(y1[i], y1[order[1:]])
27 | xx2 = np.minimum(x2[i], x2[order[1:]])
28 | yy2 = np.minimum(y2[i], y2[order[1:]])
29 |
30 | w = np.maximum(0.0, xx2 - xx1 + 1)
31 | h = np.maximum(0.0, yy2 - yy1 + 1)
32 | inter = w * h
33 | ovr = inter / (areas[i] + areas[order[1:]] - inter)
34 |
35 | inds = np.where(ovr <= thresh)[0]
36 | order = order[inds + 1]
37 |
38 | return keep
39 |
--------------------------------------------------------------------------------
/lib/pre_glove.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from cs224-2017 stanford
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 |
11 | from tensorflow.python.platform import gfile
12 | from os.path import join as pjoin
13 | from tqdm import *
14 | import numpy as np
15 | import os
16 |
17 | from config import cfg
18 |
19 |
20 | _PAD = b""
21 | _SOS = b""
22 | _EOS = b""
23 |
24 |
25 | def initialize_vocabulary(vocabulary_path):
26 | # map vocab to word embeddings
27 | if gfile.Exists(vocabulary_path):
28 | rev_vocab = [_PAD, _SOS, _EOS]
29 | with gfile.GFile(vocabulary_path, mode="r") as f:
30 | rev_vocab.extend(f.readlines())
31 | rev_vocab = [line.strip('\n') for line in rev_vocab]
32 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
33 | return vocab, rev_vocab
34 | else:
35 | raise ValueError("Vocabulary file %s not found.", vocabulary_path)
36 |
37 |
38 | def process_glove(vocab_list, save_path, size=4e5, random_init=True):
39 | """
40 | :param vocab_list: [vocab]
41 | :return:
42 | """
43 | if not gfile.Exists(save_path + ".npz"):
44 | glove_path = os.path.join(cfg.DATA_DIR, "glove.6B.{}d.txt".format(cfg.GLOVE_DIM))
45 | if random_init:
46 | glove = np.random.randn(len(vocab_list), cfg.GLOVE_DIM)
47 | else:
48 | glove = np.zeros((len(vocab_list), cfg.GLOVE_DIM))
49 | found = 0
50 | with open(glove_path, 'r') as fh:
51 | for line in tqdm(fh, total=size):
52 | array = line.lstrip().rstrip().split(" ")
53 | word = array[0]
54 | vector = list(map(float, array[1:]))
55 | if word in vocab_list:
56 | idx = vocab_list.index(word)
57 | glove[idx, :] = vector
58 | found += 1
59 | if word.capitalize() in vocab_list:
60 | idx = vocab_list.index(word.capitalize())
61 | glove[idx, :] = vector
62 | found += 1
63 | if word.upper() in vocab_list:
64 | idx = vocab_list.index(word.upper())
65 | glove[idx, :] = vector
66 | found += 1
67 |
68 | print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path))
69 | np.savez_compressed(save_path, glove=glove)
70 | print("saved trimmed glove matrix at: {}".format(save_path))
71 |
72 |
73 | if __name__ == "__main__":
74 | vocab_path = pjoin(cfg.CACHE_DIR, 'vocabulary.txt')
75 | vocab, rev_vocab = initialize_vocabulary(vocab_path)
76 | process_glove(rev_vocab, cfg.DATA_DIR + "/glove.trimmed.{}".format(cfg.GLOVE_DIM),
77 | random_init=True)
78 |
--------------------------------------------------------------------------------
/lib/preprocess.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Preprocessing data in valohai computing platform.
4 | # This script may out of date. #2017.12.20
5 | set -e
6 | set -x
7 |
8 | POSITIONAL=()
9 | while [[ $# -gt 0 ]]; do
10 | #statements
11 | key="$1"
12 |
13 | case $key in
14 | -vs|--version)
15 | VERSION=$2
16 | shift
17 | shift
18 | ;;
19 | -p|--path)
20 | IN_PATH=$2
21 | shift
22 | shift
23 | ;;
24 | -od|--output_dir)
25 | OUTPUT_DIR=$2
26 | shift
27 | shift
28 | ;;
29 | -mw|--max_words)
30 | MAX_WORDS=$2
31 | shift
32 | shift
33 | ;;
34 | *)
35 | POSITIONAL+=("$1")
36 | shift
37 | ;;
38 | esac
39 | done
40 |
41 |
42 | if [ -d "/valohai/inputs" ]; then
43 | # apt-get -y update
44 | # apt-get -y install python-pip
45 | pip install -r requirements.txt
46 | cd /valohai/inputs
47 | mkdir ${VERSION}
48 | unzip image_meta/image_data.json.zip -d ./${VERSION}
49 | unzip regions/region_descriptions.json.zip -d ./${VERSION}
50 | cd /valohai/repository/lib
51 | time python2 preprocess.py --version ${VERSION} \
52 | --path ${IN_PATH} \
53 | --output_dir ${OUTPUT_DIR} \
54 | --max_words ${MAX_WORDS}
55 |
56 | tar -czvf /valohai/outputs/visual_genome.tar.gz ${OUTPUT_DIR}
57 | # comment it if one already have data stored in S3
58 | mv regions/region_descriptions.json.zip /valohai/outputs
59 | fi
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/README:
--------------------------------------------------------------------------------
1 | =============================
2 | Linjie Yang
3 | 04/21/2016
4 | =============================
5 | This folder holds the functions for evaluating image captioning models, including the dense captioning models. This folder is originally from a standard evaluation toolkit for MS COCO (https://github.com/tylin/coco-caption).
6 | The newly added functions and usages are as follows.
7 | (1) dt_eval.py: function to evaluate captioning model on web data. One image only has one ground truth caption.
8 | (2) vg_eval.py: function to evaluate the dense captioning model on visual genome. Calculate Meteor score and mean AP which are described in the DenseCap paper (http://arxiv.org/abs/1511.07571).
9 | (3) meteor/meteor2.py: modified version of "meteor/meteor.py". Adapted to be usedfor multi-to-multi caption matching in DenseCap.
10 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/bleu/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
2 |
3 | Permission is hereby granted, free of charge, to any person obtaining a copy
4 | of this software and associated documentation files (the "Software"), to deal
5 | in the Software without restriction, including without limitation the rights
6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 | copies of the Software, and to permit persons to whom the Software is
8 | furnished to do so, subject to the following conditions:
9 |
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 |
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/bleu/bleu.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File Name : bleu.py
4 | #
5 | # Description : Wrapper for BLEU scorer.
6 | #
7 | # Creation Date : 06-01-2015
8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
9 | # Authors : Hao Fang and Tsung-Yi Lin
10 |
11 | from bleu_scorer import BleuScorer
12 |
13 |
14 | class Bleu:
15 | def __init__(self, n=4):
16 | # default compute Blue score up to 4
17 | self._n = n
18 | self._hypo_for_image = {}
19 | self.ref_for_image = {}
20 |
21 | def compute_score(self, gts, res):
22 |
23 | assert(gts.keys() == res.keys())
24 | imgIds = gts.keys()
25 |
26 | bleu_scorer = BleuScorer(n=self._n)
27 | for id in imgIds:
28 | hypo = res[id]
29 | ref = gts[id]
30 |
31 | # Sanity check.
32 | assert(type(hypo) is list)
33 | assert(len(hypo) == 1)
34 | assert(type(ref) is list)
35 | assert(len(ref) > 1)
36 |
37 | bleu_scorer += (hypo[0], ref)
38 |
39 | #score, scores = bleu_scorer.compute_score(option='shortest')
40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 |
43 | # return (bleu, bleu_info)
44 | return score, scores
45 |
46 | def method(self):
47 | return "Bleu"
48 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/cider/cider.py:
--------------------------------------------------------------------------------
1 | # Filename: cider.py
2 | #
3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric
4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
5 | #
6 | # Creation Date: Sun Feb 8 14:16:54 2015
7 | #
8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin
9 |
10 | from cider_scorer import CiderScorer
11 | import pdb
12 |
13 | class Cider:
14 | """
15 | Main Class to compute the CIDEr metric
16 |
17 | """
18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19 | # set cider to sum over 1 to 4-grams
20 | self._n = n
21 | # set the standard deviation parameter for gaussian penalty
22 | self._sigma = sigma
23 |
24 | def compute_score(self, gts, res):
25 | """
26 | Main function to compute CIDEr score
27 | :param hypo_for_image (dict) : dictionary with key and value
28 | ref_for_image (dict) : dictionary with key and value
29 | :return: cider (float) : computed CIDEr score for the corpus
30 | """
31 |
32 | assert(gts.keys() == res.keys())
33 | imgIds = gts.keys()
34 |
35 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
36 |
37 | for id in imgIds:
38 | hypo = res[id]
39 | ref = gts[id]
40 |
41 | # Sanity check.
42 | assert(type(hypo) is list)
43 | assert(len(hypo) == 1)
44 | assert(type(ref) is list)
45 | assert(len(ref) > 0)
46 |
47 | cider_scorer += (hypo[0], ref)
48 |
49 | (score, scores) = cider_scorer.compute_score()
50 |
51 | return score, scores
52 |
53 | def method(self):
54 | return "CIDEr"
--------------------------------------------------------------------------------
/lib/pycocoevalcap/eval.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | from tokenizer.ptbtokenizer import PTBTokenizer
3 | from bleu.bleu import Bleu
4 | from meteor.meteor import Meteor
5 | from rouge.rouge import Rouge
6 | from cider.cider import Cider
7 |
8 | class COCOEvalCap:
9 | def __init__(self, coco, cocoRes):
10 | self.evalImgs = []
11 | self.eval = {}
12 | self.imgToEval = {}
13 | self.coco = coco
14 | self.cocoRes = cocoRes
15 | self.params = {'image_id': coco.getImgIds()}
16 |
17 | def evaluate(self):
18 | imgIds = self.params['image_id']
19 | # imgIds = self.coco.getImgIds()
20 | gts = {}
21 | res = {}
22 | for imgId in imgIds:
23 | gts[imgId] = self.coco.imgToAnns[imgId]
24 | res[imgId] = self.cocoRes.imgToAnns[imgId]
25 |
26 | # =================================================
27 | # Set up scorers
28 | # =================================================
29 | print 'tokenization...'
30 | tokenizer = PTBTokenizer()
31 | gts = tokenizer.tokenize(gts)
32 | res = tokenizer.tokenize(res)
33 |
34 | # =================================================
35 | # Set up scorers
36 | # =================================================
37 | print 'setting up scorers...'
38 | scorers = [
39 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
40 | (Meteor(),"METEOR"),
41 | (Rouge(), "ROUGE_L"),
42 | (Cider(), "CIDEr")
43 | ]
44 |
45 | # =================================================
46 | # Compute scores
47 | # =================================================
48 | eval = {}
49 | for scorer, method in scorers:
50 | print 'computing %s score...'%(scorer.method())
51 | score, scores = scorer.compute_score(gts, res)
52 | if type(method) == list:
53 | for sc, scs, m in zip(score, scores, method):
54 | self.setEval(sc, m)
55 | self.setImgToEvalImgs(scs, imgIds, m)
56 | print "%s: %0.3f"%(m, sc)
57 | else:
58 | self.setEval(score, method)
59 | self.setImgToEvalImgs(scores, imgIds, method)
60 | print "%s: %0.3f"%(method, score)
61 | self.setEvalImgs()
62 |
63 | def setEval(self, score, method):
64 | self.eval[method] = score
65 |
66 | def setImgToEvalImgs(self, scores, imgIds, method):
67 | for imgId, score in zip(imgIds, scores):
68 | if not imgId in self.imgToEval:
69 | self.imgToEval[imgId] = {}
70 | self.imgToEval[imgId]["image_id"] = imgId
71 | self.imgToEval[imgId][method] = score
72 |
73 | def setEvalImgs(self):
74 | self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]
--------------------------------------------------------------------------------
/lib/pycocoevalcap/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/meteor/meteor-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/pycocoevalcap/meteor/meteor-1.5.jar
--------------------------------------------------------------------------------
/lib/pycocoevalcap/meteor/meteor.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | # Python wrapper for METEOR implementation, by Xinlei Chen
4 | # Modified by Linjie Yang for evaluating dense captioning
5 | # Acknowledge Michael Denkowski for the generous discussion and help
6 |
7 | import os
8 | import sys
9 | import subprocess
10 | import threading
11 |
12 | # Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed.
13 | METEOR_JAR = 'meteor-1.5.jar'
14 | # print METEOR_JAR
15 |
16 | class Meteor:
17 |
18 | def __init__(self):
19 | self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
20 | '-', '-', '-stdio', '-l', 'en', '-norm']
21 | self.meteor_p = subprocess.Popen(self.meteor_cmd, \
22 | cwd=os.path.dirname(os.path.abspath(__file__)), \
23 | stdin=subprocess.PIPE, \
24 | stdout=subprocess.PIPE, \
25 | stderr=subprocess.PIPE)
26 | # Used to guarantee thread safety
27 | self.lock = threading.Lock()
28 |
29 | def compute_score(self, gts, res, imgIds=None):
30 | assert(gts.keys() == res.keys())
31 | if imgIds is None:
32 | imgIds = gts.keys()
33 | scores = []
34 |
35 | eval_line = 'EVAL'
36 | self.lock.acquire()
37 | for i in imgIds:
38 | assert(len(res[i]) == 1)
39 |
40 | stat = self._stat(res[i][0], gts[i])
41 | eval_line += ' ||| {}'.format(stat)
42 |
43 | self.meteor_p.stdin.write('{}\n'.format(eval_line))
44 | for i in range(0,len(imgIds)):
45 | scores.append(float(self.meteor_p.stdout.readline().strip()))
46 | final_score = self.meteor_p.stdout.readline().strip()
47 | #print final_score
48 | score = float(final_score)
49 | self.lock.release()
50 |
51 | return score, scores
52 |
53 |
54 | def compute_score_m2m(self, gts, res, imgIds=None):
55 | assert(gts.keys() == res.keys())
56 | if imgIds is None:
57 | imgIds = gts.keys()
58 | scores = []
59 |
60 | eval_line = 'EVAL'
61 | self.lock.acquire()
62 | tot_line = 0
63 | for i in imgIds:
64 | #assert(len(res[i]) == 1)
65 | for res_sent in res[i]:
66 | stat = self._stat(res_sent, gts[i])
67 | eval_line += ' ||| {}'.format(stat)
68 | tot_line += 1
69 | self.meteor_p.stdin.write('{}\n'.format(eval_line))
70 | for i in range(0,len(imgIds)):
71 | scores_im = []
72 | for j in xrange(len(res[i])):
73 | scores_im.append(float(self.meteor_p.stdout.readline().strip()))
74 | scores.append(scores_im)
75 | score = float(self.meteor_p.stdout.readline().strip())
76 | self.lock.release()
77 |
78 | return score, scores
79 | def method(self):
80 | return "METEOR"
81 |
82 | def _stat(self, hypothesis_str, reference_list):
83 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
84 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ')
85 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
86 | self.meteor_p.stdin.write('{}\n'.format(score_line))
87 | return self.meteor_p.stdout.readline().strip()
88 |
89 | def score(self, hypothesis_str, reference_list):
90 | self.lock.acquire()
91 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
92 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ')
93 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
94 | self.meteor_p.stdin.write('{}\n'.format(score_line))
95 | stats = self.meteor_p.stdout.readline().strip()
96 | eval_line = 'EVAL ||| {}'.format(stats)
97 | # EVAL ||| stats
98 | self.meteor_p.stdin.write('{}\n'.format(eval_line))
99 | score = float(self.meteor_p.stdout.readline().strip())
100 | self.lock.release()
101 | return score
102 |
103 | def __exit__(self):
104 | self.lock.acquire()
105 | self.meteor_p.stdin.close()
106 | self.meteor_p.wait()
107 | self.lock.release()
108 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/rouge/rouge.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File Name : rouge.py
4 | #
5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
6 | #
7 | # Creation Date : 2015-01-07 06:03
8 | # Author : Ramakrishna Vedantam
9 |
10 | import numpy as np
11 | import pdb
12 |
13 | def my_lcs(string, sub):
14 | """
15 | Calculates longest common subsequence for a pair of tokenized strings
16 | :param string : list of str : tokens from a string split using whitespace
17 | :param sub : list of str : shorter string, also split using whitespace
18 | :returns: length (list of int): length of the longest common subsequence between the two strings
19 |
20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
21 | """
22 | if(len(string)< len(sub)):
23 | sub, string = string, sub
24 |
25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
26 |
27 | for j in range(1,len(sub)+1):
28 | for i in range(1,len(string)+1):
29 | if(string[i-1] == sub[j-1]):
30 | lengths[i][j] = lengths[i-1][j-1] + 1
31 | else:
32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
33 |
34 | return lengths[len(string)][len(sub)]
35 |
36 | class Rouge():
37 | '''
38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
39 |
40 | '''
41 | def __init__(self):
42 | # vrama91: updated the value below based on discussion with Hovey
43 | self.beta = 1.2
44 |
45 | def calc_score(self, candidate, refs):
46 | """
47 | Compute ROUGE-L score given one candidate and references for an image
48 | :param candidate: str : candidate sentence to be evaluated
49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated
50 | :returns score: int (ROUGE-L score for the candidate evaluated against references)
51 | """
52 | assert(len(candidate)==1)
53 | assert(len(refs)>0)
54 | prec = []
55 | rec = []
56 |
57 | # split into tokens
58 | token_c = candidate[0].split(" ")
59 |
60 | for reference in refs:
61 | # split into tokens
62 | token_r = reference.split(" ")
63 | # compute the longest common subsequence
64 | lcs = my_lcs(token_r, token_c)
65 | prec.append(lcs/float(len(token_c)))
66 | rec.append(lcs/float(len(token_r)))
67 |
68 | prec_max = max(prec)
69 | rec_max = max(rec)
70 |
71 | if(prec_max!=0 and rec_max !=0):
72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
73 | else:
74 | score = 0.0
75 | return score
76 |
77 | def compute_score(self, gts, res):
78 | """
79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset
80 | Invoked by evaluate_captions.py
81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
84 | """
85 | assert(gts.keys() == res.keys())
86 | imgIds = gts.keys()
87 |
88 | score = []
89 | for id in imgIds:
90 | hypo = res[id]
91 | ref = gts[id]
92 |
93 | score.append(self.calc_score(hypo, ref))
94 |
95 | # Sanity check.
96 | assert(type(hypo) is list)
97 | assert(len(hypo) == 1)
98 | assert(type(ref) is list)
99 | assert(len(ref) > 0)
100 |
101 | average_score = np.mean(np.array(score))
102 | return average_score, np.array(score)
103 |
104 | def method(self):
105 | return "Rouge"
106 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File Name : ptbtokenizer.py
4 | #
5 | # Description : Do the PTB Tokenization and remove punctuations.
6 | #
7 | # Creation Date : 29-12-2014
8 | # Last Modified : Thu Mar 19 09:53:35 2015
9 | # Authors : Hao Fang and Tsung-Yi Lin
10 |
11 | import os
12 | import sys
13 | import subprocess
14 | import tempfile
15 | import itertools
16 |
17 | # path to the stanford corenlp jar
18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
19 |
20 | # punctuations to be removed from the sentences
21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-",
22 | ".", "?", "!", ",", ":", "-", "--", "...", ";"]
23 |
24 |
25 | class PTBTokenizer:
26 | """Python wrapper of Stanford PTBTokenizer"""
27 |
28 | def tokenize(self, captions_for_image):
29 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR,
30 | 'edu.stanford.nlp.process.PTBTokenizer',
31 | '-preserveLines', '-lowerCase']
32 |
33 | # ======================================================
34 | # prepare data for PTB Tokenizer
35 | # ======================================================
36 | final_tokenized_captions_for_image = {}
37 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
38 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
39 |
40 | # ======================================================
41 | # save sentences to temporary file
42 | # ======================================================
43 | path_to_jar_dirname = os.path.dirname(os.path.abspath(__file__))
44 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
45 | tmp_file.write(sentences)
46 | tmp_file.close()
47 |
48 | # ======================================================
49 | # tokenize sentence
50 | # ======================================================
51 | cmd.append(os.path.basename(tmp_file.name))
52 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname,
53 | stdout=subprocess.PIPE) # shell=True
54 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
55 | lines = token_lines.split('\n')
56 | # remove temp file
57 | os.remove(tmp_file.name)
58 |
59 | # ======================================================
60 | # create dictionary for tokenized captions
61 | # ======================================================
62 | for k, line in zip(image_id, lines):
63 | if not k in final_tokenized_captions_for_image:
64 | final_tokenized_captions_for_image[k] = []
65 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ')
66 | if w not in PUNCTUATIONS])
67 | final_tokenized_captions_for_image[k].append(tokenized_caption)
68 |
69 | return final_tokenized_captions_for_image
70 |
--------------------------------------------------------------------------------
/lib/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar
--------------------------------------------------------------------------------
/lib/setup.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Ross Girshick's work
5 | # --------------------------------------------------------
6 | # Fast R-CNN
7 | # Copyright (c) 2015 Microsoft
8 | # Licensed under The MIT License [see LICENSE for details]
9 | # Written by Ross Girshick
10 | # --------------------------------------------------------
11 |
12 |
13 | import os
14 | from os.path import join as pjoin
15 | from setuptools import setup
16 | from distutils.extension import Extension
17 | from Cython.Distutils import build_ext
18 | import subprocess
19 | import numpy as np
20 |
21 | def find_in_path(name, path):
22 | "Find a file in a search path"
23 | # Adapted fom
24 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
25 | for dir in path.split(os.pathsep):
26 | binpath = pjoin(dir, name)
27 | if os.path.exists(binpath):
28 | return os.path.abspath(binpath)
29 | return None
30 |
31 |
32 | def locate_cuda():
33 | """Locate the CUDA environment on the system
34 |
35 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
36 | and values giving the absolute path to each directory.
37 |
38 | Starts by looking for the CUDAHOME env variable. If not found, everything
39 | is based on finding 'nvcc' in the PATH.
40 | """
41 |
42 | # first check if the CUDAHOME env variable is in use
43 | if 'CUDAHOME' in os.environ:
44 | home = os.environ['CUDAHOME']
45 | nvcc = pjoin(home, 'bin', 'nvcc')
46 | else:
47 | # otherwise, search the PATH for NVCC
48 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
49 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
50 | if nvcc is None:
51 | raise EnvironmentError('The nvcc binary could not be '
52 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME')
53 | home = os.path.dirname(os.path.dirname(nvcc))
54 |
55 | cudaconfig = {'home':home, 'nvcc':nvcc,
56 | 'include': pjoin(home, 'include'),
57 | 'lib64': pjoin(home, 'lib64')}
58 | for k, v in cudaconfig.iteritems():
59 | if not os.path.exists(v):
60 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
61 |
62 | return cudaconfig
63 | CUDA = locate_cuda()
64 |
65 |
66 | # Obtain the numpy include directory. This logic works across numpy versions.
67 | try:
68 | numpy_include = np.get_include()
69 | except AttributeError:
70 | numpy_include = np.get_numpy_include()
71 |
72 | def customize_compiler_for_nvcc(self):
73 | """inject deep into distutils to customize how the dispatch
74 | to gcc/nvcc works.
75 |
76 | If you subclass UnixCCompiler, it's not trivial to get your subclass
77 | injected in, and still have the right customizations (i.e.
78 | distutils.sysconfig.customize_compiler) run on it. So instead of going
79 | the OO route, I have this. Note, it's kindof like a wierd functional
80 | subclassing going on."""
81 |
82 | # tell the compiler it can processes .cu
83 | self.src_extensions.append('.cu')
84 |
85 | # save references to the default compiler_so and _comple methods
86 | default_compiler_so = self.compiler_so
87 | super = self._compile
88 |
89 | # now redefine the _compile method. This gets executed for each
90 | # object but distutils doesn't have the ability to change compilers
91 | # based on source extension: we add it.
92 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
93 | if os.path.splitext(src)[1] == '.cu':
94 | # use the cuda for .cu files
95 | self.set_executable('compiler_so', CUDA['nvcc'])
96 | # use only a subset of the extra_postargs, which are 1-1 translated
97 | # from the extra_compile_args in the Extension class
98 | postargs = extra_postargs['nvcc']
99 | else:
100 | postargs = extra_postargs['gcc']
101 |
102 | super(obj, src, ext, cc_args, postargs, pp_opts)
103 | # reset the default compiler_so, which we might have changed for cuda
104 | self.compiler_so = default_compiler_so
105 |
106 | # inject our redefined _compile method into the class
107 | self._compile = _compile
108 |
109 |
110 | # run the customize_compiler
111 | class custom_build_ext(build_ext):
112 | def build_extensions(self):
113 | customize_compiler_for_nvcc(self.compiler)
114 | build_ext.build_extensions(self)
115 |
116 |
117 | ext_modules = [
118 | Extension(
119 | "utils.cython_bbox",
120 | ["utils/bbox.pyx"],
121 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
122 | include_dirs = [numpy_include]
123 | ),
124 | Extension(
125 | "nms.cpu_nms",
126 | ["nms/cpu_nms.pyx"],
127 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
128 | include_dirs = [numpy_include]
129 | ),
130 | Extension('nms.gpu_nms',
131 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
132 | library_dirs=[CUDA['lib64']],
133 | libraries=['cudart'],
134 | language='c++',
135 | runtime_library_dirs=[CUDA['lib64']],
136 | # this syntax is specific to this build system
137 | # we're only going to use certain compiler args with nvcc and not with
138 | # gcc the implementation of this trick is in customize_compiler() below
139 | extra_compile_args={'gcc': ["-Wno-unused-function"],
140 | 'nvcc': ['-arch=sm_35',
141 | '--ptxas-options=-v',
142 | '-c',
143 | '--compiler-options',
144 | "'-fPIC'"]},
145 | include_dirs = [numpy_include, CUDA['include']]
146 | ),
147 | ]
148 |
149 | setup(
150 | name='fast_rcnn',
151 | ext_modules=ext_modules,
152 | # inject our custom trigger
153 | cmdclass={'build_ext': custom_build_ext},
154 | )
155 |
156 |
--------------------------------------------------------------------------------
/lib/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
--------------------------------------------------------------------------------
/lib/utils/bbox.pyx:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Sergey Karayev
6 | # --------------------------------------------------------
7 |
8 | cimport cython
9 | import numpy as np
10 | cimport numpy as np
11 |
12 | DTYPE = np.float
13 | ctypedef np.float_t DTYPE_t
14 |
15 | def bbox_overlaps(
16 | np.ndarray[DTYPE_t, ndim=2] boxes,
17 | np.ndarray[DTYPE_t, ndim=2] query_boxes):
18 | """
19 | Parameters
20 | ----------
21 | boxes: (N, 4) ndarray of float
22 | query_boxes: (K, 4) ndarray of float
23 | Returns
24 | -------
25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes
26 | """
27 | cdef unsigned int N = boxes.shape[0]
28 | cdef unsigned int K = query_boxes.shape[0]
29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
30 | cdef DTYPE_t iw, ih, box_area
31 | cdef DTYPE_t ua
32 | cdef unsigned int k, n
33 | for k in range(K):
34 | box_area = (
35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1)
37 | )
38 | for n in range(N):
39 | iw = (
40 | min(boxes[n, 2], query_boxes[k, 2]) -
41 | max(boxes[n, 0], query_boxes[k, 0]) + 1
42 | )
43 | if iw > 0:
44 | ih = (
45 | min(boxes[n, 3], query_boxes[k, 3]) -
46 | max(boxes[n, 1], query_boxes[k, 1]) + 1
47 | )
48 | if ih > 0:
49 | ua = float(
50 | (boxes[n, 2] - boxes[n, 0] + 1) *
51 | (boxes[n, 3] - boxes[n, 1] + 1) +
52 | box_area - iw * ih
53 | )
54 | overlaps[n, k] = iw * ih / ua
55 | return overlaps
56 |
--------------------------------------------------------------------------------
/lib/utils/bbox_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from collections import OrderedDict
4 | import json
5 | import numpy as np
6 | import pprint
7 | import cPickle as pickle
8 | import string
9 |
10 | def get_bbox_coord(norm_coord, do_clip=True):
11 | #input is a nx4 numpy array in normalized bbox coordinates
12 | #print norm_coord.shape
13 | #print norm_coord
14 | bboxes_coord = np.zeros(norm_coord.shape)
15 | #x,y,w,h
16 | bboxes_coord[:, :2] = norm_coord[:, :2]+0.5
17 | bboxes_coord[:, 2:] = np.exp(norm_coord[:, 2:])
18 |
19 | #x1,y1,x2,y2
20 | bboxes_coord2 = np.zeros(norm_coord.shape)
21 | bboxes_coord2[:, :2] = bboxes_coord[:, :2] - bboxes_coord[:, 2:] * 0.5
22 | bboxes_coord2[:, 2:] = bboxes_coord[:, :2] + bboxes_coord[:, 2:] * 0.5
23 | #clipping all coordinates to [0,1]
24 | if do_clip:
25 | bboxes_coord2 = np.minimum(np.maximum(bboxes_coord2, 0), 1)
26 | return bboxes_coord2
27 |
28 |
29 | def get_bbox_iou_matrix(bboxes):
30 | region_n = bboxes.shape[0]
31 | #area, intersection area, union area
32 | bbox_areas = (bboxes[:,2] - bboxes[:,0]) * \
33 | (bboxes[:, 3] - bboxes[:, 1])
34 |
35 | x_a1 = bboxes[:,0].reshape(region_n,1)
36 | x_a2 = bboxes[:,2].reshape(region_n,1)
37 | x_b1 = bboxes[:,0].reshape(1,region_n)
38 | x_b2 = bboxes[:,2].reshape(1,region_n)
39 | y_a1 = bboxes[:,1].reshape(region_n,1)
40 | y_a2 = bboxes[:,3].reshape(region_n,1)
41 | y_b1 = bboxes[:,1].reshape(1,region_n)
42 | y_b2 = bboxes[:,3].reshape(1,region_n)
43 | bbox_pair_x_diff = np.maximum(0, np.minimum(x_a2, x_b2) - np.maximum(x_a1, x_b1))
44 | bbox_pair_y_diff = np.maximum(0, np.minimum(y_a2, y_b2) - np.maximum(y_a1, y_b1))
45 | inter_areas = bbox_pair_x_diff * bbox_pair_y_diff
46 |
47 | #IoU
48 | union_areas = bbox_areas.reshape(region_n,1) + bbox_areas.reshape(1,region_n)
49 |
50 | bbox_iou = inter_areas / (union_areas - inter_areas)
51 | return bbox_iou
52 |
53 | def nms(region_info, bbox_th=0.3):
54 | #non-maximum surpression
55 | region_info.sort(key = lambda x: -x['log_prob'])
56 | #keep_index = []
57 | region_n = len(region_info)
58 | #fast computation of pairwise IoU
59 | #pick the bbox of last timestep of each sample
60 | #print 'region_info length %d' % len(region_info)
61 | all_bboxes = np.array([x['location'][-1,:] for x in region_info])# nx4 matrix
62 | bbox_iou = get_bbox_iou_matrix(all_bboxes)
63 | bbox_iou_th = bbox_iou < bbox_th
64 | keep_flag = np.ones((region_n),dtype=np.uint8)
65 |
66 | for i in xrange(region_n-1):
67 | if keep_flag[i]:
68 | keep_flag[i+1:] = np.logical_and(keep_flag[i+1:], bbox_iou_th[i,i+1:])
69 | print 'sum of keep flag'
70 | print keep_flag.sum()
71 | return [region_info[i] for i in xrange(region_n) if keep_flag[i]]
72 |
73 | def region_merge(region_info, bbox_th=0.7):
74 | #merging ground truth bboxes
75 |
76 | #keep_index = []
77 | region_n = len(region_info)
78 | region_merged = []
79 | #fast computation of pairwise IoU
80 | #pick the bbox of last timestep of each sample
81 | all_bboxes = np.array([x['location'] for x in region_info], dtype = np.float32)# nx4 matrix
82 | bbox_iou = get_bbox_iou_matrix(all_bboxes)
83 | bbox_iou_th = bbox_iou > bbox_th
84 | bbox_iou_overlap_n = bbox_iou_th.sum(axis = 0)
85 |
86 | merge_flag = np.ones((region_n),dtype=np.uint8)
87 | unmerged_region = region_n
88 | while unmerged_region > 0:
89 | max_overlap_id = np.argmax(bbox_iou_overlap_n)
90 | assert bbox_iou_overlap_n[max_overlap_id] > 0
91 | merge_group = np.nonzero(bbox_iou_th[max_overlap_id,:] & merge_flag)[0]
92 | unmerged_region -= len(merge_group)
93 | merge_flag[merge_group] = 0
94 | bbox_iou_overlap_n[merge_group] = 0
95 | bbox_group = all_bboxes[merge_group,:].reshape(len(merge_group),4)
96 | caption_group = [region_info[i]['caption'] for i in merge_group]
97 | bbox_mean = np.mean(bbox_group, axis = 0).tolist()
98 | region_merged.append({'image_id':region_info[max_overlap_id]['image_id'], \
99 | 'captions': caption_group, 'location': bbox_mean})
100 | return region_merged
101 |
102 |
--------------------------------------------------------------------------------
/lib/utils/blob.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | """Blob helper functions."""
9 |
10 | import numpy as np
11 | import cv2
12 |
13 |
14 | def im_list_to_blob(ims):
15 | """Convert a list of images into a network input.
16 |
17 | Assumes images are already prepared (means subtracted, BGR order, ...).
18 | """
19 | max_shape = np.array([im.shape for im in ims]).max(axis=0)
20 | num_images = len(ims)
21 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
22 | dtype=np.float32)
23 | for i in xrange(num_images):
24 | im = ims[i]
25 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
26 | # Move channels (axis 3) to axis 1
27 | # Axis order will become: (batch elem, channel, height, width)
28 | # TODO: check out if we need transpose here.
29 | # For now, we stick to the tf_faster_rcnn version
30 | # channel_swap = (0, 3, 1, 2)
31 | # blob = blob.transpose(channel_swap)
32 | return blob
33 |
34 |
35 | def prep_im_for_blob(im, pixel_means, target_size, max_size):
36 | """Mean subtract and scale an image for use in a blob."""
37 | im = im.astype(np.float32, copy=False)
38 | im -= pixel_means
39 | im_shape = im.shape
40 | im_size_min = np.min(im_shape[0:2])
41 | im_size_max = np.max(im_shape[0:2])
42 | im_scale = float(target_size) / float(im_size_min)
43 | # Prevent the biggest axis from being more than MAX_SIZE
44 | if np.round(im_scale * im_size_max) > max_size:
45 | im_scale = float(max_size) / float(im_size_max)
46 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
47 | interpolation=cv2.INTER_LINEAR)
48 |
49 | return im, im_scale
50 |
--------------------------------------------------------------------------------
/lib/utils/debug.py:
--------------------------------------------------------------------------------
1 | ### Functions in this file are for debugging purpose
2 | ### Linjie Yang
3 |
4 | import numpy as np
5 |
6 | def softmax(x):
7 | """Compute softmax values for each sets of scores in x."""
8 | # defalut: last dimension of x is the score dimension
9 | axis = len(x.shape) - 1
10 | x = x - x.max(axis = axis, keepdims=True)
11 | sf = np.exp(x)
12 | sf = sf / np.sum(sf, axis=axis, keepdims=True)
13 | return sf
--------------------------------------------------------------------------------
/lib/utils/timer.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 |
8 | import time
9 |
10 | class Timer(object):
11 | """A simple timer."""
12 | def __init__(self):
13 | self.total_time = 0.
14 | self.calls = 0
15 | self.start_time = 0.
16 | self.diff = 0.
17 | self.average_time = 0.
18 |
19 | def tic(self):
20 | # using time.time instead of time.clock because time time.clock
21 | # does not normalize for multithreading
22 | self.start_time = time.time()
23 |
24 | def toc(self, average=True):
25 | self.diff = time.time() - self.start_time
26 | self.total_time += self.diff
27 | self.calls += 1
28 | self.average_time = self.total_time / self.calls
29 | if average:
30 | return self.average_time
31 | else:
32 | return self.diff
33 |
--------------------------------------------------------------------------------
/lib/utils/visualization.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Tensorflow Faster R-CNN
3 | # Licensed under The MIT License [see LICENSE for details]
4 | # Written by Xinlei Chen
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | import pdb
11 | import numpy as np
12 | import numpy.random as npr
13 | from six.moves import range
14 | from lib.config import cfg
15 | import PIL.Image as Image
16 | import PIL.ImageColor as ImageColor
17 | import PIL.ImageDraw as ImageDraw
18 | import PIL.ImageFont as ImageFont
19 | from lib.fast_rcnn.nms_wrapper import nms
20 | from lib.fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv
21 |
22 | STANDARD_COLORS = [
23 | 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
24 | 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
25 | 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
26 | 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
27 | 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
28 | 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
29 | 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
30 | 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
31 | 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
32 | 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
33 | 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
34 | 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
35 | 'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
36 | 'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
37 | 'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
38 | 'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
39 | 'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
40 | 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
41 | 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
42 | 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
43 | 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
44 | 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
45 | 'WhiteSmoke', 'Yellow', 'YellowGreen'
46 | ]
47 |
48 | NUM_COLORS = len(STANDARD_COLORS)
49 |
50 | try:
51 | FONT = ImageFont.truetype('arial.ttf', 24)
52 | except IOError:
53 | FONT = ImageFont.load_default()
54 |
55 |
56 | def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, font, color='black', thickness=4):
57 | draw = ImageDraw.Draw(image)
58 | (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
59 | draw.line([(left, top), (left, bottom), (right, bottom),
60 | (right, top), (left, top)], width=thickness, fill=color)
61 | text_bottom = bottom
62 | # Reverse list and print from bottom to top.
63 | text_width, text_height = font.getsize(display_str)
64 | margin = np.ceil(0.05 * text_height)
65 | draw.rectangle(
66 | [(left, text_bottom - text_height - 2 * margin), (left + text_width,
67 | text_bottom)],
68 | fill=color)
69 | draw.text(
70 | (left + margin, text_bottom - text_height - margin),
71 | display_str,
72 | fill='black',
73 | font=font)
74 |
75 | return image
76 |
77 |
78 | def draw_bounding_boxes(image, gt_boxes, im_info, phrases):
79 |
80 | num_boxes = gt_boxes.shape[0]
81 | gt_boxes_new = gt_boxes.copy()
82 | gt_boxes_new[:, :4] = np.round(gt_boxes_new[:, :4].copy() / im_info[2])
83 | disp_image = Image.fromarray(np.uint8(image[0]))
84 |
85 | # show several(10) boxes for debugging
86 | show_ids = npr.choice(np.arange(num_boxes), size=5, replace=False)
87 | vocab_path = '%s/vocabulary.txt' % cfg.CACHE_DIR
88 | with open(vocab_path, 'r') as f:
89 | vocab = [line.strip() for line in f]
90 | # vocab_extra = ['', '', '']
91 | # for ex in vocab_extra:
92 | # vocab.insert(0, ex)
93 | for idx, i in enumerate(show_ids):
94 | # this_class = int(gt_boxes_new[i, 4])
95 | # phrase = phrases[i] if len(phrases[i]) < cfg.TIME_STEPS else phrases[1:]
96 | # for adding gt bounding box
97 | if len(phrases[i]) < cfg.TIME_STEPS:
98 | phrase = phrases[i]
99 | # for adding predicted boxes
100 | else:
101 | phrase = []
102 | # phrases[i][1:] to remove the token
103 | for p in phrases[i]:
104 | if p == cfg.END_INDEX:
105 | break
106 | phrase.append(p)
107 |
108 | caption = ' '.join([vocab[j - 3] if j - 3 >= 0 else "" for j
109 | in phrase])
110 | # caption = " ".join([vocab[j] for j in phrase[i])
111 | disp_image = _draw_single_box(disp_image,
112 | gt_boxes_new[i, 0],
113 | gt_boxes_new[i, 1],
114 | gt_boxes_new[i, 2],
115 | gt_boxes_new[i, 3],
116 | '%s_%s' % (i, caption),
117 | FONT,
118 | color=STANDARD_COLORS[idx % NUM_COLORS])
119 |
120 | image[0, :] = np.array(disp_image)
121 | return image
122 |
123 |
124 | def draw_densecap(image, scores, rois, im_info, cap_probs, bbox_pred):
125 | """
126 | bbox_pred: [None, 4]
127 | rois: [None, 5]
128 |
129 | """
130 | # for bbox unnormalization
131 |
132 | bbox_mean = np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS).reshape((1, 4))
133 | bbox_stds = np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS).reshape((1, 4))
134 |
135 | boxes = rois[:, 1:5] / im_info[2]
136 | # [None, 12]
137 | cap_ids = np.argmax(cap_probs, axis=1).reshape((-1, cfg.TIME_STEPS))
138 |
139 | # bbox target unnormalization
140 | box_deltas = bbox_pred * bbox_stds + bbox_mean
141 |
142 | # do the transformation
143 | pred_boxes = bbox_transform_inv(boxes, box_deltas)
144 | pred_boxes = clip_boxes(pred_boxes, image.shape)
145 |
146 | pos_dets = np.hstack((pred_boxes, scores[:, 1][:, np.newaxis])).astype(np.float32, copy=False)
147 | keep = nms(pos_dets, cfg.TEST.NMS)
148 | pos_boxes = boxes[keep, :]
149 | cap_ids = cap_ids[keep, :]
150 | im_info[2] = 1.
151 | img_cap = draw_bounding_boxes(image, pos_boxes, im_info, cap_ids)
152 |
153 | return img_cap
154 |
--------------------------------------------------------------------------------
/logs/densecap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/logs/densecap.png
--------------------------------------------------------------------------------
/logs/funny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/logs/funny.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython>=0.19.2
2 | opencv-python>=3.3.0
3 | numpy>=1.7.1
4 | scipy>=0.13.2
5 | scikit-image>=0.9.3
6 | matplotlib>=1.3.1
7 | ipython>=3.0.0
8 | pyyaml>=3.10
9 | Pillow>=2.3.0
10 | easydict>=1.6
11 | ijson>=2.3
12 | tqdm>=4.17.1
13 |
--------------------------------------------------------------------------------
/scripts/dense_cap_config.yml:
--------------------------------------------------------------------------------
1 | EXP_DIR: DenseCap
2 | DEBUG_ALL: False
3 | ALL_TEST: False
4 | ALL_TEST_NUM_TRAIN: 100
5 | ALL_TEST_NUM_VAL: 100
6 | ALL_TEST_NUM_TEST: 1000
7 | LIMIT_RAM: True
8 | EMBED_DIM: 512
9 | CONTEXT_FUSION: False
10 | INIT_BY_GLOVE: False
11 | KEEP_AS_GLOVE_DIM: False
12 | GLOVE_DIM: 300
13 | TRAIN:
14 | HAS_RPN: True
15 | IMS_PER_BATCH: 1
16 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
17 | RPN_POSITIVE_OVERLAP: 0.7
18 | SUMMARY_INTERVAL: 10
19 | RPN_BATCHSIZE: 256
20 | BATCH_SIZE: 256
21 | PROPOSAL_METHOD: gt
22 | BG_THRESH_LO: 0.0
23 | FG_FRACTION: 0.5
24 | RPN_NMS_THRESH: 0.7
25 | MAX_SIZE: 720
26 | USE_FLIPPED: True
27 | LR_DIY_DECAY: True
28 | STEPSIZE: [100000]
29 | WEIGHT_INITIALIZER: normal
30 | DISPLAY: 10
31 | # EXP_DECAY_RATE: 0.5
32 | # EXP_DECAY_STEPS: 500
33 | RESNET:
34 | FIXED_BLOCKS: 1
35 | TEST:
36 | HAS_RPN: True
37 | RPN_NMS_THRESH: 0.6
38 | NMS: 0.5
39 | RPN_POST_NMS_TOP_N: 300
40 | MAX_SIZE: 720
41 |
--------------------------------------------------------------------------------
/scripts/dense_cap_demo.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Run with:
4 | # bash scripts/dense_cap_demo.sh [ckpt_path] [vocab_path]
5 |
6 | set -x
7 | set -e
8 |
9 | ckpt=$1
10 | vocab=$2
11 |
12 | # For my own experiment usage, just ignore it.
13 | if [ -d '/home/joe' ]; then
14 | ckpt='/home/joe/git/densecap/output/dc_context/vg_1.2_train'
15 | vocab='/home/joe/git/visual_genome/1.2/vocabulary.txt'
16 | fi
17 |
18 | time python ./tools/demo.py \
19 | --ckpt ${ckpt} \
20 | --cfg scripts/dense_cap_config.yml \
21 | --vocab ${vocab} \
22 | --set TEST.USE_BEAM_SEARCH False EMBED_DIM 512 TEST.LN_FACTOR 1. TEST.RPN_NMS_THRESH 0.7 TEST.NMS 0.3
23 |
--------------------------------------------------------------------------------
/scripts/dense_cap_test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # --------------------------------------------------------
4 | # DenseCap-Tensorflow
5 | # Written by InnerPeace
6 | # This file is adapted from Ross Linjie's work
7 | # --------------------------------------------------------
8 |
9 | # TODO: change the test procedure.
10 | set -x
11 | set -e
12 |
13 | GPU_ID=0
14 | CKPT=$1
15 | TEST_IMDB=$2
16 |
17 |
18 | # Fro valohai platform, maybe out of date.
19 | if [ -d '/valohai/outputs' ]; then
20 | CKPT="./output/Densecap_res50_context_all/vg_1.2_train"
21 | fi
22 |
23 | # For my own experiment, just ignore it.
24 | if [ -d '/home/joe' ]; then
25 | CKPT="/home/joe/git/densecap/output/dc_tune_context/vg_1.2_train"
26 | TEST_IMDB="vg_1.2_test"
27 | fi
28 |
29 | LOG="logs/test_log.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
30 | exec &> >(tee -a "$LOG")
31 | echo Logging output to "$LOG"
32 |
33 | time python ./tools/test_net.py \
34 | --ckpt ${CKPT} \
35 | --imdb ${TEST_IMDB} \
36 | --cfg scripts/dense_cap_config.yml \
37 | --set ALL_TEST True
38 |
--------------------------------------------------------------------------------
/scripts/dense_cap_train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Run with:
4 | # bash scripts/dense_cap_train.sh [dataset] [net] [ckpt_to_init] [data_dir] [step]
5 |
6 | set -x
7 | set -e
8 |
9 | export PYTHONUNBUFFERED='True'
10 |
11 | DATASET=$1
12 | NET=$2
13 | ckpt_path=$3
14 | data_dir=$4
15 | step=$5
16 |
17 | # For my own experiment usage, just ignore it.
18 | if [ -d '/home/joe' ]; then
19 | DATASET='visual_genome_1.2'
20 | NET='res50'
21 | ckpt_path="experiments/random_fixconv_i85k_171219/dc_fixed_1219/vg_1.2_train"
22 | # ckpt_path="experiments/rd_fixconv_i165k_171221/dc_conv_fixed/vg_1.2_train"
23 | # ckpt_path='/home/joe/git/slim_models/res50.ckpt'
24 | data_dir='/home/joe/git/visual_genome'
25 | fi
26 |
27 | case $DATASET in
28 | visual_genome)
29 | TRAIN_IMDB="vg_1.0_train"
30 | TEST_IMDB="vg_1.0_val"
31 | PT_DIR="dense_cap"
32 | FINETUNE_AFTER1=200000
33 | FINETUNE_AFTER2=100000
34 | ITERS1=400000
35 | ITERS2=300000
36 | ;;
37 | visual_genome_1.2)
38 | TRAIN_IMDB="vg_1.2_train"
39 | TEST_IMDB="vg_1.2_val"
40 | PT_DIR="dense_cap"
41 | FINETUNE_AFTER1=200000
42 | FINETUNE_AFTER2=100000
43 | ITERS1=400000
44 | ITERS2=300000
45 | ;;
46 | *)
47 | echo "No dataset given"
48 | exit
49 | ;;
50 | esac
51 |
52 | # This is for valohai computing platform, one can just ignore it.
53 | if [ -d '/valohai/outputs' ]; then
54 | ckpt_path='/valohai/inputs/resnet'
55 | data_dir='/valohai/inputs/visual_genome'
56 | LOG="/valohai/outputs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
57 | else
58 | LOG="logs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
59 | fi
60 |
61 | exec &> >(tee -a "$LOG")
62 | echo Logging output to "$LOG"
63 |
64 | # First step, freeze conv nets weights
65 | if [ ${step} -lt '2' ]
66 | then
67 | time python ./tools/train_net.py \
68 | --weights ${ckpt_path} \
69 | --imdb ${TRAIN_IMDB} \
70 | --imdbval ${TEST_IMDB} \
71 | --iters ${FINETUNE_AFTER1}\
72 | --cfg scripts/dense_cap_config.yml \
73 | --data_dir ${data_dir} \
74 | --net ${NET} \
75 | --set EXP_DIR dc_conv_fixed CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3
76 | fi
77 |
78 | # Step2: Finetune convnets
79 | NEW_WIGHTS=output/dc_conv_fixed/${TRAIN_IMDB}
80 | if [ ${step} -lt '3' ]
81 | then
82 | time python ./tools/train_net.py \
83 | --weights ${NEW_WIGHTS} \
84 | --imdb ${TRAIN_IMDB} \
85 | --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \
86 | --imdbval ${TEST_IMDB} \
87 | --cfg scripts/dense_cap_config.yml \
88 | --data_dir ${data_dir} \
89 | --net ${NET} \
90 | --set EXP_DIR dc_tune_conv CONTEXT_FUSION False RESNET.FIXED_BLOCKS 1 TRAIN.LEARNING_RATE 0.00025
91 | fi
92 |
93 | # Step3: train with contex fusion
94 | NEW_WIGHTS=output/dc_tune_conv/${TRAIN_IMDB}
95 | if [ ${step} -lt '4' ]
96 | then
97 | time python ./tools/train_net.py \
98 | --weights ${NEW_WIGHTS} \
99 | --imdb ${TRAIN_IMDB} \
100 | --imdbval ${TEST_IMDB} \
101 | --iters ${FINETUNE_AFTER2} \
102 | --cfg scripts/dense_cap_config.yml \
103 | --data_dir ${data_dir} \
104 | --net ${NET} \
105 | --set EXP_DIR dc_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 3 TRAIN.LEARNING_RATE 0.000125
106 | fi
107 |
108 | # Step4: finetune context fusion
109 | NEW_WIGHTS=output/dc_context/${TRAIN_IMDB}
110 | if [ ${step} -lt '5' ]
111 | then
112 | time python ./tools/train_net.py \
113 | --weights ${NEW_WIGHTS} \
114 | --imdb ${TRAIN_IMDB} \
115 | --imdbval ${TEST_IMDB} \
116 | --iters `expr ${ITERS2} - ${FINETUNE_AFTER2}` \
117 | --cfg scripts/dense_cap_config.yml \
118 | --data_dir ${data_dir} \
119 | --net ${NET} \
120 | --set EXP_DIR dc_tune_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 1 TRAIN.LEARNING_RATE 0.0000625
121 | fi
122 |
--------------------------------------------------------------------------------
/scripts/old_dense_cap_train.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # --------------------------------------------------------
4 | # DenseCap-Tensorflow
5 | # Written by InnerPeace
6 | # This file is adapted from Ross Linjie's work
7 | # --------------------------------------------------------
8 | # Script for training dense captioning model with joint inference and visual context
9 | # Do freeze-convnet training first, then finetuning
10 | # Usage:
11 | # ./models/dense_cap/dense_cap_train.sh [GPU_ID] [DATASET] [MODEL_TYPE] [INITIAL_WEIGHTS] [EXTRA_ARGS]
12 | # Example:
13 | # To train a model with joint inference and visual context (late fusion, feature summation) on visual genome 1.0
14 | # TODO: change the example.
15 | # ./models/dense_cap/dense_cap_train.sh 1 visual_genome late_fusion_sum models/vggnet/vgg16.caffemodel
16 | set -x
17 | set -e
18 |
19 | export PYTHONUNBUFFERED="True"
20 |
21 | GPU_ID=$1
22 | DATASET=$2
23 | MODEL_TYPE=$3
24 | WEIGHTS=$4
25 | array=( $@ )
26 | len=${#array[@]}
27 | EXTRA_ARGS=${array[@]:4:$len}
28 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_}
29 | case $DATASET in
30 | visual_genome)
31 | TRAIN_IMDB="vg_1.0_train"
32 | TEST_IMDB="vg_1.0_val"
33 | PT_DIR="dense_cap"
34 | FINETUNE_AFTER1=200000
35 | FINETUNE_AFTER2=100000
36 | ITERS1=400000
37 | ITERS2=300000
38 | ;;
39 | visual_genome_1.2)
40 | TRAIN_IMDB="vg_1.2_train"
41 | TEST_IMDB="vg_1.2_val"
42 | PT_DIR="dense_cap"
43 | FINETUNE_AFTER1=200000
44 | FINETUNE_AFTER2=100000
45 | ITERS1=400000
46 | ITERS2=300000
47 | ;;
48 | *)
49 | echo "No dataset given"
50 | exit
51 | ;;
52 | esac
53 | GLOG_logtostderr=1
54 | # If training visual context model, need to start with the context-free counterpart
55 | if [ ${MODEL_TYPE} != "joint_inference" ]
56 | then
57 | # TODO: change the options for training
58 | ./tools/train_net.py --gpu ${GPU_ID} \
59 | --solver models/${PT_DIR}/solver_joint_inference.prototxt \
60 | --weights ${WEIGHTS} \
61 | --imdb ${TRAIN_IMDB} \
62 | --iters ${FINETUNE_AFTER1} \
63 | --cfg models/${PT_DIR}/dense_cap.yml \
64 | ${EXTRA_ARGS}
65 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_joint_inference_iter_${FINETUNE_AFTER1}.caffemodel
66 | # Finetuning all weights
67 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
68 | --solver models/${PT_DIR}/solver_joint_inference_finetune.prototxt \
69 | --weights ${NEW_WEIGHTS} \
70 | --imdb ${TRAIN_IMDB} \
71 | --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \
72 | --cfg models/${PT_DIR}/dense_cap.yml \
73 | ${EXTRA_ARGS}
74 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_joint_inference_finetune_iter_`expr ${ITERS1} - ${FINETUNE_AFTER1}`.caffemodel
75 | # Training with convnet weights fixed
76 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
77 | --solver models/${PT_DIR}/solver_${MODEL_TYPE}.prototxt \
78 | --weights ${NEW_WEIGHTS} \
79 | --imdb ${TRAIN_IMDB} \
80 | --iters ${FINETUNE_AFTER2} \
81 | --cfg models/${PT_DIR}/dense_cap.yml \
82 | ${EXTRA_ARGS}
83 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_${MODEL_TYPE}_iter_${FINETUNE_AFTER2}.caffemodel
84 | # Finetuning all weights
85 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
86 | --solver models/${PT_DIR}/solver_${MODEL_TYPE}_finetune.prototxt \
87 | --weights ${NEW_WEIGHTS} \
88 | --imdb ${TRAIN_IMDB} \
89 | --iters `expr ${ITERS2} - ${FINETUNE_AFTER2}` \
90 | --cfg models/${PT_DIR}/dense_cap.yml \
91 | ${EXTRA_ARGS}
92 |
93 | else
94 | # Training with convnet weights fixed
95 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
96 | --solver models/${PT_DIR}/solver_${MODEL_TYPE}.prototxt \
97 | --weights ${WEIGHTS} \
98 | --imdb ${TRAIN_IMDB} \
99 | --iters ${FINETUNE_AFTER1} \
100 | --cfg models/${PT_DIR}/dense_cap.yml \
101 | ${EXTRA_ARGS}
102 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_${MODEL_TYPE}_iter_${FINETUNE_AFTER1}.caffemodel
103 | # Finetuning all weights
104 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
105 | --solver models/${PT_DIR}/solver_${MODEL_TYPE}_finetune.prototxt \
106 | --weights ${NEW_WEIGHTS} \
107 | --imdb ${TRAIN_IMDB} \
108 | --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \
109 | --cfg models/${PT_DIR}/dense_cap.yml \
110 | ${EXTRA_ARGS}
111 | fi
112 |
--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | ## TEST
2 | Some of the test files during developing, just ignore it.
3 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/tests/__init__.py
--------------------------------------------------------------------------------
/tests/architecture_test.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | from lib.config import cfg
11 | import tensorflow as tf
12 | from lib.nets.resnet_v1 import resnetv1
13 | from tests.roidata_test import get_data_test
14 | import six
15 | import numpy as np
16 |
17 |
18 | def architecture_test():
19 | blob = get_data_test()
20 | tf.reset_default_graph()
21 | net = resnetv1(50)
22 | # net._build_network()
23 | net.create_architecture(mode='TEST', tag='pre')
24 |
25 | for n in tf.get_default_graph().as_graph_def().node:
26 | print(n.name)
27 |
28 | tfconfig = tf.ConfigProto(allow_soft_placement=True)
29 | tfconfig.gpu_options.allow_growth = True
30 |
31 | feed_dict = {net._image: blob['data'],
32 | net._im_info: blob['im_info'],
33 | net._gt_boxes: blob['gt_boxes'],
34 | net._gt_phrases: blob['gt_phrases']}
35 | output = net._for_debug
36 | output.update({
37 | "image": net._image,
38 | "im_info": net._im_info,
39 | "gt_boxes": net._gt_boxes,
40 | "gt_phrases": net._gt_phrases
41 | })
42 |
43 | with tf.Session(config=tfconfig) as sess:
44 | init = tf.global_variables_initializer()
45 | sess.run(init)
46 | out = sess.run('DenseCap_ResNet50/Prediction/lstm/cap_init_state:0', feed_dict=feed_dict)
47 | print(out.shape)
48 | # out = sess.run(output, feed_dict=feed_dict)
49 |
50 | # for k, v in six.iteritems(out):
51 | # print("name: {} ==> {}".format(k, v.shape))
52 | # # print("shape: {}".format(v.shape))
53 | # if k == 'labels':
54 | # # print(v)
55 | # # print("first 5 example:")
56 | # print(v[:5])
57 | # if k == 'loss' or k == 'total_loss':
58 | # print(k, v)
59 |
60 |
61 | if __name__ == '__main__':
62 | architecture_test()
63 |
--------------------------------------------------------------------------------
/tests/bash_log_test/bash_log_test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -x
4 | set -e
5 |
6 | export PYTHONUNBUFFERED="True"
7 |
8 | TAG=$1
9 |
10 | LOG="logs/${TAG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
11 | exec &> >(tee -a "$LOG")
12 | echo Logging output to "$LOG"
13 |
14 | time python ./nonsense.py
15 |
--------------------------------------------------------------------------------
/tests/bash_log_test/logs/test.txt.2017-10-18_15-33-56:
--------------------------------------------------------------------------------
1 | + echo Logging output to logs/test.txt.2017-10-18_15-33-56
2 | Logging output to logs/test.txt.2017-10-18_15-33-56
3 | + python ./nonsense.py
4 | hello world
5 |
6 | real 0m0.011s
7 | user 0m0.012s
8 | sys 0m0.000s
9 |
--------------------------------------------------------------------------------
/tests/bash_log_test/nonsense.py:
--------------------------------------------------------------------------------
1 | """test file"""
2 |
3 | from __future__ import absolute_import
4 | from __future__ import division
5 | from __future__ import print_function
6 |
7 |
8 | def main():
9 | print("hello world")
10 |
11 |
12 | if __name__ == '__main__':
13 | main()
14 |
--------------------------------------------------------------------------------
/tests/ckpt_restore_test.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------
2 | # DenseCap
3 | # Written by InnerPeace
4 | # This file is adapted from Xinlei's work
5 | # ----------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | import tensorflow as tf
11 | from tensorflow.python import pywrap_tensorflow
12 | import tensorflow.contrib.slim as slim
13 |
14 | from tensorflow.contrib.slim import arg_scope
15 | from tensorflow.contrib.slim.python.slim.nets import resnet_utils
16 | from tensorflow.contrib.slim.python.slim.nets import resnet_v1
17 | from tensorflow.contrib.slim.python.slim.nets.resnet_v1 import resnet_v1_block
18 | import numpy as np
19 |
20 | from lib.config import cfg
21 |
22 |
23 | def resnet_arg_scope(is_training=True,
24 | batch_norm_decay=0.997,
25 | batch_norm_epsilon=1e-5,
26 | batch_norm_scale=True):
27 | batch_norm_params = {
28 | 'is_training': False,
29 | 'decay': batch_norm_decay,
30 | 'epsilon': batch_norm_epsilon,
31 | 'scale': batch_norm_scale,
32 | 'trainable': False,
33 | 'updates_collections': tf.GraphKeys.UPDATE_OPS
34 | }
35 |
36 | with arg_scope(
37 | [slim.conv2d],
38 | # weights_regularizer=slim.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY),
39 | weights_regularizer=None,
40 | weights_initializer=slim.variance_scaling_initializer(),
41 | trainable=is_training,
42 | activation_fn=tf.nn.relu,
43 | normalizer_fn=slim.batch_norm,
44 | normalizer_params=batch_norm_params):
45 | with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc:
46 | return arg_sc
47 |
48 |
49 | class resnetv1():
50 | def __init__(self, num_layers=50):
51 | # Network.__init__(self)
52 | self._feat_stride = [16, ]
53 | self._feat_compress = [1. / float(self._feat_stride[0]), ]
54 | self._num_layers = num_layers
55 | self._scope = 'resnet_v1_%d' % num_layers
56 | self._decide_blocks()
57 |
58 | # Do the first few layers manually, because 'SAME' padding can behave inconsistently
59 | # for images of different sizes: sometimes 0, sometimes 1
60 | def _build_base(self):
61 | with tf.variable_scope(self._scope, self._scope):
62 | net = resnet_utils.conv2d_same(self._image, 64, 7, stride=2, scope='conv1')
63 | net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
64 | net = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='pool1')
65 |
66 | return net
67 |
68 | def _image_to_head(self, is_training, reuse=None):
69 | assert (0 <= cfg.RESNET.FIXED_BLOCKS <= 3)
70 | # Now the base is always fixed during training
71 | with slim.arg_scope(resnet_arg_scope(is_training=False)):
72 | net_conv = self._build_base()
73 | if cfg.RESNET.FIXED_BLOCKS > 0:
74 | with slim.arg_scope(resnet_arg_scope(is_training=False)):
75 | net_conv, _ = resnet_v1.resnet_v1(net_conv,
76 | self._blocks[0:cfg.RESNET.FIXED_BLOCKS],
77 | global_pool=False,
78 | include_root_block=False,
79 | reuse=reuse,
80 | scope=self._scope)
81 | if cfg.RESNET.FIXED_BLOCKS < 3:
82 | with slim.arg_scope(resnet_arg_scope(is_training=is_training)):
83 | net_conv, _ = resnet_v1.resnet_v1(net_conv,
84 | self._blocks[cfg.RESNET.FIXED_BLOCKS:-1],
85 | global_pool=False,
86 | include_root_block=False,
87 | reuse=reuse,
88 | scope=self._scope)
89 |
90 | self._act_summaries.append(net_conv)
91 | self._layers['head'] = net_conv
92 |
93 | return net_conv
94 |
95 | def _decide_blocks(self):
96 | # choose different blocks for different number of layers
97 | if self._num_layers == 50:
98 | self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
99 | resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
100 | # use stride 1 for the last conv4 layer
101 | resnet_v1_block('block3', base_depth=256, num_units=6, stride=1),
102 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
103 |
104 | elif self._num_layers == 101:
105 | self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
106 | resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
107 | # use stride 1 for the last conv4 layer
108 | resnet_v1_block('block3', base_depth=256, num_units=23, stride=1),
109 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
110 |
111 | elif self._num_layers == 152:
112 | self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
113 | resnet_v1_block('block2', base_depth=128, num_units=8, stride=2),
114 | # use stride 1 for the last conv4 layer
115 | resnet_v1_block('block3', base_depth=256, num_units=36, stride=1),
116 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
117 |
118 | else:
119 | # other numbers are not supported
120 | raise NotImplementedError
121 |
122 | def get_variables_to_restore(self, variables, var_keep_dic):
123 | variables_to_restore = []
124 |
125 | for v in variables:
126 | # exclude the first conv layer to swap RGB to BGR
127 | if v.name == (self._scope + '/conv1/weights:0'):
128 | self._variables_to_fix[v.name] = v
129 | continue
130 | if v.name.split(':')[0] in var_keep_dic:
131 | print('Variables restored: %s' % v.name)
132 | variables_to_restore.append(v)
133 |
134 | return variables_to_restore
135 |
136 | def fix_variables(self, sess, pretrained_model):
137 | print('Fix Resnet V1 layers..')
138 | with tf.variable_scope('Fix_Resnet_V1') as scope:
139 | with tf.device("/cpu:0"):
140 | # fix RGB to BGR
141 | conv1_rgb = tf.get_variable("conv1_rgb", [7, 7, 3, 64], trainable=False)
142 | restorer_fc = tf.train.Saver({self._scope + "/conv1/weights": conv1_rgb})
143 | restorer_fc.restore(sess, pretrained_model)
144 |
145 | sess.run(tf.assign(self._variables_to_fix[self._scope + '/conv1/weights:0'],
146 | tf.reverse(conv1_rgb, [2])))
147 |
148 |
149 | def get_variables_in_checkpoint_file(file_name):
150 | try:
151 | reader = pywrap_tensorflow.NewCheckpointReader(file_name)
152 | var_to_shape_map = reader.get_variable_to_shape_map()
153 | return var_to_shape_map
154 | except Exception as e: # pylint: disable=broad-except
155 | print(str(e))
156 | if "corrupted compressed block contents" in str(e):
157 | print("It's likely that your checkpoint file has been compressed "
158 | "with SNAPPY.")
159 |
160 |
161 | def main():
162 | ckpt_path = '/home/joe/git/slim_models/resnet_v1_50.ckpt'
163 | var_keep_dic = get_variables_in_checkpoint_file(ckpt_path)
164 | for key in var_keep_dic:
165 | print("tensor_name: ", key)
166 |
167 |
168 | if __name__ == '__main__':
169 | main()
170 |
171 |
--------------------------------------------------------------------------------
/tests/dencap_oa_test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # This script is used for my own experiments, just ignore it.
4 | # Run with:
5 | # bash scripts/dense_cap_train.sh [dataset] [net] [ckpt_to_init] [data_dir] [step]
6 |
7 | set -x
8 | set -e
9 |
10 | export PYTHONUNBUFFERED='True'
11 |
12 | DATASET='visual_genome_1.2'
13 | NET='res50'
14 | ckpt_path='/home/joe/git/slim_models'
15 | data_dir='/home/joe/git/visual_genome'
16 | step=$1
17 |
18 | case $DATASET in
19 | visual_genome)
20 | TRAIN_IMDB="vg_1.0_train"
21 | TEST_IMDB="vg_1.0_val"
22 | PT_DIR="dense_cap"
23 | FINETUNE_AFTER1=200000
24 | FINETUNE_AFTER2=100000
25 | ITERS1=400000
26 | ITERS2=300000
27 | ;;
28 | visual_genome_1.2)
29 | TRAIN_IMDB="vg_1.2_train"
30 | TEST_IMDB="vg_1.2_val"
31 | PT_DIR="dense_cap"
32 | FINETUNE_AFTER1=200000
33 | FINETUNE_AFTER2=100000
34 | ITERS1=400000
35 | ITERS2=300000
36 | ;;
37 | *)
38 | echo "No dataset given"
39 | exit
40 | ;;
41 | esac
42 |
43 | if [ -d '/valohai/outputs' ]; then
44 | ckpt_path='/valohai/inputs/resnet'
45 | data_dir='/valohai/inputs/visual_genome'
46 | LOG="/valohai/outputs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
47 | else
48 | LOG="logs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
49 | fi
50 |
51 | exec &> >(tee -a "$LOG")
52 | echo Logging output to "$LOG"
53 |
54 | FIRST_ITERS=80000
55 | if [ ${step} -lt '2' ]
56 | then
57 | time python ./tools/train_net.py \
58 | --weights ${ckpt_path}/${NET}.ckpt \
59 | --imdb ${TRAIN_IMDB} \
60 | --imdbval ${TEST_IMDB} \
61 | --iters 50000 \
62 | --cfg scripts/dense_cap_config.yml \
63 | --data_dir ${data_dir} \
64 | --net ${NET} \
65 | --set TRAIN_GLOVE False EXP_DIR dc_fixed CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3 KEEP_AS_GLOVE_DIM False LOSS.CLS_W 1. LOSS.BBOX_W 0.2 LOSS.RPN_BBOX_W 1. LOSS.RPN_CLS_W 0.5
66 | # --set EXP_DIR dc_fixed CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3
67 |
68 | # mkdir output/dc_fixed
69 | # cp -r output/Densecap/ output/dc_dc_fixed
70 | fi
71 |
72 | NEW_WIGHTS=output/dc_fixed/${TRAIN_IMDB}
73 | if [ ${step} -lt '3' ]
74 | then
75 | time python ./tools/train_net.py \
76 | --weights ${NEW_WIGHTS} \
77 | --imdb ${TRAIN_IMDB} \
78 | --iters 30000 \
79 | --imdbval ${TEST_IMDB} \
80 | --cfg scripts/dense_cap_config.yml \
81 | --data_dir ${data_dir} \
82 | --net ${NET} \
83 | --set TRAIN_GLOVE True EXP_DIR dc_tune_vec CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3 KEEP_AS_GLOVE_DIM False
84 | # TRAIN.LEARNING_RATE 0.0005
85 | # --iters `expr ${FINETUNE_AFTER1} - ${FIRST_ITERS}` \
86 |
87 | # mkdir output/dc_tune_vec
88 | # cp -r output/Densecap/ output/dc_tune_vec
89 | fi
90 |
91 | #NEW_WIGHTS=output/dc_tune_vec/${TRAIN_IMDB}
92 | if [ ${step} -lt '4' ]
93 | then
94 | time python ./tools/train_net.py \
95 | --weights ${NEW_WIGHTS} \
96 | --imdb ${TRAIN_IMDB} \
97 | --imdbval ${TEST_IMDB} \
98 | --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \
99 | --cfg scripts/dense_cap_config.yml \
100 | --data_dir ${data_dir} \
101 | --net ${NET} \
102 | --set EXP_DIR dc_tune_conv CONTEXT_FUSION False RESNET.FIXED_BLOCKS 1
103 |
104 | # mkdir output/dc_tune_conv
105 | # cp -r output/Densecap/ output/dc_tune_conv
106 | fi
107 |
108 | NEW_WIGHTS=output/dc_tune_conv/${TRAIN_IMDB}
109 | if [ ${step} -lt '5' ]
110 | then
111 | time python ./tools/train_net.py \
112 | --weights ${NEW_WIGHTS} \
113 | --imdb ${TRAIN_IMDB} \
114 | --imdbval ${TEST_IMDB} \
115 | --iters ${FINETUNE_AFTER2} \
116 | --cfg scripts/dense_cap_config.yml \
117 | --data_dir ${data_dir} \
118 | --net ${NET} \
119 | --set TRAIN_GLOVE True EXP_DIR dc_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 3
120 | # mkdir output/dc_context
121 | # cp -r output/Densecap/ output/dc_context
122 | # --iters `expr ${FINETUNE_AFTER1} - ${FIRST_ITERS}`
123 | fi
124 |
125 | NEW_WIGHTS=output/dc_context/${TRAIN_IMDB}
126 | if [ ${step} -lt '6' ]
127 | then
128 | time python ./tools/train_net.py \
129 | --weights ${NEW_WIGHTS} \
130 | --imdb ${TRAIN_IMDB} \
131 | --imdbval ${TEST_IMDB} \
132 | --iters `expr ${ITERS2} - ${FINETUNE_AFTER2}` \
133 | --cfg scripts/dense_cap_config.yml \
134 | --data_dir ${data_dir} \
135 | --net ${NET} \
136 | --set TRAIN_GLOVE True EXP_DIR dc_tune_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 1
137 | fi
138 |
--------------------------------------------------------------------------------
/tests/logs/architecture_test.txt:
--------------------------------------------------------------------------------
1 | /home/joe/.tf_env2/bin/python /home/joe/git/densecap/tests/architecture_test.py
2 | data_path: /home/joe/git/visual_genome_test/1.2
3 | pre gt roidb could be loaded from /home/joe/git/visual_genome_test/1.2_cache/pre_gt_roidb
4 | LIMIT_RAM version and load index from /home/joe/git/visual_genome_test/1.2_cache/pre_gt_roidb/image_index.json
5 |
6 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:50: RuntimeWarning: overflow encountered in exp
7 | pred_w = np.exp(dw) * widths[:, np.newaxis]
8 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:50: RuntimeWarning: overflow encountered in multiply
9 | pred_w = np.exp(dw) * widths[:, np.newaxis]
10 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:51: RuntimeWarning: overflow encountered in exp
11 | pred_h = np.exp(dh) * heights[:, np.newaxis]
12 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:51: RuntimeWarning: overflow encountered in multiply
13 | pred_h = np.exp(dh) * heights[:, np.newaxis]
14 |
15 | length of labels, i.e. number of regions: 256
16 | sentence data layer input (first 3)
17 | 2239.0 [ 4 87 6 5 85 87 0 0 0 0]
18 | 2239.0 [ 4 87 6 5 85 87 0 0 0 0]
19 | 2239.0 [ 4 87 6 5 85 87 0 0 0 0]
20 | sentence data layer output (first 3)
21 | input sentence
22 | [[ 1. 4. 87. 6. 5. 85. 87. 0. 0. 0. 0.]
23 | [ 1. 4. 87. 6. 5. 85. 87. 0. 0. 0. 0.]
24 | [ 1. 4. 87. 6. 5. 85. 87. 0. 0. 0. 0.]]
25 | target sentence
26 | [[ 1. 4. 87. 6. 5. 85. 87. 2. 0. 0. 0. 0.]
27 | [ 1. 4. 87. 6. 5. 85. 87. 2. 0. 0. 0. 0.]
28 | [ 1. 4. 87. 6. 5. 85. 87. 2. 0. 0. 0. 0.]]
29 | cont sentence
30 | [[ 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
31 | [ 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
32 | [ 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]]
33 | cont bbox
34 | [[ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
35 | [ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
36 | [ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]]
37 |
38 | name: fc7 ==> (256, 2048)
39 | name: image ==> (1, 540, 720, 3)
40 | name: labels ==> (256,)
41 | [3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
42 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
43 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
44 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
45 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
46 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
47 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
48 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
49 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
50 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
51 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
52 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
53 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
54 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
55 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
56 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
57 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
58 | 3682]
59 | name: bbox_inside_weights ==> (256, 4)
60 | name: bbox_targets ==> (256, 4)
61 | name: input_sentence ==> (256, 11)
62 | name: rpn ==> (1, 34, 45, 512)
63 | name: rpn_labels ==> (1, 1, 408, 45)
64 | name: cont_bbox ==> (256, 12)
65 | name: bbox_outside_weights ==> (256, 4)
66 | name: target_sentence ==> (256, 12)
67 | name: rpn_bbox_outside_weights ==> (1, 34, 45, 48)
68 | name: pool5 ==> (256, 7, 7, 1024)
69 | name: rpn_bbox_inside_weights ==> (1, 34, 45, 48)
70 | name: proposal_rois ==> (9, 5)
71 | name: head ==> (1, 34, 45, 1024)
72 | name: clss ==> (256,)
73 | name: rpn_cls_score_reshape ==> (1, 408, 45, 2)
74 | name: anchors ==> (18360, 4)
75 | name: cont_sentence ==> (256, 12)
76 | name: cls_prob ==> (256, 2)
77 | name: gt_boxes ==> (262, 5)
78 | name: rpn_bbox_pred ==> (1, 34, 45, 48)
79 | name: rpn_cls_score ==> (1, 34, 45, 24)
80 | name: im_info ==> (3,)
81 | name: phrases ==> (256, 10)
82 | name: rpn_cls_prob ==> (1, 34, 45, 24)
83 | name: gt_phrases ==> (262, 10)
84 | name: rois ==> (256, 5)
85 | name: proposal_rpn_scores ==> (9, 1)
86 | name: rpn_cls_prob_reshape ==> (1, 408, 45, 2)
87 | name: rpn_bbox_targets ==> (1, 34, 45, 48)
88 |
89 |
90 |
--------------------------------------------------------------------------------
/tests/logs/preprocessing.txt:
--------------------------------------------------------------------------------
1 | split image number: 77398 for split name: train
2 | start loading image meta data json files...
3 | 0.316329 seconds for loading
4 | train: 100%|███████████████████████████| 108077/108077 [03:05<00:00, 581.84it/s]
5 | processing train set with time: 185.75 seconds
6 | there are 272 invalid bboxes out of 3684063
7 | there are 3 empty phrases after triming
8 | Found 56945 unique word tokens.
9 | Using vocabulary size 10000.
10 | The least frequent word in our vocabulary is 'ruff' and appeared 14 times.
11 | Dumping vocabulary to file: /home/joe/git/visual_genome/1.2/vocabulary.txt
12 | Done.
13 | split image number: 5000 for split name: val
14 | start loading image meta data json files...
15 | 0.273385 seconds for loading
16 | val: 100%|████████████████████████████| 108077/108077 [00:20<00:00, 5401.88it/s]
17 | processing val set with time: 20.01 seconds
18 | there are 14 invalid bboxes out of 237362
19 | there are 0 empty phrases after triming
20 | split image number: 5000 for split name: test
21 | start loading image meta data json files...
22 | 0.273840 seconds for loading
23 | test: 100%|███████████████████████████| 108077/108077 [00:20<00:00, 5225.84it/s]
24 | processing test set with time: 20.68 seconds
25 | there are 17 invalid bboxes out of 238069
26 | there are 0 empty phrases after triming
--------------------------------------------------------------------------------
/tests/logs/sentence_data_layer_test.txt:
--------------------------------------------------------------------------------
1 | data_path: /home/joe/git/visual_genome_test/1.2
2 | Appending horizontally-flipped training examples...
3 | pre gt roidb loaded from /home/joe/git/visual_genome_test/1.2/pre_gt_roidb.pkl
4 | done
5 | Preparing training data...
6 | done
7 | Filtered 0 roidb entries: 4 -> 4
8 | length of labels, i.e. number of regions: 262
9 | sentence data layer input (first 3)
10 | 1382.0 [ 4 33 6 25 20 144 0 0 0 0]
11 | 1383.0 [167 6 30 4 11 0 0 0 0 0]
12 | 1384.0 [ 7 6 21 72 0 0 0 0 0 0]
13 | sentence data layer output (first 3)
14 | input sentence
15 | [[ 1. 4. 33. 6. 25. 20. 144. 0. 0. 0. 0.]
16 | [ 1. 167. 6. 30. 4. 11. 0. 0. 0. 0. 0.]
17 | [ 1. 7. 6. 21. 72. 0. 0. 0. 0. 0. 0.]]
18 | target sentence
19 | [[ 1. 4. 33. 6. 25. 20. 144. 2. 0. 0. 0. 0.]
20 | [ 1. 167. 6. 30. 4. 11. 2. 0. 0. 0. 0. 0.]
21 | [ 1. 7. 6. 21. 72. 2. 0. 0. 0. 0. 0. 0.]]
22 | cont sentence
23 | [[ 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
24 | [ 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]
25 | [ 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]]
26 | cont bbox
27 | [[ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
28 | [ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
29 | [ 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]
--------------------------------------------------------------------------------
/tests/pickle_read_test.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Ross Girshick's work
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | from os.path import join as pjoin
11 | from six.moves import cPickle
12 |
13 | def pickle_test():
14 | DEFAULT_PATH = '/home/joe/git/visual_genome_test'
15 | cache = pjoin(DEFAULT_PATH, '1.2_cache/pre_gt_roidb', '1.pkl')
16 | cache_flip = pjoin(DEFAULT_PATH, '1.2_cache/pre_gt_roidb', '1_flip.pkl')
17 | ori = pjoin(DEFAULT_PATH, '1.2', 'pre_gt_roidb.pkl')
18 | phra = pjoin(DEFAULT_PATH, '1.2', 'pre_gt_phrases.pkl')
19 | with open(cache, 'rb') as fc:
20 | data_cache = cPickle.load(fc)
21 | with open(cache_flip, 'rb') as f:
22 | data_flip = cPickle.load(f)
23 | with open(ori, 'rb') as fo:
24 | data_ori = cPickle.load(fo)
25 | with open(phra, 'rb') as fp:
26 | data_phra = cPickle.load(fp)
27 | # from IPython import embed;
28 | # embed()
29 |
30 | print(data_cache)
31 | print ('flip------------------')
32 | print(data_flip)
33 | print ('ori------------------')
34 | print(data_ori)
35 | print("data ori length:", len(data_ori))
36 | print ('phrase------------------')
37 | print (data_phra)
38 | # print (data_phra[2239])
39 |
40 |
41 | if __name__ == '__main__':
42 | pickle_test()
43 |
--------------------------------------------------------------------------------
/tests/read_regions_json/ijson_example.txt:
--------------------------------------------------------------------------------
1 | ('', u'start_array', None)
2 | ('item', u'start_map', None)
3 | ('item', u'map_key', u'regions')
4 | (u'item.regions', u'start_array', None)
5 | (u'item.regions.item', u'start_map', None)
6 | (u'item.regions.item', u'map_key', u'region_id')
7 | (u'item.regions.item.region_id', u'number', 1382)
8 | (u'item.regions.item', u'map_key', u'width')
9 | (u'item.regions.item.width', u'number', 82)
10 | (u'item.regions.item', u'map_key', u'height')
11 | (u'item.regions.item.height', u'number', 139)
12 | (u'item.regions.item', u'map_key', u'image_id')
13 | (u'item.regions.item.image_id', u'number', 1)
14 | (u'item.regions.item', u'map_key', u'phrase')
15 | (u'item.regions.item.phrase', u'string', u'the clock is green in colour')
16 | (u'item.regions.item', u'map_key', u'y')
17 | (u'item.regions.item.y', u'number', 57)
18 | (u'item.regions.item', u'map_key', u'x')
19 | (u'item.regions.item.x', u'number', 421)
20 | (u'item.regions.item', u'end_map', None)
21 |
--------------------------------------------------------------------------------
/tests/read_regions_json/read_regions_test.py:
--------------------------------------------------------------------------------
1 | # ----------------------------------------------
2 | # DenseCap
3 | # Written by InnerPeace
4 | # ----------------------------------------------
5 |
6 | """read large region description json files"""
7 |
8 | import ijson
9 | import json
10 | # import tqdm
11 |
12 | def read_regions( ):
13 | VG_VERSION = '1.2'
14 | VG_PATH = '/home/joe/git/VG_raw_data'
15 | VG_REGION_PATH = '%s/%s/region_descriptions.json' % (VG_PATH, VG_VERSION)
16 | # parser = ijson.parse(open('test_region.json'))
17 | parser = ijson.parse(open(VG_REGION_PATH))
18 |
19 | last_value = None
20 | Dic = {}
21 | regions = []
22 | dic = {}
23 | for prefix, event, value in parser:
24 | if value == 'regions':
25 | Dic = {}
26 | regions = []
27 | last_value = None
28 | elif last_value == 'id':
29 | Dic['regions'] = regions
30 | Dic['id'] = value
31 | with open('test_id_%s.json' % value, 'w') as f:
32 | json.dump(Dic, f)
33 | break
34 | elif event == 'map_key':
35 | last_value = value
36 | elif event == 'end_map':
37 | regions.append(dic)
38 | dic = {}
39 | last_value = None
40 | elif last_value:
41 | dic[last_value] = value
42 |
43 |
44 | def equal_test( ):
45 | new = json.load(open('true_id_1_out.json'))
46 | old = json.load(open('true_id_1.json'))
47 | if old == new:
48 | print('success!')
49 | else:
50 | print('ERROR!')
51 |
52 | '''OUT: success!'''
53 |
54 |
55 | def json_line_read( ):
56 | '''This is not working'''
57 |
58 | with open('true_id_1.json', 'r') as f:
59 | for line in f:
60 | print(line)
61 |
62 |
63 | def read_time_test( ):
64 | path = '/home/joe/git/visual_genome_test/1.2/pre_gt_regions/1.json'
65 | import time
66 | tic = time.time()
67 | with open(path, 'r') as f:
68 | data = json.load(f)
69 | toc = time.time()
70 | print ('read time: %s seconds' % (toc - tic))
71 |
72 | def read_all_regions_test():
73 | '''it gonna kill my computer'''
74 | from tqdm import tqdm
75 | path = '/home/joe/git/visual_genome/1.2/train_gt_regions/'
76 | split_path = '/home/joe/git/densecap/info/densecap_splits.json'
77 | with open(split_path, 'r') as fid:
78 | img_index = json.load(fid)['train']
79 | all_regions = {}
80 | for i in tqdm(xrange(len(img_index)), desc='train set'):
81 | idx = img_index[i]
82 | with open(path+'%s.json'%idx, 'r') as f:
83 | all_regions["%s"%idx] = json.load(f)
84 |
85 | if __name__ == '__main__':
86 | # read_regions()
87 | # equal_test()
88 | # json_line_read()
89 | # read_time_test()
90 | read_all_regions_test()
91 |
--------------------------------------------------------------------------------
/tests/read_regions_json/test_region.json:
--------------------------------------------------------------------------------
1 | {"regions":[{"region_id": 4091, "width": 396, "height": 293, "image_id": 1, "phrase": "tall buildings with many windows", "y": 6, "x": 396}, {"region_id": 4090, "width": 709, "height": 281, "image_id": 1, "phrase": "brick sidewalk", "y": 315, "x": 81}], "id": 1}
--------------------------------------------------------------------------------
/tests/read_regions_json/test_region_out.json:
--------------------------------------------------------------------------------
1 | {"regions": [{"region_id": 4091, "image_id": 1, "height": 293, "width": 396, "x": 396, "y": 6, "phrase": "tall buildings with many windows"}, {"region_id": 4090, "image_id": 1, "height": 281, "width": 709, "x": 81, "y": 315, "phrase": "brick sidewalk"}], "id": 1}
--------------------------------------------------------------------------------
/tests/roidata_test.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # --------------------------------------------------------
5 | from __future__ import absolute_import
6 | from __future__ import division
7 | from __future__ import print_function
8 |
9 | from lib.fast_rcnn.layer import RoIDataLayer
10 | from lib.config import cfg
11 | from lib.datasets.visual_genome import visual_genome
12 | import lib.fast_rcnn.roidb as rdl_roidb
13 | import cv2
14 | import numpy as np
15 | from six.moves import xrange
16 |
17 | # cfg.LIMIT_RAM = False
18 | DEFAULT_PATH = '/home/joe/git/visual_genome_test/1.2'
19 |
20 |
21 | # def roidata_test(roidb, num_classes=2):
22 | # data = RoIDataLayer(roidb, num_classes=num_classes)
23 |
24 | def get_training_roidb(imdb):
25 | """Returns a roidb (Region of Interest database) for use in training."""
26 | if cfg.TRAIN.USE_FLIPPED and not cfg.LIMIT_RAM:
27 | print('Appending horizontally-flipped training examples...')
28 | imdb.append_flipped_images()
29 | print('done')
30 |
31 | print('Preparing training data...')
32 | rdl_roidb.prepare_roidb(imdb)
33 | print('done')
34 |
35 | return imdb.roidb
36 |
37 |
38 | def filter_roidb(roidb):
39 | """Remove roidb entries that have no usable RoIs."""
40 |
41 | def is_valid(entry):
42 | # Valid images have:
43 | # (1) At least one foreground RoI OR
44 | # (2) At least one background RoI
45 | overlaps = entry['max_overlaps']
46 | # find boxes with sufficient overlap
47 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
48 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
49 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
50 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
51 | # image is only valid if such boxes exist
52 | valid = len(fg_inds) > 0 or len(bg_inds) > 0
53 | return valid
54 |
55 | num = len(roidb)
56 | filtered_roidb = [entry for entry in roidb if is_valid(entry)]
57 | num_after = len(filtered_roidb)
58 | print('Filtered {} roidb entries: {} -> {}'.format(num - num_after,
59 | num, num_after))
60 | return filtered_roidb
61 |
62 |
63 | def vis_regions(im, regions, phrases=None, path='/home/joe/git/VG_raw_data/images_test'):
64 | vocab_path = '%s/vocabulary.txt' % DEFAULT_PATH
65 | with open(vocab_path, 'r') as f:
66 | vocab = [line.strip() for line in f]
67 |
68 | mean_values = np.array([[[102.9801, 115.9465, 122.7717]]])
69 | im = im + mean_values # offset to original values
70 |
71 | for i in xrange(len(regions)):
72 | if i > 9:
73 | print ('save 10 examples and break out.')
74 | break
75 | bbox = regions[i, :4]
76 | region_id = regions[i, 4]
77 | # position 0,1,2 have been taken
78 | caption = ' '.join([vocab[j - 3] if j-3>=0 else "" for j in phrases[i]])
79 | im_new = np.copy(im)
80 | cv2.rectangle(im_new, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 0, 255), 2)
81 | cv2.imwrite('%s/%s.jpg' % (path, caption), im_new)
82 |
83 | def get_data_test():
84 | imdb = visual_genome('pre', '1.2')
85 | if cfg.LIMIT_RAM:
86 | roidb = imdb.roidb
87 | else:
88 | roidb = get_training_roidb(imdb)
89 | roidb = filter_roidb(roidb)
90 | rdata = RoIDataLayer(roidb)
91 | data = rdata.forward()
92 |
93 | return data
94 |
95 |
96 | if __name__ == '__main__':
97 | imdb = visual_genome('pre', '1.2')
98 | if cfg.LIMIT_RAM:
99 | roidb = imdb.roidb
100 | else:
101 | roidb = get_training_roidb(imdb)
102 | roidb = filter_roidb(roidb)
103 | rdata = RoIDataLayer(roidb)
104 | data = rdata.forward()
105 | # data = rdata.forward()
106 | print(data)
107 | regions = data['gt_boxes']
108 | im = data['data'][0]
109 | phrases = data['gt_phrases']
110 | vis_regions(im, regions, phrases=phrases)
111 |
112 | # from IPython import embed;
113 | #
114 | # embed()
115 |
--------------------------------------------------------------------------------
/tests/sentence_data_layer_test.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work
5 | # --------------------------------------------------------
6 | from __future__ import absolute_import
7 | from __future__ import division
8 | from __future__ import print_function
9 |
10 | from lib.config import cfg
11 | from lib.layers.sentence_data_layer import sentence_data_layer
12 | from tests.roidata_test import get_data_test
13 | import numpy as np
14 |
15 |
16 | def sentence_data_layer_test():
17 | data = get_data_test()
18 | phrases = data['gt_phrases']
19 |
20 | labels = data['gt_boxes'][:3, 4]
21 | sentence_data_layer(labels, phrases)
22 |
23 |
24 | if __name__ == '__main__':
25 |
26 | sentence_data_layer_test()
27 |
--------------------------------------------------------------------------------
/tests/vh_train_command.sh:
--------------------------------------------------------------------------------
1 | # prapare data
2 | pip install opencv-python
3 | apt-get -y update && apt-get install -y libsm6 libxext6
4 | pip install --upgrade pip
5 | pip install -r requirements.txt
6 | cd /valohai/inputs
7 | tar -xvzf ./vg_data/visual_genome.tar.gz
8 | mv ./valohai/inputs/visual_genome/ ./
9 | mkdir ./images
10 | unzip -xvzf image_1/images.zip -d ./images
11 | unzip -xvzf image_2/images2.zip -d ./images
12 | ls
13 | cd /valohai/repository
14 | cd lib
15 | make
16 | cd ..
17 | bash ./tests/dencap_oa_test.sh {parameters}
18 | tar -czvf /valohai/outputs/output.tar.gz ./output
19 |
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/tools/__init__.py
--------------------------------------------------------------------------------
/tools/_init_paths.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import sys
3 |
4 |
5 | def add_path(path):
6 | if path not in sys.path:
7 | sys.path.insert(0, path)
8 |
9 |
10 | this_dir = osp.dirname(__file__)
11 | lib_path = osp.join(this_dir, '..')
12 | add_path(lib_path)
13 |
--------------------------------------------------------------------------------
/tools/demo.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work
5 | # --------------------------------------------------------
6 | # Train a dense captioning model
7 | # Code adapted from faster R-CNN project
8 | # --------------------------------------------------------
9 | # Fast R-CNN
10 | # Copyright (c) 2015 Microsoft
11 | # Licensed under The MIT License [see LICENSE for details]
12 | # Written by Ross Girshick
13 | # --------------------------------------------------------
14 | from __future__ import absolute_import
15 | from __future__ import division
16 | from __future__ import print_function
17 |
18 | """Train a dense caption model"""
19 |
20 | import _init_paths
21 | from os.path import join as pjoin
22 | import sys
23 | import six
24 | import glob
25 | import argparse
26 | import json
27 | import numpy as np
28 | import tensorflow as tf
29 |
30 | from lib.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_output_tb_dir
31 | from lib.datasets.factory import get_imdb
32 | import lib.datasets.imdb
33 | from lib.dense_cap.train import get_training_roidb, train_net
34 | from lib.dense_cap.test import test_im
35 | from lib.nets.vgg16 import vgg16
36 | from lib.nets.resnet_v1 import resnetv1
37 | import pprint
38 |
39 |
40 | def parse_args():
41 | """
42 | Parse input arguments
43 | """
44 | parser = argparse.ArgumentParser(description='Test a Dense Caption network')
45 |
46 | parser.add_argument('--ckpt', dest='ckpt',
47 | help='initialize with pretrained model weights',
48 | default=None, type=str)
49 | parser.add_argument('--cfg', dest='cfg_file',
50 | help='optional config file',
51 | default=None, type=str)
52 | # TODO: add inception
53 | parser.add_argument('--net', dest='net',
54 | help='vgg16, res50, res101, res152',
55 | default='res50', type=str)
56 | parser.add_argument('--vocab', dest='vocabulary',
57 | help='vocabulary file',
58 | default=None, type=str)
59 |
60 | parser.add_argument('--set', dest='set_cfgs',
61 | help='set config keys', default=None,
62 | nargs=argparse.REMAINDER)
63 |
64 | if len(sys.argv) == 1:
65 | parser.print_help()
66 | sys.exit(1)
67 |
68 | args = parser.parse_args()
69 | return args
70 |
71 |
72 | if __name__ == '__main__':
73 | args = parse_args()
74 | print('------- called with args: --------')
75 | pprint.pprint(args)
76 |
77 | if args.cfg_file is not None:
78 | cfg_from_file(args.cfg_file)
79 | if args.set_cfgs is not None:
80 | cfg_from_list(args.set_cfgs)
81 |
82 | # load network
83 | if args.net == 'vgg16':
84 | net = vgg16()
85 | elif args.net == 'res50':
86 | net = resnetv1(num_layers=50)
87 | elif args.net == 'res101':
88 | net = resnetv1(num_layers=101)
89 | elif args.net == 'res152':
90 | net = resnetv1(num_layers=152)
91 | else:
92 | raise NotImplementedError
93 |
94 | net.create_architecture("TEST", num_classes=1, tag='pre')
95 | vocab = ['', '', '']
96 | with open(args.vocabulary, 'r') as f:
97 | for line in f:
98 | vocab.append(line.strip())
99 |
100 | # get the image paths
101 | im_paths = glob.glob('./data/demo/*.jpg')
102 | print(im_paths)
103 |
104 | # read checkpoint file
105 | if args.ckpt:
106 | ckpt = tf.train.get_checkpoint_state(args.ckpt)
107 | else:
108 | raise ValueError
109 |
110 | # set config
111 | tfconfig = tf.ConfigProto(allow_soft_placement=True)
112 | tfconfig.gpu_options.allow_growth = True
113 |
114 | # init session
115 | saver = tf.train.Saver()
116 | with tf.Session(config=tfconfig) as sess:
117 | print('Restored from {}'.format(ckpt.model_checkpoint_path))
118 | saver.restore(sess, ckpt.model_checkpoint_path)
119 |
120 | # for n in tf.get_default_graph().as_graph_def().node:
121 | # if 'input_feed' in n.name:
122 | # print(n.name)
123 | # for html visualization
124 | pre_results = {}
125 | save_path = './vis/data'
126 | for path in im_paths:
127 | pre_results = test_im(sess, net, path, vocab, pre_results)
128 |
129 | with open(save_path + '/results.json', 'w') as f:
130 | json.dump(pre_results, f)
131 |
--------------------------------------------------------------------------------
/tools/test_net.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # DenseCap-Tensorflow
3 | # Written by InnerPeace
4 | # This file is adapted from Linjie's work
5 | # --------------------------------------------------------
6 | # Train a dense captioning model
7 | # Code adapted from faster R-CNN project
8 | # --------------------------------------------------------
9 | # Fast R-CNN
10 | # Copyright (c) 2015 Microsoft
11 | # Licensed under The MIT License [see LICENSE for details]
12 | # Written by Ross Girshick
13 | # --------------------------------------------------------
14 | from __future__ import absolute_import
15 | from __future__ import division
16 | from __future__ import print_function
17 |
18 | """Test a dense caption model"""
19 | import _init_paths
20 | from lib.dense_cap.test import test_net
21 | from lib.config import cfg, cfg_from_file, cfg_from_list
22 | from lib.datasets.factory import get_imdb
23 | import argparse
24 | import pprint
25 | import time
26 | import os
27 | import sys
28 | import tensorflow as tf
29 | from lib.nets.vgg16 import vgg16
30 | from lib.nets.resnet_v1 import resnetv1
31 |
32 |
33 | def parse_args():
34 | """
35 | Parse input arguments
36 | """
37 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
38 | parser.add_argument('--device', dest='device', help='device to use',
39 | default='gpu', type=str)
40 | parser.add_argument('--device_id', dest='device_id', help='device id to use',
41 | default=0, type=int)
42 | parser.add_argument('--tag', dest='tag',
43 | help='tag of the model',
44 | default=None, type=str)
45 | parser.add_argument('--ckpt', dest='ckpt',
46 | help='initialize with pretrained model weights',
47 | default=None, type=str)
48 | parser.add_argument('--cfg', dest='cfg_file',
49 | help='optional config file',
50 | default=None, type=str)
51 | parser.add_argument('--imdb', dest='imdb_name',
52 | help='dataset to test on',
53 | default='vg_1.2_test', type=str)
54 | # TODO: delete extra options
55 | # parser.add_argument('--iters', dest='max_iters',
56 | # help='number of iterations to train',
57 | # default=40000, type=int)
58 | # parser.add_argument('--imdbval', dest='imdbval_name',
59 | # help='dataset to validation on',
60 | # default='vg_1.2_val', type=str)
61 | # parser.add_argument('--rand', dest='randomize',
62 | # help='randomize (do not use a fixed seed)',
63 | # action='store_true')
64 | # TODO: add inception
65 | parser.add_argument('--net', dest='net',
66 | help='vgg16, res50, res101, res152',
67 | default='res50', type=str)
68 | parser.add_argument('--vis', dest='vis', help='visualize detections',
69 | action='store_true')
70 | parser.add_argument('--use_box_at', dest='use_box_at',
71 | help='use predicted box at this time step, default to the last',
72 | default=-1, type=int)
73 | parser.add_argument('--set', dest='set_cfgs',
74 | help='set config keys', default=None,
75 | nargs=argparse.REMAINDER)
76 |
77 | if len(sys.argv) == 1:
78 | parser.print_help()
79 | sys.exit(1)
80 |
81 | args = parser.parse_args()
82 | return args
83 |
84 |
85 | if __name__ == '__main__':
86 | args = parse_args()
87 |
88 | print('Called with args:')
89 | print(args)
90 |
91 | if args.cfg_file is not None:
92 | cfg_from_file(args.cfg_file)
93 | if args.set_cfgs is not None:
94 | cfg_from_list(args.set_cfgs)
95 |
96 | cfg.GPU_ID = args.device_id
97 |
98 | print('Using config:')
99 | pprint.pprint(cfg)
100 |
101 | imdb = get_imdb(args.imdb_name)
102 | # load network
103 | if args.net == 'vgg16':
104 | net = vgg16()
105 | elif args.net == 'res50':
106 | net = resnetv1(num_layers=50)
107 | elif args.net == 'res101':
108 | net = resnetv1(num_layers=101)
109 | elif args.net == 'res152':
110 | net = resnetv1(num_layers=152)
111 | else:
112 | raise NotImplementedError
113 |
114 | net.create_architecture("TEST", num_classes=1, tag='pre')
115 | # read checkpoint file
116 | if args.ckpt:
117 | ckpt = tf.train.get_checkpoint_state(args.ckpt)
118 | else:
119 | raise ValueError("NO checkpoint found in {}".format(args.ckpt))
120 |
121 | # set config
122 | tfconfig = tf.ConfigProto(allow_soft_placement=True)
123 | tfconfig.gpu_options.allow_growth = True
124 |
125 | # init session
126 | saver = tf.train.Saver()
127 | with tf.Session(config=tfconfig) as sess:
128 | print('Restored from {}'.format(ckpt.model_checkpoint_path))
129 | saver.restore(sess, ckpt.model_checkpoint_path)
130 |
131 | test_net(sess, net, imdb,
132 | vis=args.vis, use_box_at=args.use_box_at)
133 |
--------------------------------------------------------------------------------
/valohai.yaml:
--------------------------------------------------------------------------------
1 | ---
2 |
3 | - step:
4 | name: preprocess data
5 | image: gcr.io/tensorflow/tensorflow:1.3.0-devel-gpu
6 | command: bash ./lib/preprocess.sh {parameters}
7 | inputs:
8 | - name: image_meta
9 | default: http://visualgenome.org/static/data/dataset/image_data.json.zip
10 | - name: regions
11 | default: http://visualgenome.org/static/data/dataset/region_descriptions.json.zip
12 | parameters:
13 | - name: vs
14 | type: float
15 | pass-as: -vs {v}
16 | default: 1.2
17 | - name: path
18 | type: string
19 | pass-as: -p {v}
20 | default: "/valohai/inputs"
21 | - name: output_dir
22 | type: string
23 | pass-as: -od {v}
24 | default: "/valohai/inputs/visual_genome"
25 | - name: max_words
26 | type: integer
27 | pass-as: -mw {v}
28 | default: 10
29 |
30 | - step:
31 | name: download image data
32 | image: gcr.io/tensorflow/tensorflow:1.3.0-devel-gpu
33 | command: bash ./lib/download_data_vh.sh
34 | inputs:
35 | - name: image_1
36 | default: https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip
37 | - name: image_2
38 | default: https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip
39 |
40 | - step:
41 | name: train model
42 | image: gcr.io/tensorflow/tensorflow:1.3.0-devel-gpu
43 | command: bash ./tests/dencap_oa_test.sh {parameters}
44 | inputs:
45 | - name: vg_data
46 | default: ""
47 | - name: resnet
48 | default: https://drive.google.com/uc?export=download&confirm=aZtH&id=15PxiEp7HP-ZSBG9xHMamZr-zh8iBDeA4
49 | - name: image_1
50 | default: https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip
51 | - name: image_2
52 | default: https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip
53 | parameters:
54 | - name: iters
55 | type: integer
56 | pass-as: -iters {v}
57 | default: 80000
58 |
--------------------------------------------------------------------------------
/vis/README.md:
--------------------------------------------------------------------------------
1 |
2 | ### Visualization interface
3 |
4 | When you run `run_model.lua` with `-output_vis 1` (default) it will write the images and a json struct to this folder's `data/` directory. These can then be viewed with this nice html interface.
5 |
6 | For example, to evaluate a checkpoint on some VG test data:
7 |
8 | ```
9 | th run_model.lua -checkpoint data/checkpoint.t7 -input_split test -vg_img_root_dir /path/to/visual-genome/images -max_images 10
10 | ```
11 |
12 | and then start a webbrowser, e.g. `python -m SimpleHTTPServer` and open the `view_results.html` file!
13 |
--------------------------------------------------------------------------------
/vis/style.css:
--------------------------------------------------------------------------------
1 | body {
2 | color: #333;
3 | margin: 0;
4 | padding: 0;
5 | font-family: "HelveticaNeue-Light", "Helvetica Neue Light", "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif;
6 | font-weight: 300;
7 | }
8 | svg {
9 | border: 1px solid black;
10 | background-color: #FFF;
11 | }
12 | hr {
13 | border: 1px solid black;
14 | }
15 | #wrap {
16 | width:800px;
17 | margin-left: auto;
18 | margin-right: auto;
19 | }
20 | #header {
21 | text-align: center;
22 | }
23 | #image_vis {
24 | background-color: #FFF;
25 | padding: 20px 0px;
26 | }
27 | #image_vis_controls {
28 | text-align: center;
29 | padding: 10px;
30 | background-color: #DDD;
31 | border: 1px solid #999;
32 | margin-bottom: 20px;
33 | }
34 | .bb {
35 | height: 50px;
36 | width: 175px;
37 | margin: 5px;
38 | }
39 | .ddesc {
40 | font-size: 32px;
41 | }
42 | .dcent {
43 | margin-left: auto;
44 | margin-right: auto;
45 | width: 720px;
46 | margin-bottom: 20px;
47 | }
48 | .djust {
49 | text-align: justify;
50 | }
--------------------------------------------------------------------------------
/vis/utils.js:
--------------------------------------------------------------------------------
1 |
2 | // helper function to create HSL string from a vector of colors
3 | var renderHSL = function(hsl) { // omg
4 | var ht = Math.min(360, Math.max(0, hsl[0]));
5 | var st = Math.min(100, Math.max(0, hsl[1]));
6 | var lt = Math.min(100, Math.max(0, hsl[2]));
7 | return 'hsl(' + ht + ',' + st + '%,' + lt + '%)';
8 | }
9 |
10 | // randomly shuffle an array
11 | function shuffle(array) {
12 | var currentIndex = array.length, temporaryValue, randomIndex ;
13 | // While there remain elements to shuffle...
14 | while (0 !== currentIndex) {
15 | // Pick a remaining element...
16 | randomIndex = Math.floor(Math.random() * currentIndex);
17 | currentIndex -= 1;
18 | // And swap it with the current element.
19 | temporaryValue = array[currentIndex];
20 | array[currentIndex] = array[randomIndex];
21 | array[randomIndex] = temporaryValue;
22 | }
23 | return array;
24 | }
25 |
26 | // html escaping util
27 | var entityMap = {
28 | "&": "&",
29 | "<": "<",
30 | ">": ">",
31 | '"': '"',
32 | "'": ''',
33 | "/": '/'
34 | };
35 | function escapeHtml(string) {
36 | return String(string).replace(/[&<>"'\/]/g, function (s) {
37 | return entityMap[s];
38 | });
39 | }
40 |
41 |
42 | // store colors in a global var because why not
43 | var WAD_COLORS = [
44 | "rgb(173, 35, 35)", // Red
45 | "rgb(42, 75, 215)", // Blue
46 | "rgb(87, 87, 87)", // Dark Gray
47 | "rgb(29, 105, 20)", // Green
48 | "rgb(129, 74, 25)", // Brown
49 | "rgb(129, 38, 192)", // Purple
50 | "rgb(160, 160, 160)", // Lt Gray
51 | "rgb(129, 197, 122)", // Lt green
52 | "rgb(157, 175, 255)", // Lt blue
53 | "rgb(41, 208, 208)", // Cyan
54 | "rgb(255, 146, 51)", // Orange
55 | "rgb(255, 238, 51)", // Yellow
56 | "rgb(233, 222, 187)", // Tan
57 | "rgb(255, 205, 243)", // Pink
58 | // "rgb(255, 255, 255)", // White
59 | //"rgb(0, 0, 0)", // Black
60 | ];
61 |
62 | // ----------------------------------------------------------------------------
63 | // visualization utils
64 | // ----------------------------------------------------------------------------
65 |
66 | // renders a bounding box and text annotaiton in svg element elt. assumes d3js
67 | function renderBox(elt, box, color, width, text) {
68 | if (typeof(width) === 'undefined') width = 1;
69 | elt.append('rect')
70 | .attr('x', box[0])
71 | .attr('y', box[1])
72 | .attr('width', box[2])
73 | .attr('height', box[3])
74 | .attr('stroke', color)
75 | .attr('fill', 'none')
76 | .attr('stroke-width', width);
77 | if (typeof(text) !== 'undefined' && text != '') {
78 | var t = elt.append('text').text(text)
79 | .attr('x', box[0]).attr('y', box[1])
80 | .attr('dominant-baseline', 'hanging')
81 | .attr('text-anchor', 'start');
82 | t = t[0][0];
83 | var tbox = t.getBBox();
84 | elt.insert('rect', 'text').attr('fill', color)
85 | .attr('x', tbox.x).attr('y', tbox.y)
86 | .attr('width', tbox.width)
87 | .attr('height', tbox.height);
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/vis/view_results.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | DenseCap results browser
7 |
8 |
9 |
10 |
11 |
12 |
144 |
145 |
146 |
147 |
148 |
149 |
Browse the results using the WSAD hotkeys (A,D: prev/next image, W/S: more/less detections)
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
162 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
--------------------------------------------------------------------------------