├── .gitignore ├── LICENSE ├── Note.md ├── README.md ├── __init__.py ├── info ├── __init__.py ├── densecap_splits.json ├── read_regions.py ├── read_splits.py ├── test.txt ├── train.txt └── val.txt ├── lib ├── Makefile ├── __init__.py ├── config.py ├── datasets │ ├── __init__.py │ ├── factory.py │ ├── imdb.py │ └── visual_genome.py ├── dense_cap │ ├── __init__.py │ ├── beam_search.py │ ├── caption_generator.py │ ├── test.py │ ├── train.py │ └── vis_whtml.py ├── download_data_vh.sh ├── fast_rcnn │ ├── __init__.py │ ├── bbox_transform.py │ ├── layer.py │ ├── minibatch.py │ ├── nms_wrapper.py │ └── roidb.py ├── layers │ ├── __init__.py │ ├── anchor_target_layer.py │ ├── generate_anchors.py │ ├── global_roi_layer.py │ ├── proposal_layer.py │ ├── proposal_target_layer.py │ ├── proposal_target_single_class_layer.py │ ├── proposal_top_layer.py │ ├── rois_offset_layer.py │ ├── sentence_data_layer.py │ └── snippets.py ├── limit_ram │ ├── __init__.py │ └── utils.py ├── nets │ ├── __init__.py │ ├── mobilenet_v1.py │ ├── network.py │ ├── resnet_v1.py │ └── vgg16.py ├── nms │ ├── __init__.py │ ├── cpu_nms.c │ ├── cpu_nms.pyx │ ├── gpu_nms.cpp │ ├── gpu_nms.hpp │ ├── gpu_nms.pyx │ ├── nms_kernel.cu │ └── py_cpu_nms.py ├── pre_glove.py ├── preprocess.py ├── preprocess.sh ├── pycocoevalcap │ ├── README │ ├── __init__.py │ ├── bleu │ │ ├── LICENSE │ │ ├── __init__.py │ │ ├── bleu.py │ │ └── bleu_scorer.py │ ├── cider │ │ ├── __init__.py │ │ ├── cider.py │ │ └── cider_scorer.py │ ├── eval.py │ ├── meteor │ │ ├── __init__.py │ │ ├── meteor-1.5.jar │ │ └── meteor.py │ ├── rouge │ │ ├── __init__.py │ │ └── rouge.py │ ├── tokenizer │ │ ├── __init__.py │ │ ├── ptbtokenizer.py │ │ ├── stanford-corenlp-3.4.1.jar │ │ └── tmpGeypfw │ └── vg_eval.py ├── setup.py └── utils │ ├── __init__.py │ ├── bbox.c │ ├── bbox.pyx │ ├── bbox_utils.py │ ├── blob.py │ ├── debug.py │ ├── timer.py │ └── visualization.py ├── logs ├── densecap.png └── funny.png ├── requirements.txt ├── scripts ├── dense_cap_config.yml ├── dense_cap_demo.sh ├── dense_cap_test.sh ├── dense_cap_train.sh └── old_dense_cap_train.sh ├── tests ├── README.md ├── __init__.py ├── architecture_test.py ├── bash_log_test │ ├── bash_log_test.sh │ ├── logs │ │ └── test.txt.2017-10-18_15-33-56 │ └── nonsense.py ├── ckpt_restore_test.py ├── dencap_oa_test.sh ├── logs │ ├── architecture_test.txt │ ├── architecture_test_nodes.txt │ ├── preprocessing.txt │ └── sentence_data_layer_test.txt ├── pickle_read_test.py ├── read_regions_json │ ├── ijson_example.txt │ ├── read_regions_test.py │ ├── test_region.json │ ├── test_region_out.json │ ├── true_id_1.json │ └── true_id_1_out.json ├── roidata_test.py ├── sentence_data_layer_test.py └── vh_train_command.sh ├── tools ├── __init__.py ├── _init_paths.py ├── demo.py ├── test_net.py └── train_net.py ├── valohai.yaml └── vis ├── README.md ├── d3.min.js ├── jquery-1.8.3.min.js ├── style.css ├── utils.js └── view_results.html /.gitignore: -------------------------------------------------------------------------------- 1 | #sublime 2 | *.sublime-workspace 3 | *.sublime-project 4 | #pycharm 5 | .idea/ 6 | data/ 7 | demo/ 8 | experiments/ 9 | 10 | tensorboard/ 11 | output/ 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | env/ 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # celery beat schedule file 88 | celerybeat-schedule 89 | 90 | # SageMath parsed files 91 | *.sage.py 92 | 93 | # dotenv 94 | .env 95 | 96 | # virtualenv 97 | .venv 98 | venv/ 99 | ENV/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Innerpeace 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Densecap-tensorflow 2 | 3 | Implementation of CVPR2017 paper: [Dense captioning with joint inference and visual context](https://arxiv.org/abs/1611.06949) by **Linjie Yang, Kevin Tang, Jianchao Yang, Li-Jia Li** 4 | 5 | **WITH CHANGES:** 6 | 1. Borrow the idea of [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling](https://arxiv.org/abs/1611.01462), and tied word vectors and word classfiers during captioning. 7 | 2. Initialize Word Vectors and Word Classifers with pre-trained [glove](https://nlp.stanford.edu/projects/glove/) word vectors with dimensions of 300. 8 | 3. Change the backbone of the framework to ResNet-50. 9 | 4. Add `Beam Search` and `Length Normalization` in test mode. 10 | 5. Add "Limit_RAM" mode when praparing training date since my computer only has RAM with 8G. 11 | 12 |
13 | 14 | 15 |
16 | 17 | **Special thanks to [valohai](https://valohai.com/) for offering computing resource.** 18 | 19 | ## Note 20 | 21 | **Update 2017.12.31** 22 | 23 | * After 500k iterations of training with configurations of original paper (except for the weights tying of wordvec and classifiers), it achieves **mAP 8.296**. 24 | 25 | **Update 2017.12.20** 26 | 27 | * After 1 epoch(80000 iters) of training with randomly initialized word vectors(512d), it achieves **mAP 6.509**. 28 | * After 1 epoch(75000) of training with pre-trianed glove word vectors(300d), it got **mAP 5.5** nearly. 29 | * The complete training process will take almost **10 days** with the computation I have access to, and I just trained 1 epoch to varify the framework for now. 30 | * The scripts should be compatible with both python 2.X and 3.X. Although I built it under python 2.7. 31 | * Tested on Ubuntu 16.04, tensorflow 1.4, CUDA 8.0 and cudnn 6.0, with GPU Nvidia gtx 1060(LOL...). 32 | 33 | ## Dependencies 34 | 35 | To install required python modules by: 36 | 37 | ```commandline 38 | pip install -r lib/requirements.txt 39 | ``` 40 | 41 | **For evaluation, one also need:** 42 | * java 1.8.0 43 | * python 2.7(according to 44 | [coco-caption](https://github.com/tylin/coco-caption)) 45 | 46 | To install java runtime by: 47 | ```commandline 48 | sudo apt-get install openjdk-8-jre 49 | ``` 50 | 51 | ## Preparing data 52 | 53 | ### Download 54 | 55 | [Website of Visual Genome Dataset](http://visualgenome.org/api/v0/api_home.html) 56 | 57 | * Make a new directory `VG` wherever you like. 58 | * Download `images` Part1 and Part2, extract `all (two parts)` to directory `VG/images` 59 | * Download `image meta data`, extract to directory `VG/1.2` or `VG/1.0` according to the version you download. 60 | * Download `region descriptions`, extract to directory `VG/1.2` or `VG/1.0` accordingly. 61 | * For the following process, we will refer **the absolute path** of directory `VG` as `raw_data_path`, e.g. `/home/user/git/VG`. 62 | 63 | ### Unlimit RAM 64 | 65 | If one has RAM more than 16G, then you can preprocessing dataset with following command. 66 | ```shell 67 | $ cd $ROOT/lib 68 | $ python preprocess.py --version [version] --path [raw_data_path] \ 69 | --output_dir [dir] --max_words [max_len] 70 | ``` 71 | 72 | ### Limit RAM (Less than 16G) 73 | 74 | If one has RAM `less than 16G`. 75 | * Firstly, setting up the data path in `info/read_regions.py` accordingly, and run the script with python. Then it will dump `regions` in `REGION_JSON` directory. It will take time to process more than 100k images, so be patient. 76 | ```shell 77 | $ cd $ROOT/info 78 | $ python read_regions --version [version] --vg_path [raw_data_path] 79 | ``` 80 | * In `lib/preprocess.py`, set up data path accordingly. After running the file, it will dump `gt_regions` of every image respectively to `OUTPUT_DIR` as `directory`. 81 | ```shell 82 | $ cd $ROOT/lib 83 | $ python preprocess.py --version [version] --path [raw_data_path] \ 84 | --output_dir [dir] --max_words [max_len] --limit_ram 85 | ``` 86 | 87 | ## Compile local libs 88 | 89 | ```shell 90 | $ cd root/lib 91 | $ make 92 | ``` 93 | 94 | ## Train 95 | 96 | Add or modify configurations in `root/scripts/dense_cap_config.yml`, refer to 'lib/config.py' for more configuration details. 97 | ```shell 98 | $ cd $ROOT 99 | $ bash scripts/dense_cap_train.sh [dataset] [net] [ckpt_to_init] [data_dir] [step] 100 | ``` 101 | 102 | Parameters: 103 | * dataset: `visual_genome_1.2` or `visual_genome_1.0`. 104 | * net: res50, res101 105 | * ckpt_to_init: pretrained model to be initialized with. Refer to [tf_faster_rcnn](https://github.com/endernewton/tf-faster-rcnn) for more init weight details. 106 | * data_dir: the data directory where you save the outputs after `prepare data`. 107 | * step: for continue training. 108 | - step 1: fix convnet weights 109 | - stpe 2: finetune convnets weights 110 | - step 3: add context fusion, but fix convnets weights 111 | - step 4: finetune the whole model. 112 | 113 | ## Demo 114 | 115 | Create a directory `data/demo` 116 | ```sh 117 | $ mkdir $ROOT/data/demo 118 | ``` 119 | Then put the images to be tested in the directory. 120 | 121 | **Download pretrained model (iters 500k)** by [Google Drive](https://drive.google.com/file/d/1yoJGXXpeSpQbU-6WpLsMXFLIka7xpTAy/view?usp=sharing) 122 | or [Jbox](https://jbox.sjtu.edu.cn/l/j5EeUN). Then create a "output" 123 | directory under `$ROOT` 124 | ```sh 125 | $ mkdir $ROOT/output 126 | ``` 127 | Extract the downloaded "ckpt.zip" to directory `$ROOT/output`. 128 | And run 129 | ```sh 130 | $ cd $ROOT 131 | $ bash scripts/dense_cap_demo.sh ./output/ckpt ./output/ckpt/vocabulary.txt 132 | ``` 133 | or run 134 | ```sh 135 | $ bash scripts/dense_cap_demo.sh [ckpt_path] [vocab_path] 136 | ``` 137 | for your customized checkpoint directory. 138 | 139 | It will create html files in `$ROOT/demo`, just click it. 140 | Or you can use the web-based visualizer created by [karpathy](https://github.com/karpathy) by 141 | ```sh 142 | $ cd $ROOT/vis 143 | $ python -m SimpleHTTPServer 8181 144 | ``` 145 | Then point your web brower to [http://localhost:8181/view_results.html](http://localhost:8181/view_results.html). 146 | 147 | ## TODO: 148 | 149 | - [x] preprocessing dataset. 150 | - [x] roi_data_layer & get data well prepared for feeding. 151 | - [x] proposal layer 152 | - [x] sentense data layer 153 | - [x] embedding layer 154 | - [x] get loc loss and caption loss 155 | - [x] overfit a mini-batch 156 | - [x] context fusion 157 | - [x] add experiment result. 158 | 159 | ## References 160 | 161 | * The Faster-RCNN framework inherited from repo [tf-faster-rcnn](https://github.com/endernewton/tf-faster-rcnn) by [endernewton](https://github.com/endernewton) 162 | * The official repo of [densecap](https://github.com/linjieyangsc/densecap) 163 | * [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling](https://arxiv.org/abs/1611.01462) 164 | * Official tensorflow models - "im2text". 165 | * Adapted web-based visualizer from [jcjohnson](https://github.com/jcjohnson)'s [densecap repo](https://github.com/jcjohnson/densecap) 166 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/__init__.py -------------------------------------------------------------------------------- /info/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/info/__init__.py -------------------------------------------------------------------------------- /info/read_regions.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------- 2 | # DenseCap 3 | # Written by InnerPeace 4 | # ---------------------------------------------- 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | """read large region description json files""" 10 | 11 | import ijson 12 | import json 13 | import sys 14 | import os 15 | import argparse 16 | 17 | parser = argparse.ArgumentParser(description='Preprocessing visual genome') 18 | parser.add_argument('--version', dest='version', type=float, default=1.2, help='the version of visual genome dataset.') 19 | parser.add_argument('--vg_path', dest='vg_path', type=str, default='/home/joe/git/VG_raw_data', help='directory keeping the raw dataset of visual genome') 20 | 21 | args = parser.parse_args() 22 | VG_VERSION = args.version 23 | VG_PATH = args.vg_path 24 | 25 | VG_REGION_PATH = '%s/%s/region_descriptions.json' % (VG_PATH, VG_VERSION) 26 | REGION_JSON = '%s/%s/regions' % (VG_PATH, VG_VERSION) 27 | 28 | 29 | def read_regions(): 30 | if not os.path.exists(REGION_JSON): 31 | os.makedirs(REGION_JSON) 32 | parser = ijson.parse(open(VG_REGION_PATH)) 33 | last_value = None 34 | Dic = {} 35 | regions = [] 36 | dic = {} 37 | count = 0 38 | for prefix, event, value in parser: 39 | sys.stdout.write('>>> %d \r' % count) 40 | sys.stdout.flush() 41 | if value == 'regions': 42 | Dic = {} 43 | regions = [] 44 | last_value = None 45 | elif last_value == 'id' and value: 46 | count += 1 47 | Dic['regions'] = regions 48 | Dic['id'] = value 49 | with open(REGION_JSON + '/%s.json' % value, 'w') as f: 50 | json.dump(Dic, f) 51 | elif event == 'map_key': 52 | last_value = value 53 | elif event == 'end_map': 54 | regions.append(dic) 55 | dic = {} 56 | last_value = None 57 | elif last_value: 58 | dic[last_value] = value 59 | 60 | 61 | if __name__ == '__main__': 62 | read_regions() 63 | -------------------------------------------------------------------------------- /info/read_splits.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------- 2 | # DenseCap 3 | # Written by InnerPeace 4 | # ---------------------------------------------- 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | '''Read splits''' 10 | 11 | import json 12 | 13 | def read_splits(): 14 | file = 'densecap_splits.json' 15 | with open(file, 'r') as f: 16 | data = json.load(f) 17 | splits = ['train', 'val', 'test'] 18 | for split in splits: 19 | print("%s set has %s examples." % (split, len(data[split]))) 20 | with open(split + '.txt', 'w') as f: 21 | for id in data[split]: 22 | f.write("%s\n" % id) 23 | 24 | 25 | if __name__ == '__main__': 26 | read_splits() 27 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python setup.py build_ext --inplace 3 | rm -rf build 4 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/__init__.py -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/datasets/__init__.py -------------------------------------------------------------------------------- /lib/datasets/factory.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work 5 | # -------------------------------------------------------- 6 | 7 | """Factory method for easily getting imdbs by name.""" 8 | 9 | __sets = {} 10 | 11 | from visual_genome import visual_genome 12 | 13 | 14 | # Set up visual_genome_ using rpn mode 15 | # for version in ['1.0', '1.2']: 16 | for version in ['1.2']: 17 | for split in ['train', 'val', 'test']: 18 | name = 'vg_{}_{}'.format(version, split) 19 | __sets[name] = (lambda split=split, version=version: 20 | visual_genome(split, version)) 21 | 22 | 23 | def get_imdb(name): 24 | """Get an imdb (image database) by name.""" 25 | if not __sets.has_key(name): 26 | raise KeyError('Unknown dataset: {}'.format(name)) 27 | return __sets[name]() 28 | 29 | 30 | def list_imdbs(): 31 | """List all registered imdbs.""" 32 | return __sets.keys() 33 | -------------------------------------------------------------------------------- /lib/dense_cap/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/dense_cap/__init__.py -------------------------------------------------------------------------------- /lib/dense_cap/beam_search.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work 5 | # and Google's im2txt project 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import pdb 12 | import math 13 | from lib.dense_cap.caption_generator import * 14 | import numpy as np 15 | from lib.config import cfg 16 | import tensorflow as tf 17 | from six.moves import xrange 18 | 19 | 20 | def beam_search(sess, net, blobs, im_scales): 21 | # (TODO wu) for now it only works with "concat" mode 22 | # get initial states and rois 23 | if cfg.CONTEXT_FUSION: 24 | cap_state, loc_state, scores, \ 25 | rois, gfeat_state = net.feed_image(sess, 26 | blobs['data'], 27 | blobs['im_info'][0]) 28 | all_states = np.concatenate((cap_state, loc_state, gfeat_state), axis=1) 29 | else: 30 | cap_state, loc_state, scores, rois = net.feed_image(sess, blobs['data'], 31 | blobs['im_info'][0]) 32 | all_states = np.concatenate((cap_state, loc_state), axis=1) 33 | 34 | # proposal boxes 35 | boxes = rois[:, 1:5] / im_scales[0] 36 | proposal_n = rois.shape[0] 37 | 38 | all_partial_caps = [] 39 | all_complete_caps = [] 40 | beam_size = cfg.TEST.BEAM_SIZE 41 | for i in xrange(proposal_n): 42 | init_beam = Caption(sentence=[cfg.VOCAB_START_ID], 43 | state=all_states[i], 44 | box_pred=[], 45 | logprob=0.0, 46 | score=0.0, 47 | metadata=[""]) 48 | partial_cap = TopN(beam_size) 49 | partial_cap.push(init_beam) 50 | complete_cap = TopN(beam_size) 51 | all_partial_caps.append(partial_cap) 52 | all_complete_caps.append(complete_cap) 53 | 54 | for j in xrange(cfg.TIME_STEPS - 1): 55 | all_candidates_len = [] 56 | flag = False 57 | for i in xrange(proposal_n): 58 | partial_cap = all_partial_caps[i] 59 | size = partial_cap.size() 60 | all_candidates_len.append(size) 61 | if not size: 62 | continue 63 | partial_cap_list = partial_cap.get_data() 64 | input_feed_i = [c.sentence[-1] for c in partial_cap_list] 65 | state_feed_i = [c.state for c in partial_cap_list] 66 | if not flag: 67 | flag = True 68 | input_feed = np.array(input_feed_i) 69 | state_feed = np.array(state_feed_i) 70 | else: 71 | input_feed = np.concatenate((input_feed, np.array(input_feed_i))) 72 | state_feed = np.concatenate((state_feed, np.array(state_feed_i))) 73 | 74 | if cfg.CONTEXT_FUSION: 75 | cap_feed, loc_feed, gfeat_feed = np.split(state_feed, 3, axis=1) 76 | cap_probs, new_bbox_pred, new_cap_state, new_loc_state, \ 77 | new_gfeat_state = net.inference_step(sess, input_feed, 78 | cap_feed, loc_feed, gfeat_feed) 79 | new_state = np.concatenate((new_cap_state, new_loc_state, new_gfeat_state), 80 | axis=1) 81 | else: 82 | cap_feed, loc_feed = np.split(state_feed, 2, axis=1) 83 | cap_probs, new_bbox_pred, new_cap_state, \ 84 | new_loc_state = net.inference_step(sess, input_feed, 85 | cap_feed, loc_feed) 86 | new_state = np.concatenate((new_cap_state, new_loc_state), axis=1) 87 | 88 | count = 0 89 | for k in xrange(proposal_n): 90 | l = all_candidates_len[k] 91 | if l == 0: 92 | continue 93 | partial_cap = all_partial_caps[k] 94 | complete_cap = all_complete_caps[k] 95 | partial_cap_list = partial_cap.extract() 96 | partial_cap.reset() 97 | softmax_k = cap_probs[count: count + l] 98 | states_k = new_state[count: count + l] 99 | bbox_pred_k = new_bbox_pred[count: count + l] 100 | count += l 101 | for i, par_cap in enumerate(partial_cap_list): 102 | word_probs = softmax_k[i] 103 | state = states_k[i] 104 | bbox_pred = bbox_pred_k[i] 105 | # For this partial caption, get the beam_size most probable next words. 106 | words_and_probs = list(enumerate(word_probs)) 107 | words_and_probs.sort(key=lambda x: -x[1]) 108 | words_and_probs = words_and_probs[0: beam_size] 109 | # Each next word gives a new partial caption 110 | for w, p in words_and_probs: 111 | if p < 1e-12: 112 | continue # Avoid log(0) 113 | sentence = par_cap.sentence + [w] 114 | logprob = par_cap.logprob + math.log(p) 115 | sc = logprob 116 | box_pred = par_cap.box_pred 117 | box_pred.append(bbox_pred) 118 | if w == cfg.VOCAB_END_ID: 119 | if cfg.TEST.LN_FACTOR > 0: 120 | sc /= len(sentence) ** cfg.TEST.LN_FACTOR 121 | beam = Caption(sentence, state, box_pred, logprob, sc) 122 | complete_cap.push(beam) 123 | else: 124 | beam = Caption(sentence, state, box_pred, logprob, sc) 125 | partial_cap.push(beam) 126 | captions = [] 127 | box_offsets = np.zeros((proposal_n, 4), dtype=np.float32) 128 | for i in xrange(proposal_n): 129 | complete_cap = all_complete_caps[i] 130 | if not complete_cap.size(): 131 | complete_cap = all_partial_caps[i] 132 | caps_i = complete_cap.extract(sort=True) 133 | captions.append(caps_i[0].sentence) 134 | box_offsets[i] = caps_i[0].box_pred[-1] 135 | 136 | return scores, box_offsets, captions, boxes 137 | -------------------------------------------------------------------------------- /lib/dense_cap/vis_whtml.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------- 2 | # DenseCap 3 | # Written by InnerPeace 4 | # ---------------------------------------------- 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | import cv2 10 | import os 11 | import numpy as np 12 | from six.moves import xrange 13 | 14 | 15 | def vis_whtml(im_path, im, captions, dets, pre_results=dict(), 16 | thresh=0.5, save_path='./vis/data'): 17 | print("visualizing with pretty html...") 18 | if not os.path.exists(save_path): 19 | os.mkdirs(save_path) 20 | 21 | im_name = im_path.split('/')[-1][:-4] 22 | box_xywh = [] 23 | box_caps = [] 24 | scores = [] 25 | for i in xrange(dets.shape[0]): 26 | if dets[i, -1] > thresh: 27 | box_xywh.append(box2xywh(dets[i, :4].tolist())) 28 | box_caps.append(captions[i]) 29 | scores.append(float(dets[i, -1])) 30 | 31 | # save image 32 | im_new = np.copy(im) 33 | cv2.imwrite("%s/%s.jpg" % (save_path, im_name), im_new) 34 | result = {"img_name": "%s.jpg" % im_name, 35 | "scores": scores, 36 | "captions": box_caps, 37 | "boxes": box_xywh} 38 | pre_results["results"] = pre_results.get("results", []) + [result] 39 | 40 | return pre_results 41 | 42 | 43 | def box2xywh(box): 44 | xywh = [] 45 | xywh.extend(box[:2]) 46 | for i in xrange(2): 47 | xywh.append(box[i+2] - box[i]) 48 | 49 | return xywh 50 | -------------------------------------------------------------------------------- /lib/download_data_vh.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | set -x 4 | 5 | cd /valohai/inputs 6 | mv image_1/images.zip image_2/images2.zip /valohai/outputs 7 | -------------------------------------------------------------------------------- /lib/fast_rcnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/fast_rcnn/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | 11 | def bbox_transform(ex_rois, gt_rois): 12 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 13 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 14 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 15 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 16 | 17 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 18 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 19 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 20 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 21 | 22 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 23 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 24 | targets_dw = np.log(gt_widths / ex_widths) 25 | targets_dh = np.log(gt_heights / ex_heights) 26 | 27 | targets = np.vstack( 28 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 29 | return targets 30 | 31 | 32 | def bbox_transform_inv(boxes, deltas): 33 | if boxes.shape[0] == 0: 34 | return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) 35 | 36 | boxes = boxes.astype(deltas.dtype, copy=False) 37 | 38 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 39 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 40 | ctr_x = boxes[:, 0] + 0.5 * widths 41 | ctr_y = boxes[:, 1] + 0.5 * heights 42 | 43 | dx = deltas[:, 0::4] 44 | dy = deltas[:, 1::4] 45 | dw = deltas[:, 2::4] 46 | dh = deltas[:, 3::4] 47 | 48 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 49 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 50 | pred_w = np.exp(dw) * widths[:, np.newaxis] 51 | pred_h = np.exp(dh) * heights[:, np.newaxis] 52 | 53 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 54 | # x1 55 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 56 | # y1 57 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 58 | # x2 59 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1 # to make it the perfect inversion of bbox_transform 60 | # y2 61 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1 # to make it the perfect inversion of bbox_transform 62 | 63 | return pred_boxes 64 | 65 | 66 | def clip_boxes(boxes, im_shape): 67 | """ 68 | Clip boxes to image boundaries. 69 | """ 70 | 71 | # x1 >= 0 72 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 73 | # y1 >= 0 74 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 75 | # x2 < im_shape[1] 76 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 77 | # y2 < im_shape[0] 78 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 79 | return boxes 80 | -------------------------------------------------------------------------------- /lib/fast_rcnn/layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work and Xinlei's work 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | from os.path import join as pjoin 11 | from lib.config import cfg 12 | from lib.fast_rcnn.minibatch import get_minibatch 13 | import numpy as np 14 | import time 15 | import json 16 | 17 | 18 | class RoIDataLayer(object): 19 | """densecap data layer used for training.""" 20 | 21 | def __init__(self, roidb, random=False): 22 | """set the roidb to be used by this layer during training.""" 23 | self._roidb = roidb 24 | # set a random flag 25 | self._random = random 26 | self._shuffle_roidb_inds() 27 | 28 | def _shuffle_roidb_inds(self): 29 | """Randomly permute the training roidb.""" 30 | 31 | # if the random flag is set, 32 | # then the database is shuffled according to system time 33 | # useful for the validation set. 34 | if self._random: 35 | st0 = np.random.get_state() 36 | millis = int(round(time.time() * 1000)) % 4294967259 37 | np.random.seed(millis) 38 | 39 | if not cfg.LIMIT_RAM: 40 | # with sending in the giant roidb list 41 | if cfg.TRAIN.ASPECT_GROUPING: 42 | widths = np.array([r['width'] for r in self._roidb]) 43 | heights = np.array([r['height'] for r in self._roidb]) 44 | horz = (widths >= heights) 45 | vert = np.logical_not(horz) 46 | horz_inds = np.where(horz)[0] 47 | vert_inds = np.where(vert)[0] 48 | inds = np.hstack(( 49 | np.random.permutation(horz_inds), 50 | np.random.permutation(vert_inds))) 51 | inds = np.reshape(inds, (-1, 2)) 52 | row_perm = np.random.permutation(np.arange(inds.shape[0])) 53 | inds = np.reshape(inds[row_perm, :], (-1,)) 54 | self._perm = inds 55 | else: 56 | self._perm = np.random.permutation(np.arange(len(self._roidb))) 57 | else: 58 | # LIMIT_RAM and 'roidb' is the path to saved gt_roidbs. 59 | index_path = self._roidb + '/image_index.json' 60 | with open(index_path, 'r') as f: 61 | self._image_index = json.load(f) 62 | print("LIMIT_RAM version and load index from {}".format(index_path)) 63 | self._perm = np.random.permutation(np.arange(len(self._image_index))) 64 | 65 | # restore the random state 66 | if self._random: 67 | np.random.set_state(st0) 68 | 69 | self._cur = 0 70 | 71 | def _get_next_minibatch_inds(self): 72 | """Return the roidb indices for the next minibatch.""" 73 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._perm): 74 | self._shuffle_roidb_inds() 75 | 76 | db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] 77 | self._cur += cfg.TRAIN.IMS_PER_BATCH 78 | return db_inds 79 | 80 | def _get_next_minibatch(self): 81 | """Return the blobs to be used for the next minibatch. 82 | 83 | If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a 84 | separate process and made available through self._blob_queue. 85 | """ 86 | db_inds = self._get_next_minibatch_inds() 87 | if cfg.LIMIT_RAM: 88 | assert len(db_inds) == 1, "LIMIT_RAM version only support one " \ 89 | "image per minibatch." 90 | # it is the exact file path in the 'roidb' directory. 91 | minibatch_db = self._image_index[db_inds[0]] 92 | minibatch_db = pjoin(self._roidb, "%s.pkl" % minibatch_db) 93 | else: 94 | minibatch_db = [self._roidb[i] for i in db_inds] 95 | return get_minibatch(minibatch_db) 96 | 97 | def forward(self): 98 | """Get blobs""" 99 | blobs = self._get_next_minibatch() 100 | return blobs 101 | -------------------------------------------------------------------------------- /lib/fast_rcnn/minibatch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work and Xinlei's work 5 | # -------------------------------------------------------- 6 | # Fast R-CNN 7 | # Copyright (c) 2015 Microsoft 8 | # Licensed under The MIT License [see LICENSE for details] 9 | # Written by Ross Girshick 10 | # -------------------------------------------------------- 11 | from __future__ import absolute_import 12 | from __future__ import division 13 | from __future__ import print_function 14 | 15 | """Compute minibatch blobs for training a DenseCap network.""" 16 | 17 | import numpy as np 18 | import numpy.random as npr 19 | import cv2 20 | from six.moves import cPickle, xrange 21 | from lib.config import cfg 22 | from lib.utils.blob import prep_im_for_blob, im_list_to_blob 23 | 24 | 25 | def get_minibatch(roidb): 26 | """Given a roidb, construct a minibatch sampled from it.""" 27 | 28 | if cfg.LIMIT_RAM: 29 | num_images = 1 # one image per minibatch 30 | else: 31 | num_images = len(roidb) 32 | 33 | # Sample random scales to use for each image in this batch 34 | random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), 35 | size=num_images) 36 | assert (cfg.TRAIN.BATCH_SIZE % num_images == 0), \ 37 | 'num_images ({}) must divide BATCH_SIZE ({})'. \ 38 | format(num_images, cfg.TRAIN.BATCH_SIZE) 39 | 40 | # Get the input image blob, formatted for caffe 41 | im_blob, im_scales, roidb = _get_image_blob(roidb, random_scale_inds) 42 | 43 | blobs = {'data': im_blob} 44 | 45 | if cfg.TRAIN.HAS_RPN: 46 | assert len(im_scales) == 1, "Single batch only" 47 | assert len(roidb) == 1, "Single batch only" 48 | # gt boxes: (x1, y1, x2, y2, cls) 49 | gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] 50 | gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) 51 | gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0] 52 | gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] 53 | # TODO: add "gt_phrases" 54 | blobs['gt_phrases'] = _process_gt_phrases(roidb[0]['gt_phrases']) 55 | blobs['gt_boxes'] = gt_boxes 56 | blobs['im_info'] = np.array( 57 | # TODO: for blob format stick to tf_faster_rcnn version 58 | # [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], 59 | # [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]], 60 | # make it shape [3,] 61 | [im_blob.shape[1], im_blob.shape[2], im_scales[0]], 62 | dtype=np.float32) 63 | # if cfg.LIMIT_RAM: 64 | # blobs['gt_phrases'] = roidb[0]['gt_phrases'] 65 | else: # not using RPN 66 | raise NotImplementedError 67 | 68 | return blobs 69 | 70 | 71 | def _process_gt_phrases(phrases): 72 | """processing gt phrases for blob""" 73 | num_regions = len(phrases) 74 | gt_phrases = np.zeros((num_regions, cfg.MAX_WORDS), dtype=np.int32) 75 | for ix, phra in enumerate(phrases): 76 | l = len(phra) 77 | gt_phrases[ix, :l] = phra 78 | 79 | return gt_phrases 80 | 81 | 82 | def _get_image_blob(roidb, scale_inds): 83 | """Builds an input blob from the images in the roidb at the specified 84 | scales. 85 | """ 86 | num_images = len(scale_inds) 87 | processed_ims = [] 88 | im_scales = [] 89 | if cfg.LIMIT_RAM: 90 | # roidb is the pickle file path 91 | assert num_images == 1, "LIMIT_RAM version, it has to be one image." 92 | with open(roidb, 'rb') as f: 93 | roidb = [cPickle.load(f)] 94 | 95 | for i in xrange(num_images): 96 | im = cv2.imread(roidb[i]['image']) 97 | if roidb[i]['flipped']: 98 | im = im[:, ::-1, :] 99 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 100 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, 101 | cfg.TRAIN.MAX_SIZE) 102 | im_scales.append(im_scale) 103 | processed_ims.append(im) 104 | 105 | # Create a blob to hold the input images 106 | blob = im_list_to_blob(processed_ims) 107 | 108 | return blob, im_scales, roidb 109 | 110 | -------------------------------------------------------------------------------- /lib/fast_rcnn/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | from lib.config import cfg 12 | from lib.nms.gpu_nms import gpu_nms 13 | from lib.nms.cpu_nms import cpu_nms 14 | 15 | def nms(dets, thresh, force_cpu=False): 16 | """Dispatch to either CPU or GPU NMS implementations.""" 17 | 18 | if dets.shape[0] == 0: 19 | return [] 20 | # print "gpu_id used by nms is: %d" % cfg.GPU_ID 21 | if cfg.USE_GPU_NMS and not force_cpu: 22 | return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 23 | else: 24 | return cpu_nms(dets, thresh) 25 | -------------------------------------------------------------------------------- /lib/fast_rcnn/roidb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | """Transform a roidb into a trainable roidb by adding a bunch of metadata.""" 12 | 13 | # import sys 14 | # sys.path.append("..") 15 | 16 | import numpy as np 17 | from lib.config import cfg 18 | from lib.fast_rcnn.bbox_transform import bbox_transform 19 | from lib.utils.cython_bbox import bbox_overlaps 20 | from PIL import Image 21 | 22 | 23 | def prepare_roidb(imdb): 24 | """Enrich the imdb's roidb by adding some derived quantities that 25 | are useful for training. This function precomputes the maximum 26 | overlap, taken over ground-truth boxes, between each ROI and 27 | each ground-truth box. The class with maximum overlap is also 28 | recorded. 29 | """ 30 | sizes = [Image.open(imdb.image_path_at(i)).size 31 | for i in xrange(imdb.num_images)] 32 | roidb = imdb.roidb 33 | for i in xrange(len(imdb.image_index)): 34 | roidb[i]['image'] = imdb.image_path_at(i) 35 | roidb[i]['width'] = sizes[i][0] 36 | roidb[i]['height'] = sizes[i][1] 37 | # need gt_overlaps as a dense array for argmax 38 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 39 | # max overlap with gt over classes (columns) 40 | max_overlaps = gt_overlaps.max(axis=1) 41 | # gt class that had the max overlap 42 | max_classes = gt_overlaps.argmax(axis=1) 43 | roidb[i]['max_classes'] = max_classes 44 | roidb[i]['max_overlaps'] = max_overlaps 45 | # sanity checks 46 | # max overlap of 0 => class should be zero (background) 47 | zero_inds = np.where(max_overlaps == 0)[0] 48 | assert all(max_classes[zero_inds] == 0) 49 | # max overlap > 0 => class should not be zero (must be a fg class) 50 | # nonzero_inds = np.where(max_overlaps > 0)[0] 51 | # assert all(max_classes[nonzero_inds] != 0) 52 | 53 | 54 | def add_bbox_regression_targets(roidb): 55 | """Add information needed to train bounding-box regressors.""" 56 | assert len(roidb) > 0 57 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' 58 | 59 | num_images = len(roidb) 60 | # Infer number of classes from the number of columns in gt_overlaps 61 | num_classes = roidb[0]['gt_overlaps'].shape[1] 62 | for im_i in xrange(num_images): 63 | rois = roidb[im_i]['boxes'] 64 | max_overlaps = roidb[im_i]['max_overlaps'] 65 | max_classes = roidb[im_i]['max_classes'] 66 | roidb[im_i]['bbox_targets'] = \ 67 | _compute_targets(rois, max_overlaps, max_classes) 68 | 69 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 70 | # Use fixed / precomputed "means" and "stds" instead of empirical values 71 | means = np.tile( 72 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1)) 73 | stds = np.tile( 74 | np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1)) 75 | else: 76 | # Compute values needed for means and stds 77 | # var(x) = E(x^2) - E(x)^2 78 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 79 | sums = np.zeros((num_classes, 4)) 80 | squared_sums = np.zeros((num_classes, 4)) 81 | for im_i in xrange(num_images): 82 | targets = roidb[im_i]['bbox_targets'] 83 | for cls in xrange(1, num_classes): 84 | cls_inds = np.where(targets[:, 0] == cls)[0] 85 | if cls_inds.size > 0: 86 | class_counts[cls] += cls_inds.size 87 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 88 | squared_sums[cls, :] += \ 89 | (targets[cls_inds, 1:] ** 2).sum(axis=0) 90 | 91 | means = sums / class_counts 92 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 93 | 94 | print('bbox target means:') 95 | print(means) 96 | print(means[1:, :].mean(axis=0)) # ignore bg class) 97 | print('bbox target stdevs:') 98 | print(stds) 99 | print(stds[1:, :].mean(axis=0)) # ignore bg class) 100 | 101 | # Normalize targets 102 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS: 103 | print("Normalizing targets") 104 | for im_i in xrange(num_images): 105 | targets = roidb[im_i]['bbox_targets'] 106 | for cls in xrange(1, num_classes): 107 | cls_inds = np.where(targets[:, 0] == cls)[0] 108 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] 109 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] 110 | else: 111 | print("NOT normalizing targets") 112 | 113 | # These values will be needed for making predictions 114 | # (the predicts will need to be unnormalized and uncentered) 115 | return means.ravel(), stds.ravel() 116 | 117 | 118 | def _compute_targets(rois, overlaps, labels): 119 | """Compute bounding-box regression targets for an image.""" 120 | # Indices of ground-truth ROIs 121 | gt_inds = np.where(overlaps == 1)[0] 122 | if len(gt_inds) == 0: 123 | # Bail if the image has no ground-truth ROIs 124 | return np.zeros((rois.shape[0], 5), dtype=np.float32) 125 | # Indices of examples for which we try to make predictions 126 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 127 | 128 | # Get IoU overlap between each ex ROI and gt ROI 129 | ex_gt_overlaps = bbox_overlaps( 130 | np.ascontiguousarray(rois[ex_inds, :], dtype=np.float), 131 | np.ascontiguousarray(rois[gt_inds, :], dtype=np.float)) 132 | 133 | # Find which gt ROI each ex ROI has max overlap with: 134 | # this will be the ex ROI's gt target 135 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 136 | gt_rois = rois[gt_inds[gt_assignment], :] 137 | ex_rois = rois[ex_inds, :] 138 | 139 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 140 | targets[ex_inds, 0] = labels[ex_inds] 141 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) 142 | return targets 143 | -------------------------------------------------------------------------------- /lib/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/layers/__init__.py -------------------------------------------------------------------------------- /lib/layers/anchor_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | from lib.config import cfg 13 | import numpy as np 14 | import numpy.random as npr 15 | from lib.utils.cython_bbox import bbox_overlaps 16 | from lib.fast_rcnn.bbox_transform import bbox_transform 17 | 18 | 19 | def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors): 20 | """Same as the anchor target layer in original Fast/er RCNN """ 21 | 22 | A = num_anchors 23 | total_anchors = all_anchors.shape[0] 24 | K = total_anchors / num_anchors 25 | 26 | # allow boxes to sit over the edge by a small amount 27 | _allowed_border = 0 28 | 29 | # map of shape (..., H, W) 30 | height, width = rpn_cls_score.shape[1:3] 31 | 32 | # only keep anchors inside the image 33 | inds_inside = np.where( 34 | (all_anchors[:, 0] >= -_allowed_border) & 35 | (all_anchors[:, 1] >= -_allowed_border) & 36 | (all_anchors[:, 2] < im_info[1] + _allowed_border) & # width 37 | (all_anchors[:, 3] < im_info[0] + _allowed_border) # height 38 | )[0] 39 | 40 | # keep only inside anchors 41 | anchors = all_anchors[inds_inside, :] 42 | 43 | # label: 1 is positive, 0 is negative, -1 is dont care 44 | labels = np.empty((len(inds_inside),), dtype=np.float32) 45 | labels.fill(-1) 46 | 47 | # overlaps between the anchors and the gt boxes 48 | # overlaps (ex, gt) 49 | overlaps = bbox_overlaps( 50 | np.ascontiguousarray(anchors, dtype=np.float), 51 | np.ascontiguousarray(gt_boxes, dtype=np.float)) 52 | argmax_overlaps = overlaps.argmax(axis=1) 53 | max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] 54 | gt_argmax_overlaps = overlaps.argmax(axis=0) 55 | gt_max_overlaps = overlaps[gt_argmax_overlaps, 56 | np.arange(overlaps.shape[1])] 57 | gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] 58 | 59 | if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: 60 | # assign bg labels first so that positive labels can clobber them 61 | # first set the negatives 62 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 63 | 64 | # fg label: for each gt, anchor with highest overlap 65 | labels[gt_argmax_overlaps] = 1 66 | 67 | # fg label: above threshold IOU 68 | labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 69 | 70 | if cfg.TRAIN.RPN_CLOBBER_POSITIVES: 71 | # assign bg labels last so that negative labels can clobber positives 72 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 73 | 74 | # subsample positive labels if we have too many 75 | num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) 76 | fg_inds = np.where(labels == 1)[0] 77 | if len(fg_inds) > num_fg: 78 | disable_inds = npr.choice( 79 | fg_inds, size=(len(fg_inds) - num_fg), replace=False) 80 | labels[disable_inds] = -1 81 | 82 | # subsample negative labels if we have too many 83 | num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) 84 | bg_inds = np.where(labels == 0)[0] 85 | if len(bg_inds) > num_bg: 86 | disable_inds = npr.choice( 87 | bg_inds, size=(len(bg_inds) - num_bg), replace=False) 88 | labels[disable_inds] = -1 89 | 90 | bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) 91 | bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) 92 | 93 | bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 94 | # only the positive ones have regression targets 95 | bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) 96 | 97 | bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 98 | if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: 99 | # uniform weighting of examples (given non-uniform sampling) 100 | num_examples = np.sum(labels >= 0) 101 | positive_weights = np.ones((1, 4)) * 1.0 / num_examples 102 | negative_weights = np.ones((1, 4)) * 1.0 / num_examples 103 | else: 104 | assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & 105 | (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) 106 | positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / 107 | np.sum(labels == 1)) 108 | negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / 109 | np.sum(labels == 0)) 110 | bbox_outside_weights[labels == 1, :] = positive_weights 111 | bbox_outside_weights[labels == 0, :] = negative_weights 112 | 113 | # map up to original set of anchors 114 | labels = _unmap(labels, total_anchors, inds_inside, fill=-1) 115 | bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) 116 | bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) 117 | bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) 118 | 119 | # labels 120 | labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2) 121 | labels = labels.reshape((1, 1, A * height, width)) 122 | rpn_labels = labels 123 | 124 | # bbox_targets 125 | bbox_targets = bbox_targets \ 126 | .reshape((1, height, width, A * 4)) 127 | 128 | rpn_bbox_targets = bbox_targets 129 | # bbox_inside_weights 130 | bbox_inside_weights = bbox_inside_weights \ 131 | .reshape((1, height, width, A * 4)) 132 | 133 | rpn_bbox_inside_weights = bbox_inside_weights 134 | 135 | # bbox_outside_weights 136 | bbox_outside_weights = bbox_outside_weights \ 137 | .reshape((1, height, width, A * 4)) 138 | 139 | rpn_bbox_outside_weights = bbox_outside_weights 140 | return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights 141 | 142 | 143 | def _unmap(data, count, inds, fill=0): 144 | """ Unmap a subset of item (data) back to the original set of items (of 145 | size count) """ 146 | if len(data.shape) == 1: 147 | ret = np.empty((count,), dtype=np.float32) 148 | ret.fill(fill) 149 | ret[inds] = data 150 | else: 151 | ret = np.empty((count,) + data.shape[1:], dtype=np.float32) 152 | ret.fill(fill) 153 | ret[inds, :] = data 154 | return ret 155 | 156 | 157 | def _compute_targets(ex_rois, gt_rois): 158 | """Compute bounding-box regression targets for an image.""" 159 | 160 | assert ex_rois.shape[0] == gt_rois.shape[0] 161 | assert ex_rois.shape[1] == 4 162 | assert gt_rois.shape[1] == 5 163 | 164 | return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False) 165 | -------------------------------------------------------------------------------- /lib/layers/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | 13 | 14 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 15 | # 16 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 17 | # >> anchors 18 | # 19 | # anchors = 20 | # 21 | # -83 -39 100 56 22 | # -175 -87 192 104 23 | # -359 -183 376 200 24 | # -55 -55 72 72 25 | # -119 -119 136 136 26 | # -247 -247 264 264 27 | # -35 -79 52 96 28 | # -79 -167 96 184 29 | # -167 -343 184 360 30 | 31 | # array([[ -83., -39., 100., 56.], 32 | # [-175., -87., 192., 104.], 33 | # [-359., -183., 376., 200.], 34 | # [ -55., -55., 72., 72.], 35 | # [-119., -119., 136., 136.], 36 | # [-247., -247., 264., 264.], 37 | # [ -35., -79., 52., 96.], 38 | # [ -79., -167., 96., 184.], 39 | # [-167., -343., 184., 360.]]) 40 | 41 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 42 | scales=2 ** np.arange(3, 6)): 43 | """ 44 | Generate anchor (reference) windows by enumerating aspect ratios X 45 | scales wrt a reference (0, 0, 15, 15) window. 46 | """ 47 | 48 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 49 | ratio_anchors = _ratio_enum(base_anchor, ratios) 50 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 51 | for i in range(ratio_anchors.shape[0])]) 52 | return anchors 53 | 54 | 55 | def _whctrs(anchor): 56 | """ 57 | Return width, height, x center, and y center for an anchor (window). 58 | """ 59 | 60 | w = anchor[2] - anchor[0] + 1 61 | h = anchor[3] - anchor[1] + 1 62 | x_ctr = anchor[0] + 0.5 * (w - 1) 63 | y_ctr = anchor[1] + 0.5 * (h - 1) 64 | return w, h, x_ctr, y_ctr 65 | 66 | 67 | def _mkanchors(ws, hs, x_ctr, y_ctr): 68 | """ 69 | Given a vector of widths (ws) and heights (hs) around a center 70 | (x_ctr, y_ctr), output a set of anchors (windows). 71 | """ 72 | 73 | ws = ws[:, np.newaxis] 74 | hs = hs[:, np.newaxis] 75 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 76 | y_ctr - 0.5 * (hs - 1), 77 | x_ctr + 0.5 * (ws - 1), 78 | y_ctr + 0.5 * (hs - 1))) 79 | return anchors 80 | 81 | 82 | def _ratio_enum(anchor, ratios): 83 | """ 84 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 85 | """ 86 | 87 | w, h, x_ctr, y_ctr = _whctrs(anchor) 88 | size = w * h 89 | size_ratios = size / ratios 90 | ws = np.round(np.sqrt(size_ratios)) 91 | hs = np.round(ws * ratios) 92 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 93 | return anchors 94 | 95 | 96 | def _scale_enum(anchor, scales): 97 | """ 98 | Enumerate a set of anchors for each scale wrt an anchor. 99 | """ 100 | 101 | w, h, x_ctr, y_ctr = _whctrs(anchor) 102 | ws = w * scales 103 | hs = h * scales 104 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 105 | return anchors 106 | 107 | 108 | if __name__ == '__main__': 109 | import time 110 | 111 | t = time.time() 112 | a = generate_anchors() 113 | print(time.time() - t) 114 | print(a) 115 | from IPython import embed; 116 | 117 | embed() 118 | -------------------------------------------------------------------------------- /lib/layers/global_roi_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Xinlei's work 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | 12 | 13 | def GlobalRoILayer(im_info): 14 | """ 15 | Set up the global RoI 16 | """ 17 | return np.array([0., 0., 0., im_info[1] - 1, im_info[0] - 1]) 18 | -------------------------------------------------------------------------------- /lib/layers/proposal_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick and Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from lib.config import cfg 12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 13 | from lib.fast_rcnn.nms_wrapper import nms 14 | 15 | 16 | def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors): 17 | """A simplified version compared to fast/er RCNN 18 | For details please see the technical report 19 | """ 20 | if type(cfg_key) == bytes: 21 | cfg_key = cfg_key.decode('utf-8') 22 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N 23 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N 24 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH 25 | 26 | # Get the scores and bounding boxes 27 | scores = rpn_cls_prob[:, :, :, num_anchors:] 28 | rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4)) 29 | scores = scores.reshape((-1, 1)) 30 | proposals = bbox_transform_inv(anchors, rpn_bbox_pred) 31 | if cfg.DEBUG_ALL: 32 | print ('number of proposals before clip boxes to image board: {}'.format( 33 | proposals.shape[0] 34 | )) 35 | proposals = clip_boxes(proposals, im_info[:2]) 36 | 37 | # remove predicted boxes with either height or width < threshold 38 | # (NOTE: convert min_size to input image scale stored in im_info[2]) 39 | if cfg.FILTER_SMALL_BOX: 40 | min_size = cfg[cfg_key].RPN_MIN_SIZE 41 | keep = _filter_boxes(proposals, min_size * im_info[2]) 42 | proposals = proposals[keep, :] 43 | scores = scores[keep] 44 | 45 | # Pick the top region proposals 46 | order = scores.ravel().argsort()[::-1] 47 | if pre_nms_topN > 0: 48 | order = order[:pre_nms_topN] 49 | proposals = proposals[order, :] 50 | scores = scores[order] 51 | 52 | # Non-maximal suppression 53 | if cfg.DEBUG_ALL: 54 | print("number of proposals before nms: {}".format(proposals.shape[0])) 55 | keep = nms(np.hstack((proposals, scores)), nms_thresh) 56 | if cfg.DEBUG_ALL: 57 | print("number of proposals after nms: {}".format(len(keep))) 58 | 59 | # Pick th top region proposals after NMS 60 | if post_nms_topN > 0: 61 | keep = keep[:post_nms_topN] 62 | proposals = proposals[keep, :] 63 | scores = scores[keep] 64 | 65 | # Only support single image as input 66 | batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) 67 | blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) 68 | 69 | return blob, scores 70 | 71 | 72 | def _filter_boxes(boxes, min_size): 73 | """Remove all boxes with any side smaller than min_size.""" 74 | 75 | ws = boxes[:, 2] - boxes[:, 0] + 1 76 | hs = boxes[:, 3] - boxes[:, 1] + 1 77 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 78 | return keep 79 | -------------------------------------------------------------------------------- /lib/layers/proposal_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick, Sean Bell and Xinlei Chen 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | import numpy.random as npr 13 | from lib.config import cfg 14 | from lib.fast_rcnn.bbox_transform import bbox_transform 15 | from lib.utils.cython_bbox import bbox_overlaps 16 | 17 | 18 | def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes): 19 | """ 20 | Assign object detection proposals to ground-truth targets. Produces proposal 21 | classification labels and bounding-box regression targets. 22 | """ 23 | 24 | # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN 25 | # (i.e., layers.proposal_layer.ProposalLayer), or any other source 26 | all_rois = rpn_rois 27 | all_scores = rpn_scores 28 | 29 | # Include ground-truth boxes in the set of candidate rois 30 | if cfg.TRAIN.USE_GT: 31 | zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) 32 | all_rois = np.vstack( 33 | (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) 34 | ) 35 | # not sure if it a wise appending, but anyway i am not using it 36 | all_scores = np.vstack((all_scores, zeros)) 37 | 38 | num_images = 1 39 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 40 | fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) 41 | 42 | # Sample rois with classification labels and bounding box regression 43 | # targets 44 | labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois( 45 | all_rois, all_scores, gt_boxes, fg_rois_per_image, 46 | rois_per_image, _num_classes) 47 | 48 | rois = rois.reshape(-1, 5) 49 | roi_scores = roi_scores.reshape(-1) 50 | labels = labels.reshape(-1, 1) 51 | bbox_targets = bbox_targets.reshape(-1, _num_classes * 4) 52 | bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4) 53 | bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32) 54 | 55 | return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights 56 | 57 | 58 | def _get_bbox_regression_labels(bbox_target_data, num_classes): 59 | """Bounding-box regression targets (bbox_target_data) are stored in a 60 | compact form N x (class, tx, ty, tw, th) 61 | 62 | This function expands those targets into the 4-of-4*K representation used 63 | by the network (i.e. only one class has non-zero targets). 64 | 65 | Returns: 66 | bbox_target (ndarray): N x 4K blob of regression targets 67 | bbox_inside_weights (ndarray): N x 4K blob of loss weights 68 | """ 69 | 70 | clss = bbox_target_data[:, 0] 71 | bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) 72 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 73 | inds = np.where(clss > 0)[0] 74 | for ind in inds: 75 | cls = clss[ind] 76 | start = int(4 * cls) 77 | end = start + 4 78 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] 79 | bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 80 | return bbox_targets, bbox_inside_weights 81 | 82 | 83 | def _compute_targets(ex_rois, gt_rois, labels): 84 | """Compute bounding-box regression targets for an image.""" 85 | 86 | assert ex_rois.shape[0] == gt_rois.shape[0] 87 | assert ex_rois.shape[1] == 4 88 | assert gt_rois.shape[1] == 4 89 | 90 | targets = bbox_transform(ex_rois, gt_rois) 91 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 92 | # Optionally normalize targets by a precomputed mean and stdev 93 | targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) 94 | / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) 95 | return np.hstack( 96 | (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) 97 | 98 | 99 | def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes): 100 | """Generate a random sample of RoIs comprising foreground and background 101 | examples. 102 | """ 103 | # overlaps: (rois x gt_boxes) 104 | overlaps = bbox_overlaps( 105 | np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), 106 | np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) 107 | gt_assignment = overlaps.argmax(axis=1) 108 | max_overlaps = overlaps.max(axis=1) 109 | labels = gt_boxes[gt_assignment, 4] 110 | 111 | # Select foreground RoIs as those with >= FG_THRESH overlap 112 | fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] 113 | # Guard against the case when an image has fewer than fg_rois_per_image 114 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 115 | bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & 116 | (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 117 | 118 | # Small modification to the original version where we ensure a fixed number of regions are sampled 119 | if fg_inds.size > 0 and bg_inds.size > 0: 120 | fg_rois_per_image = min(fg_rois_per_image, fg_inds.size) 121 | fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False) 122 | bg_rois_per_image = rois_per_image - fg_rois_per_image 123 | to_replace = bg_inds.size < bg_rois_per_image 124 | bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace) 125 | elif fg_inds.size > 0: 126 | to_replace = fg_inds.size < rois_per_image 127 | fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace) 128 | fg_rois_per_image = rois_per_image 129 | elif bg_inds.size > 0: 130 | to_replace = bg_inds.size < rois_per_image 131 | bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace) 132 | fg_rois_per_image = 0 133 | else: 134 | import pdb 135 | pdb.set_trace() 136 | 137 | # The indices that we're selecting (both fg and bg) 138 | keep_inds = np.append(fg_inds, bg_inds) 139 | # Select sampled values from various arrays: 140 | labels = labels[keep_inds] 141 | # Clamp labels for the background RoIs to 0 142 | labels[int(fg_rois_per_image):] = 0 143 | rois = all_rois[keep_inds] 144 | roi_scores = all_scores[keep_inds] 145 | 146 | bbox_target_data = _compute_targets( 147 | rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) 148 | 149 | bbox_targets, bbox_inside_weights = \ 150 | _get_bbox_regression_labels(bbox_target_data, num_classes) 151 | 152 | return labels, rois, roi_scores, bbox_targets, bbox_inside_weights 153 | -------------------------------------------------------------------------------- /lib/layers/proposal_target_single_class_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Xinlei's work 5 | # -------------------------------------------------------- 6 | # Faster R-CNN 7 | # Copyright (c) 2015 Microsoft 8 | # Licensed under The MIT License [see LICENSE for details] 9 | # Written by Ross Girshick, Sean Bell and Xinlei Chen 10 | # -------------------------------------------------------- 11 | from __future__ import absolute_import 12 | from __future__ import division 13 | from __future__ import print_function 14 | 15 | import numpy as np 16 | import numpy.random as npr 17 | from lib.config import cfg 18 | from lib.fast_rcnn.bbox_transform import bbox_transform 19 | from lib.utils.cython_bbox import bbox_overlaps 20 | from lib.layers.rois_offset_layer import compute_rois_offset 21 | 22 | 23 | def proposal_target_single_class_layer(rpn_rois, rpn_scores, gt_boxes, gt_phrases): 24 | """ 25 | Assign object detection proposals to ground-truth targets. Produces proposal 26 | classification labels and bounding-box regression targets. 27 | """ 28 | 29 | # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN 30 | # (i.e., layers.proposal_layer.ProposalLayer), or any other source 31 | all_rois = rpn_rois 32 | all_scores = rpn_scores 33 | 34 | # Include ground-truth boxes in the set of candidate rois 35 | if cfg.TRAIN.USE_GT: 36 | zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) 37 | all_rois = np.vstack( 38 | (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) 39 | ) 40 | # not sure if it a wise appending, but anyway i am not using it 41 | all_scores = np.vstack((all_scores, zeros)) 42 | 43 | num_images = 1 44 | rois_per_image = cfg.TRAIN.BATCH_SIZE // num_images 45 | fg_rois_per_image = int(cfg.TRAIN.FG_FRACTION * rois_per_image) 46 | 47 | # Sample rois with classification labels and bounding box regression 48 | # targets 49 | labels, rois, roi_scores, bbox_targets, bbox_inside_weights, phrases = _sample_rois( 50 | all_rois, all_scores, gt_boxes, gt_phrases, fg_rois_per_image, 51 | rois_per_image) 52 | 53 | rois = rois.reshape(-1, 5) 54 | roi_scores = roi_scores.reshape(-1) 55 | labels = labels.reshape(-1, 1) 56 | phrases = phrases.reshape(-1, cfg.MAX_WORDS) 57 | bbox_targets = bbox_targets.reshape(-1, 4) 58 | bbox_inside_weights = bbox_inside_weights.reshape(-1, 4) 59 | bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32) 60 | clss = np.array(labels > 0).astype(np.int32) 61 | 62 | return rois, roi_scores, labels, bbox_targets, \ 63 | bbox_inside_weights, bbox_outside_weights, clss, phrases 64 | 65 | 66 | def _get_bbox_regression_labels(bbox_target_data): 67 | """Bounding-box regression targets (bbox_target_data) are stored in a 68 | compact form N x (class, tx, ty, tw, th) 69 | 70 | Returns: 71 | bbox_target (ndarray): N x 4 blob of regression targets 72 | bbox_inside_weights (ndarray): N x 4 blob of loss weights 73 | """ 74 | 75 | clss = bbox_target_data[:, 0] 76 | bbox_targets = np.zeros((clss.size, 4), dtype=np.float32) 77 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 78 | inds = np.where(clss > 0)[0] 79 | for ind in inds: 80 | bbox_targets[ind, :] = bbox_target_data[ind, 1:] 81 | bbox_inside_weights[ind, :] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 82 | return bbox_targets, bbox_inside_weights 83 | 84 | 85 | def _compute_targets(ex_rois, gt_rois, labels): 86 | """Compute bounding-box regression targets for an image.""" 87 | 88 | assert ex_rois.shape[0] == gt_rois.shape[0] 89 | assert ex_rois.shape[1] == 4 90 | assert gt_rois.shape[1] == 4 91 | 92 | targets = bbox_transform(ex_rois, gt_rois) 93 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 94 | # Optionally normalize targets by a precomputed mean and stdev 95 | targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) 96 | / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) 97 | return np.hstack( 98 | (labels[:, np.newaxis], targets)).astype(np.float32, copy=False) 99 | 100 | 101 | def _sample_rois(all_rois, all_scores, gt_boxes, gt_phrases, fg_rois_per_image, rois_per_image): 102 | """Generate a random sample of RoIs comprising foreground and background 103 | examples. 104 | """ 105 | # overlaps: (rois x gt_boxes) 106 | overlaps = bbox_overlaps( 107 | np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), 108 | np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) 109 | gt_assignment = overlaps.argmax(axis=1) 110 | max_overlaps = overlaps.max(axis=1) 111 | labels = gt_boxes[gt_assignment, 4] 112 | phrases = gt_phrases[gt_assignment] 113 | 114 | # Select foreground RoIs as those with >= FG_THRESH overlap 115 | fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0] 116 | # Guard against the case when an image has fewer than fg_rois_per_image 117 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 118 | bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) & 119 | (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 120 | 121 | # Small modification to the original version where we ensure a fixed number of regions are sampled 122 | if cfg.SAMPLE_NUM_FIXED_REGIONS: 123 | if fg_inds.size > 0 and bg_inds.size > 0: 124 | fg_rois_per_image = min(fg_rois_per_image, fg_inds.size) 125 | fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False) 126 | bg_rois_per_image = rois_per_image - fg_rois_per_image 127 | to_replace = bg_inds.size < bg_rois_per_image 128 | bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace) 129 | elif fg_inds.size > 0: 130 | to_replace = fg_inds.size < rois_per_image 131 | fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace) 132 | fg_rois_per_image = rois_per_image 133 | elif bg_inds.size > 0: 134 | to_replace = bg_inds.size < rois_per_image 135 | bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace) 136 | fg_rois_per_image = 0 137 | else: 138 | import pdb 139 | pdb.set_trace() 140 | else: 141 | # foreground RoIs 142 | fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size) 143 | # Sample foreground regions without replacement 144 | if fg_inds.size > 0: 145 | fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False) 146 | 147 | # Compute number of background RoIs to take from this image (guarding 148 | # against there being fewer than desired) 149 | bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image 150 | bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size) 151 | # Sample background regions without replacement 152 | if bg_inds.size > 0: 153 | bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False) 154 | 155 | # The indices that we're selecting (both fg and bg) 156 | keep_inds = np.append(fg_inds, bg_inds) 157 | # Select sampled values from various arrays: 158 | labels = labels[keep_inds] 159 | phrases = phrases[keep_inds] 160 | # Clamp labels for the background RoIs to 0 161 | labels[int(fg_rois_per_image):] = 0 162 | phrases[int(fg_rois_per_image):, :] = 0 163 | rois = all_rois[keep_inds] 164 | roi_scores = all_scores[keep_inds] 165 | 166 | bbox_target_data = _compute_targets( 167 | rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels) 168 | 169 | if cfg.DEBUG_ALL: 170 | target_boxes = compute_rois_offset(rois[:, 1:5], bbox_target_data[:, 1:5]) 171 | match_boxes = gt_boxes[gt_assignment[keep_inds], :4] 172 | print('boxes consistency check') 173 | print(target_boxes[:2,:]) 174 | print(match_boxes[:2,:]) 175 | assert np.linalg.norm(target_boxes - match_boxes) < 0.01 176 | 177 | bbox_targets, bbox_inside_weights = \ 178 | _get_bbox_regression_labels(bbox_target_data) 179 | 180 | return labels, rois, roi_scores, bbox_targets, bbox_inside_weights, phrases 181 | -------------------------------------------------------------------------------- /lib/layers/proposal_top_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from lib.config import cfg 12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 13 | import numpy.random as npr 14 | 15 | 16 | def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, anchors, num_anchors): 17 | """A layer that just selects the top region proposals 18 | without using non-maximal suppression, 19 | For details please see the technical report 20 | """ 21 | rpn_top_n = cfg.TEST.RPN_TOP_N 22 | 23 | scores = rpn_cls_prob[:, :, :, num_anchors:] 24 | 25 | rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4)) 26 | scores = scores.reshape((-1, 1)) 27 | 28 | length = scores.shape[0] 29 | if length < rpn_top_n: 30 | # Random selection, maybe unnecessary and loses good proposals 31 | # But such case rarely happens 32 | top_inds = npr.choice(length, size=rpn_top_n, replace=True) 33 | else: 34 | top_inds = scores.argsort(0)[::-1] 35 | top_inds = top_inds[:rpn_top_n] 36 | top_inds = top_inds.reshape(rpn_top_n, ) 37 | 38 | # Do the selection here 39 | anchors = anchors[top_inds, :] 40 | rpn_bbox_pred = rpn_bbox_pred[top_inds, :] 41 | scores = scores[top_inds] 42 | 43 | # Convert anchors into proposals via bbox transformations 44 | proposals = bbox_transform_inv(anchors, rpn_bbox_pred) 45 | 46 | # Clip predicted boxes to image 47 | proposals = clip_boxes(proposals, im_info[:2]) 48 | 49 | # Output rois blob 50 | # Our RPN implementation only supports a single input image, so all 51 | # batch inds are 0 52 | batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) 53 | blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) 54 | return blob, scores 55 | -------------------------------------------------------------------------------- /lib/layers/rois_offset_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from lib.config import cfg 12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 13 | 14 | 15 | # compute the new bboxes shifted by offset from rois 16 | def compute_rois_offset(rois, offset, im_info=None): 17 | """Compute bounding-box offset for region of interests""" 18 | 19 | assert rois.shape[1] == 4 20 | assert offset.shape[1] == 4 21 | 22 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 23 | # Optionally normalize targets by a precomputed mean and stdev -- reverse the transformation 24 | offset_unnorm = offset * np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS) + \ 25 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS) 26 | else: 27 | offset_unnorm = offset.copy() 28 | rois_offset = bbox_transform_inv(rois, offset_unnorm) 29 | if not im_info is None: 30 | rois_offset = clip_boxes(rois_offset, im_info[:2]) 31 | return rois_offset 32 | -------------------------------------------------------------------------------- /lib/layers/sentence_data_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | """This python layer accepts region ids as input and 11 | retrieves region sentense for them.""" 12 | 13 | from six.moves import cPickle 14 | from lib.config import cfg 15 | from collections import Counter 16 | import numpy as np 17 | import six 18 | from six.moves import xrange 19 | 20 | # TODO: disable debug and clear stuff 21 | DEBUG = True 22 | 23 | 24 | def sentence_data_layer(labels, roi_phrases, time_steps=12, mode='concat'): 25 | all_modes = ('repeat', 'concat') 26 | assert (mode in all_modes), "Wrong type of mode which should be 'repeat' or 'concat'" 27 | 28 | if cfg.DEBUG_ALL: 29 | print('length of labels, i.e. number of regions: {}'.format(len(roi_phrases))) 30 | 31 | # all_regions is a dict from region id to caption stream 32 | assert len(labels.shape) == 2, 'Pleace check the shape of "label"' 33 | 34 | num_regions = labels.shape[0] 35 | if mode == 'repeat': 36 | input_sentence = np.zeros((num_regions, time_steps), dtype=np.float32) 37 | elif mode == 'concat': 38 | input_sentence = np.zeros((num_regions, time_steps - 1), dtype=np.float32) 39 | 40 | target_sentence = np.zeros((num_regions, time_steps), dtype=np.float32) 41 | cont_sentence = np.zeros((num_regions, time_steps), dtype=np.float32) 42 | cont_bbox = np.zeros((num_regions, time_steps), dtype=np.float32) 43 | for i in xrange(num_regions): 44 | stream = get_streams(roi_phrases[i], int(labels[i]), time_steps, mode) 45 | input_sentence[i, :] = stream['input_sentence'] 46 | target_sentence[i, :] = stream['target_sentence'] 47 | cont_sentence[i, :] = stream['cont_sentence'] 48 | cont_bbox[i, :] = stream['cont_bbox'] 49 | 50 | if cfg.DEBUG_ALL: 51 | print('sentence data layer input (first 3)') 52 | for ix, l in enumerate(labels[:3]): 53 | print(l[0], roi_phrases[ix]) 54 | print('sentence data layer output (first 3)') 55 | print('input sentence') 56 | print(input_sentence[:3, :]) 57 | print('target sentence') 58 | print(target_sentence[:3, :]) 59 | print('cont sentence') 60 | print(cont_sentence[:3, :]) 61 | print('cont bbox') 62 | print(cont_bbox[:3, :]) 63 | 64 | return input_sentence, target_sentence, cont_sentence, cont_bbox 65 | 66 | 67 | def get_streams(phrases, region_id, time_steps=12, mode='concat'): 68 | 69 | if mode == 'repeat': 70 | # Image features repeated at each time step 71 | if region_id > 0: 72 | stream = phrases[:np.sum(phrases > 0)] 73 | stream = stream.tolist() 74 | pad = time_steps - (len(stream) + 1) 75 | out = {} 76 | out['cont_sentence'] = [0] + [1] * len(stream) + [0] * pad 77 | out['input_sentence'] = [1] + stream + [0] * pad 78 | out['target_sentence'] = stream + [2] + [0] * pad 79 | # only make prediction at the last time step for bbox 80 | out['cont_bbox'] = [0] * len(stream) + [1] + [0] * pad 81 | 82 | for key, val in six.iteritems(out): 83 | if len(val) > time_steps: 84 | out[key] = val[:time_steps] 85 | else: 86 | # negative sample, no phrase related 87 | out = {} 88 | out['cont_sentence'] = [0] * time_steps 89 | out['input_sentence'] = [0] * time_steps 90 | out['target_sentence'] = [0] * time_steps 91 | out['cont_bbox'] = [0] * time_steps 92 | 93 | elif mode == 'concat': 94 | # Image feature concatenated to the first time step 95 | if region_id > 0: 96 | # stream = phrases[region_id] 97 | stream = phrases[:np.sum(phrases > 0)] 98 | stream = stream.tolist() 99 | pad = time_steps - (len(stream) + 2) 100 | out = {} 101 | out['cont_sentence'] = [0] + [1] * (len(stream) + 1) + [0] * pad 102 | out['input_sentence'] = [1] + stream + [0] * pad 103 | out['target_sentence'] = [1] + stream + [2] + [0] * pad 104 | # only make prediction at the last time step for bbox 105 | out['cont_bbox'] = [0] * (len(stream) + 1) + [1] + [0] * pad 106 | 107 | for key, val in six.iteritems(out): 108 | if len(val) > time_steps: 109 | out[key] = val[:time_steps] 110 | else: 111 | # negative sample, no phrase related 112 | out = {} 113 | out['cont_sentence'] = [0] * time_steps 114 | out['input_sentence'] = [0] * (time_steps - 1) 115 | out['target_sentence'] = [0] * time_steps 116 | out['cont_bbox'] = [0] * time_steps 117 | else: 118 | # Global feature and region feature concatenated to the first time step 119 | if region_id > 0: 120 | stream = phrases[region_id] 121 | stream = stream.tolist() 122 | pad = time_steps - (len(stream) + 3) 123 | out = {} 124 | out['cont_sentence'] = [0] + [1] * (len(stream) + 2) + [0] * pad 125 | out['input_sentence'] = [1] + stream + [0] * pad 126 | out['target_sentence'] = [1, 1] + stream + [2] + [0] * pad 127 | # only make prediction at the last time step for bbox 128 | out['cont_bbox'] = [0] * (len(stream) + 2) + [1] + [0] * pad 129 | 130 | for key, val in out.iteritems(): 131 | if len(val) > time_steps: 132 | out[key] = val[:time_steps] 133 | else: 134 | # negative sample, no phrase related 135 | out = {} 136 | out['cont_sentence'] = [0] * time_steps 137 | out['input_sentence'] = [0] * (time_steps - 2) 138 | out['target_sentence'] = [0] * time_steps 139 | out['cont_bbox'] = [0] * time_steps 140 | 141 | return out 142 | -------------------------------------------------------------------------------- /lib/layers/snippets.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from lib.layers.generate_anchors import generate_anchors 12 | 13 | 14 | def generate_anchors_pre(height, width, feat_stride, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)): 15 | """ A wrapper function to generate anchors given different scales 16 | Also return the number of anchors in variable 'length' 17 | """ 18 | anchors = generate_anchors(ratios=np.array(anchor_ratios), scales=np.array(anchor_scales)) 19 | A = anchors.shape[0] 20 | shift_x = np.arange(0, width) * feat_stride 21 | shift_y = np.arange(0, height) * feat_stride 22 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 23 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() 24 | K = shifts.shape[0] 25 | # width changes faster, so here it is H, W, C 26 | anchors = anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 27 | anchors = anchors.reshape((K * A, 4)).astype(np.float32, copy=False) 28 | length = np.int32(anchors.shape[0]) 29 | 30 | return anchors, length 31 | -------------------------------------------------------------------------------- /lib/limit_ram/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/limit_ram/__init__.py -------------------------------------------------------------------------------- /lib/limit_ram/utils.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Ross Girshick's work 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | """functions for LIMIT_RAM version""" 11 | 12 | # import sys 13 | # sys.path.append("..") 14 | 15 | import numpy as np 16 | from lib.config import cfg 17 | 18 | 19 | def pre_roidb(roidb): 20 | """Enrich the imdb's roidb by adding some derived quantities that 21 | are useful for training. This function precomputes the maximum 22 | overlap, taken over ground-truth boxes, between each ROI and 23 | each ground-truth box. The class with maximum overlap is also 24 | recorded. 25 | """ 26 | # need gt_overlaps as a dense array for argmax 27 | gt_overlaps = roidb['gt_overlaps'].toarray() 28 | # max overlap with gt over classes (columns) 29 | max_overlaps = gt_overlaps.max(axis=1) 30 | # gt class that had the max overlap 31 | max_classes = gt_overlaps.argmax(axis=1) 32 | roidb['max_classes'] = max_classes 33 | roidb['max_overlaps'] = max_overlaps 34 | # sanity checks 35 | # max overlap of 0 => class should be zero (background) 36 | zero_inds = np.where(max_overlaps == 0)[0] 37 | assert all(max_classes[zero_inds] == 0) 38 | # max overlap > 0 => class should not be zero (must be a fg class) 39 | # nonzero_inds = np.where(max_overlaps > 0)[0] 40 | # assert all(max_classes[nonzero_inds] != 0) 41 | return roidb 42 | 43 | 44 | def is_valid_limitRam(entry): 45 | # Valid images have: 46 | # (1) At least one foreground RoI OR 47 | # (2) At least one background RoI 48 | overlaps = entry['max_overlaps'] 49 | # find boxes with sufficient overlap 50 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 51 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 52 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 53 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 54 | # image is only valid if such boxes exist 55 | valid = len(fg_inds) > 0 or len(bg_inds) > 0 56 | return valid 57 | 58 | 59 | def flip_image(roidb): 60 | """flip image and change the name for reading later""" 61 | 62 | boxes = roidb['boxes'].copy() 63 | oldx1 = boxes[:, 0].copy() 64 | oldx2 = boxes[:, 2].copy() 65 | boxes[:, 0] = roidb['width'] - oldx2 - 1 66 | boxes[:, 2] = roidb['width'] - oldx1 - 1 67 | assert (boxes[:, 2] >= boxes[:, 0]).all() 68 | entry = {'boxes': boxes, 69 | 'gt_overlaps': roidb['gt_overlaps'], 70 | 'gt_classes': roidb['gt_classes'], 71 | 'flipped': True, 72 | 'gt_phrases': roidb['gt_phrases'], 73 | 'width': roidb['width'], 74 | 'height': roidb['height'], 75 | 'image': roidb['image'], 76 | 'image_id': '%s_flip' % roidb['image_id']} 77 | 78 | return entry 79 | -------------------------------------------------------------------------------- /lib/nets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/nets/__init__.py -------------------------------------------------------------------------------- /lib/nets/vgg16.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import tensorflow as tf 11 | import tensorflow.contrib.slim as slim 12 | from tensorflow.contrib.slim import losses 13 | from tensorflow.contrib.slim import arg_scope 14 | import numpy as np 15 | 16 | from lib.nets.network import Network 17 | from lib.config import cfg 18 | 19 | 20 | class vgg16(Network): 21 | def __init__(self): 22 | Network.__init__(self) 23 | self._feat_stride = [16, ] 24 | self._feat_compress = [1. / float(self._feat_stride[0]), ] 25 | self._scope = 'DenseCap_VGG16' 26 | self._vgg_scope = 'vgg_16' 27 | 28 | def _image_to_head(self, is_training, reuse=None): 29 | with tf.variable_scope(self._vgg_scope, self._vgg_scope, reuse=reuse): 30 | net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3], 31 | trainable=False, scope='conv1') 32 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1') 33 | net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3], 34 | trainable=False, scope='conv2') 35 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2') 36 | net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3], 37 | trainable=is_training, scope='conv3') 38 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3') 39 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], 40 | trainable=is_training, scope='conv4') 41 | net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4') 42 | net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3], 43 | trainable=is_training, scope='conv5') 44 | 45 | self._act_summaries.append(net) 46 | self._layers['head'] = net 47 | 48 | return net 49 | 50 | def _head_to_tail(self, pool5, is_training, reuse=None): 51 | with tf.variable_scope(self._vgg_scope, self._vgg_scope, reuse=reuse): 52 | pool5_flat = slim.flatten(pool5, scope='flatten') 53 | fc6 = slim.fully_connected(pool5_flat, 4096, scope='fc6') 54 | if is_training: 55 | fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True, 56 | scope='dropout6') 57 | fc7 = slim.fully_connected(fc6, 4096, scope='fc7') 58 | if is_training: 59 | fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True, 60 | scope='dropout7') 61 | 62 | return fc7 63 | 64 | def get_variables_to_restore(self, variables, var_keep_dic): 65 | variables_to_restore = [] 66 | 67 | for v in variables: 68 | # exclude the conv weights that are fc weights in vgg16 69 | if v.name == (self._vgg_scope + '/fc6/weights:0') or \ 70 | v.name == (self._vgg_scope + '/fc7/weights:0'): 71 | self._variables_to_fix[v.name] = v 72 | continue 73 | # exclude the first conv layer to swap RGB to BGR 74 | if v.name == (self._vgg_scope + '/conv1/conv1_1/weights:0'): 75 | self._variables_to_fix[v.name] = v 76 | continue 77 | if v.name.split(':')[0] in var_keep_dic: 78 | print('Variables restored: %s' % v.name) 79 | variables_to_restore.append(v) 80 | 81 | return variables_to_restore 82 | 83 | def fix_variables(self, sess, pretrained_model): 84 | print('Fix VGG16 layers..') 85 | with tf.variable_scope('Fix_VGG16') as scope: 86 | with tf.device("/cpu:0"): 87 | # fix the vgg16 issue from conv weights to fc weights 88 | # fix RGB to BGR 89 | fc6_conv = tf.get_variable("fc6_conv", [7, 7, 512, 4096], trainable=False) 90 | fc7_conv = tf.get_variable("fc7_conv", [1, 1, 4096, 4096], trainable=False) 91 | conv1_rgb = tf.get_variable("conv1_rgb", [3, 3, 3, 64], trainable=False) 92 | restorer_fc = tf.train.Saver({self._vgg_scope + "/fc6/weights": fc6_conv, 93 | self._vgg_scope + "/fc7/weights": fc7_conv, 94 | self._vgg_scope + "/conv1/conv1_1/weights": conv1_rgb}) 95 | restorer_fc.restore(sess, pretrained_model) 96 | 97 | sess.run(tf.assign(self._variables_to_fix[self._vgg_scope + '/fc6/weights:0'], tf.reshape(fc6_conv, 98 | self._variables_to_fix[ 99 | self._vgg_scope + '/fc6/weights:0'].get_shape()))) 100 | sess.run(tf.assign(self._variables_to_fix[self._vgg_scope + '/fc7/weights:0'], tf.reshape(fc7_conv, 101 | self._variables_to_fix[ 102 | self._vgg_scope + '/fc7/weights:0'].get_shape()))) 103 | sess.run(tf.assign(self._variables_to_fix[self._vgg_scope + '/conv1/conv1_1/weights:0'], 104 | tf.reverse(conv1_rgb, [2]))) 105 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/nms/__init__.py -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /lib/pre_glove.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from cs224-2017 stanford 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | 11 | from tensorflow.python.platform import gfile 12 | from os.path import join as pjoin 13 | from tqdm import * 14 | import numpy as np 15 | import os 16 | 17 | from config import cfg 18 | 19 | 20 | _PAD = b"" 21 | _SOS = b"" 22 | _EOS = b"" 23 | 24 | 25 | def initialize_vocabulary(vocabulary_path): 26 | # map vocab to word embeddings 27 | if gfile.Exists(vocabulary_path): 28 | rev_vocab = [_PAD, _SOS, _EOS] 29 | with gfile.GFile(vocabulary_path, mode="r") as f: 30 | rev_vocab.extend(f.readlines()) 31 | rev_vocab = [line.strip('\n') for line in rev_vocab] 32 | vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) 33 | return vocab, rev_vocab 34 | else: 35 | raise ValueError("Vocabulary file %s not found.", vocabulary_path) 36 | 37 | 38 | def process_glove(vocab_list, save_path, size=4e5, random_init=True): 39 | """ 40 | :param vocab_list: [vocab] 41 | :return: 42 | """ 43 | if not gfile.Exists(save_path + ".npz"): 44 | glove_path = os.path.join(cfg.DATA_DIR, "glove.6B.{}d.txt".format(cfg.GLOVE_DIM)) 45 | if random_init: 46 | glove = np.random.randn(len(vocab_list), cfg.GLOVE_DIM) 47 | else: 48 | glove = np.zeros((len(vocab_list), cfg.GLOVE_DIM)) 49 | found = 0 50 | with open(glove_path, 'r') as fh: 51 | for line in tqdm(fh, total=size): 52 | array = line.lstrip().rstrip().split(" ") 53 | word = array[0] 54 | vector = list(map(float, array[1:])) 55 | if word in vocab_list: 56 | idx = vocab_list.index(word) 57 | glove[idx, :] = vector 58 | found += 1 59 | if word.capitalize() in vocab_list: 60 | idx = vocab_list.index(word.capitalize()) 61 | glove[idx, :] = vector 62 | found += 1 63 | if word.upper() in vocab_list: 64 | idx = vocab_list.index(word.upper()) 65 | glove[idx, :] = vector 66 | found += 1 67 | 68 | print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path)) 69 | np.savez_compressed(save_path, glove=glove) 70 | print("saved trimmed glove matrix at: {}".format(save_path)) 71 | 72 | 73 | if __name__ == "__main__": 74 | vocab_path = pjoin(cfg.CACHE_DIR, 'vocabulary.txt') 75 | vocab, rev_vocab = initialize_vocabulary(vocab_path) 76 | process_glove(rev_vocab, cfg.DATA_DIR + "/glove.trimmed.{}".format(cfg.GLOVE_DIM), 77 | random_init=True) 78 | -------------------------------------------------------------------------------- /lib/preprocess.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Preprocessing data in valohai computing platform. 4 | # This script may out of date. #2017.12.20 5 | set -e 6 | set -x 7 | 8 | POSITIONAL=() 9 | while [[ $# -gt 0 ]]; do 10 | #statements 11 | key="$1" 12 | 13 | case $key in 14 | -vs|--version) 15 | VERSION=$2 16 | shift 17 | shift 18 | ;; 19 | -p|--path) 20 | IN_PATH=$2 21 | shift 22 | shift 23 | ;; 24 | -od|--output_dir) 25 | OUTPUT_DIR=$2 26 | shift 27 | shift 28 | ;; 29 | -mw|--max_words) 30 | MAX_WORDS=$2 31 | shift 32 | shift 33 | ;; 34 | *) 35 | POSITIONAL+=("$1") 36 | shift 37 | ;; 38 | esac 39 | done 40 | 41 | 42 | if [ -d "/valohai/inputs" ]; then 43 | # apt-get -y update 44 | # apt-get -y install python-pip 45 | pip install -r requirements.txt 46 | cd /valohai/inputs 47 | mkdir ${VERSION} 48 | unzip image_meta/image_data.json.zip -d ./${VERSION} 49 | unzip regions/region_descriptions.json.zip -d ./${VERSION} 50 | cd /valohai/repository/lib 51 | time python2 preprocess.py --version ${VERSION} \ 52 | --path ${IN_PATH} \ 53 | --output_dir ${OUTPUT_DIR} \ 54 | --max_words ${MAX_WORDS} 55 | 56 | tar -czvf /valohai/outputs/visual_genome.tar.gz ${OUTPUT_DIR} 57 | # comment it if one already have data stored in S3 58 | mv regions/region_descriptions.json.zip /valohai/outputs 59 | fi 60 | 61 | 62 | 63 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/README: -------------------------------------------------------------------------------- 1 | ============================= 2 | Linjie Yang 3 | 04/21/2016 4 | ============================= 5 | This folder holds the functions for evaluating image captioning models, including the dense captioning models. This folder is originally from a standard evaluation toolkit for MS COCO (https://github.com/tylin/coco-caption). 6 | The newly added functions and usages are as follows. 7 | (1) dt_eval.py: function to evaluate captioning model on web data. One image only has one ground truth caption. 8 | (2) vg_eval.py: function to evaluate the dense captioning model on visual genome. Calculate Meteor score and mean AP which are described in the DenseCap paper (http://arxiv.org/abs/1511.07571). 9 | (3) meteor/meteor2.py: modified version of "meteor/meteor.py". Adapted to be usedfor multi-to-multi caption matching in DenseCap. 10 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/bleu/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/bleu/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/bleu/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from bleu_scorer import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(gts.keys() == res.keys()) 24 | imgIds = gts.keys() 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | assert(len(ref) > 1) 36 | 37 | bleu_scorer += (hypo[0], ref) 38 | 39 | #score, scores = bleu_scorer.compute_score(option='shortest') 40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=1) 41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 42 | 43 | # return (bleu, bleu_info) 44 | return score, scores 45 | 46 | def method(self): 47 | return "Bleu" 48 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | 10 | from cider_scorer import CiderScorer 11 | import pdb 12 | 13 | class Cider: 14 | """ 15 | Main Class to compute the CIDEr metric 16 | 17 | """ 18 | def __init__(self, test=None, refs=None, n=4, sigma=6.0): 19 | # set cider to sum over 1 to 4-grams 20 | self._n = n 21 | # set the standard deviation parameter for gaussian penalty 22 | self._sigma = sigma 23 | 24 | def compute_score(self, gts, res): 25 | """ 26 | Main function to compute CIDEr score 27 | :param hypo_for_image (dict) : dictionary with key and value 28 | ref_for_image (dict) : dictionary with key and value 29 | :return: cider (float) : computed CIDEr score for the corpus 30 | """ 31 | 32 | assert(gts.keys() == res.keys()) 33 | imgIds = gts.keys() 34 | 35 | cider_scorer = CiderScorer(n=self._n, sigma=self._sigma) 36 | 37 | for id in imgIds: 38 | hypo = res[id] 39 | ref = gts[id] 40 | 41 | # Sanity check. 42 | assert(type(hypo) is list) 43 | assert(len(hypo) == 1) 44 | assert(type(ref) is list) 45 | assert(len(ref) > 0) 46 | 47 | cider_scorer += (hypo[0], ref) 48 | 49 | (score, scores) = cider_scorer.compute_score() 50 | 51 | return score, scores 52 | 53 | def method(self): 54 | return "CIDEr" -------------------------------------------------------------------------------- /lib/pycocoevalcap/eval.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | from tokenizer.ptbtokenizer import PTBTokenizer 3 | from bleu.bleu import Bleu 4 | from meteor.meteor import Meteor 5 | from rouge.rouge import Rouge 6 | from cider.cider import Cider 7 | 8 | class COCOEvalCap: 9 | def __init__(self, coco, cocoRes): 10 | self.evalImgs = [] 11 | self.eval = {} 12 | self.imgToEval = {} 13 | self.coco = coco 14 | self.cocoRes = cocoRes 15 | self.params = {'image_id': coco.getImgIds()} 16 | 17 | def evaluate(self): 18 | imgIds = self.params['image_id'] 19 | # imgIds = self.coco.getImgIds() 20 | gts = {} 21 | res = {} 22 | for imgId in imgIds: 23 | gts[imgId] = self.coco.imgToAnns[imgId] 24 | res[imgId] = self.cocoRes.imgToAnns[imgId] 25 | 26 | # ================================================= 27 | # Set up scorers 28 | # ================================================= 29 | print 'tokenization...' 30 | tokenizer = PTBTokenizer() 31 | gts = tokenizer.tokenize(gts) 32 | res = tokenizer.tokenize(res) 33 | 34 | # ================================================= 35 | # Set up scorers 36 | # ================================================= 37 | print 'setting up scorers...' 38 | scorers = [ 39 | (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), 40 | (Meteor(),"METEOR"), 41 | (Rouge(), "ROUGE_L"), 42 | (Cider(), "CIDEr") 43 | ] 44 | 45 | # ================================================= 46 | # Compute scores 47 | # ================================================= 48 | eval = {} 49 | for scorer, method in scorers: 50 | print 'computing %s score...'%(scorer.method()) 51 | score, scores = scorer.compute_score(gts, res) 52 | if type(method) == list: 53 | for sc, scs, m in zip(score, scores, method): 54 | self.setEval(sc, m) 55 | self.setImgToEvalImgs(scs, imgIds, m) 56 | print "%s: %0.3f"%(m, sc) 57 | else: 58 | self.setEval(score, method) 59 | self.setImgToEvalImgs(scores, imgIds, method) 60 | print "%s: %0.3f"%(method, score) 61 | self.setEvalImgs() 62 | 63 | def setEval(self, score, method): 64 | self.eval[method] = score 65 | 66 | def setImgToEvalImgs(self, scores, imgIds, method): 67 | for imgId, score in zip(imgIds, scores): 68 | if not imgId in self.imgToEval: 69 | self.imgToEval[imgId] = {} 70 | self.imgToEval[imgId]["image_id"] = imgId 71 | self.imgToEval[imgId][method] = score 72 | 73 | def setEvalImgs(self): 74 | self.evalImgs = [eval for imgId, eval in self.imgToEval.items()] -------------------------------------------------------------------------------- /lib/pycocoevalcap/meteor/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/meteor/meteor-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/pycocoevalcap/meteor/meteor-1.5.jar -------------------------------------------------------------------------------- /lib/pycocoevalcap/meteor/meteor.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Python wrapper for METEOR implementation, by Xinlei Chen 4 | # Modified by Linjie Yang for evaluating dense captioning 5 | # Acknowledge Michael Denkowski for the generous discussion and help 6 | 7 | import os 8 | import sys 9 | import subprocess 10 | import threading 11 | 12 | # Assumes meteor-1.5.jar is in the same directory as meteor.py. Change as needed. 13 | METEOR_JAR = 'meteor-1.5.jar' 14 | # print METEOR_JAR 15 | 16 | class Meteor: 17 | 18 | def __init__(self): 19 | self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \ 20 | '-', '-', '-stdio', '-l', 'en', '-norm'] 21 | self.meteor_p = subprocess.Popen(self.meteor_cmd, \ 22 | cwd=os.path.dirname(os.path.abspath(__file__)), \ 23 | stdin=subprocess.PIPE, \ 24 | stdout=subprocess.PIPE, \ 25 | stderr=subprocess.PIPE) 26 | # Used to guarantee thread safety 27 | self.lock = threading.Lock() 28 | 29 | def compute_score(self, gts, res, imgIds=None): 30 | assert(gts.keys() == res.keys()) 31 | if imgIds is None: 32 | imgIds = gts.keys() 33 | scores = [] 34 | 35 | eval_line = 'EVAL' 36 | self.lock.acquire() 37 | for i in imgIds: 38 | assert(len(res[i]) == 1) 39 | 40 | stat = self._stat(res[i][0], gts[i]) 41 | eval_line += ' ||| {}'.format(stat) 42 | 43 | self.meteor_p.stdin.write('{}\n'.format(eval_line)) 44 | for i in range(0,len(imgIds)): 45 | scores.append(float(self.meteor_p.stdout.readline().strip())) 46 | final_score = self.meteor_p.stdout.readline().strip() 47 | #print final_score 48 | score = float(final_score) 49 | self.lock.release() 50 | 51 | return score, scores 52 | 53 | 54 | def compute_score_m2m(self, gts, res, imgIds=None): 55 | assert(gts.keys() == res.keys()) 56 | if imgIds is None: 57 | imgIds = gts.keys() 58 | scores = [] 59 | 60 | eval_line = 'EVAL' 61 | self.lock.acquire() 62 | tot_line = 0 63 | for i in imgIds: 64 | #assert(len(res[i]) == 1) 65 | for res_sent in res[i]: 66 | stat = self._stat(res_sent, gts[i]) 67 | eval_line += ' ||| {}'.format(stat) 68 | tot_line += 1 69 | self.meteor_p.stdin.write('{}\n'.format(eval_line)) 70 | for i in range(0,len(imgIds)): 71 | scores_im = [] 72 | for j in xrange(len(res[i])): 73 | scores_im.append(float(self.meteor_p.stdout.readline().strip())) 74 | scores.append(scores_im) 75 | score = float(self.meteor_p.stdout.readline().strip()) 76 | self.lock.release() 77 | 78 | return score, scores 79 | def method(self): 80 | return "METEOR" 81 | 82 | def _stat(self, hypothesis_str, reference_list): 83 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 84 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 85 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 86 | self.meteor_p.stdin.write('{}\n'.format(score_line)) 87 | return self.meteor_p.stdout.readline().strip() 88 | 89 | def score(self, hypothesis_str, reference_list): 90 | self.lock.acquire() 91 | # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words 92 | hypothesis_str = hypothesis_str.replace('|||','').replace(' ',' ') 93 | score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str)) 94 | self.meteor_p.stdin.write('{}\n'.format(score_line)) 95 | stats = self.meteor_p.stdout.readline().strip() 96 | eval_line = 'EVAL ||| {}'.format(stats) 97 | # EVAL ||| stats 98 | self.meteor_p.stdin.write('{}\n'.format(eval_line)) 99 | score = float(self.meteor_p.stdout.readline().strip()) 100 | self.lock.release() 101 | return score 102 | 103 | def __exit__(self): 104 | self.lock.acquire() 105 | self.meteor_p.stdin.close() 106 | self.meteor_p.wait() 107 | self.lock.release() 108 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/rouge/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'vrama91' 2 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/rouge/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | import pdb 12 | 13 | def my_lcs(string, sub): 14 | """ 15 | Calculates longest common subsequence for a pair of tokenized strings 16 | :param string : list of str : tokens from a string split using whitespace 17 | :param sub : list of str : shorter string, also split using whitespace 18 | :returns: length (list of int): length of the longest common subsequence between the two strings 19 | 20 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 21 | """ 22 | if(len(string)< len(sub)): 23 | sub, string = string, sub 24 | 25 | lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)] 26 | 27 | for j in range(1,len(sub)+1): 28 | for i in range(1,len(string)+1): 29 | if(string[i-1] == sub[j-1]): 30 | lengths[i][j] = lengths[i-1][j-1] + 1 31 | else: 32 | lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1]) 33 | 34 | return lengths[len(string)][len(sub)] 35 | 36 | class Rouge(): 37 | ''' 38 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 39 | 40 | ''' 41 | def __init__(self): 42 | # vrama91: updated the value below based on discussion with Hovey 43 | self.beta = 1.2 44 | 45 | def calc_score(self, candidate, refs): 46 | """ 47 | Compute ROUGE-L score given one candidate and references for an image 48 | :param candidate: str : candidate sentence to be evaluated 49 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 50 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 51 | """ 52 | assert(len(candidate)==1) 53 | assert(len(refs)>0) 54 | prec = [] 55 | rec = [] 56 | 57 | # split into tokens 58 | token_c = candidate[0].split(" ") 59 | 60 | for reference in refs: 61 | # split into tokens 62 | token_r = reference.split(" ") 63 | # compute the longest common subsequence 64 | lcs = my_lcs(token_r, token_c) 65 | prec.append(lcs/float(len(token_c))) 66 | rec.append(lcs/float(len(token_r))) 67 | 68 | prec_max = max(prec) 69 | rec_max = max(rec) 70 | 71 | if(prec_max!=0 and rec_max !=0): 72 | score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max) 73 | else: 74 | score = 0.0 75 | return score 76 | 77 | def compute_score(self, gts, res): 78 | """ 79 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 80 | Invoked by evaluate_captions.py 81 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 82 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 83 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 84 | """ 85 | assert(gts.keys() == res.keys()) 86 | imgIds = gts.keys() 87 | 88 | score = [] 89 | for id in imgIds: 90 | hypo = res[id] 91 | ref = gts[id] 92 | 93 | score.append(self.calc_score(hypo, ref)) 94 | 95 | # Sanity check. 96 | assert(type(hypo) is list) 97 | assert(len(hypo) == 1) 98 | assert(type(ref) is list) 99 | assert(len(ref) > 0) 100 | 101 | average_score = np.mean(np.array(score)) 102 | return average_score, np.array(score) 103 | 104 | def method(self): 105 | return "Rouge" 106 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'hfang' 2 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/tokenizer/ptbtokenizer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : ptbtokenizer.py 4 | # 5 | # Description : Do the PTB Tokenization and remove punctuations. 6 | # 7 | # Creation Date : 29-12-2014 8 | # Last Modified : Thu Mar 19 09:53:35 2015 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | import os 12 | import sys 13 | import subprocess 14 | import tempfile 15 | import itertools 16 | 17 | # path to the stanford corenlp jar 18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar' 19 | 20 | # punctuations to be removed from the sentences 21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-", 22 | ".", "?", "!", ",", ":", "-", "--", "...", ";"] 23 | 24 | 25 | class PTBTokenizer: 26 | """Python wrapper of Stanford PTBTokenizer""" 27 | 28 | def tokenize(self, captions_for_image): 29 | cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR, 30 | 'edu.stanford.nlp.process.PTBTokenizer', 31 | '-preserveLines', '-lowerCase'] 32 | 33 | # ====================================================== 34 | # prepare data for PTB Tokenizer 35 | # ====================================================== 36 | final_tokenized_captions_for_image = {} 37 | image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))] 38 | sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v]) 39 | 40 | # ====================================================== 41 | # save sentences to temporary file 42 | # ====================================================== 43 | path_to_jar_dirname = os.path.dirname(os.path.abspath(__file__)) 44 | tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname) 45 | tmp_file.write(sentences) 46 | tmp_file.close() 47 | 48 | # ====================================================== 49 | # tokenize sentence 50 | # ====================================================== 51 | cmd.append(os.path.basename(tmp_file.name)) 52 | p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname, 53 | stdout=subprocess.PIPE) # shell=True 54 | token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0] 55 | lines = token_lines.split('\n') 56 | # remove temp file 57 | os.remove(tmp_file.name) 58 | 59 | # ====================================================== 60 | # create dictionary for tokenized captions 61 | # ====================================================== 62 | for k, line in zip(image_id, lines): 63 | if not k in final_tokenized_captions_for_image: 64 | final_tokenized_captions_for_image[k] = [] 65 | tokenized_caption = ' '.join([w for w in line.rstrip().split(' ') 66 | if w not in PUNCTUATIONS]) 67 | final_tokenized_captions_for_image[k].append(tokenized_caption) 68 | 69 | return final_tokenized_captions_for_image 70 | -------------------------------------------------------------------------------- /lib/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Ross Girshick's work 5 | # -------------------------------------------------------- 6 | # Fast R-CNN 7 | # Copyright (c) 2015 Microsoft 8 | # Licensed under The MIT License [see LICENSE for details] 9 | # Written by Ross Girshick 10 | # -------------------------------------------------------- 11 | 12 | 13 | import os 14 | from os.path import join as pjoin 15 | from setuptools import setup 16 | from distutils.extension import Extension 17 | from Cython.Distutils import build_ext 18 | import subprocess 19 | import numpy as np 20 | 21 | def find_in_path(name, path): 22 | "Find a file in a search path" 23 | # Adapted fom 24 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 25 | for dir in path.split(os.pathsep): 26 | binpath = pjoin(dir, name) 27 | if os.path.exists(binpath): 28 | return os.path.abspath(binpath) 29 | return None 30 | 31 | 32 | def locate_cuda(): 33 | """Locate the CUDA environment on the system 34 | 35 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 36 | and values giving the absolute path to each directory. 37 | 38 | Starts by looking for the CUDAHOME env variable. If not found, everything 39 | is based on finding 'nvcc' in the PATH. 40 | """ 41 | 42 | # first check if the CUDAHOME env variable is in use 43 | if 'CUDAHOME' in os.environ: 44 | home = os.environ['CUDAHOME'] 45 | nvcc = pjoin(home, 'bin', 'nvcc') 46 | else: 47 | # otherwise, search the PATH for NVCC 48 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 49 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 50 | if nvcc is None: 51 | raise EnvironmentError('The nvcc binary could not be ' 52 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 53 | home = os.path.dirname(os.path.dirname(nvcc)) 54 | 55 | cudaconfig = {'home':home, 'nvcc':nvcc, 56 | 'include': pjoin(home, 'include'), 57 | 'lib64': pjoin(home, 'lib64')} 58 | for k, v in cudaconfig.iteritems(): 59 | if not os.path.exists(v): 60 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 61 | 62 | return cudaconfig 63 | CUDA = locate_cuda() 64 | 65 | 66 | # Obtain the numpy include directory. This logic works across numpy versions. 67 | try: 68 | numpy_include = np.get_include() 69 | except AttributeError: 70 | numpy_include = np.get_numpy_include() 71 | 72 | def customize_compiler_for_nvcc(self): 73 | """inject deep into distutils to customize how the dispatch 74 | to gcc/nvcc works. 75 | 76 | If you subclass UnixCCompiler, it's not trivial to get your subclass 77 | injected in, and still have the right customizations (i.e. 78 | distutils.sysconfig.customize_compiler) run on it. So instead of going 79 | the OO route, I have this. Note, it's kindof like a wierd functional 80 | subclassing going on.""" 81 | 82 | # tell the compiler it can processes .cu 83 | self.src_extensions.append('.cu') 84 | 85 | # save references to the default compiler_so and _comple methods 86 | default_compiler_so = self.compiler_so 87 | super = self._compile 88 | 89 | # now redefine the _compile method. This gets executed for each 90 | # object but distutils doesn't have the ability to change compilers 91 | # based on source extension: we add it. 92 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 93 | if os.path.splitext(src)[1] == '.cu': 94 | # use the cuda for .cu files 95 | self.set_executable('compiler_so', CUDA['nvcc']) 96 | # use only a subset of the extra_postargs, which are 1-1 translated 97 | # from the extra_compile_args in the Extension class 98 | postargs = extra_postargs['nvcc'] 99 | else: 100 | postargs = extra_postargs['gcc'] 101 | 102 | super(obj, src, ext, cc_args, postargs, pp_opts) 103 | # reset the default compiler_so, which we might have changed for cuda 104 | self.compiler_so = default_compiler_so 105 | 106 | # inject our redefined _compile method into the class 107 | self._compile = _compile 108 | 109 | 110 | # run the customize_compiler 111 | class custom_build_ext(build_ext): 112 | def build_extensions(self): 113 | customize_compiler_for_nvcc(self.compiler) 114 | build_ext.build_extensions(self) 115 | 116 | 117 | ext_modules = [ 118 | Extension( 119 | "utils.cython_bbox", 120 | ["utils/bbox.pyx"], 121 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 122 | include_dirs = [numpy_include] 123 | ), 124 | Extension( 125 | "nms.cpu_nms", 126 | ["nms/cpu_nms.pyx"], 127 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 128 | include_dirs = [numpy_include] 129 | ), 130 | Extension('nms.gpu_nms', 131 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 132 | library_dirs=[CUDA['lib64']], 133 | libraries=['cudart'], 134 | language='c++', 135 | runtime_library_dirs=[CUDA['lib64']], 136 | # this syntax is specific to this build system 137 | # we're only going to use certain compiler args with nvcc and not with 138 | # gcc the implementation of this trick is in customize_compiler() below 139 | extra_compile_args={'gcc': ["-Wno-unused-function"], 140 | 'nvcc': ['-arch=sm_35', 141 | '--ptxas-options=-v', 142 | '-c', 143 | '--compiler-options', 144 | "'-fPIC'"]}, 145 | include_dirs = [numpy_include, CUDA['include']] 146 | ), 147 | ] 148 | 149 | setup( 150 | name='fast_rcnn', 151 | ext_modules=ext_modules, 152 | # inject our custom trigger 153 | cmdclass={'build_ext': custom_build_ext}, 154 | ) 155 | 156 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | -------------------------------------------------------------------------------- /lib/utils/bbox_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from collections import OrderedDict 4 | import json 5 | import numpy as np 6 | import pprint 7 | import cPickle as pickle 8 | import string 9 | 10 | def get_bbox_coord(norm_coord, do_clip=True): 11 | #input is a nx4 numpy array in normalized bbox coordinates 12 | #print norm_coord.shape 13 | #print norm_coord 14 | bboxes_coord = np.zeros(norm_coord.shape) 15 | #x,y,w,h 16 | bboxes_coord[:, :2] = norm_coord[:, :2]+0.5 17 | bboxes_coord[:, 2:] = np.exp(norm_coord[:, 2:]) 18 | 19 | #x1,y1,x2,y2 20 | bboxes_coord2 = np.zeros(norm_coord.shape) 21 | bboxes_coord2[:, :2] = bboxes_coord[:, :2] - bboxes_coord[:, 2:] * 0.5 22 | bboxes_coord2[:, 2:] = bboxes_coord[:, :2] + bboxes_coord[:, 2:] * 0.5 23 | #clipping all coordinates to [0,1] 24 | if do_clip: 25 | bboxes_coord2 = np.minimum(np.maximum(bboxes_coord2, 0), 1) 26 | return bboxes_coord2 27 | 28 | 29 | def get_bbox_iou_matrix(bboxes): 30 | region_n = bboxes.shape[0] 31 | #area, intersection area, union area 32 | bbox_areas = (bboxes[:,2] - bboxes[:,0]) * \ 33 | (bboxes[:, 3] - bboxes[:, 1]) 34 | 35 | x_a1 = bboxes[:,0].reshape(region_n,1) 36 | x_a2 = bboxes[:,2].reshape(region_n,1) 37 | x_b1 = bboxes[:,0].reshape(1,region_n) 38 | x_b2 = bboxes[:,2].reshape(1,region_n) 39 | y_a1 = bboxes[:,1].reshape(region_n,1) 40 | y_a2 = bboxes[:,3].reshape(region_n,1) 41 | y_b1 = bboxes[:,1].reshape(1,region_n) 42 | y_b2 = bboxes[:,3].reshape(1,region_n) 43 | bbox_pair_x_diff = np.maximum(0, np.minimum(x_a2, x_b2) - np.maximum(x_a1, x_b1)) 44 | bbox_pair_y_diff = np.maximum(0, np.minimum(y_a2, y_b2) - np.maximum(y_a1, y_b1)) 45 | inter_areas = bbox_pair_x_diff * bbox_pair_y_diff 46 | 47 | #IoU 48 | union_areas = bbox_areas.reshape(region_n,1) + bbox_areas.reshape(1,region_n) 49 | 50 | bbox_iou = inter_areas / (union_areas - inter_areas) 51 | return bbox_iou 52 | 53 | def nms(region_info, bbox_th=0.3): 54 | #non-maximum surpression 55 | region_info.sort(key = lambda x: -x['log_prob']) 56 | #keep_index = [] 57 | region_n = len(region_info) 58 | #fast computation of pairwise IoU 59 | #pick the bbox of last timestep of each sample 60 | #print 'region_info length %d' % len(region_info) 61 | all_bboxes = np.array([x['location'][-1,:] for x in region_info])# nx4 matrix 62 | bbox_iou = get_bbox_iou_matrix(all_bboxes) 63 | bbox_iou_th = bbox_iou < bbox_th 64 | keep_flag = np.ones((region_n),dtype=np.uint8) 65 | 66 | for i in xrange(region_n-1): 67 | if keep_flag[i]: 68 | keep_flag[i+1:] = np.logical_and(keep_flag[i+1:], bbox_iou_th[i,i+1:]) 69 | print 'sum of keep flag' 70 | print keep_flag.sum() 71 | return [region_info[i] for i in xrange(region_n) if keep_flag[i]] 72 | 73 | def region_merge(region_info, bbox_th=0.7): 74 | #merging ground truth bboxes 75 | 76 | #keep_index = [] 77 | region_n = len(region_info) 78 | region_merged = [] 79 | #fast computation of pairwise IoU 80 | #pick the bbox of last timestep of each sample 81 | all_bboxes = np.array([x['location'] for x in region_info], dtype = np.float32)# nx4 matrix 82 | bbox_iou = get_bbox_iou_matrix(all_bboxes) 83 | bbox_iou_th = bbox_iou > bbox_th 84 | bbox_iou_overlap_n = bbox_iou_th.sum(axis = 0) 85 | 86 | merge_flag = np.ones((region_n),dtype=np.uint8) 87 | unmerged_region = region_n 88 | while unmerged_region > 0: 89 | max_overlap_id = np.argmax(bbox_iou_overlap_n) 90 | assert bbox_iou_overlap_n[max_overlap_id] > 0 91 | merge_group = np.nonzero(bbox_iou_th[max_overlap_id,:] & merge_flag)[0] 92 | unmerged_region -= len(merge_group) 93 | merge_flag[merge_group] = 0 94 | bbox_iou_overlap_n[merge_group] = 0 95 | bbox_group = all_bboxes[merge_group,:].reshape(len(merge_group),4) 96 | caption_group = [region_info[i]['caption'] for i in merge_group] 97 | bbox_mean = np.mean(bbox_group, axis = 0).tolist() 98 | region_merged.append({'image_id':region_info[max_overlap_id]['image_id'], \ 99 | 'captions': caption_group, 'location': bbox_mean}) 100 | return region_merged 101 | 102 | -------------------------------------------------------------------------------- /lib/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | import cv2 12 | 13 | 14 | def im_list_to_blob(ims): 15 | """Convert a list of images into a network input. 16 | 17 | Assumes images are already prepared (means subtracted, BGR order, ...). 18 | """ 19 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 20 | num_images = len(ims) 21 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 22 | dtype=np.float32) 23 | for i in xrange(num_images): 24 | im = ims[i] 25 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 26 | # Move channels (axis 3) to axis 1 27 | # Axis order will become: (batch elem, channel, height, width) 28 | # TODO: check out if we need transpose here. 29 | # For now, we stick to the tf_faster_rcnn version 30 | # channel_swap = (0, 3, 1, 2) 31 | # blob = blob.transpose(channel_swap) 32 | return blob 33 | 34 | 35 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 36 | """Mean subtract and scale an image for use in a blob.""" 37 | im = im.astype(np.float32, copy=False) 38 | im -= pixel_means 39 | im_shape = im.shape 40 | im_size_min = np.min(im_shape[0:2]) 41 | im_size_max = np.max(im_shape[0:2]) 42 | im_scale = float(target_size) / float(im_size_min) 43 | # Prevent the biggest axis from being more than MAX_SIZE 44 | if np.round(im_scale * im_size_max) > max_size: 45 | im_scale = float(max_size) / float(im_size_max) 46 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 47 | interpolation=cv2.INTER_LINEAR) 48 | 49 | return im, im_scale 50 | -------------------------------------------------------------------------------- /lib/utils/debug.py: -------------------------------------------------------------------------------- 1 | ### Functions in this file are for debugging purpose 2 | ### Linjie Yang 3 | 4 | import numpy as np 5 | 6 | def softmax(x): 7 | """Compute softmax values for each sets of scores in x.""" 8 | # defalut: last dimension of x is the score dimension 9 | axis = len(x.shape) - 1 10 | x = x - x.max(axis = axis, keepdims=True) 11 | sf = np.exp(x) 12 | sf = sf / np.sum(sf, axis=axis, keepdims=True) 13 | return sf -------------------------------------------------------------------------------- /lib/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | -------------------------------------------------------------------------------- /lib/utils/visualization.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import pdb 11 | import numpy as np 12 | import numpy.random as npr 13 | from six.moves import range 14 | from lib.config import cfg 15 | import PIL.Image as Image 16 | import PIL.ImageColor as ImageColor 17 | import PIL.ImageDraw as ImageDraw 18 | import PIL.ImageFont as ImageFont 19 | from lib.fast_rcnn.nms_wrapper import nms 20 | from lib.fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv 21 | 22 | STANDARD_COLORS = [ 23 | 'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque', 24 | 'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite', 25 | 'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan', 26 | 'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange', 27 | 'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet', 28 | 'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite', 29 | 'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod', 30 | 'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki', 31 | 'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue', 32 | 'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey', 33 | 'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue', 34 | 'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime', 35 | 'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid', 36 | 'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen', 37 | 'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin', 38 | 'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed', 39 | 'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed', 40 | 'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple', 41 | 'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown', 42 | 'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue', 43 | 'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow', 44 | 'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White', 45 | 'WhiteSmoke', 'Yellow', 'YellowGreen' 46 | ] 47 | 48 | NUM_COLORS = len(STANDARD_COLORS) 49 | 50 | try: 51 | FONT = ImageFont.truetype('arial.ttf', 24) 52 | except IOError: 53 | FONT = ImageFont.load_default() 54 | 55 | 56 | def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, font, color='black', thickness=4): 57 | draw = ImageDraw.Draw(image) 58 | (left, right, top, bottom) = (xmin, xmax, ymin, ymax) 59 | draw.line([(left, top), (left, bottom), (right, bottom), 60 | (right, top), (left, top)], width=thickness, fill=color) 61 | text_bottom = bottom 62 | # Reverse list and print from bottom to top. 63 | text_width, text_height = font.getsize(display_str) 64 | margin = np.ceil(0.05 * text_height) 65 | draw.rectangle( 66 | [(left, text_bottom - text_height - 2 * margin), (left + text_width, 67 | text_bottom)], 68 | fill=color) 69 | draw.text( 70 | (left + margin, text_bottom - text_height - margin), 71 | display_str, 72 | fill='black', 73 | font=font) 74 | 75 | return image 76 | 77 | 78 | def draw_bounding_boxes(image, gt_boxes, im_info, phrases): 79 | 80 | num_boxes = gt_boxes.shape[0] 81 | gt_boxes_new = gt_boxes.copy() 82 | gt_boxes_new[:, :4] = np.round(gt_boxes_new[:, :4].copy() / im_info[2]) 83 | disp_image = Image.fromarray(np.uint8(image[0])) 84 | 85 | # show several(10) boxes for debugging 86 | show_ids = npr.choice(np.arange(num_boxes), size=5, replace=False) 87 | vocab_path = '%s/vocabulary.txt' % cfg.CACHE_DIR 88 | with open(vocab_path, 'r') as f: 89 | vocab = [line.strip() for line in f] 90 | # vocab_extra = ['', '', ''] 91 | # for ex in vocab_extra: 92 | # vocab.insert(0, ex) 93 | for idx, i in enumerate(show_ids): 94 | # this_class = int(gt_boxes_new[i, 4]) 95 | # phrase = phrases[i] if len(phrases[i]) < cfg.TIME_STEPS else phrases[1:] 96 | # for adding gt bounding box 97 | if len(phrases[i]) < cfg.TIME_STEPS: 98 | phrase = phrases[i] 99 | # for adding predicted boxes 100 | else: 101 | phrase = [] 102 | # phrases[i][1:] to remove the token 103 | for p in phrases[i]: 104 | if p == cfg.END_INDEX: 105 | break 106 | phrase.append(p) 107 | 108 | caption = ' '.join([vocab[j - 3] if j - 3 >= 0 else "" for j 109 | in phrase]) 110 | # caption = " ".join([vocab[j] for j in phrase[i]) 111 | disp_image = _draw_single_box(disp_image, 112 | gt_boxes_new[i, 0], 113 | gt_boxes_new[i, 1], 114 | gt_boxes_new[i, 2], 115 | gt_boxes_new[i, 3], 116 | '%s_%s' % (i, caption), 117 | FONT, 118 | color=STANDARD_COLORS[idx % NUM_COLORS]) 119 | 120 | image[0, :] = np.array(disp_image) 121 | return image 122 | 123 | 124 | def draw_densecap(image, scores, rois, im_info, cap_probs, bbox_pred): 125 | """ 126 | bbox_pred: [None, 4] 127 | rois: [None, 5] 128 | 129 | """ 130 | # for bbox unnormalization 131 | 132 | bbox_mean = np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS).reshape((1, 4)) 133 | bbox_stds = np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS).reshape((1, 4)) 134 | 135 | boxes = rois[:, 1:5] / im_info[2] 136 | # [None, 12] 137 | cap_ids = np.argmax(cap_probs, axis=1).reshape((-1, cfg.TIME_STEPS)) 138 | 139 | # bbox target unnormalization 140 | box_deltas = bbox_pred * bbox_stds + bbox_mean 141 | 142 | # do the transformation 143 | pred_boxes = bbox_transform_inv(boxes, box_deltas) 144 | pred_boxes = clip_boxes(pred_boxes, image.shape) 145 | 146 | pos_dets = np.hstack((pred_boxes, scores[:, 1][:, np.newaxis])).astype(np.float32, copy=False) 147 | keep = nms(pos_dets, cfg.TEST.NMS) 148 | pos_boxes = boxes[keep, :] 149 | cap_ids = cap_ids[keep, :] 150 | im_info[2] = 1. 151 | img_cap = draw_bounding_boxes(image, pos_boxes, im_info, cap_ids) 152 | 153 | return img_cap 154 | -------------------------------------------------------------------------------- /logs/densecap.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/logs/densecap.png -------------------------------------------------------------------------------- /logs/funny.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/logs/funny.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython>=0.19.2 2 | opencv-python>=3.3.0 3 | numpy>=1.7.1 4 | scipy>=0.13.2 5 | scikit-image>=0.9.3 6 | matplotlib>=1.3.1 7 | ipython>=3.0.0 8 | pyyaml>=3.10 9 | Pillow>=2.3.0 10 | easydict>=1.6 11 | ijson>=2.3 12 | tqdm>=4.17.1 13 | -------------------------------------------------------------------------------- /scripts/dense_cap_config.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: DenseCap 2 | DEBUG_ALL: False 3 | ALL_TEST: False 4 | ALL_TEST_NUM_TRAIN: 100 5 | ALL_TEST_NUM_VAL: 100 6 | ALL_TEST_NUM_TEST: 1000 7 | LIMIT_RAM: True 8 | EMBED_DIM: 512 9 | CONTEXT_FUSION: False 10 | INIT_BY_GLOVE: False 11 | KEEP_AS_GLOVE_DIM: False 12 | GLOVE_DIM: 300 13 | TRAIN: 14 | HAS_RPN: True 15 | IMS_PER_BATCH: 1 16 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 17 | RPN_POSITIVE_OVERLAP: 0.7 18 | SUMMARY_INTERVAL: 10 19 | RPN_BATCHSIZE: 256 20 | BATCH_SIZE: 256 21 | PROPOSAL_METHOD: gt 22 | BG_THRESH_LO: 0.0 23 | FG_FRACTION: 0.5 24 | RPN_NMS_THRESH: 0.7 25 | MAX_SIZE: 720 26 | USE_FLIPPED: True 27 | LR_DIY_DECAY: True 28 | STEPSIZE: [100000] 29 | WEIGHT_INITIALIZER: normal 30 | DISPLAY: 10 31 | # EXP_DECAY_RATE: 0.5 32 | # EXP_DECAY_STEPS: 500 33 | RESNET: 34 | FIXED_BLOCKS: 1 35 | TEST: 36 | HAS_RPN: True 37 | RPN_NMS_THRESH: 0.6 38 | NMS: 0.5 39 | RPN_POST_NMS_TOP_N: 300 40 | MAX_SIZE: 720 41 | -------------------------------------------------------------------------------- /scripts/dense_cap_demo.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Run with: 4 | # bash scripts/dense_cap_demo.sh [ckpt_path] [vocab_path] 5 | 6 | set -x 7 | set -e 8 | 9 | ckpt=$1 10 | vocab=$2 11 | 12 | # For my own experiment usage, just ignore it. 13 | if [ -d '/home/joe' ]; then 14 | ckpt='/home/joe/git/densecap/output/dc_context/vg_1.2_train' 15 | vocab='/home/joe/git/visual_genome/1.2/vocabulary.txt' 16 | fi 17 | 18 | time python ./tools/demo.py \ 19 | --ckpt ${ckpt} \ 20 | --cfg scripts/dense_cap_config.yml \ 21 | --vocab ${vocab} \ 22 | --set TEST.USE_BEAM_SEARCH False EMBED_DIM 512 TEST.LN_FACTOR 1. TEST.RPN_NMS_THRESH 0.7 TEST.NMS 0.3 23 | -------------------------------------------------------------------------------- /scripts/dense_cap_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # -------------------------------------------------------- 4 | # DenseCap-Tensorflow 5 | # Written by InnerPeace 6 | # This file is adapted from Ross Linjie's work 7 | # -------------------------------------------------------- 8 | 9 | # TODO: change the test procedure. 10 | set -x 11 | set -e 12 | 13 | GPU_ID=0 14 | CKPT=$1 15 | TEST_IMDB=$2 16 | 17 | 18 | # Fro valohai platform, maybe out of date. 19 | if [ -d '/valohai/outputs' ]; then 20 | CKPT="./output/Densecap_res50_context_all/vg_1.2_train" 21 | fi 22 | 23 | # For my own experiment, just ignore it. 24 | if [ -d '/home/joe' ]; then 25 | CKPT="/home/joe/git/densecap/output/dc_tune_context/vg_1.2_train" 26 | TEST_IMDB="vg_1.2_test" 27 | fi 28 | 29 | LOG="logs/test_log.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 30 | exec &> >(tee -a "$LOG") 31 | echo Logging output to "$LOG" 32 | 33 | time python ./tools/test_net.py \ 34 | --ckpt ${CKPT} \ 35 | --imdb ${TEST_IMDB} \ 36 | --cfg scripts/dense_cap_config.yml \ 37 | --set ALL_TEST True 38 | -------------------------------------------------------------------------------- /scripts/dense_cap_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Run with: 4 | # bash scripts/dense_cap_train.sh [dataset] [net] [ckpt_to_init] [data_dir] [step] 5 | 6 | set -x 7 | set -e 8 | 9 | export PYTHONUNBUFFERED='True' 10 | 11 | DATASET=$1 12 | NET=$2 13 | ckpt_path=$3 14 | data_dir=$4 15 | step=$5 16 | 17 | # For my own experiment usage, just ignore it. 18 | if [ -d '/home/joe' ]; then 19 | DATASET='visual_genome_1.2' 20 | NET='res50' 21 | ckpt_path="experiments/random_fixconv_i85k_171219/dc_fixed_1219/vg_1.2_train" 22 | # ckpt_path="experiments/rd_fixconv_i165k_171221/dc_conv_fixed/vg_1.2_train" 23 | # ckpt_path='/home/joe/git/slim_models/res50.ckpt' 24 | data_dir='/home/joe/git/visual_genome' 25 | fi 26 | 27 | case $DATASET in 28 | visual_genome) 29 | TRAIN_IMDB="vg_1.0_train" 30 | TEST_IMDB="vg_1.0_val" 31 | PT_DIR="dense_cap" 32 | FINETUNE_AFTER1=200000 33 | FINETUNE_AFTER2=100000 34 | ITERS1=400000 35 | ITERS2=300000 36 | ;; 37 | visual_genome_1.2) 38 | TRAIN_IMDB="vg_1.2_train" 39 | TEST_IMDB="vg_1.2_val" 40 | PT_DIR="dense_cap" 41 | FINETUNE_AFTER1=200000 42 | FINETUNE_AFTER2=100000 43 | ITERS1=400000 44 | ITERS2=300000 45 | ;; 46 | *) 47 | echo "No dataset given" 48 | exit 49 | ;; 50 | esac 51 | 52 | # This is for valohai computing platform, one can just ignore it. 53 | if [ -d '/valohai/outputs' ]; then 54 | ckpt_path='/valohai/inputs/resnet' 55 | data_dir='/valohai/inputs/visual_genome' 56 | LOG="/valohai/outputs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 57 | else 58 | LOG="logs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 59 | fi 60 | 61 | exec &> >(tee -a "$LOG") 62 | echo Logging output to "$LOG" 63 | 64 | # First step, freeze conv nets weights 65 | if [ ${step} -lt '2' ] 66 | then 67 | time python ./tools/train_net.py \ 68 | --weights ${ckpt_path} \ 69 | --imdb ${TRAIN_IMDB} \ 70 | --imdbval ${TEST_IMDB} \ 71 | --iters ${FINETUNE_AFTER1}\ 72 | --cfg scripts/dense_cap_config.yml \ 73 | --data_dir ${data_dir} \ 74 | --net ${NET} \ 75 | --set EXP_DIR dc_conv_fixed CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3 76 | fi 77 | 78 | # Step2: Finetune convnets 79 | NEW_WIGHTS=output/dc_conv_fixed/${TRAIN_IMDB} 80 | if [ ${step} -lt '3' ] 81 | then 82 | time python ./tools/train_net.py \ 83 | --weights ${NEW_WIGHTS} \ 84 | --imdb ${TRAIN_IMDB} \ 85 | --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \ 86 | --imdbval ${TEST_IMDB} \ 87 | --cfg scripts/dense_cap_config.yml \ 88 | --data_dir ${data_dir} \ 89 | --net ${NET} \ 90 | --set EXP_DIR dc_tune_conv CONTEXT_FUSION False RESNET.FIXED_BLOCKS 1 TRAIN.LEARNING_RATE 0.00025 91 | fi 92 | 93 | # Step3: train with contex fusion 94 | NEW_WIGHTS=output/dc_tune_conv/${TRAIN_IMDB} 95 | if [ ${step} -lt '4' ] 96 | then 97 | time python ./tools/train_net.py \ 98 | --weights ${NEW_WIGHTS} \ 99 | --imdb ${TRAIN_IMDB} \ 100 | --imdbval ${TEST_IMDB} \ 101 | --iters ${FINETUNE_AFTER2} \ 102 | --cfg scripts/dense_cap_config.yml \ 103 | --data_dir ${data_dir} \ 104 | --net ${NET} \ 105 | --set EXP_DIR dc_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 3 TRAIN.LEARNING_RATE 0.000125 106 | fi 107 | 108 | # Step4: finetune context fusion 109 | NEW_WIGHTS=output/dc_context/${TRAIN_IMDB} 110 | if [ ${step} -lt '5' ] 111 | then 112 | time python ./tools/train_net.py \ 113 | --weights ${NEW_WIGHTS} \ 114 | --imdb ${TRAIN_IMDB} \ 115 | --imdbval ${TEST_IMDB} \ 116 | --iters `expr ${ITERS2} - ${FINETUNE_AFTER2}` \ 117 | --cfg scripts/dense_cap_config.yml \ 118 | --data_dir ${data_dir} \ 119 | --net ${NET} \ 120 | --set EXP_DIR dc_tune_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 1 TRAIN.LEARNING_RATE 0.0000625 121 | fi 122 | -------------------------------------------------------------------------------- /scripts/old_dense_cap_train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # -------------------------------------------------------- 4 | # DenseCap-Tensorflow 5 | # Written by InnerPeace 6 | # This file is adapted from Ross Linjie's work 7 | # -------------------------------------------------------- 8 | # Script for training dense captioning model with joint inference and visual context 9 | # Do freeze-convnet training first, then finetuning 10 | # Usage: 11 | # ./models/dense_cap/dense_cap_train.sh [GPU_ID] [DATASET] [MODEL_TYPE] [INITIAL_WEIGHTS] [EXTRA_ARGS] 12 | # Example: 13 | # To train a model with joint inference and visual context (late fusion, feature summation) on visual genome 1.0 14 | # TODO: change the example. 15 | # ./models/dense_cap/dense_cap_train.sh 1 visual_genome late_fusion_sum models/vggnet/vgg16.caffemodel 16 | set -x 17 | set -e 18 | 19 | export PYTHONUNBUFFERED="True" 20 | 21 | GPU_ID=$1 22 | DATASET=$2 23 | MODEL_TYPE=$3 24 | WEIGHTS=$4 25 | array=( $@ ) 26 | len=${#array[@]} 27 | EXTRA_ARGS=${array[@]:4:$len} 28 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 29 | case $DATASET in 30 | visual_genome) 31 | TRAIN_IMDB="vg_1.0_train" 32 | TEST_IMDB="vg_1.0_val" 33 | PT_DIR="dense_cap" 34 | FINETUNE_AFTER1=200000 35 | FINETUNE_AFTER2=100000 36 | ITERS1=400000 37 | ITERS2=300000 38 | ;; 39 | visual_genome_1.2) 40 | TRAIN_IMDB="vg_1.2_train" 41 | TEST_IMDB="vg_1.2_val" 42 | PT_DIR="dense_cap" 43 | FINETUNE_AFTER1=200000 44 | FINETUNE_AFTER2=100000 45 | ITERS1=400000 46 | ITERS2=300000 47 | ;; 48 | *) 49 | echo "No dataset given" 50 | exit 51 | ;; 52 | esac 53 | GLOG_logtostderr=1 54 | # If training visual context model, need to start with the context-free counterpart 55 | if [ ${MODEL_TYPE} != "joint_inference" ] 56 | then 57 | # TODO: change the options for training 58 | ./tools/train_net.py --gpu ${GPU_ID} \ 59 | --solver models/${PT_DIR}/solver_joint_inference.prototxt \ 60 | --weights ${WEIGHTS} \ 61 | --imdb ${TRAIN_IMDB} \ 62 | --iters ${FINETUNE_AFTER1} \ 63 | --cfg models/${PT_DIR}/dense_cap.yml \ 64 | ${EXTRA_ARGS} 65 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_joint_inference_iter_${FINETUNE_AFTER1}.caffemodel 66 | # Finetuning all weights 67 | ./lib/tools/train_net.py --gpu ${GPU_ID} \ 68 | --solver models/${PT_DIR}/solver_joint_inference_finetune.prototxt \ 69 | --weights ${NEW_WEIGHTS} \ 70 | --imdb ${TRAIN_IMDB} \ 71 | --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \ 72 | --cfg models/${PT_DIR}/dense_cap.yml \ 73 | ${EXTRA_ARGS} 74 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_joint_inference_finetune_iter_`expr ${ITERS1} - ${FINETUNE_AFTER1}`.caffemodel 75 | # Training with convnet weights fixed 76 | ./lib/tools/train_net.py --gpu ${GPU_ID} \ 77 | --solver models/${PT_DIR}/solver_${MODEL_TYPE}.prototxt \ 78 | --weights ${NEW_WEIGHTS} \ 79 | --imdb ${TRAIN_IMDB} \ 80 | --iters ${FINETUNE_AFTER2} \ 81 | --cfg models/${PT_DIR}/dense_cap.yml \ 82 | ${EXTRA_ARGS} 83 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_${MODEL_TYPE}_iter_${FINETUNE_AFTER2}.caffemodel 84 | # Finetuning all weights 85 | ./lib/tools/train_net.py --gpu ${GPU_ID} \ 86 | --solver models/${PT_DIR}/solver_${MODEL_TYPE}_finetune.prototxt \ 87 | --weights ${NEW_WEIGHTS} \ 88 | --imdb ${TRAIN_IMDB} \ 89 | --iters `expr ${ITERS2} - ${FINETUNE_AFTER2}` \ 90 | --cfg models/${PT_DIR}/dense_cap.yml \ 91 | ${EXTRA_ARGS} 92 | 93 | else 94 | # Training with convnet weights fixed 95 | ./lib/tools/train_net.py --gpu ${GPU_ID} \ 96 | --solver models/${PT_DIR}/solver_${MODEL_TYPE}.prototxt \ 97 | --weights ${WEIGHTS} \ 98 | --imdb ${TRAIN_IMDB} \ 99 | --iters ${FINETUNE_AFTER1} \ 100 | --cfg models/${PT_DIR}/dense_cap.yml \ 101 | ${EXTRA_ARGS} 102 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_${MODEL_TYPE}_iter_${FINETUNE_AFTER1}.caffemodel 103 | # Finetuning all weights 104 | ./lib/tools/train_net.py --gpu ${GPU_ID} \ 105 | --solver models/${PT_DIR}/solver_${MODEL_TYPE}_finetune.prototxt \ 106 | --weights ${NEW_WEIGHTS} \ 107 | --imdb ${TRAIN_IMDB} \ 108 | --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \ 109 | --cfg models/${PT_DIR}/dense_cap.yml \ 110 | ${EXTRA_ARGS} 111 | fi 112 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | ## TEST 2 | Some of the test files during developing, just ignore it. 3 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/tests/__init__.py -------------------------------------------------------------------------------- /tests/architecture_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | from lib.config import cfg 11 | import tensorflow as tf 12 | from lib.nets.resnet_v1 import resnetv1 13 | from tests.roidata_test import get_data_test 14 | import six 15 | import numpy as np 16 | 17 | 18 | def architecture_test(): 19 | blob = get_data_test() 20 | tf.reset_default_graph() 21 | net = resnetv1(50) 22 | # net._build_network() 23 | net.create_architecture(mode='TEST', tag='pre') 24 | 25 | for n in tf.get_default_graph().as_graph_def().node: 26 | print(n.name) 27 | 28 | tfconfig = tf.ConfigProto(allow_soft_placement=True) 29 | tfconfig.gpu_options.allow_growth = True 30 | 31 | feed_dict = {net._image: blob['data'], 32 | net._im_info: blob['im_info'], 33 | net._gt_boxes: blob['gt_boxes'], 34 | net._gt_phrases: blob['gt_phrases']} 35 | output = net._for_debug 36 | output.update({ 37 | "image": net._image, 38 | "im_info": net._im_info, 39 | "gt_boxes": net._gt_boxes, 40 | "gt_phrases": net._gt_phrases 41 | }) 42 | 43 | with tf.Session(config=tfconfig) as sess: 44 | init = tf.global_variables_initializer() 45 | sess.run(init) 46 | out = sess.run('DenseCap_ResNet50/Prediction/lstm/cap_init_state:0', feed_dict=feed_dict) 47 | print(out.shape) 48 | # out = sess.run(output, feed_dict=feed_dict) 49 | 50 | # for k, v in six.iteritems(out): 51 | # print("name: {} ==> {}".format(k, v.shape)) 52 | # # print("shape: {}".format(v.shape)) 53 | # if k == 'labels': 54 | # # print(v) 55 | # # print("first 5 example:") 56 | # print(v[:5]) 57 | # if k == 'loss' or k == 'total_loss': 58 | # print(k, v) 59 | 60 | 61 | if __name__ == '__main__': 62 | architecture_test() 63 | -------------------------------------------------------------------------------- /tests/bash_log_test/bash_log_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | set -e 5 | 6 | export PYTHONUNBUFFERED="True" 7 | 8 | TAG=$1 9 | 10 | LOG="logs/${TAG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 11 | exec &> >(tee -a "$LOG") 12 | echo Logging output to "$LOG" 13 | 14 | time python ./nonsense.py 15 | -------------------------------------------------------------------------------- /tests/bash_log_test/logs/test.txt.2017-10-18_15-33-56: -------------------------------------------------------------------------------- 1 | + echo Logging output to logs/test.txt.2017-10-18_15-33-56 2 | Logging output to logs/test.txt.2017-10-18_15-33-56 3 | + python ./nonsense.py 4 | hello world 5 | 6 | real 0m0.011s 7 | user 0m0.012s 8 | sys 0m0.000s 9 | -------------------------------------------------------------------------------- /tests/bash_log_test/nonsense.py: -------------------------------------------------------------------------------- 1 | """test file""" 2 | 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | 8 | def main(): 9 | print("hello world") 10 | 11 | 12 | if __name__ == '__main__': 13 | main() 14 | -------------------------------------------------------------------------------- /tests/ckpt_restore_test.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------- 2 | # DenseCap 3 | # Written by InnerPeace 4 | # This file is adapted from Xinlei's work 5 | # ---------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import tensorflow as tf 11 | from tensorflow.python import pywrap_tensorflow 12 | import tensorflow.contrib.slim as slim 13 | 14 | from tensorflow.contrib.slim import arg_scope 15 | from tensorflow.contrib.slim.python.slim.nets import resnet_utils 16 | from tensorflow.contrib.slim.python.slim.nets import resnet_v1 17 | from tensorflow.contrib.slim.python.slim.nets.resnet_v1 import resnet_v1_block 18 | import numpy as np 19 | 20 | from lib.config import cfg 21 | 22 | 23 | def resnet_arg_scope(is_training=True, 24 | batch_norm_decay=0.997, 25 | batch_norm_epsilon=1e-5, 26 | batch_norm_scale=True): 27 | batch_norm_params = { 28 | 'is_training': False, 29 | 'decay': batch_norm_decay, 30 | 'epsilon': batch_norm_epsilon, 31 | 'scale': batch_norm_scale, 32 | 'trainable': False, 33 | 'updates_collections': tf.GraphKeys.UPDATE_OPS 34 | } 35 | 36 | with arg_scope( 37 | [slim.conv2d], 38 | # weights_regularizer=slim.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY), 39 | weights_regularizer=None, 40 | weights_initializer=slim.variance_scaling_initializer(), 41 | trainable=is_training, 42 | activation_fn=tf.nn.relu, 43 | normalizer_fn=slim.batch_norm, 44 | normalizer_params=batch_norm_params): 45 | with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc: 46 | return arg_sc 47 | 48 | 49 | class resnetv1(): 50 | def __init__(self, num_layers=50): 51 | # Network.__init__(self) 52 | self._feat_stride = [16, ] 53 | self._feat_compress = [1. / float(self._feat_stride[0]), ] 54 | self._num_layers = num_layers 55 | self._scope = 'resnet_v1_%d' % num_layers 56 | self._decide_blocks() 57 | 58 | # Do the first few layers manually, because 'SAME' padding can behave inconsistently 59 | # for images of different sizes: sometimes 0, sometimes 1 60 | def _build_base(self): 61 | with tf.variable_scope(self._scope, self._scope): 62 | net = resnet_utils.conv2d_same(self._image, 64, 7, stride=2, scope='conv1') 63 | net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]]) 64 | net = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='pool1') 65 | 66 | return net 67 | 68 | def _image_to_head(self, is_training, reuse=None): 69 | assert (0 <= cfg.RESNET.FIXED_BLOCKS <= 3) 70 | # Now the base is always fixed during training 71 | with slim.arg_scope(resnet_arg_scope(is_training=False)): 72 | net_conv = self._build_base() 73 | if cfg.RESNET.FIXED_BLOCKS > 0: 74 | with slim.arg_scope(resnet_arg_scope(is_training=False)): 75 | net_conv, _ = resnet_v1.resnet_v1(net_conv, 76 | self._blocks[0:cfg.RESNET.FIXED_BLOCKS], 77 | global_pool=False, 78 | include_root_block=False, 79 | reuse=reuse, 80 | scope=self._scope) 81 | if cfg.RESNET.FIXED_BLOCKS < 3: 82 | with slim.arg_scope(resnet_arg_scope(is_training=is_training)): 83 | net_conv, _ = resnet_v1.resnet_v1(net_conv, 84 | self._blocks[cfg.RESNET.FIXED_BLOCKS:-1], 85 | global_pool=False, 86 | include_root_block=False, 87 | reuse=reuse, 88 | scope=self._scope) 89 | 90 | self._act_summaries.append(net_conv) 91 | self._layers['head'] = net_conv 92 | 93 | return net_conv 94 | 95 | def _decide_blocks(self): 96 | # choose different blocks for different number of layers 97 | if self._num_layers == 50: 98 | self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2), 99 | resnet_v1_block('block2', base_depth=128, num_units=4, stride=2), 100 | # use stride 1 for the last conv4 layer 101 | resnet_v1_block('block3', base_depth=256, num_units=6, stride=1), 102 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)] 103 | 104 | elif self._num_layers == 101: 105 | self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2), 106 | resnet_v1_block('block2', base_depth=128, num_units=4, stride=2), 107 | # use stride 1 for the last conv4 layer 108 | resnet_v1_block('block3', base_depth=256, num_units=23, stride=1), 109 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)] 110 | 111 | elif self._num_layers == 152: 112 | self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2), 113 | resnet_v1_block('block2', base_depth=128, num_units=8, stride=2), 114 | # use stride 1 for the last conv4 layer 115 | resnet_v1_block('block3', base_depth=256, num_units=36, stride=1), 116 | resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)] 117 | 118 | else: 119 | # other numbers are not supported 120 | raise NotImplementedError 121 | 122 | def get_variables_to_restore(self, variables, var_keep_dic): 123 | variables_to_restore = [] 124 | 125 | for v in variables: 126 | # exclude the first conv layer to swap RGB to BGR 127 | if v.name == (self._scope + '/conv1/weights:0'): 128 | self._variables_to_fix[v.name] = v 129 | continue 130 | if v.name.split(':')[0] in var_keep_dic: 131 | print('Variables restored: %s' % v.name) 132 | variables_to_restore.append(v) 133 | 134 | return variables_to_restore 135 | 136 | def fix_variables(self, sess, pretrained_model): 137 | print('Fix Resnet V1 layers..') 138 | with tf.variable_scope('Fix_Resnet_V1') as scope: 139 | with tf.device("/cpu:0"): 140 | # fix RGB to BGR 141 | conv1_rgb = tf.get_variable("conv1_rgb", [7, 7, 3, 64], trainable=False) 142 | restorer_fc = tf.train.Saver({self._scope + "/conv1/weights": conv1_rgb}) 143 | restorer_fc.restore(sess, pretrained_model) 144 | 145 | sess.run(tf.assign(self._variables_to_fix[self._scope + '/conv1/weights:0'], 146 | tf.reverse(conv1_rgb, [2]))) 147 | 148 | 149 | def get_variables_in_checkpoint_file(file_name): 150 | try: 151 | reader = pywrap_tensorflow.NewCheckpointReader(file_name) 152 | var_to_shape_map = reader.get_variable_to_shape_map() 153 | return var_to_shape_map 154 | except Exception as e: # pylint: disable=broad-except 155 | print(str(e)) 156 | if "corrupted compressed block contents" in str(e): 157 | print("It's likely that your checkpoint file has been compressed " 158 | "with SNAPPY.") 159 | 160 | 161 | def main(): 162 | ckpt_path = '/home/joe/git/slim_models/resnet_v1_50.ckpt' 163 | var_keep_dic = get_variables_in_checkpoint_file(ckpt_path) 164 | for key in var_keep_dic: 165 | print("tensor_name: ", key) 166 | 167 | 168 | if __name__ == '__main__': 169 | main() 170 | 171 | -------------------------------------------------------------------------------- /tests/dencap_oa_test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script is used for my own experiments, just ignore it. 4 | # Run with: 5 | # bash scripts/dense_cap_train.sh [dataset] [net] [ckpt_to_init] [data_dir] [step] 6 | 7 | set -x 8 | set -e 9 | 10 | export PYTHONUNBUFFERED='True' 11 | 12 | DATASET='visual_genome_1.2' 13 | NET='res50' 14 | ckpt_path='/home/joe/git/slim_models' 15 | data_dir='/home/joe/git/visual_genome' 16 | step=$1 17 | 18 | case $DATASET in 19 | visual_genome) 20 | TRAIN_IMDB="vg_1.0_train" 21 | TEST_IMDB="vg_1.0_val" 22 | PT_DIR="dense_cap" 23 | FINETUNE_AFTER1=200000 24 | FINETUNE_AFTER2=100000 25 | ITERS1=400000 26 | ITERS2=300000 27 | ;; 28 | visual_genome_1.2) 29 | TRAIN_IMDB="vg_1.2_train" 30 | TEST_IMDB="vg_1.2_val" 31 | PT_DIR="dense_cap" 32 | FINETUNE_AFTER1=200000 33 | FINETUNE_AFTER2=100000 34 | ITERS1=400000 35 | ITERS2=300000 36 | ;; 37 | *) 38 | echo "No dataset given" 39 | exit 40 | ;; 41 | esac 42 | 43 | if [ -d '/valohai/outputs' ]; then 44 | ckpt_path='/valohai/inputs/resnet' 45 | data_dir='/valohai/inputs/visual_genome' 46 | LOG="/valohai/outputs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 47 | else 48 | LOG="logs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 49 | fi 50 | 51 | exec &> >(tee -a "$LOG") 52 | echo Logging output to "$LOG" 53 | 54 | FIRST_ITERS=80000 55 | if [ ${step} -lt '2' ] 56 | then 57 | time python ./tools/train_net.py \ 58 | --weights ${ckpt_path}/${NET}.ckpt \ 59 | --imdb ${TRAIN_IMDB} \ 60 | --imdbval ${TEST_IMDB} \ 61 | --iters 50000 \ 62 | --cfg scripts/dense_cap_config.yml \ 63 | --data_dir ${data_dir} \ 64 | --net ${NET} \ 65 | --set TRAIN_GLOVE False EXP_DIR dc_fixed CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3 KEEP_AS_GLOVE_DIM False LOSS.CLS_W 1. LOSS.BBOX_W 0.2 LOSS.RPN_BBOX_W 1. LOSS.RPN_CLS_W 0.5 66 | # --set EXP_DIR dc_fixed CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3 67 | 68 | # mkdir output/dc_fixed 69 | # cp -r output/Densecap/ output/dc_dc_fixed 70 | fi 71 | 72 | NEW_WIGHTS=output/dc_fixed/${TRAIN_IMDB} 73 | if [ ${step} -lt '3' ] 74 | then 75 | time python ./tools/train_net.py \ 76 | --weights ${NEW_WIGHTS} \ 77 | --imdb ${TRAIN_IMDB} \ 78 | --iters 30000 \ 79 | --imdbval ${TEST_IMDB} \ 80 | --cfg scripts/dense_cap_config.yml \ 81 | --data_dir ${data_dir} \ 82 | --net ${NET} \ 83 | --set TRAIN_GLOVE True EXP_DIR dc_tune_vec CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3 KEEP_AS_GLOVE_DIM False 84 | # TRAIN.LEARNING_RATE 0.0005 85 | # --iters `expr ${FINETUNE_AFTER1} - ${FIRST_ITERS}` \ 86 | 87 | # mkdir output/dc_tune_vec 88 | # cp -r output/Densecap/ output/dc_tune_vec 89 | fi 90 | 91 | #NEW_WIGHTS=output/dc_tune_vec/${TRAIN_IMDB} 92 | if [ ${step} -lt '4' ] 93 | then 94 | time python ./tools/train_net.py \ 95 | --weights ${NEW_WIGHTS} \ 96 | --imdb ${TRAIN_IMDB} \ 97 | --imdbval ${TEST_IMDB} \ 98 | --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \ 99 | --cfg scripts/dense_cap_config.yml \ 100 | --data_dir ${data_dir} \ 101 | --net ${NET} \ 102 | --set EXP_DIR dc_tune_conv CONTEXT_FUSION False RESNET.FIXED_BLOCKS 1 103 | 104 | # mkdir output/dc_tune_conv 105 | # cp -r output/Densecap/ output/dc_tune_conv 106 | fi 107 | 108 | NEW_WIGHTS=output/dc_tune_conv/${TRAIN_IMDB} 109 | if [ ${step} -lt '5' ] 110 | then 111 | time python ./tools/train_net.py \ 112 | --weights ${NEW_WIGHTS} \ 113 | --imdb ${TRAIN_IMDB} \ 114 | --imdbval ${TEST_IMDB} \ 115 | --iters ${FINETUNE_AFTER2} \ 116 | --cfg scripts/dense_cap_config.yml \ 117 | --data_dir ${data_dir} \ 118 | --net ${NET} \ 119 | --set TRAIN_GLOVE True EXP_DIR dc_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 3 120 | # mkdir output/dc_context 121 | # cp -r output/Densecap/ output/dc_context 122 | # --iters `expr ${FINETUNE_AFTER1} - ${FIRST_ITERS}` 123 | fi 124 | 125 | NEW_WIGHTS=output/dc_context/${TRAIN_IMDB} 126 | if [ ${step} -lt '6' ] 127 | then 128 | time python ./tools/train_net.py \ 129 | --weights ${NEW_WIGHTS} \ 130 | --imdb ${TRAIN_IMDB} \ 131 | --imdbval ${TEST_IMDB} \ 132 | --iters `expr ${ITERS2} - ${FINETUNE_AFTER2}` \ 133 | --cfg scripts/dense_cap_config.yml \ 134 | --data_dir ${data_dir} \ 135 | --net ${NET} \ 136 | --set TRAIN_GLOVE True EXP_DIR dc_tune_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 1 137 | fi 138 | -------------------------------------------------------------------------------- /tests/logs/architecture_test.txt: -------------------------------------------------------------------------------- 1 | /home/joe/.tf_env2/bin/python /home/joe/git/densecap/tests/architecture_test.py 2 | data_path: /home/joe/git/visual_genome_test/1.2 3 | pre gt roidb could be loaded from /home/joe/git/visual_genome_test/1.2_cache/pre_gt_roidb 4 | LIMIT_RAM version and load index from /home/joe/git/visual_genome_test/1.2_cache/pre_gt_roidb/image_index.json 5 | 6 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:50: RuntimeWarning: overflow encountered in exp 7 | pred_w = np.exp(dw) * widths[:, np.newaxis] 8 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:50: RuntimeWarning: overflow encountered in multiply 9 | pred_w = np.exp(dw) * widths[:, np.newaxis] 10 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:51: RuntimeWarning: overflow encountered in exp 11 | pred_h = np.exp(dh) * heights[:, np.newaxis] 12 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:51: RuntimeWarning: overflow encountered in multiply 13 | pred_h = np.exp(dh) * heights[:, np.newaxis] 14 | 15 | length of labels, i.e. number of regions: 256 16 | sentence data layer input (first 3) 17 | 2239.0 [ 4 87 6 5 85 87 0 0 0 0] 18 | 2239.0 [ 4 87 6 5 85 87 0 0 0 0] 19 | 2239.0 [ 4 87 6 5 85 87 0 0 0 0] 20 | sentence data layer output (first 3) 21 | input sentence 22 | [[ 1. 4. 87. 6. 5. 85. 87. 0. 0. 0. 0.] 23 | [ 1. 4. 87. 6. 5. 85. 87. 0. 0. 0. 0.] 24 | [ 1. 4. 87. 6. 5. 85. 87. 0. 0. 0. 0.]] 25 | target sentence 26 | [[ 1. 4. 87. 6. 5. 85. 87. 2. 0. 0. 0. 0.] 27 | [ 1. 4. 87. 6. 5. 85. 87. 2. 0. 0. 0. 0.] 28 | [ 1. 4. 87. 6. 5. 85. 87. 2. 0. 0. 0. 0.]] 29 | cont sentence 30 | [[ 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.] 31 | [ 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.] 32 | [ 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.]] 33 | cont bbox 34 | [[ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] 35 | [ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] 36 | [ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]] 37 | 38 | name: fc7 ==> (256, 2048) 39 | name: image ==> (1, 540, 720, 3) 40 | name: labels ==> (256,) 41 | [3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 42 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 43 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 44 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 45 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 46 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 47 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 48 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 49 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 50 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 51 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 52 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 53 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 54 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 55 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 56 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 57 | 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 58 | 3682] 59 | name: bbox_inside_weights ==> (256, 4) 60 | name: bbox_targets ==> (256, 4) 61 | name: input_sentence ==> (256, 11) 62 | name: rpn ==> (1, 34, 45, 512) 63 | name: rpn_labels ==> (1, 1, 408, 45) 64 | name: cont_bbox ==> (256, 12) 65 | name: bbox_outside_weights ==> (256, 4) 66 | name: target_sentence ==> (256, 12) 67 | name: rpn_bbox_outside_weights ==> (1, 34, 45, 48) 68 | name: pool5 ==> (256, 7, 7, 1024) 69 | name: rpn_bbox_inside_weights ==> (1, 34, 45, 48) 70 | name: proposal_rois ==> (9, 5) 71 | name: head ==> (1, 34, 45, 1024) 72 | name: clss ==> (256,) 73 | name: rpn_cls_score_reshape ==> (1, 408, 45, 2) 74 | name: anchors ==> (18360, 4) 75 | name: cont_sentence ==> (256, 12) 76 | name: cls_prob ==> (256, 2) 77 | name: gt_boxes ==> (262, 5) 78 | name: rpn_bbox_pred ==> (1, 34, 45, 48) 79 | name: rpn_cls_score ==> (1, 34, 45, 24) 80 | name: im_info ==> (3,) 81 | name: phrases ==> (256, 10) 82 | name: rpn_cls_prob ==> (1, 34, 45, 24) 83 | name: gt_phrases ==> (262, 10) 84 | name: rois ==> (256, 5) 85 | name: proposal_rpn_scores ==> (9, 1) 86 | name: rpn_cls_prob_reshape ==> (1, 408, 45, 2) 87 | name: rpn_bbox_targets ==> (1, 34, 45, 48) 88 | 89 | 90 | -------------------------------------------------------------------------------- /tests/logs/preprocessing.txt: -------------------------------------------------------------------------------- 1 | split image number: 77398 for split name: train 2 | start loading image meta data json files... 3 | 0.316329 seconds for loading 4 | train: 100%|███████████████████████████| 108077/108077 [03:05<00:00, 581.84it/s] 5 | processing train set with time: 185.75 seconds 6 | there are 272 invalid bboxes out of 3684063 7 | there are 3 empty phrases after triming 8 | Found 56945 unique word tokens. 9 | Using vocabulary size 10000. 10 | The least frequent word in our vocabulary is 'ruff' and appeared 14 times. 11 | Dumping vocabulary to file: /home/joe/git/visual_genome/1.2/vocabulary.txt 12 | Done. 13 | split image number: 5000 for split name: val 14 | start loading image meta data json files... 15 | 0.273385 seconds for loading 16 | val: 100%|████████████████████████████| 108077/108077 [00:20<00:00, 5401.88it/s] 17 | processing val set with time: 20.01 seconds 18 | there are 14 invalid bboxes out of 237362 19 | there are 0 empty phrases after triming 20 | split image number: 5000 for split name: test 21 | start loading image meta data json files... 22 | 0.273840 seconds for loading 23 | test: 100%|███████████████████████████| 108077/108077 [00:20<00:00, 5225.84it/s] 24 | processing test set with time: 20.68 seconds 25 | there are 17 invalid bboxes out of 238069 26 | there are 0 empty phrases after triming -------------------------------------------------------------------------------- /tests/logs/sentence_data_layer_test.txt: -------------------------------------------------------------------------------- 1 | data_path: /home/joe/git/visual_genome_test/1.2 2 | Appending horizontally-flipped training examples... 3 | pre gt roidb loaded from /home/joe/git/visual_genome_test/1.2/pre_gt_roidb.pkl 4 | done 5 | Preparing training data... 6 | done 7 | Filtered 0 roidb entries: 4 -> 4 8 | length of labels, i.e. number of regions: 262 9 | sentence data layer input (first 3) 10 | 1382.0 [ 4 33 6 25 20 144 0 0 0 0] 11 | 1383.0 [167 6 30 4 11 0 0 0 0 0] 12 | 1384.0 [ 7 6 21 72 0 0 0 0 0 0] 13 | sentence data layer output (first 3) 14 | input sentence 15 | [[ 1. 4. 33. 6. 25. 20. 144. 0. 0. 0. 0.] 16 | [ 1. 167. 6. 30. 4. 11. 0. 0. 0. 0. 0.] 17 | [ 1. 7. 6. 21. 72. 0. 0. 0. 0. 0. 0.]] 18 | target sentence 19 | [[ 1. 4. 33. 6. 25. 20. 144. 2. 0. 0. 0. 0.] 20 | [ 1. 167. 6. 30. 4. 11. 2. 0. 0. 0. 0. 0.] 21 | [ 1. 7. 6. 21. 72. 2. 0. 0. 0. 0. 0. 0.]] 22 | cont sentence 23 | [[ 0. 1. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0.] 24 | [ 0. 1. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0.] 25 | [ 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]] 26 | cont bbox 27 | [[ 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.] 28 | [ 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.] 29 | [ 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]] -------------------------------------------------------------------------------- /tests/pickle_read_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Ross Girshick's work 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | from os.path import join as pjoin 11 | from six.moves import cPickle 12 | 13 | def pickle_test(): 14 | DEFAULT_PATH = '/home/joe/git/visual_genome_test' 15 | cache = pjoin(DEFAULT_PATH, '1.2_cache/pre_gt_roidb', '1.pkl') 16 | cache_flip = pjoin(DEFAULT_PATH, '1.2_cache/pre_gt_roidb', '1_flip.pkl') 17 | ori = pjoin(DEFAULT_PATH, '1.2', 'pre_gt_roidb.pkl') 18 | phra = pjoin(DEFAULT_PATH, '1.2', 'pre_gt_phrases.pkl') 19 | with open(cache, 'rb') as fc: 20 | data_cache = cPickle.load(fc) 21 | with open(cache_flip, 'rb') as f: 22 | data_flip = cPickle.load(f) 23 | with open(ori, 'rb') as fo: 24 | data_ori = cPickle.load(fo) 25 | with open(phra, 'rb') as fp: 26 | data_phra = cPickle.load(fp) 27 | # from IPython import embed; 28 | # embed() 29 | 30 | print(data_cache) 31 | print ('flip------------------') 32 | print(data_flip) 33 | print ('ori------------------') 34 | print(data_ori) 35 | print("data ori length:", len(data_ori)) 36 | print ('phrase------------------') 37 | print (data_phra) 38 | # print (data_phra[2239]) 39 | 40 | 41 | if __name__ == '__main__': 42 | pickle_test() 43 | -------------------------------------------------------------------------------- /tests/read_regions_json/ijson_example.txt: -------------------------------------------------------------------------------- 1 | ('', u'start_array', None) 2 | ('item', u'start_map', None) 3 | ('item', u'map_key', u'regions') 4 | (u'item.regions', u'start_array', None) 5 | (u'item.regions.item', u'start_map', None) 6 | (u'item.regions.item', u'map_key', u'region_id') 7 | (u'item.regions.item.region_id', u'number', 1382) 8 | (u'item.regions.item', u'map_key', u'width') 9 | (u'item.regions.item.width', u'number', 82) 10 | (u'item.regions.item', u'map_key', u'height') 11 | (u'item.regions.item.height', u'number', 139) 12 | (u'item.regions.item', u'map_key', u'image_id') 13 | (u'item.regions.item.image_id', u'number', 1) 14 | (u'item.regions.item', u'map_key', u'phrase') 15 | (u'item.regions.item.phrase', u'string', u'the clock is green in colour') 16 | (u'item.regions.item', u'map_key', u'y') 17 | (u'item.regions.item.y', u'number', 57) 18 | (u'item.regions.item', u'map_key', u'x') 19 | (u'item.regions.item.x', u'number', 421) 20 | (u'item.regions.item', u'end_map', None) 21 | -------------------------------------------------------------------------------- /tests/read_regions_json/read_regions_test.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------------------- 2 | # DenseCap 3 | # Written by InnerPeace 4 | # ---------------------------------------------- 5 | 6 | """read large region description json files""" 7 | 8 | import ijson 9 | import json 10 | # import tqdm 11 | 12 | def read_regions( ): 13 | VG_VERSION = '1.2' 14 | VG_PATH = '/home/joe/git/VG_raw_data' 15 | VG_REGION_PATH = '%s/%s/region_descriptions.json' % (VG_PATH, VG_VERSION) 16 | # parser = ijson.parse(open('test_region.json')) 17 | parser = ijson.parse(open(VG_REGION_PATH)) 18 | 19 | last_value = None 20 | Dic = {} 21 | regions = [] 22 | dic = {} 23 | for prefix, event, value in parser: 24 | if value == 'regions': 25 | Dic = {} 26 | regions = [] 27 | last_value = None 28 | elif last_value == 'id': 29 | Dic['regions'] = regions 30 | Dic['id'] = value 31 | with open('test_id_%s.json' % value, 'w') as f: 32 | json.dump(Dic, f) 33 | break 34 | elif event == 'map_key': 35 | last_value = value 36 | elif event == 'end_map': 37 | regions.append(dic) 38 | dic = {} 39 | last_value = None 40 | elif last_value: 41 | dic[last_value] = value 42 | 43 | 44 | def equal_test( ): 45 | new = json.load(open('true_id_1_out.json')) 46 | old = json.load(open('true_id_1.json')) 47 | if old == new: 48 | print('success!') 49 | else: 50 | print('ERROR!') 51 | 52 | '''OUT: success!''' 53 | 54 | 55 | def json_line_read( ): 56 | '''This is not working''' 57 | 58 | with open('true_id_1.json', 'r') as f: 59 | for line in f: 60 | print(line) 61 | 62 | 63 | def read_time_test( ): 64 | path = '/home/joe/git/visual_genome_test/1.2/pre_gt_regions/1.json' 65 | import time 66 | tic = time.time() 67 | with open(path, 'r') as f: 68 | data = json.load(f) 69 | toc = time.time() 70 | print ('read time: %s seconds' % (toc - tic)) 71 | 72 | def read_all_regions_test(): 73 | '''it gonna kill my computer''' 74 | from tqdm import tqdm 75 | path = '/home/joe/git/visual_genome/1.2/train_gt_regions/' 76 | split_path = '/home/joe/git/densecap/info/densecap_splits.json' 77 | with open(split_path, 'r') as fid: 78 | img_index = json.load(fid)['train'] 79 | all_regions = {} 80 | for i in tqdm(xrange(len(img_index)), desc='train set'): 81 | idx = img_index[i] 82 | with open(path+'%s.json'%idx, 'r') as f: 83 | all_regions["%s"%idx] = json.load(f) 84 | 85 | if __name__ == '__main__': 86 | # read_regions() 87 | # equal_test() 88 | # json_line_read() 89 | # read_time_test() 90 | read_all_regions_test() 91 | -------------------------------------------------------------------------------- /tests/read_regions_json/test_region.json: -------------------------------------------------------------------------------- 1 | {"regions":[{"region_id": 4091, "width": 396, "height": 293, "image_id": 1, "phrase": "tall buildings with many windows", "y": 6, "x": 396}, {"region_id": 4090, "width": 709, "height": 281, "image_id": 1, "phrase": "brick sidewalk", "y": 315, "x": 81}], "id": 1} -------------------------------------------------------------------------------- /tests/read_regions_json/test_region_out.json: -------------------------------------------------------------------------------- 1 | {"regions": [{"region_id": 4091, "image_id": 1, "height": 293, "width": 396, "x": 396, "y": 6, "phrase": "tall buildings with many windows"}, {"region_id": 4090, "image_id": 1, "height": 281, "width": 709, "x": 81, "y": 315, "phrase": "brick sidewalk"}], "id": 1} -------------------------------------------------------------------------------- /tests/roidata_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # -------------------------------------------------------- 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | 9 | from lib.fast_rcnn.layer import RoIDataLayer 10 | from lib.config import cfg 11 | from lib.datasets.visual_genome import visual_genome 12 | import lib.fast_rcnn.roidb as rdl_roidb 13 | import cv2 14 | import numpy as np 15 | from six.moves import xrange 16 | 17 | # cfg.LIMIT_RAM = False 18 | DEFAULT_PATH = '/home/joe/git/visual_genome_test/1.2' 19 | 20 | 21 | # def roidata_test(roidb, num_classes=2): 22 | # data = RoIDataLayer(roidb, num_classes=num_classes) 23 | 24 | def get_training_roidb(imdb): 25 | """Returns a roidb (Region of Interest database) for use in training.""" 26 | if cfg.TRAIN.USE_FLIPPED and not cfg.LIMIT_RAM: 27 | print('Appending horizontally-flipped training examples...') 28 | imdb.append_flipped_images() 29 | print('done') 30 | 31 | print('Preparing training data...') 32 | rdl_roidb.prepare_roidb(imdb) 33 | print('done') 34 | 35 | return imdb.roidb 36 | 37 | 38 | def filter_roidb(roidb): 39 | """Remove roidb entries that have no usable RoIs.""" 40 | 41 | def is_valid(entry): 42 | # Valid images have: 43 | # (1) At least one foreground RoI OR 44 | # (2) At least one background RoI 45 | overlaps = entry['max_overlaps'] 46 | # find boxes with sufficient overlap 47 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 48 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 49 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 50 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 51 | # image is only valid if such boxes exist 52 | valid = len(fg_inds) > 0 or len(bg_inds) > 0 53 | return valid 54 | 55 | num = len(roidb) 56 | filtered_roidb = [entry for entry in roidb if is_valid(entry)] 57 | num_after = len(filtered_roidb) 58 | print('Filtered {} roidb entries: {} -> {}'.format(num - num_after, 59 | num, num_after)) 60 | return filtered_roidb 61 | 62 | 63 | def vis_regions(im, regions, phrases=None, path='/home/joe/git/VG_raw_data/images_test'): 64 | vocab_path = '%s/vocabulary.txt' % DEFAULT_PATH 65 | with open(vocab_path, 'r') as f: 66 | vocab = [line.strip() for line in f] 67 | 68 | mean_values = np.array([[[102.9801, 115.9465, 122.7717]]]) 69 | im = im + mean_values # offset to original values 70 | 71 | for i in xrange(len(regions)): 72 | if i > 9: 73 | print ('save 10 examples and break out.') 74 | break 75 | bbox = regions[i, :4] 76 | region_id = regions[i, 4] 77 | # position 0,1,2 have been taken 78 | caption = ' '.join([vocab[j - 3] if j-3>=0 else "" for j in phrases[i]]) 79 | im_new = np.copy(im) 80 | cv2.rectangle(im_new, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 0, 255), 2) 81 | cv2.imwrite('%s/%s.jpg' % (path, caption), im_new) 82 | 83 | def get_data_test(): 84 | imdb = visual_genome('pre', '1.2') 85 | if cfg.LIMIT_RAM: 86 | roidb = imdb.roidb 87 | else: 88 | roidb = get_training_roidb(imdb) 89 | roidb = filter_roidb(roidb) 90 | rdata = RoIDataLayer(roidb) 91 | data = rdata.forward() 92 | 93 | return data 94 | 95 | 96 | if __name__ == '__main__': 97 | imdb = visual_genome('pre', '1.2') 98 | if cfg.LIMIT_RAM: 99 | roidb = imdb.roidb 100 | else: 101 | roidb = get_training_roidb(imdb) 102 | roidb = filter_roidb(roidb) 103 | rdata = RoIDataLayer(roidb) 104 | data = rdata.forward() 105 | # data = rdata.forward() 106 | print(data) 107 | regions = data['gt_boxes'] 108 | im = data['data'][0] 109 | phrases = data['gt_phrases'] 110 | vis_regions(im, regions, phrases=phrases) 111 | 112 | # from IPython import embed; 113 | # 114 | # embed() 115 | -------------------------------------------------------------------------------- /tests/sentence_data_layer_test.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | from lib.config import cfg 11 | from lib.layers.sentence_data_layer import sentence_data_layer 12 | from tests.roidata_test import get_data_test 13 | import numpy as np 14 | 15 | 16 | def sentence_data_layer_test(): 17 | data = get_data_test() 18 | phrases = data['gt_phrases'] 19 | 20 | labels = data['gt_boxes'][:3, 4] 21 | sentence_data_layer(labels, phrases) 22 | 23 | 24 | if __name__ == '__main__': 25 | 26 | sentence_data_layer_test() 27 | -------------------------------------------------------------------------------- /tests/vh_train_command.sh: -------------------------------------------------------------------------------- 1 | # prapare data 2 | pip install opencv-python 3 | apt-get -y update && apt-get install -y libsm6 libxext6 4 | pip install --upgrade pip 5 | pip install -r requirements.txt 6 | cd /valohai/inputs 7 | tar -xvzf ./vg_data/visual_genome.tar.gz 8 | mv ./valohai/inputs/visual_genome/ ./ 9 | mkdir ./images 10 | unzip -xvzf image_1/images.zip -d ./images 11 | unzip -xvzf image_2/images2.zip -d ./images 12 | ls 13 | cd /valohai/repository 14 | cd lib 15 | make 16 | cd .. 17 | bash ./tests/dencap_oa_test.sh {parameters} 18 | tar -czvf /valohai/outputs/output.tar.gz ./output 19 | -------------------------------------------------------------------------------- /tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/tools/__init__.py -------------------------------------------------------------------------------- /tools/_init_paths.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import sys 3 | 4 | 5 | def add_path(path): 6 | if path not in sys.path: 7 | sys.path.insert(0, path) 8 | 9 | 10 | this_dir = osp.dirname(__file__) 11 | lib_path = osp.join(this_dir, '..') 12 | add_path(lib_path) 13 | -------------------------------------------------------------------------------- /tools/demo.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work 5 | # -------------------------------------------------------- 6 | # Train a dense captioning model 7 | # Code adapted from faster R-CNN project 8 | # -------------------------------------------------------- 9 | # Fast R-CNN 10 | # Copyright (c) 2015 Microsoft 11 | # Licensed under The MIT License [see LICENSE for details] 12 | # Written by Ross Girshick 13 | # -------------------------------------------------------- 14 | from __future__ import absolute_import 15 | from __future__ import division 16 | from __future__ import print_function 17 | 18 | """Train a dense caption model""" 19 | 20 | import _init_paths 21 | from os.path import join as pjoin 22 | import sys 23 | import six 24 | import glob 25 | import argparse 26 | import json 27 | import numpy as np 28 | import tensorflow as tf 29 | 30 | from lib.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_output_tb_dir 31 | from lib.datasets.factory import get_imdb 32 | import lib.datasets.imdb 33 | from lib.dense_cap.train import get_training_roidb, train_net 34 | from lib.dense_cap.test import test_im 35 | from lib.nets.vgg16 import vgg16 36 | from lib.nets.resnet_v1 import resnetv1 37 | import pprint 38 | 39 | 40 | def parse_args(): 41 | """ 42 | Parse input arguments 43 | """ 44 | parser = argparse.ArgumentParser(description='Test a Dense Caption network') 45 | 46 | parser.add_argument('--ckpt', dest='ckpt', 47 | help='initialize with pretrained model weights', 48 | default=None, type=str) 49 | parser.add_argument('--cfg', dest='cfg_file', 50 | help='optional config file', 51 | default=None, type=str) 52 | # TODO: add inception 53 | parser.add_argument('--net', dest='net', 54 | help='vgg16, res50, res101, res152', 55 | default='res50', type=str) 56 | parser.add_argument('--vocab', dest='vocabulary', 57 | help='vocabulary file', 58 | default=None, type=str) 59 | 60 | parser.add_argument('--set', dest='set_cfgs', 61 | help='set config keys', default=None, 62 | nargs=argparse.REMAINDER) 63 | 64 | if len(sys.argv) == 1: 65 | parser.print_help() 66 | sys.exit(1) 67 | 68 | args = parser.parse_args() 69 | return args 70 | 71 | 72 | if __name__ == '__main__': 73 | args = parse_args() 74 | print('------- called with args: --------') 75 | pprint.pprint(args) 76 | 77 | if args.cfg_file is not None: 78 | cfg_from_file(args.cfg_file) 79 | if args.set_cfgs is not None: 80 | cfg_from_list(args.set_cfgs) 81 | 82 | # load network 83 | if args.net == 'vgg16': 84 | net = vgg16() 85 | elif args.net == 'res50': 86 | net = resnetv1(num_layers=50) 87 | elif args.net == 'res101': 88 | net = resnetv1(num_layers=101) 89 | elif args.net == 'res152': 90 | net = resnetv1(num_layers=152) 91 | else: 92 | raise NotImplementedError 93 | 94 | net.create_architecture("TEST", num_classes=1, tag='pre') 95 | vocab = ['', '', ''] 96 | with open(args.vocabulary, 'r') as f: 97 | for line in f: 98 | vocab.append(line.strip()) 99 | 100 | # get the image paths 101 | im_paths = glob.glob('./data/demo/*.jpg') 102 | print(im_paths) 103 | 104 | # read checkpoint file 105 | if args.ckpt: 106 | ckpt = tf.train.get_checkpoint_state(args.ckpt) 107 | else: 108 | raise ValueError 109 | 110 | # set config 111 | tfconfig = tf.ConfigProto(allow_soft_placement=True) 112 | tfconfig.gpu_options.allow_growth = True 113 | 114 | # init session 115 | saver = tf.train.Saver() 116 | with tf.Session(config=tfconfig) as sess: 117 | print('Restored from {}'.format(ckpt.model_checkpoint_path)) 118 | saver.restore(sess, ckpt.model_checkpoint_path) 119 | 120 | # for n in tf.get_default_graph().as_graph_def().node: 121 | # if 'input_feed' in n.name: 122 | # print(n.name) 123 | # for html visualization 124 | pre_results = {} 125 | save_path = './vis/data' 126 | for path in im_paths: 127 | pre_results = test_im(sess, net, path, vocab, pre_results) 128 | 129 | with open(save_path + '/results.json', 'w') as f: 130 | json.dump(pre_results, f) 131 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # DenseCap-Tensorflow 3 | # Written by InnerPeace 4 | # This file is adapted from Linjie's work 5 | # -------------------------------------------------------- 6 | # Train a dense captioning model 7 | # Code adapted from faster R-CNN project 8 | # -------------------------------------------------------- 9 | # Fast R-CNN 10 | # Copyright (c) 2015 Microsoft 11 | # Licensed under The MIT License [see LICENSE for details] 12 | # Written by Ross Girshick 13 | # -------------------------------------------------------- 14 | from __future__ import absolute_import 15 | from __future__ import division 16 | from __future__ import print_function 17 | 18 | """Test a dense caption model""" 19 | import _init_paths 20 | from lib.dense_cap.test import test_net 21 | from lib.config import cfg, cfg_from_file, cfg_from_list 22 | from lib.datasets.factory import get_imdb 23 | import argparse 24 | import pprint 25 | import time 26 | import os 27 | import sys 28 | import tensorflow as tf 29 | from lib.nets.vgg16 import vgg16 30 | from lib.nets.resnet_v1 import resnetv1 31 | 32 | 33 | def parse_args(): 34 | """ 35 | Parse input arguments 36 | """ 37 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 38 | parser.add_argument('--device', dest='device', help='device to use', 39 | default='gpu', type=str) 40 | parser.add_argument('--device_id', dest='device_id', help='device id to use', 41 | default=0, type=int) 42 | parser.add_argument('--tag', dest='tag', 43 | help='tag of the model', 44 | default=None, type=str) 45 | parser.add_argument('--ckpt', dest='ckpt', 46 | help='initialize with pretrained model weights', 47 | default=None, type=str) 48 | parser.add_argument('--cfg', dest='cfg_file', 49 | help='optional config file', 50 | default=None, type=str) 51 | parser.add_argument('--imdb', dest='imdb_name', 52 | help='dataset to test on', 53 | default='vg_1.2_test', type=str) 54 | # TODO: delete extra options 55 | # parser.add_argument('--iters', dest='max_iters', 56 | # help='number of iterations to train', 57 | # default=40000, type=int) 58 | # parser.add_argument('--imdbval', dest='imdbval_name', 59 | # help='dataset to validation on', 60 | # default='vg_1.2_val', type=str) 61 | # parser.add_argument('--rand', dest='randomize', 62 | # help='randomize (do not use a fixed seed)', 63 | # action='store_true') 64 | # TODO: add inception 65 | parser.add_argument('--net', dest='net', 66 | help='vgg16, res50, res101, res152', 67 | default='res50', type=str) 68 | parser.add_argument('--vis', dest='vis', help='visualize detections', 69 | action='store_true') 70 | parser.add_argument('--use_box_at', dest='use_box_at', 71 | help='use predicted box at this time step, default to the last', 72 | default=-1, type=int) 73 | parser.add_argument('--set', dest='set_cfgs', 74 | help='set config keys', default=None, 75 | nargs=argparse.REMAINDER) 76 | 77 | if len(sys.argv) == 1: 78 | parser.print_help() 79 | sys.exit(1) 80 | 81 | args = parser.parse_args() 82 | return args 83 | 84 | 85 | if __name__ == '__main__': 86 | args = parse_args() 87 | 88 | print('Called with args:') 89 | print(args) 90 | 91 | if args.cfg_file is not None: 92 | cfg_from_file(args.cfg_file) 93 | if args.set_cfgs is not None: 94 | cfg_from_list(args.set_cfgs) 95 | 96 | cfg.GPU_ID = args.device_id 97 | 98 | print('Using config:') 99 | pprint.pprint(cfg) 100 | 101 | imdb = get_imdb(args.imdb_name) 102 | # load network 103 | if args.net == 'vgg16': 104 | net = vgg16() 105 | elif args.net == 'res50': 106 | net = resnetv1(num_layers=50) 107 | elif args.net == 'res101': 108 | net = resnetv1(num_layers=101) 109 | elif args.net == 'res152': 110 | net = resnetv1(num_layers=152) 111 | else: 112 | raise NotImplementedError 113 | 114 | net.create_architecture("TEST", num_classes=1, tag='pre') 115 | # read checkpoint file 116 | if args.ckpt: 117 | ckpt = tf.train.get_checkpoint_state(args.ckpt) 118 | else: 119 | raise ValueError("NO checkpoint found in {}".format(args.ckpt)) 120 | 121 | # set config 122 | tfconfig = tf.ConfigProto(allow_soft_placement=True) 123 | tfconfig.gpu_options.allow_growth = True 124 | 125 | # init session 126 | saver = tf.train.Saver() 127 | with tf.Session(config=tfconfig) as sess: 128 | print('Restored from {}'.format(ckpt.model_checkpoint_path)) 129 | saver.restore(sess, ckpt.model_checkpoint_path) 130 | 131 | test_net(sess, net, imdb, 132 | vis=args.vis, use_box_at=args.use_box_at) 133 | -------------------------------------------------------------------------------- /valohai.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | 3 | - step: 4 | name: preprocess data 5 | image: gcr.io/tensorflow/tensorflow:1.3.0-devel-gpu 6 | command: bash ./lib/preprocess.sh {parameters} 7 | inputs: 8 | - name: image_meta 9 | default: http://visualgenome.org/static/data/dataset/image_data.json.zip 10 | - name: regions 11 | default: http://visualgenome.org/static/data/dataset/region_descriptions.json.zip 12 | parameters: 13 | - name: vs 14 | type: float 15 | pass-as: -vs {v} 16 | default: 1.2 17 | - name: path 18 | type: string 19 | pass-as: -p {v} 20 | default: "/valohai/inputs" 21 | - name: output_dir 22 | type: string 23 | pass-as: -od {v} 24 | default: "/valohai/inputs/visual_genome" 25 | - name: max_words 26 | type: integer 27 | pass-as: -mw {v} 28 | default: 10 29 | 30 | - step: 31 | name: download image data 32 | image: gcr.io/tensorflow/tensorflow:1.3.0-devel-gpu 33 | command: bash ./lib/download_data_vh.sh 34 | inputs: 35 | - name: image_1 36 | default: https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip 37 | - name: image_2 38 | default: https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip 39 | 40 | - step: 41 | name: train model 42 | image: gcr.io/tensorflow/tensorflow:1.3.0-devel-gpu 43 | command: bash ./tests/dencap_oa_test.sh {parameters} 44 | inputs: 45 | - name: vg_data 46 | default: "" 47 | - name: resnet 48 | default: https://drive.google.com/uc?export=download&confirm=aZtH&id=15PxiEp7HP-ZSBG9xHMamZr-zh8iBDeA4 49 | - name: image_1 50 | default: https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip 51 | - name: image_2 52 | default: https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip 53 | parameters: 54 | - name: iters 55 | type: integer 56 | pass-as: -iters {v} 57 | default: 80000 58 | -------------------------------------------------------------------------------- /vis/README.md: -------------------------------------------------------------------------------- 1 | 2 | ### Visualization interface 3 | 4 | When you run `run_model.lua` with `-output_vis 1` (default) it will write the images and a json struct to this folder's `data/` directory. These can then be viewed with this nice html interface. 5 | 6 | For example, to evaluate a checkpoint on some VG test data: 7 | 8 | ``` 9 | th run_model.lua -checkpoint data/checkpoint.t7 -input_split test -vg_img_root_dir /path/to/visual-genome/images -max_images 10 10 | ``` 11 | 12 | and then start a webbrowser, e.g. `python -m SimpleHTTPServer` and open the `view_results.html` file! 13 | -------------------------------------------------------------------------------- /vis/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | color: #333; 3 | margin: 0; 4 | padding: 0; 5 | font-family: "HelveticaNeue-Light", "Helvetica Neue Light", "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif; 6 | font-weight: 300; 7 | } 8 | svg { 9 | border: 1px solid black; 10 | background-color: #FFF; 11 | } 12 | hr { 13 | border: 1px solid black; 14 | } 15 | #wrap { 16 | width:800px; 17 | margin-left: auto; 18 | margin-right: auto; 19 | } 20 | #header { 21 | text-align: center; 22 | } 23 | #image_vis { 24 | background-color: #FFF; 25 | padding: 20px 0px; 26 | } 27 | #image_vis_controls { 28 | text-align: center; 29 | padding: 10px; 30 | background-color: #DDD; 31 | border: 1px solid #999; 32 | margin-bottom: 20px; 33 | } 34 | .bb { 35 | height: 50px; 36 | width: 175px; 37 | margin: 5px; 38 | } 39 | .ddesc { 40 | font-size: 32px; 41 | } 42 | .dcent { 43 | margin-left: auto; 44 | margin-right: auto; 45 | width: 720px; 46 | margin-bottom: 20px; 47 | } 48 | .djust { 49 | text-align: justify; 50 | } -------------------------------------------------------------------------------- /vis/utils.js: -------------------------------------------------------------------------------- 1 | 2 | // helper function to create HSL string from a vector of colors 3 | var renderHSL = function(hsl) { // omg 4 | var ht = Math.min(360, Math.max(0, hsl[0])); 5 | var st = Math.min(100, Math.max(0, hsl[1])); 6 | var lt = Math.min(100, Math.max(0, hsl[2])); 7 | return 'hsl(' + ht + ',' + st + '%,' + lt + '%)'; 8 | } 9 | 10 | // randomly shuffle an array 11 | function shuffle(array) { 12 | var currentIndex = array.length, temporaryValue, randomIndex ; 13 | // While there remain elements to shuffle... 14 | while (0 !== currentIndex) { 15 | // Pick a remaining element... 16 | randomIndex = Math.floor(Math.random() * currentIndex); 17 | currentIndex -= 1; 18 | // And swap it with the current element. 19 | temporaryValue = array[currentIndex]; 20 | array[currentIndex] = array[randomIndex]; 21 | array[randomIndex] = temporaryValue; 22 | } 23 | return array; 24 | } 25 | 26 | // html escaping util 27 | var entityMap = { 28 | "&": "&", 29 | "<": "<", 30 | ">": ">", 31 | '"': '"', 32 | "'": ''', 33 | "/": '/' 34 | }; 35 | function escapeHtml(string) { 36 | return String(string).replace(/[&<>"'\/]/g, function (s) { 37 | return entityMap[s]; 38 | }); 39 | } 40 | 41 | 42 | // store colors in a global var because why not 43 | var WAD_COLORS = [ 44 | "rgb(173, 35, 35)", // Red 45 | "rgb(42, 75, 215)", // Blue 46 | "rgb(87, 87, 87)", // Dark Gray 47 | "rgb(29, 105, 20)", // Green 48 | "rgb(129, 74, 25)", // Brown 49 | "rgb(129, 38, 192)", // Purple 50 | "rgb(160, 160, 160)", // Lt Gray 51 | "rgb(129, 197, 122)", // Lt green 52 | "rgb(157, 175, 255)", // Lt blue 53 | "rgb(41, 208, 208)", // Cyan 54 | "rgb(255, 146, 51)", // Orange 55 | "rgb(255, 238, 51)", // Yellow 56 | "rgb(233, 222, 187)", // Tan 57 | "rgb(255, 205, 243)", // Pink 58 | // "rgb(255, 255, 255)", // White 59 | //"rgb(0, 0, 0)", // Black 60 | ]; 61 | 62 | // ---------------------------------------------------------------------------- 63 | // visualization utils 64 | // ---------------------------------------------------------------------------- 65 | 66 | // renders a bounding box and text annotaiton in svg element elt. assumes d3js 67 | function renderBox(elt, box, color, width, text) { 68 | if (typeof(width) === 'undefined') width = 1; 69 | elt.append('rect') 70 | .attr('x', box[0]) 71 | .attr('y', box[1]) 72 | .attr('width', box[2]) 73 | .attr('height', box[3]) 74 | .attr('stroke', color) 75 | .attr('fill', 'none') 76 | .attr('stroke-width', width); 77 | if (typeof(text) !== 'undefined' && text != '') { 78 | var t = elt.append('text').text(text) 79 | .attr('x', box[0]).attr('y', box[1]) 80 | .attr('dominant-baseline', 'hanging') 81 | .attr('text-anchor', 'start'); 82 | t = t[0][0]; 83 | var tbox = t.getBBox(); 84 | elt.insert('rect', 'text').attr('fill', color) 85 | .attr('x', tbox.x).attr('y', tbox.y) 86 | .attr('width', tbox.width) 87 | .attr('height', tbox.height); 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /vis/view_results.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | DenseCap results browser 7 | 8 | 9 | 10 | 11 | 12 | 144 | 145 | 146 |
147 | 148 | 149 |
Browse the results using the WSAD hotkeys (A,D: prev/next image, W/S: more/less detections)
150 |
151 | 152 |
153 |
154 | 155 | 156 | 157 | 158 |
159 | 162 | 165 |
166 |
167 | 168 |
169 | 170 |
171 | 172 | 173 | --------------------------------------------------------------------------------