├── .gitignore
├── LICENSE
├── Note.md
├── README.md
├── __init__.py
├── info
    ├── __init__.py
    ├── densecap_splits.json
    ├── read_regions.py
    ├── read_splits.py
    ├── test.txt
    ├── train.txt
    └── val.txt
├── lib
    ├── Makefile
    ├── __init__.py
    ├── config.py
    ├── datasets
    │   ├── __init__.py
    │   ├── factory.py
    │   ├── imdb.py
    │   └── visual_genome.py
    ├── dense_cap
    │   ├── __init__.py
    │   ├── beam_search.py
    │   ├── caption_generator.py
    │   ├── test.py
    │   ├── train.py
    │   └── vis_whtml.py
    ├── download_data_vh.sh
    ├── fast_rcnn
    │   ├── __init__.py
    │   ├── bbox_transform.py
    │   ├── layer.py
    │   ├── minibatch.py
    │   ├── nms_wrapper.py
    │   └── roidb.py
    ├── layers
    │   ├── __init__.py
    │   ├── anchor_target_layer.py
    │   ├── generate_anchors.py
    │   ├── global_roi_layer.py
    │   ├── proposal_layer.py
    │   ├── proposal_target_layer.py
    │   ├── proposal_target_single_class_layer.py
    │   ├── proposal_top_layer.py
    │   ├── rois_offset_layer.py
    │   ├── sentence_data_layer.py
    │   └── snippets.py
    ├── limit_ram
    │   ├── __init__.py
    │   └── utils.py
    ├── nets
    │   ├── __init__.py
    │   ├── mobilenet_v1.py
    │   ├── network.py
    │   ├── resnet_v1.py
    │   └── vgg16.py
    ├── nms
    │   ├── __init__.py
    │   ├── cpu_nms.c
    │   ├── cpu_nms.pyx
    │   ├── gpu_nms.cpp
    │   ├── gpu_nms.hpp
    │   ├── gpu_nms.pyx
    │   ├── nms_kernel.cu
    │   └── py_cpu_nms.py
    ├── pre_glove.py
    ├── preprocess.py
    ├── preprocess.sh
    ├── pycocoevalcap
    │   ├── README
    │   ├── __init__.py
    │   ├── bleu
    │   │   ├── LICENSE
    │   │   ├── __init__.py
    │   │   ├── bleu.py
    │   │   └── bleu_scorer.py
    │   ├── cider
    │   │   ├── __init__.py
    │   │   ├── cider.py
    │   │   └── cider_scorer.py
    │   ├── eval.py
    │   ├── meteor
    │   │   ├── __init__.py
    │   │   ├── meteor-1.5.jar
    │   │   └── meteor.py
    │   ├── rouge
    │   │   ├── __init__.py
    │   │   └── rouge.py
    │   ├── tokenizer
    │   │   ├── __init__.py
    │   │   ├── ptbtokenizer.py
    │   │   ├── stanford-corenlp-3.4.1.jar
    │   │   └── tmpGeypfw
    │   └── vg_eval.py
    ├── setup.py
    └── utils
    │   ├── __init__.py
    │   ├── bbox.c
    │   ├── bbox.pyx
    │   ├── bbox_utils.py
    │   ├── blob.py
    │   ├── debug.py
    │   ├── timer.py
    │   └── visualization.py
├── logs
    ├── densecap.png
    └── funny.png
├── requirements.txt
├── scripts
    ├── dense_cap_config.yml
    ├── dense_cap_demo.sh
    ├── dense_cap_test.sh
    ├── dense_cap_train.sh
    └── old_dense_cap_train.sh
├── tests
    ├── README.md
    ├── __init__.py
    ├── architecture_test.py
    ├── bash_log_test
    │   ├── bash_log_test.sh
    │   ├── logs
    │   │   └── test.txt.2017-10-18_15-33-56
    │   └── nonsense.py
    ├── ckpt_restore_test.py
    ├── dencap_oa_test.sh
    ├── logs
    │   ├── architecture_test.txt
    │   ├── architecture_test_nodes.txt
    │   ├── preprocessing.txt
    │   └── sentence_data_layer_test.txt
    ├── pickle_read_test.py
    ├── read_regions_json
    │   ├── ijson_example.txt
    │   ├── read_regions_test.py
    │   ├── test_region.json
    │   ├── test_region_out.json
    │   ├── true_id_1.json
    │   └── true_id_1_out.json
    ├── roidata_test.py
    ├── sentence_data_layer_test.py
    └── vh_train_command.sh
├── tools
    ├── __init__.py
    ├── _init_paths.py
    ├── demo.py
    ├── test_net.py
    └── train_net.py
├── valohai.yaml
└── vis
    ├── README.md
    ├── d3.min.js
    ├── jquery-1.8.3.min.js
    ├── style.css
    ├── utils.js
    └── view_results.html


/.gitignore:
--------------------------------------------------------------------------------
  1 | #sublime
  2 | *.sublime-workspace
  3 | *.sublime-project
  4 | #pycharm
  5 | .idea/
  6 | data/
  7 | demo/
  8 | experiments/
  9 | 
 10 | tensorboard/
 11 | output/
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | env/
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | .hypothesis/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # celery beat schedule file
 88 | celerybeat-schedule
 89 | 
 90 | # SageMath parsed files
 91 | *.sage.py
 92 | 
 93 | # dotenv
 94 | .env
 95 | 
 96 | # virtualenv
 97 | .venv
 98 | venv/
 99 | ENV/
100 | 
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 | 
105 | # Rope project settings
106 | .ropeproject
107 | 
108 | # mkdocs documentation
109 | /site
110 | 
111 | # mypy
112 | .mypy_cache/
113 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Innerpeace
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Densecap-tensorflow
  2 | 
  3 | Implementation of CVPR2017 paper: [Dense captioning with joint inference and visual context](https://arxiv.org/abs/1611.06949) by **Linjie Yang, Kevin Tang, Jianchao Yang, Li-Jia Li**
  4 | 
  5 | **WITH CHANGES:**  
  6 | 1. Borrow the idea of [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling](https://arxiv.org/abs/1611.01462), and tied word vectors and word classfiers during captioning.
  7 | 2. Initialize Word Vectors and Word Classifers with pre-trained [glove](https://nlp.stanford.edu/projects/glove/) word vectors with dimensions of 300.
  8 | 3. Change the backbone of the framework to ResNet-50.
  9 | 4. Add `Beam Search` and `Length Normalization` in test mode.
 10 | 5. Add "Limit_RAM" mode when praparing training date since my computer only has RAM with 8G.
 11 | 
 12 | <div align="center">
 13 |     <img src="./logs/funny.png" width="40%" height="40%">
 14 |     <img src="./logs/densecap.png" width="40%" height="40%">
 15 | </div>
 16 | 
 17 | **Special thanks to [valohai](https://valohai.com/) for offering computing resource.**
 18 | 
 19 | ## Note
 20 | 
 21 | **Update 2017.12.31**  
 22 | 
 23 | * After 500k iterations of training with configurations of original paper (except for the weights tying of wordvec and classifiers), it achieves **mAP 8.296**.
 24 | 
 25 | **Update 2017.12.20**  
 26 | 
 27 | * After 1 epoch(80000 iters) of training with randomly initialized word vectors(512d), it achieves **mAP 6.509**. 
 28 | * After 1 epoch(75000) of training with pre-trianed glove word vectors(300d), it got **mAP 5.5** nearly.
 29 | * The complete training process will take almost **10 days** with the computation I have access to, and I just trained 1 epoch to varify the framework for now.
 30 | * The scripts should be compatible with both python 2.X and 3.X. Although I built it under python 2.7.
 31 | * Tested on Ubuntu 16.04, tensorflow 1.4, CUDA 8.0 and cudnn 6.0, with GPU Nvidia gtx 1060(LOL...).
 32 | 
 33 | ## Dependencies
 34 | 
 35 | To install required python modules by:
 36 | 
 37 | ```commandline
 38 | pip install -r lib/requirements.txt
 39 | ```
 40 | 
 41 | **For evaluation, one also need:**  
 42 | * java 1.8.0
 43 | * python 2.7(according to 
 44 | [coco-caption](https://github.com/tylin/coco-caption))
 45 | 
 46 | To install java runtime by:  
 47 | ```commandline
 48 | sudo apt-get install openjdk-8-jre
 49 | ```
 50 | 
 51 | ## Preparing data
 52 | 
 53 | ### Download
 54 | 
 55 | [Website of Visual Genome Dataset](http://visualgenome.org/api/v0/api_home.html)
 56 | 
 57 | * Make a new directory `VG` wherever you like.
 58 | * Download `images` Part1 and Part2, extract `all (two parts)` to directory `VG/images`
 59 | * Download `image meta data`, extract to directory `VG/1.2` or `VG/1.0` according to the version you download.
 60 | * Download `region descriptions`, extract to directory `VG/1.2` or `VG/1.0` accordingly.
 61 | * For the following process, we will refer **the absolute path** of directory `VG` as `raw_data_path`, e.g. `/home/user/git/VG`.
 62 | 
 63 | ### Unlimit RAM
 64 | 
 65 | If one has RAM more than 16G, then you can preprocessing dataset with following command.
 66 | ```shell
 67 | $ cd $ROOT/lib
 68 | $ python preprocess.py --version [version] --path [raw_data_path] \
 69 |         --output_dir [dir] --max_words [max_len]
 70 | ```
 71 | 
 72 | ### Limit RAM (Less than 16G)
 73 | 
 74 | If one has RAM `less than 16G`.
 75 | * Firstly, setting up the data path in `info/read_regions.py` accordingly, and run the script with python. Then it will dump `regions` in `REGION_JSON` directory. It will take time to process more than 100k images, so be patient.
 76 | ```shell
 77 | $ cd $ROOT/info
 78 | $ python read_regions --version [version] --vg_path [raw_data_path]
 79 | ```
 80 | * In `lib/preprocess.py`, set up data path accordingly. After running the file, it will dump `gt_regions` of every image respectively to `OUTPUT_DIR` as `directory`.
 81 | ```shell
 82 | $ cd $ROOT/lib
 83 | $ python preprocess.py --version [version] --path [raw_data_path] \
 84 |         --output_dir [dir] --max_words [max_len] --limit_ram
 85 | ```
 86 | 
 87 | ## Compile local libs
 88 | 
 89 | ```shell
 90 | $ cd root/lib
 91 | $ make
 92 | ```
 93 | 
 94 | ## Train
 95 | 
 96 | Add or modify configurations in `root/scripts/dense_cap_config.yml`, refer to 'lib/config.py' for more configuration details.
 97 | ```shell
 98 | $ cd $ROOT
 99 | $ bash scripts/dense_cap_train.sh [dataset] [net] [ckpt_to_init] [data_dir] [step]
100 | ```
101 | 
102 | Parameters:
103 | * dataset: `visual_genome_1.2` or `visual_genome_1.0`.
104 | * net: res50, res101
105 | * ckpt_to_init: pretrained model to be initialized with. Refer to [tf_faster_rcnn](https://github.com/endernewton/tf-faster-rcnn) for more init weight details.
106 | * data_dir: the data directory where you save the outputs after `prepare data`.
107 | * step: for continue training. 
108 |     - step 1: fix convnet weights
109 |     - stpe 2: finetune convnets weights
110 |     - step 3: add context fusion, but fix convnets weights
111 |     - step 4: finetune the whole model.
112 | 
113 | ## Demo
114 | 
115 | Create a directory `data/demo`
116 | ```sh
117 | $ mkdir $ROOT/data/demo
118 | ```
119 | Then put the images to be tested in the directory. 
120 | 
121 | **Download pretrained model (iters 500k)** by [Google Drive](https://drive.google.com/file/d/1yoJGXXpeSpQbU-6WpLsMXFLIka7xpTAy/view?usp=sharing) 
122 | or [Jbox](https://jbox.sjtu.edu.cn/l/j5EeUN). Then create a "output" 
123 | directory under `$ROOT`
124 | ```sh
125 | $ mkdir $ROOT/output
126 | ```
127 | Extract the downloaded "ckpt.zip" to directory `$ROOT/output`.
128 | And run
129 | ```sh
130 | $ cd $ROOT
131 | $ bash scripts/dense_cap_demo.sh ./output/ckpt ./output/ckpt/vocabulary.txt
132 | ```
133 | or run
134 | ```sh
135 | $ bash scripts/dense_cap_demo.sh [ckpt_path] [vocab_path]
136 | ```
137 | for your customized checkpoint directory.
138 | 
139 | It will create html files in `$ROOT/demo`, just click it.
140 | Or you can use the web-based visualizer created by [karpathy](https://github.com/karpathy) by
141 | ```sh
142 | $ cd $ROOT/vis
143 | $ python -m SimpleHTTPServer 8181
144 | ```
145 | Then point your web brower to [http://localhost:8181/view_results.html](http://localhost:8181/view_results.html).
146 | 
147 | ## TODO:
148 | 
149 | - [x] preprocessing dataset.
150 | - [x] roi_data_layer & get data well prepared for feeding.
151 | - [x] proposal layer
152 | - [x] sentense data layer
153 | - [x] embedding layer
154 | - [x] get loc loss and caption loss
155 | - [x] overfit a mini-batch
156 | - [x] context fusion
157 | - [x] add experiment result.
158 | 
159 | ## References
160 | 
161 | * The Faster-RCNN framework inherited from repo [tf-faster-rcnn](https://github.com/endernewton/tf-faster-rcnn) by [endernewton](https://github.com/endernewton)
162 | * The official repo of [densecap](https://github.com/linjieyangsc/densecap)
163 | * [Tying Word Vectors and Word Classifiers: A Loss Framework for Language Modeling](https://arxiv.org/abs/1611.01462)
164 | * Official tensorflow models - "im2text".
165 | * Adapted web-based visualizer from [jcjohnson](https://github.com/jcjohnson)'s [densecap repo](https://github.com/jcjohnson/densecap)
166 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/__init__.py


--------------------------------------------------------------------------------
/info/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/info/__init__.py


--------------------------------------------------------------------------------
/info/read_regions.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------
 2 | # DenseCap
 3 | # Written by InnerPeace
 4 | # ----------------------------------------------
 5 | from __future__ import absolute_import
 6 | from __future__ import division
 7 | from __future__ import print_function
 8 | 
 9 | """read large region description json files"""
10 | 
11 | import ijson
12 | import json
13 | import sys
14 | import os
15 | import argparse
16 | 
17 | parser = argparse.ArgumentParser(description='Preprocessing visual genome')
18 | parser.add_argument('--version', dest='version', type=float, default=1.2, help='the version of visual genome dataset.')
19 | parser.add_argument('--vg_path', dest='vg_path', type=str, default='/home/joe/git/VG_raw_data', help='directory keeping the raw dataset of visual genome')
20 | 
21 | args = parser.parse_args()
22 | VG_VERSION = args.version
23 | VG_PATH = args.vg_path
24 | 
25 | VG_REGION_PATH = '%s/%s/region_descriptions.json' % (VG_PATH, VG_VERSION)
26 | REGION_JSON = '%s/%s/regions' % (VG_PATH, VG_VERSION)
27 | 
28 | 
29 | def read_regions():
30 |     if not os.path.exists(REGION_JSON):
31 |         os.makedirs(REGION_JSON)
32 |     parser = ijson.parse(open(VG_REGION_PATH))
33 |     last_value = None
34 |     Dic = {}
35 |     regions = []
36 |     dic = {}
37 |     count = 0
38 |     for prefix, event, value in parser:
39 |         sys.stdout.write('>>> %d \r' % count)
40 |         sys.stdout.flush()
41 |         if value == 'regions':
42 |             Dic = {}
43 |             regions = []
44 |             last_value = None
45 |         elif last_value == 'id' and value:
46 |             count += 1
47 |             Dic['regions'] = regions
48 |             Dic['id'] = value
49 |             with open(REGION_JSON + '/%s.json' % value, 'w') as f:
50 |                 json.dump(Dic, f)
51 |         elif event == 'map_key':
52 |             last_value = value
53 |         elif event == 'end_map':
54 |             regions.append(dic)
55 |             dic = {}
56 |             last_value = None
57 |         elif last_value:
58 |             dic[last_value] = value
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     read_regions()
63 | 


--------------------------------------------------------------------------------
/info/read_splits.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------
 2 | # DenseCap
 3 | # Written by InnerPeace
 4 | # ----------------------------------------------
 5 | from __future__ import absolute_import
 6 | from __future__ import division
 7 | from __future__ import print_function
 8 | 
 9 | '''Read splits'''
10 | 
11 | import json
12 | 
13 | def read_splits():
14 |     file = 'densecap_splits.json'
15 |     with open(file, 'r') as f:
16 |         data = json.load(f)
17 |     splits = ['train', 'val', 'test']
18 |     for split in splits:
19 |         print("%s set has %s examples." % (split, len(data[split])))
20 |         with open(split + '.txt', 'w') as f:
21 |             for id in data[split]:
22 |                 f.write("%s\n" % id)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     read_splits()
27 | 


--------------------------------------------------------------------------------
/lib/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	python setup.py build_ext --inplace
3 | 	rm -rf build
4 | 


--------------------------------------------------------------------------------
/lib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/__init__.py


--------------------------------------------------------------------------------
/lib/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/datasets/__init__.py


--------------------------------------------------------------------------------
/lib/datasets/factory.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DenseCap-Tensorflow
 3 | # Written by InnerPeace
 4 | # This file is adapted from Linjie's work
 5 | # --------------------------------------------------------
 6 | 
 7 | """Factory method for easily getting imdbs by name."""
 8 | 
 9 | __sets = {}
10 | 
11 | from visual_genome import visual_genome
12 | 
13 | 
14 | # Set up visual_genome_<split> using rpn mode
15 | # for version in ['1.0', '1.2']:
16 | for version in ['1.2']:
17 |     for split in ['train', 'val', 'test']:
18 |         name = 'vg_{}_{}'.format(version, split)
19 |         __sets[name] = (lambda split=split, version=version:
20 |                         visual_genome(split, version))
21 | 
22 | 
23 | def get_imdb(name):
24 |     """Get an imdb (image database) by name."""
25 |     if not __sets.has_key(name):
26 |         raise KeyError('Unknown dataset: {}'.format(name))
27 |     return __sets[name]()
28 | 
29 | 
30 | def list_imdbs():
31 |     """List all registered imdbs."""
32 |     return __sets.keys()
33 | 


--------------------------------------------------------------------------------
/lib/dense_cap/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/dense_cap/__init__.py


--------------------------------------------------------------------------------
/lib/dense_cap/beam_search.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DenseCap-Tensorflow
  3 | # Written by InnerPeace
  4 | # This file is adapted from Linjie's work
  5 | #             and Google's im2txt project
  6 | # --------------------------------------------------------
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import pdb
 12 | import math
 13 | from lib.dense_cap.caption_generator import *
 14 | import numpy as np
 15 | from lib.config import cfg
 16 | import tensorflow as tf
 17 | from six.moves import xrange
 18 | 
 19 | 
 20 | def beam_search(sess, net, blobs, im_scales):
 21 |     # (TODO wu) for now it only works with "concat" mode
 22 |     # get initial states and rois
 23 |     if cfg.CONTEXT_FUSION:
 24 |         cap_state, loc_state, scores, \
 25 |             rois, gfeat_state = net.feed_image(sess,
 26 |                                                blobs['data'],
 27 |                                                blobs['im_info'][0])
 28 |         all_states = np.concatenate((cap_state, loc_state, gfeat_state), axis=1)
 29 |     else:
 30 |         cap_state, loc_state, scores, rois = net.feed_image(sess, blobs['data'],
 31 |                                                             blobs['im_info'][0])
 32 |         all_states = np.concatenate((cap_state, loc_state), axis=1)
 33 | 
 34 |     # proposal boxes
 35 |     boxes = rois[:, 1:5] / im_scales[0]
 36 |     proposal_n = rois.shape[0]
 37 | 
 38 |     all_partial_caps = []
 39 |     all_complete_caps = []
 40 |     beam_size = cfg.TEST.BEAM_SIZE
 41 |     for i in xrange(proposal_n):
 42 |         init_beam = Caption(sentence=[cfg.VOCAB_START_ID],
 43 |                             state=all_states[i],
 44 |                             box_pred=[],
 45 |                             logprob=0.0,
 46 |                             score=0.0,
 47 |                             metadata=[""])
 48 |         partial_cap = TopN(beam_size)
 49 |         partial_cap.push(init_beam)
 50 |         complete_cap = TopN(beam_size)
 51 |         all_partial_caps.append(partial_cap)
 52 |         all_complete_caps.append(complete_cap)
 53 | 
 54 |     for j in xrange(cfg.TIME_STEPS - 1):
 55 |         all_candidates_len = []
 56 |         flag = False
 57 |         for i in xrange(proposal_n):
 58 |             partial_cap = all_partial_caps[i]
 59 |             size = partial_cap.size()
 60 |             all_candidates_len.append(size)
 61 |             if not size:
 62 |                 continue
 63 |             partial_cap_list = partial_cap.get_data()
 64 |             input_feed_i = [c.sentence[-1] for c in partial_cap_list]
 65 |             state_feed_i = [c.state for c in partial_cap_list]
 66 |             if not flag:
 67 |                 flag = True
 68 |                 input_feed = np.array(input_feed_i)
 69 |                 state_feed = np.array(state_feed_i)
 70 |             else:
 71 |                 input_feed = np.concatenate((input_feed, np.array(input_feed_i)))
 72 |                 state_feed = np.concatenate((state_feed, np.array(state_feed_i)))
 73 | 
 74 |         if cfg.CONTEXT_FUSION:
 75 |             cap_feed, loc_feed, gfeat_feed = np.split(state_feed, 3, axis=1)
 76 |             cap_probs, new_bbox_pred, new_cap_state, new_loc_state, \
 77 |                 new_gfeat_state = net.inference_step(sess, input_feed,
 78 |                                                      cap_feed, loc_feed, gfeat_feed)
 79 |             new_state = np.concatenate((new_cap_state, new_loc_state, new_gfeat_state),
 80 |                                        axis=1)
 81 |         else:
 82 |             cap_feed, loc_feed = np.split(state_feed, 2, axis=1)
 83 |             cap_probs, new_bbox_pred, new_cap_state, \
 84 |                 new_loc_state = net.inference_step(sess, input_feed,
 85 |                                                    cap_feed, loc_feed)
 86 |             new_state = np.concatenate((new_cap_state, new_loc_state), axis=1)
 87 | 
 88 |         count = 0
 89 |         for k in xrange(proposal_n):
 90 |             l = all_candidates_len[k]
 91 |             if l == 0:
 92 |                 continue
 93 |             partial_cap = all_partial_caps[k]
 94 |             complete_cap = all_complete_caps[k]
 95 |             partial_cap_list = partial_cap.extract()
 96 |             partial_cap.reset()
 97 |             softmax_k = cap_probs[count: count + l]
 98 |             states_k = new_state[count: count + l]
 99 |             bbox_pred_k = new_bbox_pred[count: count + l]
100 |             count += l
101 |             for i, par_cap in enumerate(partial_cap_list):
102 |                 word_probs = softmax_k[i]
103 |                 state = states_k[i]
104 |                 bbox_pred = bbox_pred_k[i]
105 |                 # For this partial caption, get the beam_size most probable next words.
106 |                 words_and_probs = list(enumerate(word_probs))
107 |                 words_and_probs.sort(key=lambda x: -x[1])
108 |                 words_and_probs = words_and_probs[0: beam_size]
109 |                 # Each next word gives a new partial caption
110 |                 for w, p in words_and_probs:
111 |                     if p < 1e-12:
112 |                         continue  # Avoid log(0)
113 |                     sentence = par_cap.sentence + [w]
114 |                     logprob = par_cap.logprob + math.log(p)
115 |                     sc = logprob
116 |                     box_pred = par_cap.box_pred
117 |                     box_pred.append(bbox_pred)
118 |                     if w == cfg.VOCAB_END_ID:
119 |                         if cfg.TEST.LN_FACTOR > 0:
120 |                             sc /= len(sentence) ** cfg.TEST.LN_FACTOR
121 |                         beam = Caption(sentence, state, box_pred, logprob, sc)
122 |                         complete_cap.push(beam)
123 |                     else:
124 |                         beam = Caption(sentence, state, box_pred, logprob, sc)
125 |                         partial_cap.push(beam)
126 |     captions = []
127 |     box_offsets = np.zeros((proposal_n, 4), dtype=np.float32)
128 |     for i in xrange(proposal_n):
129 |         complete_cap = all_complete_caps[i]
130 |         if not complete_cap.size():
131 |             complete_cap = all_partial_caps[i]
132 |         caps_i = complete_cap.extract(sort=True)
133 |         captions.append(caps_i[0].sentence)
134 |         box_offsets[i] = caps_i[0].box_pred[-1]
135 | 
136 |     return scores, box_offsets, captions, boxes
137 | 


--------------------------------------------------------------------------------
/lib/dense_cap/vis_whtml.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------
 2 | # DenseCap
 3 | # Written by InnerPeace
 4 | # ----------------------------------------------
 5 | from __future__ import absolute_import
 6 | from __future__ import division
 7 | from __future__ import print_function
 8 | 
 9 | import cv2
10 | import os
11 | import numpy as np
12 | from six.moves import xrange
13 | 
14 | 
15 | def vis_whtml(im_path, im, captions, dets, pre_results=dict(),
16 |               thresh=0.5, save_path='./vis/data'):
17 |     print("visualizing with pretty html...")
18 |     if not os.path.exists(save_path):
19 |         os.mkdirs(save_path)
20 | 
21 |     im_name = im_path.split('/')[-1][:-4]
22 |     box_xywh = []
23 |     box_caps = []
24 |     scores = []
25 |     for i in xrange(dets.shape[0]):
26 |         if dets[i, -1] > thresh:
27 |             box_xywh.append(box2xywh(dets[i, :4].tolist()))
28 |             box_caps.append(captions[i])
29 |             scores.append(float(dets[i, -1]))
30 | 
31 |     # save image
32 |     im_new = np.copy(im)
33 |     cv2.imwrite("%s/%s.jpg" % (save_path, im_name), im_new)
34 |     result = {"img_name": "%s.jpg" % im_name,
35 |               "scores": scores,
36 |               "captions": box_caps,
37 |               "boxes": box_xywh}
38 |     pre_results["results"] = pre_results.get("results", []) + [result]
39 | 
40 |     return pre_results
41 | 
42 | 
43 | def box2xywh(box):
44 |     xywh = []
45 |     xywh.extend(box[:2])
46 |     for i in xrange(2):
47 |         xywh.append(box[i+2] - box[i])
48 | 
49 |     return xywh
50 | 


--------------------------------------------------------------------------------
/lib/download_data_vh.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | set -x
4 | 
5 | cd /valohai/inputs
6 | mv image_1/images.zip image_2/images2.zip /valohai/outputs
7 | 


--------------------------------------------------------------------------------
/lib/fast_rcnn/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/lib/fast_rcnn/bbox_transform.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def bbox_transform(ex_rois, gt_rois):
12 |     ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
13 |     ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
14 |     ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
15 |     ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
16 | 
17 |     gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
18 |     gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
19 |     gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
20 |     gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
21 | 
22 |     targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
23 |     targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
24 |     targets_dw = np.log(gt_widths / ex_widths)
25 |     targets_dh = np.log(gt_heights / ex_heights)
26 | 
27 |     targets = np.vstack(
28 |         (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
29 |     return targets
30 | 
31 | 
32 | def bbox_transform_inv(boxes, deltas):
33 |     if boxes.shape[0] == 0:
34 |         return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
35 | 
36 |     boxes = boxes.astype(deltas.dtype, copy=False)
37 | 
38 |     widths = boxes[:, 2] - boxes[:, 0] + 1.0
39 |     heights = boxes[:, 3] - boxes[:, 1] + 1.0
40 |     ctr_x = boxes[:, 0] + 0.5 * widths
41 |     ctr_y = boxes[:, 1] + 0.5 * heights
42 | 
43 |     dx = deltas[:, 0::4]
44 |     dy = deltas[:, 1::4]
45 |     dw = deltas[:, 2::4]
46 |     dh = deltas[:, 3::4]
47 | 
48 |     pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
49 |     pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
50 |     pred_w = np.exp(dw) * widths[:, np.newaxis]
51 |     pred_h = np.exp(dh) * heights[:, np.newaxis]
52 | 
53 |     pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
54 |     # x1
55 |     pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
56 |     # y1
57 |     pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
58 |     # x2
59 |     pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1  # to make it the perfect inversion of bbox_transform
60 |     # y2
61 |     pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1  # to make it the perfect inversion of bbox_transform
62 | 
63 |     return pred_boxes
64 | 
65 | 
66 | def clip_boxes(boxes, im_shape):
67 |     """
68 |     Clip boxes to image boundaries.
69 |     """
70 | 
71 |     # x1 >= 0
72 |     boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
73 |     # y1 >= 0
74 |     boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
75 |     # x2 < im_shape[1]
76 |     boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
77 |     # y2 < im_shape[0]
78 |     boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
79 |     return boxes
80 | 


--------------------------------------------------------------------------------
/lib/fast_rcnn/layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DenseCap-Tensorflow
  3 | # Written by InnerPeace
  4 | # This file is adapted from Linjie's work and Xinlei's work
  5 | # --------------------------------------------------------
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | from os.path import join as pjoin
 11 | from lib.config import cfg
 12 | from lib.fast_rcnn.minibatch import get_minibatch
 13 | import numpy as np
 14 | import time
 15 | import json
 16 | 
 17 | 
 18 | class RoIDataLayer(object):
 19 |     """densecap data layer used for training."""
 20 | 
 21 |     def __init__(self, roidb, random=False):
 22 |         """set the roidb to be used by this layer during training."""
 23 |         self._roidb = roidb
 24 |         # set a random flag
 25 |         self._random = random
 26 |         self._shuffle_roidb_inds()
 27 | 
 28 |     def _shuffle_roidb_inds(self):
 29 |         """Randomly permute the training roidb."""
 30 | 
 31 |         # if the random flag is set,
 32 |         # then the database is shuffled according to system time
 33 |         # useful for the validation set.
 34 |         if self._random:
 35 |             st0 = np.random.get_state()
 36 |             millis = int(round(time.time() * 1000)) % 4294967259
 37 |             np.random.seed(millis)
 38 | 
 39 |         if not cfg.LIMIT_RAM:
 40 |             # with sending in the giant roidb list
 41 |             if cfg.TRAIN.ASPECT_GROUPING:
 42 |                 widths = np.array([r['width'] for r in self._roidb])
 43 |                 heights = np.array([r['height'] for r in self._roidb])
 44 |                 horz = (widths >= heights)
 45 |                 vert = np.logical_not(horz)
 46 |                 horz_inds = np.where(horz)[0]
 47 |                 vert_inds = np.where(vert)[0]
 48 |                 inds = np.hstack((
 49 |                     np.random.permutation(horz_inds),
 50 |                     np.random.permutation(vert_inds)))
 51 |                 inds = np.reshape(inds, (-1, 2))
 52 |                 row_perm = np.random.permutation(np.arange(inds.shape[0]))
 53 |                 inds = np.reshape(inds[row_perm, :], (-1,))
 54 |                 self._perm = inds
 55 |             else:
 56 |                 self._perm = np.random.permutation(np.arange(len(self._roidb)))
 57 |         else:
 58 |             # LIMIT_RAM and 'roidb' is the path to saved gt_roidbs.
 59 |             index_path = self._roidb + '/image_index.json'
 60 |             with open(index_path, 'r') as f:
 61 |                 self._image_index = json.load(f)
 62 |                 print("LIMIT_RAM version and load index from {}".format(index_path))
 63 |             self._perm = np.random.permutation(np.arange(len(self._image_index)))
 64 | 
 65 |         # restore the random state
 66 |         if self._random:
 67 |             np.random.set_state(st0)
 68 | 
 69 |         self._cur = 0
 70 | 
 71 |     def _get_next_minibatch_inds(self):
 72 |         """Return the roidb indices for the next minibatch."""
 73 |         if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._perm):
 74 |             self._shuffle_roidb_inds()
 75 | 
 76 |         db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH]
 77 |         self._cur += cfg.TRAIN.IMS_PER_BATCH
 78 |         return db_inds
 79 | 
 80 |     def _get_next_minibatch(self):
 81 |         """Return the blobs to be used for the next minibatch.
 82 | 
 83 |         If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a
 84 |         separate process and made available through self._blob_queue.
 85 |         """
 86 |         db_inds = self._get_next_minibatch_inds()
 87 |         if cfg.LIMIT_RAM:
 88 |             assert len(db_inds) == 1, "LIMIT_RAM version only support one " \
 89 |                                       "image per minibatch."
 90 |             # it is the exact file path in the 'roidb' directory.
 91 |             minibatch_db = self._image_index[db_inds[0]]
 92 |             minibatch_db = pjoin(self._roidb, "%s.pkl" % minibatch_db)
 93 |         else:
 94 |             minibatch_db = [self._roidb[i] for i in db_inds]
 95 |         return get_minibatch(minibatch_db)
 96 | 
 97 |     def forward(self):
 98 |         """Get blobs"""
 99 |         blobs = self._get_next_minibatch()
100 |         return blobs
101 | 


--------------------------------------------------------------------------------
/lib/fast_rcnn/minibatch.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DenseCap-Tensorflow
  3 | # Written by InnerPeace
  4 | # This file is adapted from Linjie's work and Xinlei's work
  5 | # --------------------------------------------------------
  6 | # Fast R-CNN
  7 | # Copyright (c) 2015 Microsoft
  8 | # Licensed under The MIT License [see LICENSE for details]
  9 | # Written by Ross Girshick
 10 | # --------------------------------------------------------
 11 | from __future__ import absolute_import
 12 | from __future__ import division
 13 | from __future__ import print_function
 14 | 
 15 | """Compute minibatch blobs for training a DenseCap network."""
 16 | 
 17 | import numpy as np
 18 | import numpy.random as npr
 19 | import cv2
 20 | from six.moves import cPickle, xrange
 21 | from lib.config import cfg
 22 | from lib.utils.blob import prep_im_for_blob, im_list_to_blob
 23 | 
 24 | 
 25 | def get_minibatch(roidb):
 26 |     """Given a roidb, construct a minibatch sampled from it."""
 27 | 
 28 |     if cfg.LIMIT_RAM:
 29 |         num_images = 1  # one image per minibatch
 30 |     else:
 31 |         num_images = len(roidb)
 32 | 
 33 |     # Sample random scales to use for each image in this batch
 34 |     random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES),
 35 |                                     size=num_images)
 36 |     assert (cfg.TRAIN.BATCH_SIZE % num_images == 0), \
 37 |         'num_images ({}) must divide BATCH_SIZE ({})'. \
 38 |             format(num_images, cfg.TRAIN.BATCH_SIZE)
 39 | 
 40 |     # Get the input image blob, formatted for caffe
 41 |     im_blob, im_scales, roidb = _get_image_blob(roidb, random_scale_inds)
 42 | 
 43 |     blobs = {'data': im_blob}
 44 | 
 45 |     if cfg.TRAIN.HAS_RPN:
 46 |         assert len(im_scales) == 1, "Single batch only"
 47 |         assert len(roidb) == 1, "Single batch only"
 48 |         # gt boxes: (x1, y1, x2, y2, cls)
 49 |         gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0]
 50 |         gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32)
 51 |         gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0]
 52 |         gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds]
 53 |         # TODO: add "gt_phrases"
 54 |         blobs['gt_phrases'] = _process_gt_phrases(roidb[0]['gt_phrases'])
 55 |         blobs['gt_boxes'] = gt_boxes
 56 |         blobs['im_info'] = np.array(
 57 |             # TODO: for blob format stick to tf_faster_rcnn version
 58 |             # [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]],
 59 |             # [[im_blob.shape[1], im_blob.shape[2], im_scales[0]]],
 60 |             # make it shape [3,]
 61 |             [im_blob.shape[1], im_blob.shape[2], im_scales[0]],
 62 |             dtype=np.float32)
 63 |         # if cfg.LIMIT_RAM:
 64 |         #     blobs['gt_phrases'] = roidb[0]['gt_phrases']
 65 |     else:  # not using RPN
 66 |         raise NotImplementedError
 67 | 
 68 |     return blobs
 69 | 
 70 | 
 71 | def _process_gt_phrases(phrases):
 72 |     """processing gt phrases for blob"""
 73 |     num_regions = len(phrases)
 74 |     gt_phrases = np.zeros((num_regions, cfg.MAX_WORDS), dtype=np.int32)
 75 |     for ix, phra in enumerate(phrases):
 76 |         l = len(phra)
 77 |         gt_phrases[ix, :l] = phra
 78 | 
 79 |     return gt_phrases
 80 | 
 81 | 
 82 | def _get_image_blob(roidb, scale_inds):
 83 |     """Builds an input blob from the images in the roidb at the specified
 84 |     scales.
 85 |     """
 86 |     num_images = len(scale_inds)
 87 |     processed_ims = []
 88 |     im_scales = []
 89 |     if cfg.LIMIT_RAM:
 90 |         # roidb is the pickle file path
 91 |         assert num_images == 1, "LIMIT_RAM version, it has to be one image."
 92 |         with open(roidb, 'rb') as f:
 93 |             roidb = [cPickle.load(f)]
 94 | 
 95 |     for i in xrange(num_images):
 96 |         im = cv2.imread(roidb[i]['image'])
 97 |         if roidb[i]['flipped']:
 98 |             im = im[:, ::-1, :]
 99 |         target_size = cfg.TRAIN.SCALES[scale_inds[i]]
100 |         im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
101 |                                         cfg.TRAIN.MAX_SIZE)
102 |         im_scales.append(im_scale)
103 |         processed_ims.append(im)
104 | 
105 |     # Create a blob to hold the input images
106 |     blob = im_list_to_blob(processed_ims)
107 | 
108 |     return blob, im_scales, roidb
109 | 
110 | 


--------------------------------------------------------------------------------
/lib/fast_rcnn/nms_wrapper.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | from __future__ import absolute_import
 8 | from __future__ import division
 9 | from __future__ import print_function
10 | 
11 | from lib.config import cfg
12 | from lib.nms.gpu_nms import gpu_nms
13 | from lib.nms.cpu_nms import cpu_nms
14 | 
15 | def nms(dets, thresh, force_cpu=False):
16 |     """Dispatch to either CPU or GPU NMS implementations."""
17 | 
18 |     if dets.shape[0] == 0:
19 |         return []
20 | #    print "gpu_id used by nms is: %d" % cfg.GPU_ID
21 |     if cfg.USE_GPU_NMS and not force_cpu:
22 |         return gpu_nms(dets, thresh, device_id=cfg.GPU_ID)
23 |     else:
24 |         return cpu_nms(dets, thresh)
25 | 


--------------------------------------------------------------------------------
/lib/fast_rcnn/roidb.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Fast R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick
  6 | # --------------------------------------------------------
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | """Transform a roidb into a trainable roidb by adding a bunch of metadata."""
 12 | 
 13 | # import sys
 14 | # sys.path.append("..")
 15 | 
 16 | import numpy as np
 17 | from lib.config import cfg
 18 | from lib.fast_rcnn.bbox_transform import bbox_transform
 19 | from lib.utils.cython_bbox import bbox_overlaps
 20 | from PIL import Image
 21 | 
 22 | 
 23 | def prepare_roidb(imdb):
 24 |     """Enrich the imdb's roidb by adding some derived quantities that
 25 |     are useful for training. This function precomputes the maximum
 26 |     overlap, taken over ground-truth boxes, between each ROI and
 27 |     each ground-truth box. The class with maximum overlap is also
 28 |     recorded.
 29 |     """
 30 |     sizes = [Image.open(imdb.image_path_at(i)).size
 31 |              for i in xrange(imdb.num_images)]
 32 |     roidb = imdb.roidb
 33 |     for i in xrange(len(imdb.image_index)):
 34 |         roidb[i]['image'] = imdb.image_path_at(i)
 35 |         roidb[i]['width'] = sizes[i][0]
 36 |         roidb[i]['height'] = sizes[i][1]
 37 |         # need gt_overlaps as a dense array for argmax
 38 |         gt_overlaps = roidb[i]['gt_overlaps'].toarray()
 39 |         # max overlap with gt over classes (columns)
 40 |         max_overlaps = gt_overlaps.max(axis=1)
 41 |         # gt class that had the max overlap
 42 |         max_classes = gt_overlaps.argmax(axis=1)
 43 |         roidb[i]['max_classes'] = max_classes
 44 |         roidb[i]['max_overlaps'] = max_overlaps
 45 |         # sanity checks
 46 |         # max overlap of 0 => class should be zero (background)
 47 |         zero_inds = np.where(max_overlaps == 0)[0]
 48 |         assert all(max_classes[zero_inds] == 0)
 49 |         # max overlap > 0 => class should not be zero (must be a fg class)
 50 |         # nonzero_inds = np.where(max_overlaps > 0)[0]
 51 |         # assert all(max_classes[nonzero_inds] != 0)
 52 | 
 53 | 
 54 | def add_bbox_regression_targets(roidb):
 55 |     """Add information needed to train bounding-box regressors."""
 56 |     assert len(roidb) > 0
 57 |     assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
 58 | 
 59 |     num_images = len(roidb)
 60 |     # Infer number of classes from the number of columns in gt_overlaps
 61 |     num_classes = roidb[0]['gt_overlaps'].shape[1]
 62 |     for im_i in xrange(num_images):
 63 |         rois = roidb[im_i]['boxes']
 64 |         max_overlaps = roidb[im_i]['max_overlaps']
 65 |         max_classes = roidb[im_i]['max_classes']
 66 |         roidb[im_i]['bbox_targets'] = \
 67 |             _compute_targets(rois, max_overlaps, max_classes)
 68 | 
 69 |     if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
 70 |         # Use fixed / precomputed "means" and "stds" instead of empirical values
 71 |         means = np.tile(
 72 |             np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
 73 |         stds = np.tile(
 74 |             np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
 75 |     else:
 76 |         # Compute values needed for means and stds
 77 |         # var(x) = E(x^2) - E(x)^2
 78 |         class_counts = np.zeros((num_classes, 1)) + cfg.EPS
 79 |         sums = np.zeros((num_classes, 4))
 80 |         squared_sums = np.zeros((num_classes, 4))
 81 |         for im_i in xrange(num_images):
 82 |             targets = roidb[im_i]['bbox_targets']
 83 |             for cls in xrange(1, num_classes):
 84 |                 cls_inds = np.where(targets[:, 0] == cls)[0]
 85 |                 if cls_inds.size > 0:
 86 |                     class_counts[cls] += cls_inds.size
 87 |                     sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
 88 |                     squared_sums[cls, :] += \
 89 |                         (targets[cls_inds, 1:] ** 2).sum(axis=0)
 90 | 
 91 |         means = sums / class_counts
 92 |         stds = np.sqrt(squared_sums / class_counts - means ** 2)
 93 | 
 94 |     print('bbox target means:')
 95 |     print(means)
 96 |     print(means[1:, :].mean(axis=0))  # ignore bg class)
 97 |     print('bbox target stdevs:')
 98 |     print(stds)
 99 |     print(stds[1:, :].mean(axis=0))  # ignore bg class)
100 | 
101 |     # Normalize targets
102 |     if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
103 |         print("Normalizing targets")
104 |         for im_i in xrange(num_images):
105 |             targets = roidb[im_i]['bbox_targets']
106 |             for cls in xrange(1, num_classes):
107 |                 cls_inds = np.where(targets[:, 0] == cls)[0]
108 |                 roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
109 |                 roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
110 |     else:
111 |         print("NOT normalizing targets")
112 | 
113 |     # These values will be needed for making predictions
114 |     # (the predicts will need to be unnormalized and uncentered)
115 |     return means.ravel(), stds.ravel()
116 | 
117 | 
118 | def _compute_targets(rois, overlaps, labels):
119 |     """Compute bounding-box regression targets for an image."""
120 |     # Indices of ground-truth ROIs
121 |     gt_inds = np.where(overlaps == 1)[0]
122 |     if len(gt_inds) == 0:
123 |         # Bail if the image has no ground-truth ROIs
124 |         return np.zeros((rois.shape[0], 5), dtype=np.float32)
125 |     # Indices of examples for which we try to make predictions
126 |     ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]
127 | 
128 |     # Get IoU overlap between each ex ROI and gt ROI
129 |     ex_gt_overlaps = bbox_overlaps(
130 |         np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
131 |         np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
132 | 
133 |     # Find which gt ROI each ex ROI has max overlap with:
134 |     # this will be the ex ROI's gt target
135 |     gt_assignment = ex_gt_overlaps.argmax(axis=1)
136 |     gt_rois = rois[gt_inds[gt_assignment], :]
137 |     ex_rois = rois[ex_inds, :]
138 | 
139 |     targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
140 |     targets[ex_inds, 0] = labels[ex_inds]
141 |     targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
142 |     return targets
143 | 


--------------------------------------------------------------------------------
/lib/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/layers/__init__.py


--------------------------------------------------------------------------------
/lib/layers/anchor_target_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Faster R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick and Xinlei Chen
  6 | # --------------------------------------------------------
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import os
 12 | from lib.config import cfg
 13 | import numpy as np
 14 | import numpy.random as npr
 15 | from lib.utils.cython_bbox import bbox_overlaps
 16 | from lib.fast_rcnn.bbox_transform import bbox_transform
 17 | 
 18 | 
 19 | def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, all_anchors, num_anchors):
 20 |     """Same as the anchor target layer in original Fast/er RCNN """
 21 | 
 22 |     A = num_anchors
 23 |     total_anchors = all_anchors.shape[0]
 24 |     K = total_anchors / num_anchors
 25 | 
 26 |     # allow boxes to sit over the edge by a small amount
 27 |     _allowed_border = 0
 28 | 
 29 |     # map of shape (..., H, W)
 30 |     height, width = rpn_cls_score.shape[1:3]
 31 | 
 32 |     # only keep anchors inside the image
 33 |     inds_inside = np.where(
 34 |         (all_anchors[:, 0] >= -_allowed_border) &
 35 |         (all_anchors[:, 1] >= -_allowed_border) &
 36 |         (all_anchors[:, 2] < im_info[1] + _allowed_border) &  # width
 37 |         (all_anchors[:, 3] < im_info[0] + _allowed_border)  # height
 38 |     )[0]
 39 | 
 40 |     # keep only inside anchors
 41 |     anchors = all_anchors[inds_inside, :]
 42 | 
 43 |     # label: 1 is positive, 0 is negative, -1 is dont care
 44 |     labels = np.empty((len(inds_inside),), dtype=np.float32)
 45 |     labels.fill(-1)
 46 | 
 47 |     # overlaps between the anchors and the gt boxes
 48 |     # overlaps (ex, gt)
 49 |     overlaps = bbox_overlaps(
 50 |         np.ascontiguousarray(anchors, dtype=np.float),
 51 |         np.ascontiguousarray(gt_boxes, dtype=np.float))
 52 |     argmax_overlaps = overlaps.argmax(axis=1)
 53 |     max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
 54 |     gt_argmax_overlaps = overlaps.argmax(axis=0)
 55 |     gt_max_overlaps = overlaps[gt_argmax_overlaps,
 56 |                                np.arange(overlaps.shape[1])]
 57 |     gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
 58 | 
 59 |     if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
 60 |         # assign bg labels first so that positive labels can clobber them
 61 |         # first set the negatives
 62 |         labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
 63 | 
 64 |     # fg label: for each gt, anchor with highest overlap
 65 |     labels[gt_argmax_overlaps] = 1
 66 | 
 67 |     # fg label: above threshold IOU
 68 |     labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
 69 | 
 70 |     if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
 71 |         # assign bg labels last so that negative labels can clobber positives
 72 |         labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
 73 | 
 74 |     # subsample positive labels if we have too many
 75 |     num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
 76 |     fg_inds = np.where(labels == 1)[0]
 77 |     if len(fg_inds) > num_fg:
 78 |         disable_inds = npr.choice(
 79 |             fg_inds, size=(len(fg_inds) - num_fg), replace=False)
 80 |         labels[disable_inds] = -1
 81 | 
 82 |     # subsample negative labels if we have too many
 83 |     num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
 84 |     bg_inds = np.where(labels == 0)[0]
 85 |     if len(bg_inds) > num_bg:
 86 |         disable_inds = npr.choice(
 87 |             bg_inds, size=(len(bg_inds) - num_bg), replace=False)
 88 |         labels[disable_inds] = -1
 89 | 
 90 |     bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
 91 |     bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
 92 | 
 93 |     bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
 94 |     # only the positive ones have regression targets
 95 |     bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
 96 | 
 97 |     bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
 98 |     if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:
 99 |         # uniform weighting of examples (given non-uniform sampling)
100 |         num_examples = np.sum(labels >= 0)
101 |         positive_weights = np.ones((1, 4)) * 1.0 / num_examples
102 |         negative_weights = np.ones((1, 4)) * 1.0 / num_examples
103 |     else:
104 |         assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
105 |                 (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
106 |         positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
107 |                             np.sum(labels == 1))
108 |         negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
109 |                             np.sum(labels == 0))
110 |     bbox_outside_weights[labels == 1, :] = positive_weights
111 |     bbox_outside_weights[labels == 0, :] = negative_weights
112 | 
113 |     # map up to original set of anchors
114 |     labels = _unmap(labels, total_anchors, inds_inside, fill=-1)
115 |     bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0)
116 |     bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
117 |     bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
118 | 
119 |     # labels
120 |     labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
121 |     labels = labels.reshape((1, 1, A * height, width))
122 |     rpn_labels = labels
123 | 
124 |     # bbox_targets
125 |     bbox_targets = bbox_targets \
126 |         .reshape((1, height, width, A * 4))
127 | 
128 |     rpn_bbox_targets = bbox_targets
129 |     # bbox_inside_weights
130 |     bbox_inside_weights = bbox_inside_weights \
131 |         .reshape((1, height, width, A * 4))
132 | 
133 |     rpn_bbox_inside_weights = bbox_inside_weights
134 | 
135 |     # bbox_outside_weights
136 |     bbox_outside_weights = bbox_outside_weights \
137 |         .reshape((1, height, width, A * 4))
138 | 
139 |     rpn_bbox_outside_weights = bbox_outside_weights
140 |     return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights
141 | 
142 | 
143 | def _unmap(data, count, inds, fill=0):
144 |     """ Unmap a subset of item (data) back to the original set of items (of
145 |   size count) """
146 |     if len(data.shape) == 1:
147 |         ret = np.empty((count,), dtype=np.float32)
148 |         ret.fill(fill)
149 |         ret[inds] = data
150 |     else:
151 |         ret = np.empty((count,) + data.shape[1:], dtype=np.float32)
152 |         ret.fill(fill)
153 |         ret[inds, :] = data
154 |     return ret
155 | 
156 | 
157 | def _compute_targets(ex_rois, gt_rois):
158 |     """Compute bounding-box regression targets for an image."""
159 | 
160 |     assert ex_rois.shape[0] == gt_rois.shape[0]
161 |     assert ex_rois.shape[1] == 4
162 |     assert gt_rois.shape[1] == 5
163 | 
164 |     return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)
165 | 


--------------------------------------------------------------------------------
/lib/layers/generate_anchors.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Faster R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick and Sean Bell
  6 | # --------------------------------------------------------
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import numpy as np
 12 | 
 13 | 
 14 | # Verify that we compute the same anchors as Shaoqing's matlab implementation:
 15 | #
 16 | #    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
 17 | #    >> anchors
 18 | #
 19 | #    anchors =
 20 | #
 21 | #       -83   -39   100    56
 22 | #      -175   -87   192   104
 23 | #      -359  -183   376   200
 24 | #       -55   -55    72    72
 25 | #      -119  -119   136   136
 26 | #      -247  -247   264   264
 27 | #       -35   -79    52    96
 28 | #       -79  -167    96   184
 29 | #      -167  -343   184   360
 30 | 
 31 | # array([[ -83.,  -39.,  100.,   56.],
 32 | #       [-175.,  -87.,  192.,  104.],
 33 | #       [-359., -183.,  376.,  200.],
 34 | #       [ -55.,  -55.,   72.,   72.],
 35 | #       [-119., -119.,  136.,  136.],
 36 | #       [-247., -247.,  264.,  264.],
 37 | #       [ -35.,  -79.,   52.,   96.],
 38 | #       [ -79., -167.,   96.,  184.],
 39 | #       [-167., -343.,  184.,  360.]])
 40 | 
 41 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
 42 |                      scales=2 ** np.arange(3, 6)):
 43 |   """
 44 |   Generate anchor (reference) windows by enumerating aspect ratios X
 45 |   scales wrt a reference (0, 0, 15, 15) window.
 46 |   """
 47 | 
 48 |   base_anchor = np.array([1, 1, base_size, base_size]) - 1
 49 |   ratio_anchors = _ratio_enum(base_anchor, ratios)
 50 |   anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
 51 |                        for i in range(ratio_anchors.shape[0])])
 52 |   return anchors
 53 | 
 54 | 
 55 | def _whctrs(anchor):
 56 |   """
 57 |   Return width, height, x center, and y center for an anchor (window).
 58 |   """
 59 | 
 60 |   w = anchor[2] - anchor[0] + 1
 61 |   h = anchor[3] - anchor[1] + 1
 62 |   x_ctr = anchor[0] + 0.5 * (w - 1)
 63 |   y_ctr = anchor[1] + 0.5 * (h - 1)
 64 |   return w, h, x_ctr, y_ctr
 65 | 
 66 | 
 67 | def _mkanchors(ws, hs, x_ctr, y_ctr):
 68 |   """
 69 |   Given a vector of widths (ws) and heights (hs) around a center
 70 |   (x_ctr, y_ctr), output a set of anchors (windows).
 71 |   """
 72 | 
 73 |   ws = ws[:, np.newaxis]
 74 |   hs = hs[:, np.newaxis]
 75 |   anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
 76 |                        y_ctr - 0.5 * (hs - 1),
 77 |                        x_ctr + 0.5 * (ws - 1),
 78 |                        y_ctr + 0.5 * (hs - 1)))
 79 |   return anchors
 80 | 
 81 | 
 82 | def _ratio_enum(anchor, ratios):
 83 |   """
 84 |   Enumerate a set of anchors for each aspect ratio wrt an anchor.
 85 |   """
 86 | 
 87 |   w, h, x_ctr, y_ctr = _whctrs(anchor)
 88 |   size = w * h
 89 |   size_ratios = size / ratios
 90 |   ws = np.round(np.sqrt(size_ratios))
 91 |   hs = np.round(ws * ratios)
 92 |   anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
 93 |   return anchors
 94 | 
 95 | 
 96 | def _scale_enum(anchor, scales):
 97 |   """
 98 |   Enumerate a set of anchors for each scale wrt an anchor.
 99 |   """
100 | 
101 |   w, h, x_ctr, y_ctr = _whctrs(anchor)
102 |   ws = w * scales
103 |   hs = h * scales
104 |   anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
105 |   return anchors
106 | 
107 | 
108 | if __name__ == '__main__':
109 |   import time
110 | 
111 |   t = time.time()
112 |   a = generate_anchors()
113 |   print(time.time() - t)
114 |   print(a)
115 |   from IPython import embed;
116 | 
117 |   embed()
118 | 


--------------------------------------------------------------------------------
/lib/layers/global_roi_layer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DenseCap-Tensorflow
 3 | # Written by InnerPeace
 4 | # This file is adapted from Xinlei's work
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def GlobalRoILayer(im_info):
14 |     """
15 |     Set up the global RoI
16 |     """
17 |     return np.array([0., 0., 0., im_info[1] - 1, im_info[0] - 1])
18 | 


--------------------------------------------------------------------------------
/lib/layers/proposal_layer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Ross Girshick and Xinlei Chen
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import numpy as np
11 | from lib.config import cfg
12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes
13 | from lib.fast_rcnn.nms_wrapper import nms
14 | 
15 | 
16 | def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, anchors, num_anchors):
17 |     """A simplified version compared to fast/er RCNN
18 |      For details please see the technical report
19 |   """
20 |     if type(cfg_key) == bytes:
21 |         cfg_key = cfg_key.decode('utf-8')
22 |     pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
23 |     post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
24 |     nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
25 | 
26 |     # Get the scores and bounding boxes
27 |     scores = rpn_cls_prob[:, :, :, num_anchors:]
28 |     rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
29 |     scores = scores.reshape((-1, 1))
30 |     proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
31 |     if cfg.DEBUG_ALL:
32 |         print ('number of proposals before clip boxes to image board: {}'.format(
33 |             proposals.shape[0]
34 |         ))
35 |     proposals = clip_boxes(proposals, im_info[:2])
36 | 
37 |     # remove predicted boxes with either height or width < threshold
38 |     # (NOTE: convert min_size to input image scale stored in im_info[2])
39 |     if cfg.FILTER_SMALL_BOX:
40 |         min_size = cfg[cfg_key].RPN_MIN_SIZE
41 |         keep = _filter_boxes(proposals, min_size * im_info[2])
42 |         proposals = proposals[keep, :]
43 |         scores = scores[keep]
44 | 
45 |     # Pick the top region proposals
46 |     order = scores.ravel().argsort()[::-1]
47 |     if pre_nms_topN > 0:
48 |         order = order[:pre_nms_topN]
49 |     proposals = proposals[order, :]
50 |     scores = scores[order]
51 | 
52 |     # Non-maximal suppression
53 |     if cfg.DEBUG_ALL:
54 |         print("number of proposals before nms: {}".format(proposals.shape[0]))
55 |     keep = nms(np.hstack((proposals, scores)), nms_thresh)
56 |     if cfg.DEBUG_ALL:
57 |         print("number of proposals after nms: {}".format(len(keep)))
58 | 
59 |     # Pick th top region proposals after NMS
60 |     if post_nms_topN > 0:
61 |         keep = keep[:post_nms_topN]
62 |     proposals = proposals[keep, :]
63 |     scores = scores[keep]
64 | 
65 |     # Only support single image as input
66 |     batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
67 |     blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
68 | 
69 |     return blob, scores
70 | 
71 | 
72 | def _filter_boxes(boxes, min_size):
73 |     """Remove all boxes with any side smaller than min_size."""
74 | 
75 |     ws = boxes[:, 2] - boxes[:, 0] + 1
76 |     hs = boxes[:, 3] - boxes[:, 1] + 1
77 |     keep = np.where((ws >= min_size) & (hs >= min_size))[0]
78 |     return keep
79 | 


--------------------------------------------------------------------------------
/lib/layers/proposal_target_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Faster R-CNN
  3 | # Copyright (c) 2015 Microsoft
  4 | # Licensed under The MIT License [see LICENSE for details]
  5 | # Written by Ross Girshick, Sean Bell and Xinlei Chen
  6 | # --------------------------------------------------------
  7 | from __future__ import absolute_import
  8 | from __future__ import division
  9 | from __future__ import print_function
 10 | 
 11 | import numpy as np
 12 | import numpy.random as npr
 13 | from lib.config import cfg
 14 | from lib.fast_rcnn.bbox_transform import bbox_transform
 15 | from lib.utils.cython_bbox import bbox_overlaps
 16 | 
 17 | 
 18 | def proposal_target_layer(rpn_rois, rpn_scores, gt_boxes, _num_classes):
 19 |     """
 20 |   Assign object detection proposals to ground-truth targets. Produces proposal
 21 |   classification labels and bounding-box regression targets.
 22 |   """
 23 | 
 24 |     # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
 25 |     # (i.e., layers.proposal_layer.ProposalLayer), or any other source
 26 |     all_rois = rpn_rois
 27 |     all_scores = rpn_scores
 28 | 
 29 |     # Include ground-truth boxes in the set of candidate rois
 30 |     if cfg.TRAIN.USE_GT:
 31 |         zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
 32 |         all_rois = np.vstack(
 33 |             (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
 34 |         )
 35 |         # not sure if it a wise appending, but anyway i am not using it
 36 |         all_scores = np.vstack((all_scores, zeros))
 37 | 
 38 |     num_images = 1
 39 |     rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
 40 |     fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image)
 41 | 
 42 |     # Sample rois with classification labels and bounding box regression
 43 |     # targets
 44 |     labels, rois, roi_scores, bbox_targets, bbox_inside_weights = _sample_rois(
 45 |         all_rois, all_scores, gt_boxes, fg_rois_per_image,
 46 |         rois_per_image, _num_classes)
 47 | 
 48 |     rois = rois.reshape(-1, 5)
 49 |     roi_scores = roi_scores.reshape(-1)
 50 |     labels = labels.reshape(-1, 1)
 51 |     bbox_targets = bbox_targets.reshape(-1, _num_classes * 4)
 52 |     bbox_inside_weights = bbox_inside_weights.reshape(-1, _num_classes * 4)
 53 |     bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
 54 | 
 55 |     return rois, roi_scores, labels, bbox_targets, bbox_inside_weights, bbox_outside_weights
 56 | 
 57 | 
 58 | def _get_bbox_regression_labels(bbox_target_data, num_classes):
 59 |     """Bounding-box regression targets (bbox_target_data) are stored in a
 60 |   compact form N x (class, tx, ty, tw, th)
 61 | 
 62 |   This function expands those targets into the 4-of-4*K representation used
 63 |   by the network (i.e. only one class has non-zero targets).
 64 | 
 65 |   Returns:
 66 |       bbox_target (ndarray): N x 4K blob of regression targets
 67 |       bbox_inside_weights (ndarray): N x 4K blob of loss weights
 68 |   """
 69 | 
 70 |     clss = bbox_target_data[:, 0]
 71 |     bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32)
 72 |     bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
 73 |     inds = np.where(clss > 0)[0]
 74 |     for ind in inds:
 75 |         cls = clss[ind]
 76 |         start = int(4 * cls)
 77 |         end = start + 4
 78 |         bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
 79 |         bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
 80 |     return bbox_targets, bbox_inside_weights
 81 | 
 82 | 
 83 | def _compute_targets(ex_rois, gt_rois, labels):
 84 |     """Compute bounding-box regression targets for an image."""
 85 | 
 86 |     assert ex_rois.shape[0] == gt_rois.shape[0]
 87 |     assert ex_rois.shape[1] == 4
 88 |     assert gt_rois.shape[1] == 4
 89 | 
 90 |     targets = bbox_transform(ex_rois, gt_rois)
 91 |     if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
 92 |         # Optionally normalize targets by a precomputed mean and stdev
 93 |         targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
 94 |                    / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
 95 |     return np.hstack(
 96 |         (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
 97 | 
 98 | 
 99 | def _sample_rois(all_rois, all_scores, gt_boxes, fg_rois_per_image, rois_per_image, num_classes):
100 |     """Generate a random sample of RoIs comprising foreground and background
101 |   examples.
102 |   """
103 |     # overlaps: (rois x gt_boxes)
104 |     overlaps = bbox_overlaps(
105 |         np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
106 |         np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
107 |     gt_assignment = overlaps.argmax(axis=1)
108 |     max_overlaps = overlaps.max(axis=1)
109 |     labels = gt_boxes[gt_assignment, 4]
110 | 
111 |     # Select foreground RoIs as those with >= FG_THRESH overlap
112 |     fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
113 |     # Guard against the case when an image has fewer than fg_rois_per_image
114 |     # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
115 |     bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
116 |                        (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
117 | 
118 |     # Small modification to the original version where we ensure a fixed number of regions are sampled
119 |     if fg_inds.size > 0 and bg_inds.size > 0:
120 |         fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
121 |         fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
122 |         bg_rois_per_image = rois_per_image - fg_rois_per_image
123 |         to_replace = bg_inds.size < bg_rois_per_image
124 |         bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
125 |     elif fg_inds.size > 0:
126 |         to_replace = fg_inds.size < rois_per_image
127 |         fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
128 |         fg_rois_per_image = rois_per_image
129 |     elif bg_inds.size > 0:
130 |         to_replace = bg_inds.size < rois_per_image
131 |         bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
132 |         fg_rois_per_image = 0
133 |     else:
134 |         import pdb
135 |         pdb.set_trace()
136 | 
137 |     # The indices that we're selecting (both fg and bg)
138 |     keep_inds = np.append(fg_inds, bg_inds)
139 |     # Select sampled values from various arrays:
140 |     labels = labels[keep_inds]
141 |     # Clamp labels for the background RoIs to 0
142 |     labels[int(fg_rois_per_image):] = 0
143 |     rois = all_rois[keep_inds]
144 |     roi_scores = all_scores[keep_inds]
145 | 
146 |     bbox_target_data = _compute_targets(
147 |         rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
148 | 
149 |     bbox_targets, bbox_inside_weights = \
150 |         _get_bbox_regression_labels(bbox_target_data, num_classes)
151 | 
152 |     return labels, rois, roi_scores, bbox_targets, bbox_inside_weights
153 | 


--------------------------------------------------------------------------------
/lib/layers/proposal_target_single_class_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DenseCap-Tensorflow
  3 | # Written by InnerPeace
  4 | # This file is adapted from Xinlei's work
  5 | # --------------------------------------------------------
  6 | # Faster R-CNN
  7 | # Copyright (c) 2015 Microsoft
  8 | # Licensed under The MIT License [see LICENSE for details]
  9 | # Written by Ross Girshick, Sean Bell and Xinlei Chen
 10 | # --------------------------------------------------------
 11 | from __future__ import absolute_import
 12 | from __future__ import division
 13 | from __future__ import print_function
 14 | 
 15 | import numpy as np
 16 | import numpy.random as npr
 17 | from lib.config import cfg
 18 | from lib.fast_rcnn.bbox_transform import bbox_transform
 19 | from lib.utils.cython_bbox import bbox_overlaps
 20 | from lib.layers.rois_offset_layer import compute_rois_offset
 21 | 
 22 | 
 23 | def proposal_target_single_class_layer(rpn_rois, rpn_scores, gt_boxes, gt_phrases):
 24 |     """
 25 |   Assign object detection proposals to ground-truth targets. Produces proposal
 26 |   classification labels and bounding-box regression targets.
 27 |   """
 28 | 
 29 |     # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
 30 |     # (i.e., layers.proposal_layer.ProposalLayer), or any other source
 31 |     all_rois = rpn_rois
 32 |     all_scores = rpn_scores
 33 | 
 34 |     # Include ground-truth boxes in the set of candidate rois
 35 |     if cfg.TRAIN.USE_GT:
 36 |         zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
 37 |         all_rois = np.vstack(
 38 |             (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
 39 |         )
 40 |         # not sure if it a wise appending, but anyway i am not using it
 41 |         all_scores = np.vstack((all_scores, zeros))
 42 | 
 43 |     num_images = 1
 44 |     rois_per_image = cfg.TRAIN.BATCH_SIZE // num_images
 45 |     fg_rois_per_image = int(cfg.TRAIN.FG_FRACTION * rois_per_image)
 46 | 
 47 |     # Sample rois with classification labels and bounding box regression
 48 |     # targets
 49 |     labels, rois, roi_scores, bbox_targets, bbox_inside_weights, phrases = _sample_rois(
 50 |         all_rois, all_scores, gt_boxes, gt_phrases, fg_rois_per_image,
 51 |         rois_per_image)
 52 | 
 53 |     rois = rois.reshape(-1, 5)
 54 |     roi_scores = roi_scores.reshape(-1)
 55 |     labels = labels.reshape(-1, 1)
 56 |     phrases = phrases.reshape(-1, cfg.MAX_WORDS)
 57 |     bbox_targets = bbox_targets.reshape(-1, 4)
 58 |     bbox_inside_weights = bbox_inside_weights.reshape(-1, 4)
 59 |     bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
 60 |     clss = np.array(labels > 0).astype(np.int32)
 61 | 
 62 |     return rois, roi_scores, labels, bbox_targets, \
 63 |            bbox_inside_weights, bbox_outside_weights, clss, phrases
 64 | 
 65 | 
 66 | def _get_bbox_regression_labels(bbox_target_data):
 67 |     """Bounding-box regression targets (bbox_target_data) are stored in a
 68 |   compact form N x (class, tx, ty, tw, th)
 69 | 
 70 |   Returns:
 71 |       bbox_target (ndarray): N x 4 blob of regression targets
 72 |       bbox_inside_weights (ndarray): N x 4 blob of loss weights
 73 |   """
 74 | 
 75 |     clss = bbox_target_data[:, 0]
 76 |     bbox_targets = np.zeros((clss.size, 4), dtype=np.float32)
 77 |     bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
 78 |     inds = np.where(clss > 0)[0]
 79 |     for ind in inds:
 80 |         bbox_targets[ind, :] = bbox_target_data[ind, 1:]
 81 |         bbox_inside_weights[ind, :] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
 82 |     return bbox_targets, bbox_inside_weights
 83 | 
 84 | 
 85 | def _compute_targets(ex_rois, gt_rois, labels):
 86 |     """Compute bounding-box regression targets for an image."""
 87 | 
 88 |     assert ex_rois.shape[0] == gt_rois.shape[0]
 89 |     assert ex_rois.shape[1] == 4
 90 |     assert gt_rois.shape[1] == 4
 91 | 
 92 |     targets = bbox_transform(ex_rois, gt_rois)
 93 |     if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
 94 |         # Optionally normalize targets by a precomputed mean and stdev
 95 |         targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS))
 96 |                    / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
 97 |     return np.hstack(
 98 |         (labels[:, np.newaxis], targets)).astype(np.float32, copy=False)
 99 | 
100 | 
101 | def _sample_rois(all_rois, all_scores, gt_boxes, gt_phrases, fg_rois_per_image, rois_per_image):
102 |     """Generate a random sample of RoIs comprising foreground and background
103 |   examples.
104 |   """
105 |     # overlaps: (rois x gt_boxes)
106 |     overlaps = bbox_overlaps(
107 |         np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
108 |         np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
109 |     gt_assignment = overlaps.argmax(axis=1)
110 |     max_overlaps = overlaps.max(axis=1)
111 |     labels = gt_boxes[gt_assignment, 4]
112 |     phrases = gt_phrases[gt_assignment]
113 | 
114 |     # Select foreground RoIs as those with >= FG_THRESH overlap
115 |     fg_inds = np.where(max_overlaps >= cfg.TRAIN.FG_THRESH)[0]
116 |     # Guard against the case when an image has fewer than fg_rois_per_image
117 |     # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
118 |     bg_inds = np.where((max_overlaps < cfg.TRAIN.BG_THRESH_HI) &
119 |                        (max_overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
120 | 
121 |     # Small modification to the original version where we ensure a fixed number of regions are sampled
122 |     if cfg.SAMPLE_NUM_FIXED_REGIONS:
123 |         if fg_inds.size > 0 and bg_inds.size > 0:
124 |             fg_rois_per_image = min(fg_rois_per_image, fg_inds.size)
125 |             fg_inds = npr.choice(fg_inds, size=int(fg_rois_per_image), replace=False)
126 |             bg_rois_per_image = rois_per_image - fg_rois_per_image
127 |             to_replace = bg_inds.size < bg_rois_per_image
128 |             bg_inds = npr.choice(bg_inds, size=int(bg_rois_per_image), replace=to_replace)
129 |         elif fg_inds.size > 0:
130 |             to_replace = fg_inds.size < rois_per_image
131 |             fg_inds = npr.choice(fg_inds, size=int(rois_per_image), replace=to_replace)
132 |             fg_rois_per_image = rois_per_image
133 |         elif bg_inds.size > 0:
134 |             to_replace = bg_inds.size < rois_per_image
135 |             bg_inds = npr.choice(bg_inds, size=int(rois_per_image), replace=to_replace)
136 |             fg_rois_per_image = 0
137 |         else:
138 |             import pdb
139 |             pdb.set_trace()
140 |     else:
141 |         # foreground RoIs
142 |         fg_rois_per_this_image = min(fg_rois_per_image, fg_inds.size)
143 |         # Sample foreground regions without replacement
144 |         if fg_inds.size > 0:
145 |             fg_inds = npr.choice(fg_inds, size=fg_rois_per_this_image, replace=False)
146 | 
147 |         # Compute number of background RoIs to take from this image (guarding
148 |         # against there being fewer than desired)
149 |         bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image
150 |         bg_rois_per_this_image = min(bg_rois_per_this_image, bg_inds.size)
151 |         # Sample background regions without replacement
152 |         if bg_inds.size > 0:
153 |             bg_inds = npr.choice(bg_inds, size=bg_rois_per_this_image, replace=False)
154 | 
155 |     # The indices that we're selecting (both fg and bg)
156 |     keep_inds = np.append(fg_inds, bg_inds)
157 |     # Select sampled values from various arrays:
158 |     labels = labels[keep_inds]
159 |     phrases = phrases[keep_inds]
160 |     # Clamp labels for the background RoIs to 0
161 |     labels[int(fg_rois_per_image):] = 0
162 |     phrases[int(fg_rois_per_image):, :] = 0
163 |     rois = all_rois[keep_inds]
164 |     roi_scores = all_scores[keep_inds]
165 | 
166 |     bbox_target_data = _compute_targets(
167 |         rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], labels)
168 | 
169 |     if cfg.DEBUG_ALL:
170 |         target_boxes = compute_rois_offset(rois[:, 1:5], bbox_target_data[:, 1:5])
171 |         match_boxes = gt_boxes[gt_assignment[keep_inds], :4]
172 |         print('boxes consistency check')
173 |         print(target_boxes[:2,:])
174 |         print(match_boxes[:2,:])
175 |         assert np.linalg.norm(target_boxes - match_boxes) < 0.01
176 | 
177 |     bbox_targets, bbox_inside_weights = \
178 |         _get_bbox_regression_labels(bbox_target_data)
179 | 
180 |     return labels, rois, roi_scores, bbox_targets, bbox_inside_weights, phrases
181 | 


--------------------------------------------------------------------------------
/lib/layers/proposal_top_layer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Xinlei Chen
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import numpy as np
11 | from lib.config import cfg
12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes
13 | import numpy.random as npr
14 | 
15 | 
16 | def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, anchors, num_anchors):
17 |     """A layer that just selects the top region proposals
18 |      without using non-maximal suppression,
19 |      For details please see the technical report
20 |   """
21 |     rpn_top_n = cfg.TEST.RPN_TOP_N
22 | 
23 |     scores = rpn_cls_prob[:, :, :, num_anchors:]
24 | 
25 |     rpn_bbox_pred = rpn_bbox_pred.reshape((-1, 4))
26 |     scores = scores.reshape((-1, 1))
27 | 
28 |     length = scores.shape[0]
29 |     if length < rpn_top_n:
30 |         # Random selection, maybe unnecessary and loses good proposals
31 |         # But such case rarely happens
32 |         top_inds = npr.choice(length, size=rpn_top_n, replace=True)
33 |     else:
34 |         top_inds = scores.argsort(0)[::-1]
35 |         top_inds = top_inds[:rpn_top_n]
36 |         top_inds = top_inds.reshape(rpn_top_n, )
37 | 
38 |     # Do the selection here
39 |     anchors = anchors[top_inds, :]
40 |     rpn_bbox_pred = rpn_bbox_pred[top_inds, :]
41 |     scores = scores[top_inds]
42 | 
43 |     # Convert anchors into proposals via bbox transformations
44 |     proposals = bbox_transform_inv(anchors, rpn_bbox_pred)
45 | 
46 |     # Clip predicted boxes to image
47 |     proposals = clip_boxes(proposals, im_info[:2])
48 | 
49 |     # Output rois blob
50 |     # Our RPN implementation only supports a single input image, so all
51 |     # batch inds are 0
52 |     batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
53 |     blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
54 |     return blob, scores
55 | 


--------------------------------------------------------------------------------
/lib/layers/rois_offset_layer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DenseCap-Tensorflow
 3 | # Written by InnerPeace
 4 | # This file is adapted from Linjie's work
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import numpy as np
11 | from lib.config import cfg
12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes
13 | 
14 | 
15 | # compute the new bboxes shifted by offset from rois
16 | def compute_rois_offset(rois, offset, im_info=None):
17 |     """Compute bounding-box offset for region of interests"""
18 | 
19 |     assert rois.shape[1] == 4
20 |     assert offset.shape[1] == 4
21 | 
22 |     if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
23 |         # Optionally normalize targets by a precomputed mean and stdev -- reverse the transformation
24 |         offset_unnorm = offset * np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS) + \
25 |                         np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)
26 |     else:
27 |         offset_unnorm = offset.copy()
28 |     rois_offset = bbox_transform_inv(rois, offset_unnorm)
29 |     if not im_info is None:
30 |         rois_offset = clip_boxes(rois_offset, im_info[:2])
31 |     return rois_offset
32 | 


--------------------------------------------------------------------------------
/lib/layers/sentence_data_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DenseCap-Tensorflow
  3 | # Written by InnerPeace
  4 | # This file is adapted from Linjie's work
  5 | # --------------------------------------------------------
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | """This python layer accepts region ids as input and 
 11 | retrieves region sentense for them."""
 12 | 
 13 | from six.moves import cPickle
 14 | from lib.config import cfg
 15 | from collections import Counter
 16 | import numpy as np
 17 | import six
 18 | from six.moves import xrange
 19 | 
 20 | # TODO: disable debug and clear stuff
 21 | DEBUG = True
 22 | 
 23 | 
 24 | def sentence_data_layer(labels, roi_phrases, time_steps=12, mode='concat'):
 25 |     all_modes = ('repeat', 'concat')
 26 |     assert (mode in all_modes), "Wrong type of mode which should be 'repeat' or 'concat'"
 27 | 
 28 |     if cfg.DEBUG_ALL:
 29 |         print('length of labels, i.e. number of regions: {}'.format(len(roi_phrases)))
 30 | 
 31 |     # all_regions is a dict from region id to caption stream
 32 |     assert len(labels.shape) == 2, 'Pleace check the shape of "label"'
 33 | 
 34 |     num_regions = labels.shape[0]
 35 |     if mode == 'repeat':
 36 |         input_sentence = np.zeros((num_regions, time_steps), dtype=np.float32)
 37 |     elif mode == 'concat':
 38 |         input_sentence = np.zeros((num_regions, time_steps - 1), dtype=np.float32)
 39 | 
 40 |     target_sentence = np.zeros((num_regions, time_steps), dtype=np.float32)
 41 |     cont_sentence = np.zeros((num_regions, time_steps), dtype=np.float32)
 42 |     cont_bbox = np.zeros((num_regions, time_steps), dtype=np.float32)
 43 |     for i in xrange(num_regions):
 44 |         stream = get_streams(roi_phrases[i], int(labels[i]), time_steps, mode)
 45 |         input_sentence[i, :] = stream['input_sentence']
 46 |         target_sentence[i, :] = stream['target_sentence']
 47 |         cont_sentence[i, :] = stream['cont_sentence']
 48 |         cont_bbox[i, :] = stream['cont_bbox']
 49 | 
 50 |     if cfg.DEBUG_ALL:
 51 |         print('sentence data layer input (first 3)')
 52 |         for ix, l in enumerate(labels[:3]):
 53 |             print(l[0], roi_phrases[ix])
 54 |         print('sentence data layer output (first 3)')
 55 |         print('input sentence')
 56 |         print(input_sentence[:3, :])
 57 |         print('target sentence')
 58 |         print(target_sentence[:3, :])
 59 |         print('cont sentence')
 60 |         print(cont_sentence[:3, :])
 61 |         print('cont bbox')
 62 |         print(cont_bbox[:3, :])
 63 | 
 64 |     return input_sentence, target_sentence, cont_sentence, cont_bbox
 65 | 
 66 | 
 67 | def get_streams(phrases, region_id, time_steps=12, mode='concat'):
 68 | 
 69 |     if mode == 'repeat':
 70 |         # Image features repeated at each time step
 71 |         if region_id > 0:
 72 |             stream = phrases[:np.sum(phrases > 0)]
 73 |             stream = stream.tolist()
 74 |             pad = time_steps - (len(stream) + 1)
 75 |             out = {}
 76 |             out['cont_sentence'] = [0] + [1] * len(stream) + [0] * pad
 77 |             out['input_sentence'] = [1] + stream + [0] * pad
 78 |             out['target_sentence'] = stream + [2] + [0] * pad
 79 |             # only make prediction at the last time step for bbox
 80 |             out['cont_bbox'] = [0] * len(stream) + [1] + [0] * pad
 81 | 
 82 |             for key, val in six.iteritems(out):
 83 |                 if len(val) > time_steps:
 84 |                     out[key] = val[:time_steps]
 85 |         else:
 86 |             # negative sample, no phrase related
 87 |             out = {}
 88 |             out['cont_sentence'] = [0] * time_steps
 89 |             out['input_sentence'] = [0] * time_steps
 90 |             out['target_sentence'] = [0] * time_steps
 91 |             out['cont_bbox'] = [0] * time_steps
 92 | 
 93 |     elif mode == 'concat':
 94 |         # Image feature concatenated to the first time step
 95 |         if region_id > 0:
 96 |             # stream = phrases[region_id]
 97 |             stream = phrases[:np.sum(phrases > 0)]
 98 |             stream = stream.tolist()
 99 |             pad = time_steps - (len(stream) + 2)
100 |             out = {}
101 |             out['cont_sentence'] = [0] + [1] * (len(stream) + 1) + [0] * pad
102 |             out['input_sentence'] = [1] + stream + [0] * pad
103 |             out['target_sentence'] = [1] + stream + [2] + [0] * pad
104 |             # only make prediction at the last time step for bbox
105 |             out['cont_bbox'] = [0] * (len(stream) + 1) + [1] + [0] * pad
106 | 
107 |             for key, val in six.iteritems(out):
108 |                 if len(val) > time_steps:
109 |                     out[key] = val[:time_steps]
110 |         else:
111 |             # negative sample, no phrase related
112 |             out = {}
113 |             out['cont_sentence'] = [0] * time_steps
114 |             out['input_sentence'] = [0] * (time_steps - 1)
115 |             out['target_sentence'] = [0] * time_steps
116 |             out['cont_bbox'] = [0] * time_steps
117 |     else:
118 |         # Global feature and region feature concatenated to the first time step
119 |         if region_id > 0:
120 |             stream = phrases[region_id]
121 |             stream = stream.tolist()
122 |             pad = time_steps - (len(stream) + 3)
123 |             out = {}
124 |             out['cont_sentence'] = [0] + [1] * (len(stream) + 2) + [0] * pad
125 |             out['input_sentence'] = [1] + stream + [0] * pad
126 |             out['target_sentence'] = [1, 1] + stream + [2] + [0] * pad
127 |             # only make prediction at the last time step for bbox
128 |             out['cont_bbox'] = [0] * (len(stream) + 2) + [1] + [0] * pad
129 | 
130 |             for key, val in out.iteritems():
131 |                 if len(val) > time_steps:
132 |                     out[key] = val[:time_steps]
133 |         else:
134 |             # negative sample, no phrase related
135 |             out = {}
136 |             out['cont_sentence'] = [0] * time_steps
137 |             out['input_sentence'] = [0] * (time_steps - 2)
138 |             out['target_sentence'] = [0] * time_steps
139 |             out['cont_bbox'] = [0] * time_steps
140 | 
141 |     return out
142 | 


--------------------------------------------------------------------------------
/lib/layers/snippets.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Tensorflow Faster R-CNN
 3 | # Licensed under The MIT License [see LICENSE for details]
 4 | # Written by Xinlei Chen
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | import numpy as np
11 | from lib.layers.generate_anchors import generate_anchors
12 | 
13 | 
14 | def generate_anchors_pre(height, width, feat_stride, anchor_scales=(8, 16, 32), anchor_ratios=(0.5, 1, 2)):
15 |     """ A wrapper function to generate anchors given different scales
16 |     Also return the number of anchors in variable 'length'
17 |   """
18 |     anchors = generate_anchors(ratios=np.array(anchor_ratios), scales=np.array(anchor_scales))
19 |     A = anchors.shape[0]
20 |     shift_x = np.arange(0, width) * feat_stride
21 |     shift_y = np.arange(0, height) * feat_stride
22 |     shift_x, shift_y = np.meshgrid(shift_x, shift_y)
23 |     shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose()
24 |     K = shifts.shape[0]
25 |     # width changes faster, so here it is H, W, C
26 |     anchors = anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))
27 |     anchors = anchors.reshape((K * A, 4)).astype(np.float32, copy=False)
28 |     length = np.int32(anchors.shape[0])
29 | 
30 |     return anchors, length
31 | 


--------------------------------------------------------------------------------
/lib/limit_ram/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/limit_ram/__init__.py


--------------------------------------------------------------------------------
/lib/limit_ram/utils.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DenseCap-Tensorflow
 3 | # Written by InnerPeace
 4 | # This file is adapted from Ross Girshick's work
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | """functions for LIMIT_RAM version"""
11 | 
12 | # import sys
13 | # sys.path.append("..")
14 | 
15 | import numpy as np
16 | from lib.config import cfg
17 | 
18 | 
19 | def pre_roidb(roidb):
20 |     """Enrich the imdb's roidb by adding some derived quantities that
21 |     are useful for training. This function precomputes the maximum
22 |     overlap, taken over ground-truth boxes, between each ROI and
23 |     each ground-truth box. The class with maximum overlap is also
24 |     recorded.
25 |     """
26 |     # need gt_overlaps as a dense array for argmax
27 |     gt_overlaps = roidb['gt_overlaps'].toarray()
28 |     # max overlap with gt over classes (columns)
29 |     max_overlaps = gt_overlaps.max(axis=1)
30 |     # gt class that had the max overlap
31 |     max_classes = gt_overlaps.argmax(axis=1)
32 |     roidb['max_classes'] = max_classes
33 |     roidb['max_overlaps'] = max_overlaps
34 |     # sanity checks
35 |     # max overlap of 0 => class should be zero (background)
36 |     zero_inds = np.where(max_overlaps == 0)[0]
37 |     assert all(max_classes[zero_inds] == 0)
38 |     # max overlap > 0 => class should not be zero (must be a fg class)
39 |     # nonzero_inds = np.where(max_overlaps > 0)[0]
40 |     # assert all(max_classes[nonzero_inds] != 0)
41 |     return roidb
42 | 
43 | 
44 | def is_valid_limitRam(entry):
45 |     # Valid images have:
46 |     #   (1) At least one foreground RoI OR
47 |     #   (2) At least one background RoI
48 |     overlaps = entry['max_overlaps']
49 |     # find boxes with sufficient overlap
50 |     fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
51 |     # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
52 |     bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
53 |                        (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
54 |     # image is only valid if such boxes exist
55 |     valid = len(fg_inds) > 0 or len(bg_inds) > 0
56 |     return valid
57 | 
58 | 
59 | def flip_image(roidb):
60 |     """flip image and change the name for reading later"""
61 | 
62 |     boxes = roidb['boxes'].copy()
63 |     oldx1 = boxes[:, 0].copy()
64 |     oldx2 = boxes[:, 2].copy()
65 |     boxes[:, 0] = roidb['width'] - oldx2 - 1
66 |     boxes[:, 2] = roidb['width'] - oldx1 - 1
67 |     assert (boxes[:, 2] >= boxes[:, 0]).all()
68 |     entry = {'boxes': boxes,
69 |              'gt_overlaps': roidb['gt_overlaps'],
70 |              'gt_classes': roidb['gt_classes'],
71 |              'flipped': True,
72 |              'gt_phrases': roidb['gt_phrases'],
73 |              'width': roidb['width'],
74 |              'height': roidb['height'],
75 |              'image': roidb['image'],
76 |              'image_id': '%s_flip' % roidb['image_id']}
77 | 
78 |     return entry
79 | 


--------------------------------------------------------------------------------
/lib/nets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/nets/__init__.py


--------------------------------------------------------------------------------
/lib/nets/vgg16.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Tensorflow Faster R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Xinlei Chen
  5 | # --------------------------------------------------------
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import tensorflow as tf
 11 | import tensorflow.contrib.slim as slim
 12 | from tensorflow.contrib.slim import losses
 13 | from tensorflow.contrib.slim import arg_scope
 14 | import numpy as np
 15 | 
 16 | from lib.nets.network import Network
 17 | from lib.config import cfg
 18 | 
 19 | 
 20 | class vgg16(Network):
 21 |     def __init__(self):
 22 |         Network.__init__(self)
 23 |         self._feat_stride = [16, ]
 24 |         self._feat_compress = [1. / float(self._feat_stride[0]), ]
 25 |         self._scope = 'DenseCap_VGG16'
 26 |         self._vgg_scope = 'vgg_16'
 27 | 
 28 |     def _image_to_head(self, is_training, reuse=None):
 29 |         with tf.variable_scope(self._vgg_scope, self._vgg_scope, reuse=reuse):
 30 |             net = slim.repeat(self._image, 2, slim.conv2d, 64, [3, 3],
 31 |                               trainable=False, scope='conv1')
 32 |             net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool1')
 33 |             net = slim.repeat(net, 2, slim.conv2d, 128, [3, 3],
 34 |                               trainable=False, scope='conv2')
 35 |             net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool2')
 36 |             net = slim.repeat(net, 3, slim.conv2d, 256, [3, 3],
 37 |                               trainable=is_training, scope='conv3')
 38 |             net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool3')
 39 |             net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
 40 |                               trainable=is_training, scope='conv4')
 41 |             net = slim.max_pool2d(net, [2, 2], padding='SAME', scope='pool4')
 42 |             net = slim.repeat(net, 3, slim.conv2d, 512, [3, 3],
 43 |                               trainable=is_training, scope='conv5')
 44 | 
 45 |         self._act_summaries.append(net)
 46 |         self._layers['head'] = net
 47 | 
 48 |         return net
 49 | 
 50 |     def _head_to_tail(self, pool5, is_training, reuse=None):
 51 |         with tf.variable_scope(self._vgg_scope, self._vgg_scope, reuse=reuse):
 52 |             pool5_flat = slim.flatten(pool5, scope='flatten')
 53 |             fc6 = slim.fully_connected(pool5_flat, 4096, scope='fc6')
 54 |             if is_training:
 55 |                 fc6 = slim.dropout(fc6, keep_prob=0.5, is_training=True,
 56 |                                    scope='dropout6')
 57 |             fc7 = slim.fully_connected(fc6, 4096, scope='fc7')
 58 |             if is_training:
 59 |                 fc7 = slim.dropout(fc7, keep_prob=0.5, is_training=True,
 60 |                                    scope='dropout7')
 61 | 
 62 |         return fc7
 63 | 
 64 |     def get_variables_to_restore(self, variables, var_keep_dic):
 65 |         variables_to_restore = []
 66 | 
 67 |         for v in variables:
 68 |             # exclude the conv weights that are fc weights in vgg16
 69 |             if v.name == (self._vgg_scope + '/fc6/weights:0') or \
 70 |                     v.name == (self._vgg_scope + '/fc7/weights:0'):
 71 |                 self._variables_to_fix[v.name] = v
 72 |                 continue
 73 |             # exclude the first conv layer to swap RGB to BGR
 74 |             if v.name == (self._vgg_scope + '/conv1/conv1_1/weights:0'):
 75 |                 self._variables_to_fix[v.name] = v
 76 |                 continue
 77 |             if v.name.split(':')[0] in var_keep_dic:
 78 |                 print('Variables restored: %s' % v.name)
 79 |                 variables_to_restore.append(v)
 80 | 
 81 |         return variables_to_restore
 82 | 
 83 |     def fix_variables(self, sess, pretrained_model):
 84 |         print('Fix VGG16 layers..')
 85 |         with tf.variable_scope('Fix_VGG16') as scope:
 86 |             with tf.device("/cpu:0"):
 87 |                 # fix the vgg16 issue from conv weights to fc weights
 88 |                 # fix RGB to BGR
 89 |                 fc6_conv = tf.get_variable("fc6_conv", [7, 7, 512, 4096], trainable=False)
 90 |                 fc7_conv = tf.get_variable("fc7_conv", [1, 1, 4096, 4096], trainable=False)
 91 |                 conv1_rgb = tf.get_variable("conv1_rgb", [3, 3, 3, 64], trainable=False)
 92 |                 restorer_fc = tf.train.Saver({self._vgg_scope + "/fc6/weights": fc6_conv,
 93 |                                               self._vgg_scope + "/fc7/weights": fc7_conv,
 94 |                                               self._vgg_scope + "/conv1/conv1_1/weights": conv1_rgb})
 95 |                 restorer_fc.restore(sess, pretrained_model)
 96 | 
 97 |                 sess.run(tf.assign(self._variables_to_fix[self._vgg_scope + '/fc6/weights:0'], tf.reshape(fc6_conv,
 98 |                                                                                                           self._variables_to_fix[
 99 |                                                                                                               self._vgg_scope + '/fc6/weights:0'].get_shape())))
100 |                 sess.run(tf.assign(self._variables_to_fix[self._vgg_scope + '/fc7/weights:0'], tf.reshape(fc7_conv,
101 |                                                                                                           self._variables_to_fix[
102 |                                                                                                               self._vgg_scope + '/fc7/weights:0'].get_shape())))
103 |                 sess.run(tf.assign(self._variables_to_fix[self._vgg_scope + '/conv1/conv1_1/weights:0'],
104 |                                    tf.reverse(conv1_rgb, [2])))
105 | 


--------------------------------------------------------------------------------
/lib/nms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/nms/__init__.py


--------------------------------------------------------------------------------
/lib/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
12 |     return a if a >= b else b
13 | 
14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
15 |     return a if a <= b else b
16 | 
17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
18 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
19 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
20 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
21 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
22 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
23 | 
24 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
25 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
26 | 
27 |     cdef int ndets = dets.shape[0]
28 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
29 |             np.zeros((ndets), dtype=np.int)
30 | 
31 |     # nominal indices
32 |     cdef int _i, _j
33 |     # sorted indices
34 |     cdef int i, j
35 |     # temp variables for box i's (the box currently under consideration)
36 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
37 |     # variables for computing overlap with box j (lower scoring box)
38 |     cdef np.float32_t xx1, yy1, xx2, yy2
39 |     cdef np.float32_t w, h
40 |     cdef np.float32_t inter, ovr
41 | 
42 |     keep = []
43 |     for _i in range(ndets):
44 |         i = order[_i]
45 |         if suppressed[i] == 1:
46 |             continue
47 |         keep.append(i)
48 |         ix1 = x1[i]
49 |         iy1 = y1[i]
50 |         ix2 = x2[i]
51 |         iy2 = y2[i]
52 |         iarea = areas[i]
53 |         for _j in range(_i + 1, ndets):
54 |             j = order[_j]
55 |             if suppressed[j] == 1:
56 |                 continue
57 |             xx1 = max(ix1, x1[j])
58 |             yy1 = max(iy1, y1[j])
59 |             xx2 = min(ix2, x2[j])
60 |             yy2 = min(iy2, y2[j])
61 |             w = max(0.0, xx2 - xx1 + 1)
62 |             h = max(0.0, yy2 - yy1 + 1)
63 |             inter = w * h
64 |             ovr = inter / (iarea + areas[j] - inter)
65 |             if ovr >= thresh:
66 |                 suppressed[j] = 1
67 | 
68 |     return keep
69 | 


--------------------------------------------------------------------------------
/lib/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 |           int boxes_dim, float nms_overlap_thresh, int device_id);
3 | 


--------------------------------------------------------------------------------
/lib/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Faster R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | assert sizeof(int) == sizeof(np.int32_t)
12 | 
13 | cdef extern from "gpu_nms.hpp":
14 |     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
15 | 
16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
17 |             np.int32_t device_id=0):
18 |     cdef int boxes_num = dets.shape[0]
19 |     cdef int boxes_dim = dets.shape[1]
20 |     cdef int num_out
21 |     cdef np.ndarray[np.int32_t, ndim=1] \
22 |         keep = np.zeros(boxes_num, dtype=np.int32)
23 |     cdef np.ndarray[np.float32_t, ndim=1] \
24 |         scores = dets[:, 4]
25 |     cdef np.ndarray[np.int_t, ndim=1] \
26 |         order = scores.argsort()[::-1]
27 |     cdef np.ndarray[np.float32_t, ndim=2] \
28 |         sorted_dets = dets[order, :]
29 |     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
30 |     keep = keep[:num_out]
31 |     return list(order[keep])
32 | 


--------------------------------------------------------------------------------
/lib/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
  1 | // ------------------------------------------------------------------
  2 | // Faster R-CNN
  3 | // Copyright (c) 2015 Microsoft
  4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details]
  5 | // Written by Shaoqing Ren
  6 | // ------------------------------------------------------------------
  7 | 
  8 | #include "gpu_nms.hpp"
  9 | #include <vector>
 10 | #include <iostream>
 11 | 
 12 | #define CUDA_CHECK(condition) \
 13 |   /* Code block avoids redefinition of cudaError_t error */ \
 14 |   do { \
 15 |     cudaError_t error = condition; \
 16 |     if (error != cudaSuccess) { \
 17 |       std::cout << cudaGetErrorString(error) << std::endl; \
 18 |     } \
 19 |   } while (0)
 20 | 
 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 23 | 
 24 | __device__ inline float devIoU(float const * const a, float const * const b) {
 25 |   float left = max(a[0], b[0]), right = min(a[2], b[2]);
 26 |   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 27 |   float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
 28 |   float interS = width * height;
 29 |   float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
 30 |   float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
 31 |   return interS / (Sa + Sb - interS);
 32 | }
 33 | 
 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 35 |                            const float *dev_boxes, unsigned long long *dev_mask) {
 36 |   const int row_start = blockIdx.y;
 37 |   const int col_start = blockIdx.x;
 38 | 
 39 |   // if (row_start > col_start) return;
 40 | 
 41 |   const int row_size =
 42 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 43 |   const int col_size =
 44 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 45 | 
 46 |   __shared__ float block_boxes[threadsPerBlock * 5];
 47 |   if (threadIdx.x < col_size) {
 48 |     block_boxes[threadIdx.x * 5 + 0] =
 49 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
 50 |     block_boxes[threadIdx.x * 5 + 1] =
 51 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
 52 |     block_boxes[threadIdx.x * 5 + 2] =
 53 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
 54 |     block_boxes[threadIdx.x * 5 + 3] =
 55 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
 56 |     block_boxes[threadIdx.x * 5 + 4] =
 57 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
 58 |   }
 59 |   __syncthreads();
 60 | 
 61 |   if (threadIdx.x < row_size) {
 62 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 63 |     const float *cur_box = dev_boxes + cur_box_idx * 5;
 64 |     int i = 0;
 65 |     unsigned long long t = 0;
 66 |     int start = 0;
 67 |     if (row_start == col_start) {
 68 |       start = threadIdx.x + 1;
 69 |     }
 70 |     for (i = start; i < col_size; i++) {
 71 |       if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
 72 |         t |= 1ULL << i;
 73 |       }
 74 |     }
 75 |     const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
 76 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
 77 |   }
 78 | }
 79 | 
 80 | void _set_device(int device_id) {
 81 |   int current_device;
 82 |   CUDA_CHECK(cudaGetDevice(&current_device));
 83 |   if (current_device == device_id) {
 84 |     return;
 85 |   }
 86 |   // The call to cudaSetDevice must come before any calls to Get, which
 87 |   // may perform initialization using the GPU.
 88 |   CUDA_CHECK(cudaSetDevice(device_id));
 89 | }
 90 | 
 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
 92 |           int boxes_dim, float nms_overlap_thresh, int device_id) {
 93 |   _set_device(device_id);
 94 | 
 95 |   float* boxes_dev = NULL;
 96 |   unsigned long long* mask_dev = NULL;
 97 | 
 98 |   const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
 99 | 
100 |   CUDA_CHECK(cudaMalloc(&boxes_dev,
101 |                         boxes_num * boxes_dim * sizeof(float)));
102 |   CUDA_CHECK(cudaMemcpy(boxes_dev,
103 |                         boxes_host,
104 |                         boxes_num * boxes_dim * sizeof(float),
105 |                         cudaMemcpyHostToDevice));
106 | 
107 |   CUDA_CHECK(cudaMalloc(&mask_dev,
108 |                         boxes_num * col_blocks * sizeof(unsigned long long)));
109 | 
110 |   dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
111 |               DIVUP(boxes_num, threadsPerBlock));
112 |   dim3 threads(threadsPerBlock);
113 |   nms_kernel<<<blocks, threads>>>(boxes_num,
114 |                                   nms_overlap_thresh,
115 |                                   boxes_dev,
116 |                                   mask_dev);
117 | 
118 |   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
119 |   CUDA_CHECK(cudaMemcpy(&mask_host[0],
120 |                         mask_dev,
121 |                         sizeof(unsigned long long) * boxes_num * col_blocks,
122 |                         cudaMemcpyDeviceToHost));
123 | 
124 |   std::vector<unsigned long long> remv(col_blocks);
125 |   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
126 | 
127 |   int num_to_keep = 0;
128 |   for (int i = 0; i < boxes_num; i++) {
129 |     int nblock = i / threadsPerBlock;
130 |     int inblock = i % threadsPerBlock;
131 | 
132 |     if (!(remv[nblock] & (1ULL << inblock))) {
133 |       keep_out[num_to_keep++] = i;
134 |       unsigned long long *p = &mask_host[0] + i * col_blocks;
135 |       for (int j = nblock; j < col_blocks; j++) {
136 |         remv[j] |= p[j];
137 |       }
138 |     }
139 |   }
140 |   *num_out = num_to_keep;
141 | 
142 |   CUDA_CHECK(cudaFree(boxes_dev));
143 |   CUDA_CHECK(cudaFree(mask_dev));
144 | }
145 | 


--------------------------------------------------------------------------------
/lib/nms/py_cpu_nms.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | def py_cpu_nms(dets, thresh):
11 |     """Pure Python NMS baseline."""
12 |     x1 = dets[:, 0]
13 |     y1 = dets[:, 1]
14 |     x2 = dets[:, 2]
15 |     y2 = dets[:, 3]
16 |     scores = dets[:, 4]
17 | 
18 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
19 |     order = scores.argsort()[::-1]
20 | 
21 |     keep = []
22 |     while order.size > 0:
23 |         i = order[0]
24 |         keep.append(i)
25 |         xx1 = np.maximum(x1[i], x1[order[1:]])
26 |         yy1 = np.maximum(y1[i], y1[order[1:]])
27 |         xx2 = np.minimum(x2[i], x2[order[1:]])
28 |         yy2 = np.minimum(y2[i], y2[order[1:]])
29 | 
30 |         w = np.maximum(0.0, xx2 - xx1 + 1)
31 |         h = np.maximum(0.0, yy2 - yy1 + 1)
32 |         inter = w * h
33 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
34 | 
35 |         inds = np.where(ovr <= thresh)[0]
36 |         order = order[inds + 1]
37 | 
38 |     return keep
39 | 


--------------------------------------------------------------------------------
/lib/pre_glove.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DenseCap-Tensorflow
 3 | # Written by InnerPeace
 4 | # This file is adapted from cs224-2017 stanford
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | 
11 | from tensorflow.python.platform import gfile
12 | from os.path import join as pjoin
13 | from tqdm import *
14 | import numpy as np
15 | import os
16 | 
17 | from config import cfg
18 | 
19 | 
20 | _PAD = b"<pad>"
21 | _SOS = b"<sos>"
22 | _EOS = b"<eos>"
23 | 
24 | 
25 | def initialize_vocabulary(vocabulary_path):
26 |     # map vocab to word embeddings
27 |     if gfile.Exists(vocabulary_path):
28 |         rev_vocab = [_PAD, _SOS, _EOS]
29 |         with gfile.GFile(vocabulary_path, mode="r") as f:
30 |             rev_vocab.extend(f.readlines())
31 |         rev_vocab = [line.strip('\n') for line in rev_vocab]
32 |         vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
33 |         return vocab, rev_vocab
34 |     else:
35 |         raise ValueError("Vocabulary file %s not found.", vocabulary_path)
36 | 
37 | 
38 | def process_glove(vocab_list, save_path, size=4e5, random_init=True):
39 |     """
40 |     :param vocab_list: [vocab]
41 |     :return:
42 |     """
43 |     if not gfile.Exists(save_path + ".npz"):
44 |         glove_path = os.path.join(cfg.DATA_DIR, "glove.6B.{}d.txt".format(cfg.GLOVE_DIM))
45 |         if random_init:
46 |             glove = np.random.randn(len(vocab_list), cfg.GLOVE_DIM)
47 |         else:
48 |             glove = np.zeros((len(vocab_list), cfg.GLOVE_DIM))
49 |         found = 0
50 |         with open(glove_path, 'r') as fh:
51 |             for line in tqdm(fh, total=size):
52 |                 array = line.lstrip().rstrip().split(" ")
53 |                 word = array[0]
54 |                 vector = list(map(float, array[1:]))
55 |                 if word in vocab_list:
56 |                     idx = vocab_list.index(word)
57 |                     glove[idx, :] = vector
58 |                     found += 1
59 |                 if word.capitalize() in vocab_list:
60 |                     idx = vocab_list.index(word.capitalize())
61 |                     glove[idx, :] = vector
62 |                     found += 1
63 |                 if word.upper() in vocab_list:
64 |                     idx = vocab_list.index(word.upper())
65 |                     glove[idx, :] = vector
66 |                     found += 1
67 | 
68 |         print("{}/{} of word vocab have corresponding vectors in {}".format(found, len(vocab_list), glove_path))
69 |         np.savez_compressed(save_path, glove=glove)
70 |         print("saved trimmed glove matrix at: {}".format(save_path))
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     vocab_path = pjoin(cfg.CACHE_DIR, 'vocabulary.txt')
75 |     vocab, rev_vocab = initialize_vocabulary(vocab_path)
76 |     process_glove(rev_vocab, cfg.DATA_DIR + "/glove.trimmed.{}".format(cfg.GLOVE_DIM),
77 |                   random_init=True)
78 | 


--------------------------------------------------------------------------------
/lib/preprocess.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Preprocessing data in valohai computing platform.
 4 | # This script may out of date. #2017.12.20
 5 | set -e
 6 | set -x
 7 | 
 8 | POSITIONAL=()
 9 | while [[ $# -gt 0 ]]; do
10 |     #statements
11 |     key="$1"
12 | 
13 |     case $key in
14 |         -vs|--version)
15 |         VERSION=$2
16 |         shift
17 |         shift
18 |         ;;
19 |         -p|--path)
20 |         IN_PATH=$2
21 |         shift
22 |         shift
23 |         ;;
24 |         -od|--output_dir)
25 |         OUTPUT_DIR=$2
26 |         shift
27 |         shift
28 |         ;;
29 |         -mw|--max_words)
30 |         MAX_WORDS=$2
31 |         shift
32 |         shift
33 |         ;;
34 |         *)
35 |         POSITIONAL+=("$1")
36 |         shift
37 |         ;;
38 |     esac
39 | done
40 | 
41 | 
42 | if [ -d "/valohai/inputs" ]; then
43 |     # apt-get -y update
44 |     # apt-get -y install python-pip
45 |     pip install -r requirements.txt
46 |     cd /valohai/inputs
47 |     mkdir ${VERSION}
48 |     unzip image_meta/image_data.json.zip -d ./${VERSION}
49 |     unzip regions/region_descriptions.json.zip -d ./${VERSION}
50 |     cd /valohai/repository/lib
51 |     time python2 preprocess.py --version ${VERSION} \
52 |         --path ${IN_PATH} \
53 |         --output_dir ${OUTPUT_DIR} \
54 |         --max_words ${MAX_WORDS}
55 | 
56 |     tar -czvf /valohai/outputs/visual_genome.tar.gz ${OUTPUT_DIR}
57 |     # comment it if one already have data stored in S3
58 |     mv regions/region_descriptions.json.zip /valohai/outputs
59 | fi
60 | 
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/README:
--------------------------------------------------------------------------------
 1 | =============================
 2 | Linjie Yang
 3 | 04/21/2016
 4 | =============================
 5 | This folder holds the functions for evaluating image captioning models, including the dense captioning models. This folder is originally from a standard evaluation toolkit for MS COCO (https://github.com/tylin/coco-caption). 
 6 | The newly added functions and usages are as follows.
 7 | (1) dt_eval.py: function to evaluate captioning model on web data. One image only has one ground truth caption.
 8 | (2) vg_eval.py: function to evaluate the dense captioning model on visual genome. Calculate Meteor score and mean AP which are described in the DenseCap paper (http://arxiv.org/abs/1511.07571).
 9 | (3) meteor/meteor2.py: modified version of "meteor/meteor.py". Adapted to be usedfor multi-to-multi caption matching in DenseCap.
10 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/bleu/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015 Xinlei Chen, Hao Fang, Tsung-Yi Lin, and Ramakrishna Vedantam
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/bleu/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/bleu/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # 
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from bleu_scorer import BleuScorer
12 | 
13 | 
14 | class Bleu:
15 |     def __init__(self, n=4):
16 |         # default compute Blue score up to 4
17 |         self._n = n
18 |         self._hypo_for_image = {}
19 |         self.ref_for_image = {}
20 | 
21 |     def compute_score(self, gts, res):
22 | 
23 |         assert(gts.keys() == res.keys())
24 |         imgIds = gts.keys()
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert(type(hypo) is list)
33 |             assert(len(hypo) == 1)
34 |             assert(type(ref) is list)
35 |             assert(len(ref) > 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         #score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
41 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         # return (bleu, bleu_info)
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "Bleu"
48 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr (Consensus-Based Image Description Evaluation) Metric 
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | 
10 | from cider_scorer import CiderScorer
11 | import pdb
12 | 
13 | class Cider:
14 |     """
15 |     Main Class to compute the CIDEr metric 
16 | 
17 |     """
18 |     def __init__(self, test=None, refs=None, n=4, sigma=6.0):
19 |         # set cider to sum over 1 to 4-grams
20 |         self._n = n
21 |         # set the standard deviation parameter for gaussian penalty
22 |         self._sigma = sigma
23 | 
24 |     def compute_score(self, gts, res):
25 |         """
26 |         Main function to compute CIDEr score
27 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
28 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
29 |         :return: cider (float) : computed CIDEr score for the corpus 
30 |         """
31 | 
32 |         assert(gts.keys() == res.keys())
33 |         imgIds = gts.keys()
34 | 
35 |         cider_scorer = CiderScorer(n=self._n, sigma=self._sigma)
36 | 
37 |         for id in imgIds:
38 |             hypo = res[id]
39 |             ref = gts[id]
40 | 
41 |             # Sanity check.
42 |             assert(type(hypo) is list)
43 |             assert(len(hypo) == 1)
44 |             assert(type(ref) is list)
45 |             assert(len(ref) > 0)
46 | 
47 |             cider_scorer += (hypo[0], ref)
48 | 
49 |         (score, scores) = cider_scorer.compute_score()
50 | 
51 |         return score, scores
52 | 
53 |     def method(self):
54 |         return "CIDEr"


--------------------------------------------------------------------------------
/lib/pycocoevalcap/eval.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'tylin'
 2 | from tokenizer.ptbtokenizer import PTBTokenizer
 3 | from bleu.bleu import Bleu
 4 | from meteor.meteor import Meteor
 5 | from rouge.rouge import Rouge
 6 | from cider.cider import Cider
 7 | 
 8 | class COCOEvalCap:
 9 |     def __init__(self, coco, cocoRes):
10 |         self.evalImgs = []
11 |         self.eval = {}
12 |         self.imgToEval = {}
13 |         self.coco = coco
14 |         self.cocoRes = cocoRes
15 |         self.params = {'image_id': coco.getImgIds()}
16 | 
17 |     def evaluate(self):
18 |         imgIds = self.params['image_id']
19 |         # imgIds = self.coco.getImgIds()
20 |         gts = {}
21 |         res = {}
22 |         for imgId in imgIds:
23 |             gts[imgId] = self.coco.imgToAnns[imgId]
24 |             res[imgId] = self.cocoRes.imgToAnns[imgId]
25 | 
26 |         # =================================================
27 |         # Set up scorers
28 |         # =================================================
29 |         print 'tokenization...'
30 |         tokenizer = PTBTokenizer()
31 |         gts  = tokenizer.tokenize(gts)
32 |         res = tokenizer.tokenize(res)
33 | 
34 |         # =================================================
35 |         # Set up scorers
36 |         # =================================================
37 |         print 'setting up scorers...'
38 |         scorers = [
39 |             (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
40 |             (Meteor(),"METEOR"),
41 |             (Rouge(), "ROUGE_L"),
42 |             (Cider(), "CIDEr")
43 |         ]
44 | 
45 |         # =================================================
46 |         # Compute scores
47 |         # =================================================
48 |         eval = {}
49 |         for scorer, method in scorers:
50 |             print 'computing %s score...'%(scorer.method())
51 |             score, scores = scorer.compute_score(gts, res)
52 |             if type(method) == list:
53 |                 for sc, scs, m in zip(score, scores, method):
54 |                     self.setEval(sc, m)
55 |                     self.setImgToEvalImgs(scs, imgIds, m)
56 |                     print "%s: %0.3f"%(m, sc)
57 |             else:
58 |                 self.setEval(score, method)
59 |                 self.setImgToEvalImgs(scores, imgIds, method)
60 |                 print "%s: %0.3f"%(method, score)
61 |         self.setEvalImgs()
62 | 
63 |     def setEval(self, score, method):
64 |         self.eval[method] = score
65 | 
66 |     def setImgToEvalImgs(self, scores, imgIds, method):
67 |         for imgId, score in zip(imgIds, scores):
68 |             if not imgId in self.imgToEval:
69 |                 self.imgToEval[imgId] = {}
70 |                 self.imgToEval[imgId]["image_id"] = imgId
71 |             self.imgToEval[imgId][method] = score
72 | 
73 |     def setEvalImgs(self):
74 |         self.evalImgs = [eval for imgId, eval in self.imgToEval.items()]


--------------------------------------------------------------------------------
/lib/pycocoevalcap/meteor/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/meteor/meteor-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/pycocoevalcap/meteor/meteor-1.5.jar


--------------------------------------------------------------------------------
/lib/pycocoevalcap/meteor/meteor.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Python wrapper for METEOR implementation, by Xinlei Chen
  4 | # Modified by Linjie Yang for evaluating dense captioning
  5 | # Acknowledge Michael Denkowski for the generous discussion and help 
  6 | 
  7 | import os
  8 | import sys
  9 | import subprocess
 10 | import threading
 11 | 
 12 | # Assumes meteor-1.5.jar is in the same directory as meteor.py.  Change as needed.
 13 | METEOR_JAR = 'meteor-1.5.jar'
 14 | # print METEOR_JAR
 15 | 
 16 | class Meteor:
 17 | 
 18 |     def __init__(self):
 19 |         self.meteor_cmd = ['java', '-jar', '-Xmx2G', METEOR_JAR, \
 20 |                 '-', '-', '-stdio', '-l', 'en', '-norm']
 21 |         self.meteor_p = subprocess.Popen(self.meteor_cmd, \
 22 |                 cwd=os.path.dirname(os.path.abspath(__file__)), \
 23 |                 stdin=subprocess.PIPE, \
 24 |                 stdout=subprocess.PIPE, \
 25 |                 stderr=subprocess.PIPE)
 26 |         # Used to guarantee thread safety
 27 |         self.lock = threading.Lock()
 28 | 
 29 |     def compute_score(self, gts, res, imgIds=None):
 30 |         assert(gts.keys() == res.keys())
 31 |         if imgIds is None:
 32 |             imgIds = gts.keys()
 33 |         scores = []
 34 | 
 35 |         eval_line = 'EVAL'
 36 |         self.lock.acquire()
 37 |         for i in imgIds:
 38 |             assert(len(res[i]) == 1)
 39 | 
 40 |             stat = self._stat(res[i][0], gts[i])
 41 |             eval_line += ' ||| {}'.format(stat)
 42 | 
 43 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
 44 |         for i in range(0,len(imgIds)):
 45 |             scores.append(float(self.meteor_p.stdout.readline().strip()))
 46 |         final_score = self.meteor_p.stdout.readline().strip()
 47 |         #print final_score
 48 |         score = float(final_score)
 49 |         self.lock.release()
 50 | 
 51 |         return score, scores
 52 | 
 53 | 
 54 |     def compute_score_m2m(self, gts, res, imgIds=None):
 55 |         assert(gts.keys() == res.keys())
 56 |         if imgIds is None:
 57 |             imgIds = gts.keys()
 58 |         scores = []
 59 | 
 60 |         eval_line = 'EVAL'
 61 |         self.lock.acquire()
 62 |         tot_line = 0
 63 |         for i in imgIds:
 64 |             #assert(len(res[i]) == 1)
 65 |             for res_sent in res[i]:
 66 |                 stat = self._stat(res_sent, gts[i])
 67 |                 eval_line += ' ||| {}'.format(stat)
 68 |                 tot_line += 1
 69 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
 70 |         for i in range(0,len(imgIds)):
 71 |             scores_im = []
 72 |             for j in xrange(len(res[i])):
 73 |                 scores_im.append(float(self.meteor_p.stdout.readline().strip()))
 74 |             scores.append(scores_im)
 75 |         score = float(self.meteor_p.stdout.readline().strip())
 76 |         self.lock.release()
 77 | 
 78 |         return score, scores
 79 |     def method(self):
 80 |         return "METEOR"
 81 | 
 82 |     def _stat(self, hypothesis_str, reference_list):
 83 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
 84 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
 85 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
 86 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
 87 |         return self.meteor_p.stdout.readline().strip()
 88 | 
 89 |     def score(self, hypothesis_str, reference_list):
 90 |         self.lock.acquire()
 91 |         # SCORE ||| reference 1 words ||| reference n words ||| hypothesis words
 92 |         hypothesis_str = hypothesis_str.replace('|||','').replace('  ',' ')
 93 |         score_line = ' ||| '.join(('SCORE', ' ||| '.join(reference_list), hypothesis_str))
 94 |         self.meteor_p.stdin.write('{}\n'.format(score_line))
 95 |         stats = self.meteor_p.stdout.readline().strip()
 96 |         eval_line = 'EVAL ||| {}'.format(stats)
 97 |         # EVAL ||| stats 
 98 |         self.meteor_p.stdin.write('{}\n'.format(eval_line))
 99 |         score = float(self.meteor_p.stdout.readline().strip())
100 |         self.lock.release()
101 |         return score
102 |  
103 |     def __exit__(self):
104 |         self.lock.acquire()
105 |         self.meteor_p.stdin.close()
106 |         self.meteor_p.wait()
107 |         self.lock.release()
108 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/rouge/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'vrama91'
2 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/rouge/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # 
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | import pdb
 12 | 
 13 | def my_lcs(string, sub):
 14 |     """
 15 |     Calculates longest common subsequence for a pair of tokenized strings
 16 |     :param string : list of str : tokens from a string split using whitespace
 17 |     :param sub : list of str : shorter string, also split using whitespace
 18 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 19 | 
 20 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 21 |     """
 22 |     if(len(string)< len(sub)):
 23 |         sub, string = string, sub
 24 | 
 25 |     lengths = [[0 for i in range(0,len(sub)+1)] for j in range(0,len(string)+1)]
 26 | 
 27 |     for j in range(1,len(sub)+1):
 28 |         for i in range(1,len(string)+1):
 29 |             if(string[i-1] == sub[j-1]):
 30 |                 lengths[i][j] = lengths[i-1][j-1] + 1
 31 |             else:
 32 |                 lengths[i][j] = max(lengths[i-1][j] , lengths[i][j-1])
 33 | 
 34 |     return lengths[len(string)][len(sub)]
 35 | 
 36 | class Rouge():
 37 |     '''
 38 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 39 | 
 40 |     '''
 41 |     def __init__(self):
 42 |         # vrama91: updated the value below based on discussion with Hovey
 43 |         self.beta = 1.2
 44 | 
 45 |     def calc_score(self, candidate, refs):
 46 |         """
 47 |         Compute ROUGE-L score given one candidate and references for an image
 48 |         :param candidate: str : candidate sentence to be evaluated
 49 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 50 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 51 |         """
 52 |         assert(len(candidate)==1)	
 53 |         assert(len(refs)>0)         
 54 |         prec = []
 55 |         rec = []
 56 | 
 57 |         # split into tokens
 58 |         token_c = candidate[0].split(" ")
 59 |     	
 60 |         for reference in refs:
 61 |             # split into tokens
 62 |             token_r = reference.split(" ")
 63 |             # compute the longest common subsequence
 64 |             lcs = my_lcs(token_r, token_c)
 65 |             prec.append(lcs/float(len(token_c)))
 66 |             rec.append(lcs/float(len(token_r)))
 67 | 
 68 |         prec_max = max(prec)
 69 |         rec_max = max(rec)
 70 | 
 71 |         if(prec_max!=0 and rec_max !=0):
 72 |             score = ((1 + self.beta**2)*prec_max*rec_max)/float(rec_max + self.beta**2*prec_max)
 73 |         else:
 74 |             score = 0.0
 75 |         return score
 76 | 
 77 |     def compute_score(self, gts, res):
 78 |         """
 79 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 80 |         Invoked by evaluate_captions.py 
 81 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 
 82 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 83 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 84 |         """
 85 |         assert(gts.keys() == res.keys())
 86 |         imgIds = gts.keys()
 87 | 
 88 |         score = []
 89 |         for id in imgIds:
 90 |             hypo = res[id]
 91 |             ref  = gts[id]
 92 | 
 93 |             score.append(self.calc_score(hypo, ref))
 94 | 
 95 |             # Sanity check.
 96 |             assert(type(hypo) is list)
 97 |             assert(len(hypo) == 1)
 98 |             assert(type(ref) is list)
 99 |             assert(len(ref) > 0)
100 | 
101 |         average_score = np.mean(np.array(score))
102 |         return average_score, np.array(score)
103 | 
104 |     def method(self):
105 |         return "Rouge"
106 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'hfang'
2 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/tokenizer/ptbtokenizer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # File Name : ptbtokenizer.py
 4 | #
 5 | # Description : Do the PTB Tokenization and remove punctuations.
 6 | #
 7 | # Creation Date : 29-12-2014
 8 | # Last Modified : Thu Mar 19 09:53:35 2015
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | import os
12 | import sys
13 | import subprocess
14 | import tempfile
15 | import itertools
16 | 
17 | # path to the stanford corenlp jar
18 | STANFORD_CORENLP_3_4_1_JAR = 'stanford-corenlp-3.4.1.jar'
19 | 
20 | # punctuations to be removed from the sentences
21 | PUNCTUATIONS = ["''", "'", "``", "`", "-LRB-", "-RRB-", "-LCB-", "-RCB-",
22 |                 ".", "?", "!", ",", ":", "-", "--", "...", ";"]
23 | 
24 | 
25 | class PTBTokenizer:
26 |     """Python wrapper of Stanford PTBTokenizer"""
27 | 
28 |     def tokenize(self, captions_for_image):
29 |         cmd = ['java', '-cp', STANFORD_CORENLP_3_4_1_JAR,
30 |                'edu.stanford.nlp.process.PTBTokenizer',
31 |                '-preserveLines', '-lowerCase']
32 | 
33 |         # ======================================================
34 |         # prepare data for PTB Tokenizer
35 |         # ======================================================
36 |         final_tokenized_captions_for_image = {}
37 |         image_id = [k for k, v in captions_for_image.items() for _ in range(len(v))]
38 |         sentences = '\n'.join([c['caption'].replace('\n', ' ') for k, v in captions_for_image.items() for c in v])
39 | 
40 |         # ======================================================
41 |         # save sentences to temporary file
42 |         # ======================================================
43 |         path_to_jar_dirname = os.path.dirname(os.path.abspath(__file__))
44 |         tmp_file = tempfile.NamedTemporaryFile(delete=False, dir=path_to_jar_dirname)
45 |         tmp_file.write(sentences)
46 |         tmp_file.close()
47 | 
48 |         # ======================================================
49 |         # tokenize sentence
50 |         # ======================================================
51 |         cmd.append(os.path.basename(tmp_file.name))
52 |         p_tokenizer = subprocess.Popen(cmd, cwd=path_to_jar_dirname,
53 |                                        stdout=subprocess.PIPE)  # shell=True
54 |         token_lines = p_tokenizer.communicate(input=sentences.rstrip())[0]
55 |         lines = token_lines.split('\n')
56 |         # remove temp file
57 |         os.remove(tmp_file.name)
58 | 
59 |         # ======================================================
60 |         # create dictionary for tokenized captions
61 |         # ======================================================
62 |         for k, line in zip(image_id, lines):
63 |             if not k in final_tokenized_captions_for_image:
64 |                 final_tokenized_captions_for_image[k] = []
65 |             tokenized_caption = ' '.join([w for w in line.rstrip().split(' ')
66 |                                           if w not in PUNCTUATIONS])
67 |             final_tokenized_captions_for_image[k].append(tokenized_caption)
68 | 
69 |         return final_tokenized_captions_for_image
70 | 


--------------------------------------------------------------------------------
/lib/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/lib/pycocoevalcap/tokenizer/stanford-corenlp-3.4.1.jar


--------------------------------------------------------------------------------
/lib/setup.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DenseCap-Tensorflow
  3 | # Written by InnerPeace
  4 | # This file is adapted from Ross Girshick's work
  5 | # --------------------------------------------------------
  6 | # Fast R-CNN
  7 | # Copyright (c) 2015 Microsoft
  8 | # Licensed under The MIT License [see LICENSE for details]
  9 | # Written by Ross Girshick
 10 | # --------------------------------------------------------
 11 | 
 12 | 
 13 | import os
 14 | from os.path import join as pjoin
 15 | from setuptools import setup
 16 | from distutils.extension import Extension
 17 | from Cython.Distutils import build_ext
 18 | import subprocess
 19 | import numpy as np
 20 | 
 21 | def find_in_path(name, path):
 22 |     "Find a file in a search path"
 23 |     # Adapted fom
 24 |     # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
 25 |     for dir in path.split(os.pathsep):
 26 |         binpath = pjoin(dir, name)
 27 |         if os.path.exists(binpath):
 28 |             return os.path.abspath(binpath)
 29 |     return None
 30 | 
 31 | 
 32 | def locate_cuda():
 33 |     """Locate the CUDA environment on the system
 34 | 
 35 |     Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
 36 |     and values giving the absolute path to each directory.
 37 | 
 38 |     Starts by looking for the CUDAHOME env variable. If not found, everything
 39 |     is based on finding 'nvcc' in the PATH.
 40 |     """
 41 | 
 42 |     # first check if the CUDAHOME env variable is in use
 43 |     if 'CUDAHOME' in os.environ:
 44 |         home = os.environ['CUDAHOME']
 45 |         nvcc = pjoin(home, 'bin', 'nvcc')
 46 |     else:
 47 |         # otherwise, search the PATH for NVCC
 48 |         default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
 49 |         nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
 50 |         if nvcc is None:
 51 |             raise EnvironmentError('The nvcc binary could not be '
 52 |                                    'located in your $PATH. Either add it to your path, or set $CUDAHOME')
 53 |         home = os.path.dirname(os.path.dirname(nvcc))
 54 | 
 55 |     cudaconfig = {'home':home, 'nvcc':nvcc,
 56 |                   'include': pjoin(home, 'include'),
 57 |                   'lib64': pjoin(home, 'lib64')}
 58 |     for k, v in cudaconfig.iteritems():
 59 |         if not os.path.exists(v):
 60 |             raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
 61 | 
 62 |     return cudaconfig
 63 | CUDA = locate_cuda()
 64 | 
 65 | 
 66 | # Obtain the numpy include directory.  This logic works across numpy versions.
 67 | try:
 68 |     numpy_include = np.get_include()
 69 | except AttributeError:
 70 |     numpy_include = np.get_numpy_include()
 71 | 
 72 | def customize_compiler_for_nvcc(self):
 73 |     """inject deep into distutils to customize how the dispatch
 74 |     to gcc/nvcc works.
 75 | 
 76 |     If you subclass UnixCCompiler, it's not trivial to get your subclass
 77 |     injected in, and still have the right customizations (i.e.
 78 |     distutils.sysconfig.customize_compiler) run on it. So instead of going
 79 |     the OO route, I have this. Note, it's kindof like a wierd functional
 80 |     subclassing going on."""
 81 | 
 82 |     # tell the compiler it can processes .cu
 83 |     self.src_extensions.append('.cu')
 84 | 
 85 |     # save references to the default compiler_so and _comple methods
 86 |     default_compiler_so = self.compiler_so
 87 |     super = self._compile
 88 | 
 89 |     # now redefine the _compile method. This gets executed for each
 90 |     # object but distutils doesn't have the ability to change compilers
 91 |     # based on source extension: we add it.
 92 |     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
 93 |         if os.path.splitext(src)[1] == '.cu':
 94 |             # use the cuda for .cu files
 95 |             self.set_executable('compiler_so', CUDA['nvcc'])
 96 |             # use only a subset of the extra_postargs, which are 1-1 translated
 97 |             # from the extra_compile_args in the Extension class
 98 |             postargs = extra_postargs['nvcc']
 99 |         else:
100 |             postargs = extra_postargs['gcc']
101 | 
102 |         super(obj, src, ext, cc_args, postargs, pp_opts)
103 |         # reset the default compiler_so, which we might have changed for cuda
104 |         self.compiler_so = default_compiler_so
105 | 
106 |     # inject our redefined _compile method into the class
107 |     self._compile = _compile
108 | 
109 | 
110 | # run the customize_compiler
111 | class custom_build_ext(build_ext):
112 |     def build_extensions(self):
113 |         customize_compiler_for_nvcc(self.compiler)
114 |         build_ext.build_extensions(self)
115 | 
116 | 
117 | ext_modules = [
118 |     Extension(
119 |         "utils.cython_bbox",
120 |         ["utils/bbox.pyx"],
121 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
122 |         include_dirs = [numpy_include]
123 |     ),
124 |     Extension(
125 |         "nms.cpu_nms",
126 |         ["nms/cpu_nms.pyx"],
127 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
128 |         include_dirs = [numpy_include]
129 |     ),
130 |     Extension('nms.gpu_nms',
131 |               ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
132 |               library_dirs=[CUDA['lib64']],
133 |               libraries=['cudart'],
134 |               language='c++',
135 |               runtime_library_dirs=[CUDA['lib64']],
136 |               # this syntax is specific to this build system
137 |               # we're only going to use certain compiler args with nvcc and not with
138 |               # gcc the implementation of this trick is in customize_compiler() below
139 |               extra_compile_args={'gcc': ["-Wno-unused-function"],
140 |                                   'nvcc': ['-arch=sm_35',
141 |                                            '--ptxas-options=-v',
142 |                                            '-c',
143 |                                            '--compiler-options',
144 |                                            "'-fPIC'"]},
145 |               include_dirs = [numpy_include, CUDA['include']]
146 |               ),
147 | ]
148 | 
149 | setup(
150 |     name='fast_rcnn',
151 |     ext_modules=ext_modules,
152 |     # inject our custom trigger
153 |     cmdclass={'build_ext': custom_build_ext},
154 | )
155 | 
156 | 


--------------------------------------------------------------------------------
/lib/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # --------------------------------------------------------
2 | # Fast R-CNN
3 | # Copyright (c) 2015 Microsoft
4 | # Licensed under The MIT License [see LICENSE for details]
5 | # Written by Ross Girshick
6 | # --------------------------------------------------------
7 | 


--------------------------------------------------------------------------------
/lib/utils/bbox.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Sergey Karayev
 6 | # --------------------------------------------------------
 7 | 
 8 | cimport cython
 9 | import numpy as np
10 | cimport numpy as np
11 | 
12 | DTYPE = np.float
13 | ctypedef np.float_t DTYPE_t
14 | 
15 | def bbox_overlaps(
16 |         np.ndarray[DTYPE_t, ndim=2] boxes,
17 |         np.ndarray[DTYPE_t, ndim=2] query_boxes):
18 |     """
19 |     Parameters
20 |     ----------
21 |     boxes: (N, 4) ndarray of float
22 |     query_boxes: (K, 4) ndarray of float
23 |     Returns
24 |     -------
25 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
26 |     """
27 |     cdef unsigned int N = boxes.shape[0]
28 |     cdef unsigned int K = query_boxes.shape[0]
29 |     cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
30 |     cdef DTYPE_t iw, ih, box_area
31 |     cdef DTYPE_t ua
32 |     cdef unsigned int k, n
33 |     for k in range(K):
34 |         box_area = (
35 |             (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
36 |             (query_boxes[k, 3] - query_boxes[k, 1] + 1)
37 |         )
38 |         for n in range(N):
39 |             iw = (
40 |                 min(boxes[n, 2], query_boxes[k, 2]) -
41 |                 max(boxes[n, 0], query_boxes[k, 0]) + 1
42 |             )
43 |             if iw > 0:
44 |                 ih = (
45 |                     min(boxes[n, 3], query_boxes[k, 3]) -
46 |                     max(boxes[n, 1], query_boxes[k, 1]) + 1
47 |                 )
48 |                 if ih > 0:
49 |                     ua = float(
50 |                         (boxes[n, 2] - boxes[n, 0] + 1) *
51 |                         (boxes[n, 3] - boxes[n, 1] + 1) +
52 |                         box_area - iw * ih
53 |                     )
54 |                     overlaps[n, k] = iw * ih / ua
55 |     return overlaps
56 | 


--------------------------------------------------------------------------------
/lib/utils/bbox_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | from collections import OrderedDict
  4 | import json
  5 | import numpy as np
  6 | import pprint
  7 | import cPickle as pickle
  8 | import string
  9 | 
 10 | def get_bbox_coord(norm_coord, do_clip=True):
 11 |   #input is a nx4 numpy array in normalized bbox coordinates
 12 |   #print norm_coord.shape
 13 |   #print norm_coord
 14 |   bboxes_coord = np.zeros(norm_coord.shape)
 15 |   #x,y,w,h
 16 |   bboxes_coord[:, :2] = norm_coord[:, :2]+0.5
 17 |   bboxes_coord[:, 2:] = np.exp(norm_coord[:, 2:])
 18 |   
 19 |   #x1,y1,x2,y2
 20 |   bboxes_coord2 = np.zeros(norm_coord.shape)
 21 |   bboxes_coord2[:, :2] = bboxes_coord[:, :2] - bboxes_coord[:, 2:] * 0.5
 22 |   bboxes_coord2[:, 2:] = bboxes_coord[:, :2] + bboxes_coord[:, 2:] * 0.5
 23 |   #clipping all coordinates to [0,1]
 24 |   if do_clip:
 25 |     bboxes_coord2 = np.minimum(np.maximum(bboxes_coord2, 0), 1)
 26 |   return bboxes_coord2
 27 | 
 28 | 
 29 | def get_bbox_iou_matrix(bboxes):
 30 |   region_n = bboxes.shape[0]
 31 |   #area, intersection area, union area
 32 |   bbox_areas = (bboxes[:,2] - bboxes[:,0]) * \
 33 |     (bboxes[:, 3] - bboxes[:, 1])
 34 |   
 35 |   x_a1 = bboxes[:,0].reshape(region_n,1)
 36 |   x_a2 = bboxes[:,2].reshape(region_n,1)
 37 |   x_b1 = bboxes[:,0].reshape(1,region_n)
 38 |   x_b2 = bboxes[:,2].reshape(1,region_n)
 39 |   y_a1 = bboxes[:,1].reshape(region_n,1)
 40 |   y_a2 = bboxes[:,3].reshape(region_n,1)
 41 |   y_b1 = bboxes[:,1].reshape(1,region_n)
 42 |   y_b2 = bboxes[:,3].reshape(1,region_n)
 43 |   bbox_pair_x_diff = np.maximum(0, np.minimum(x_a2, x_b2) - np.maximum(x_a1, x_b1))
 44 |   bbox_pair_y_diff = np.maximum(0, np.minimum(y_a2, y_b2) - np.maximum(y_a1, y_b1))
 45 |   inter_areas = bbox_pair_x_diff * bbox_pair_y_diff
 46 |   
 47 |   #IoU
 48 |   union_areas = bbox_areas.reshape(region_n,1) + bbox_areas.reshape(1,region_n)
 49 |  
 50 |   bbox_iou = inter_areas / (union_areas - inter_areas)
 51 |   return bbox_iou
 52 |   
 53 | def nms(region_info, bbox_th=0.3):
 54 |   #non-maximum surpression
 55 |   region_info.sort(key = lambda x: -x['log_prob'])
 56 |   #keep_index = []
 57 |   region_n = len(region_info)
 58 |   #fast computation of pairwise IoU
 59 |   #pick the bbox of last timestep of each sample
 60 |   #print 'region_info length %d' % len(region_info)
 61 |   all_bboxes = np.array([x['location'][-1,:] for x in region_info])# nx4 matrix
 62 |   bbox_iou = get_bbox_iou_matrix(all_bboxes)
 63 |   bbox_iou_th = bbox_iou < bbox_th
 64 |   keep_flag = np.ones((region_n),dtype=np.uint8)
 65 | 
 66 |   for i in xrange(region_n-1):
 67 |     if keep_flag[i]:
 68 |       keep_flag[i+1:] = np.logical_and(keep_flag[i+1:], bbox_iou_th[i,i+1:])  
 69 |   print 'sum of keep flag'
 70 |   print keep_flag.sum()
 71 |   return [region_info[i] for i in xrange(region_n) if keep_flag[i]] 
 72 | 
 73 | def region_merge(region_info, bbox_th=0.7):
 74 |   #merging ground truth bboxes
 75 | 
 76 |   #keep_index = []
 77 |   region_n = len(region_info)
 78 |   region_merged = []
 79 |   #fast computation of pairwise IoU
 80 |   #pick the bbox of last timestep of each sample
 81 |   all_bboxes = np.array([x['location'] for x in region_info], dtype = np.float32)# nx4 matrix
 82 |   bbox_iou = get_bbox_iou_matrix(all_bboxes)
 83 |   bbox_iou_th = bbox_iou > bbox_th
 84 |   bbox_iou_overlap_n = bbox_iou_th.sum(axis = 0)
 85 | 
 86 |   merge_flag = np.ones((region_n),dtype=np.uint8)
 87 |   unmerged_region = region_n
 88 |   while unmerged_region > 0:
 89 |     max_overlap_id = np.argmax(bbox_iou_overlap_n)
 90 |     assert bbox_iou_overlap_n[max_overlap_id] > 0
 91 |     merge_group = np.nonzero(bbox_iou_th[max_overlap_id,:] & merge_flag)[0]
 92 |     unmerged_region -= len(merge_group)
 93 |     merge_flag[merge_group] = 0
 94 |     bbox_iou_overlap_n[merge_group] = 0
 95 |     bbox_group = all_bboxes[merge_group,:].reshape(len(merge_group),4)
 96 |     caption_group = [region_info[i]['caption'] for i in merge_group]
 97 |     bbox_mean = np.mean(bbox_group, axis = 0).tolist()
 98 |     region_merged.append({'image_id':region_info[max_overlap_id]['image_id'], \
 99 |       'captions': caption_group, 'location': bbox_mean})
100 |   return region_merged    
101 | 
102 | 


--------------------------------------------------------------------------------
/lib/utils/blob.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | """Blob helper functions."""
 9 | 
10 | import numpy as np
11 | import cv2
12 | 
13 | 
14 | def im_list_to_blob(ims):
15 |     """Convert a list of images into a network input.
16 | 
17 |     Assumes images are already prepared (means subtracted, BGR order, ...).
18 |     """
19 |     max_shape = np.array([im.shape for im in ims]).max(axis=0)
20 |     num_images = len(ims)
21 |     blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
22 |                     dtype=np.float32)
23 |     for i in xrange(num_images):
24 |         im = ims[i]
25 |         blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
26 |     # Move channels (axis 3) to axis 1
27 |     # Axis order will become: (batch elem, channel, height, width)
28 |     # TODO: check out if we need transpose here.
29 |     # For now, we stick to the tf_faster_rcnn version
30 |     # channel_swap = (0, 3, 1, 2)
31 |     # blob = blob.transpose(channel_swap)
32 |     return blob
33 | 
34 | 
35 | def prep_im_for_blob(im, pixel_means, target_size, max_size):
36 |     """Mean subtract and scale an image for use in a blob."""
37 |     im = im.astype(np.float32, copy=False)
38 |     im -= pixel_means
39 |     im_shape = im.shape
40 |     im_size_min = np.min(im_shape[0:2])
41 |     im_size_max = np.max(im_shape[0:2])
42 |     im_scale = float(target_size) / float(im_size_min)
43 |     # Prevent the biggest axis from being more than MAX_SIZE
44 |     if np.round(im_scale * im_size_max) > max_size:
45 |         im_scale = float(max_size) / float(im_size_max)
46 |     im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
47 |                     interpolation=cv2.INTER_LINEAR)
48 | 
49 |     return im, im_scale
50 | 


--------------------------------------------------------------------------------
/lib/utils/debug.py:
--------------------------------------------------------------------------------
 1 | ### Functions in this file are for debugging purpose
 2 | ### Linjie Yang
 3 | 
 4 | import numpy as np
 5 | 
 6 | def softmax(x):
 7 |     """Compute softmax values for each sets of scores in x."""
 8 |     # defalut: last dimension of x is the score dimension
 9 |     axis = len(x.shape) - 1
10 |     x = x - x.max(axis = axis, keepdims=True)
11 |     sf = np.exp(x)
12 |     sf = sf / np.sum(sf, axis=axis, keepdims=True)
13 |     return sf


--------------------------------------------------------------------------------
/lib/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | class Timer(object):
11 |     """A simple timer."""
12 |     def __init__(self):
13 |         self.total_time = 0.
14 |         self.calls = 0
15 |         self.start_time = 0.
16 |         self.diff = 0.
17 |         self.average_time = 0.
18 | 
19 |     def tic(self):
20 |         # using time.time instead of time.clock because time time.clock
21 |         # does not normalize for multithreading
22 |         self.start_time = time.time()
23 | 
24 |     def toc(self, average=True):
25 |         self.diff = time.time() - self.start_time
26 |         self.total_time += self.diff
27 |         self.calls += 1
28 |         self.average_time = self.total_time / self.calls
29 |         if average:
30 |             return self.average_time
31 |         else:
32 |             return self.diff
33 | 


--------------------------------------------------------------------------------
/lib/utils/visualization.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Tensorflow Faster R-CNN
  3 | # Licensed under The MIT License [see LICENSE for details]
  4 | # Written by Xinlei Chen
  5 | # --------------------------------------------------------
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import pdb
 11 | import numpy as np
 12 | import numpy.random as npr
 13 | from six.moves import range
 14 | from lib.config import cfg
 15 | import PIL.Image as Image
 16 | import PIL.ImageColor as ImageColor
 17 | import PIL.ImageDraw as ImageDraw
 18 | import PIL.ImageFont as ImageFont
 19 | from lib.fast_rcnn.nms_wrapper import nms
 20 | from lib.fast_rcnn.bbox_transform import clip_boxes, bbox_transform_inv
 21 | 
 22 | STANDARD_COLORS = [
 23 |     'AliceBlue', 'Chartreuse', 'Aqua', 'Aquamarine', 'Azure', 'Beige', 'Bisque',
 24 |     'BlanchedAlmond', 'BlueViolet', 'BurlyWood', 'CadetBlue', 'AntiqueWhite',
 25 |     'Chocolate', 'Coral', 'CornflowerBlue', 'Cornsilk', 'Crimson', 'Cyan',
 26 |     'DarkCyan', 'DarkGoldenRod', 'DarkGrey', 'DarkKhaki', 'DarkOrange',
 27 |     'DarkOrchid', 'DarkSalmon', 'DarkSeaGreen', 'DarkTurquoise', 'DarkViolet',
 28 |     'DeepPink', 'DeepSkyBlue', 'DodgerBlue', 'FireBrick', 'FloralWhite',
 29 |     'ForestGreen', 'Fuchsia', 'Gainsboro', 'GhostWhite', 'Gold', 'GoldenRod',
 30 |     'Salmon', 'Tan', 'HoneyDew', 'HotPink', 'IndianRed', 'Ivory', 'Khaki',
 31 |     'Lavender', 'LavenderBlush', 'LawnGreen', 'LemonChiffon', 'LightBlue',
 32 |     'LightCoral', 'LightCyan', 'LightGoldenRodYellow', 'LightGray', 'LightGrey',
 33 |     'LightGreen', 'LightPink', 'LightSalmon', 'LightSeaGreen', 'LightSkyBlue',
 34 |     'LightSlateGray', 'LightSlateGrey', 'LightSteelBlue', 'LightYellow', 'Lime',
 35 |     'LimeGreen', 'Linen', 'Magenta', 'MediumAquaMarine', 'MediumOrchid',
 36 |     'MediumPurple', 'MediumSeaGreen', 'MediumSlateBlue', 'MediumSpringGreen',
 37 |     'MediumTurquoise', 'MediumVioletRed', 'MintCream', 'MistyRose', 'Moccasin',
 38 |     'NavajoWhite', 'OldLace', 'Olive', 'OliveDrab', 'Orange', 'OrangeRed',
 39 |     'Orchid', 'PaleGoldenRod', 'PaleGreen', 'PaleTurquoise', 'PaleVioletRed',
 40 |     'PapayaWhip', 'PeachPuff', 'Peru', 'Pink', 'Plum', 'PowderBlue', 'Purple',
 41 |     'Red', 'RosyBrown', 'RoyalBlue', 'SaddleBrown', 'Green', 'SandyBrown',
 42 |     'SeaGreen', 'SeaShell', 'Sienna', 'Silver', 'SkyBlue', 'SlateBlue',
 43 |     'SlateGray', 'SlateGrey', 'Snow', 'SpringGreen', 'SteelBlue', 'GreenYellow',
 44 |     'Teal', 'Thistle', 'Tomato', 'Turquoise', 'Violet', 'Wheat', 'White',
 45 |     'WhiteSmoke', 'Yellow', 'YellowGreen'
 46 | ]
 47 | 
 48 | NUM_COLORS = len(STANDARD_COLORS)
 49 | 
 50 | try:
 51 |     FONT = ImageFont.truetype('arial.ttf', 24)
 52 | except IOError:
 53 |     FONT = ImageFont.load_default()
 54 | 
 55 | 
 56 | def _draw_single_box(image, xmin, ymin, xmax, ymax, display_str, font, color='black', thickness=4):
 57 |     draw = ImageDraw.Draw(image)
 58 |     (left, right, top, bottom) = (xmin, xmax, ymin, ymax)
 59 |     draw.line([(left, top), (left, bottom), (right, bottom),
 60 |                (right, top), (left, top)], width=thickness, fill=color)
 61 |     text_bottom = bottom
 62 |     # Reverse list and print from bottom to top.
 63 |     text_width, text_height = font.getsize(display_str)
 64 |     margin = np.ceil(0.05 * text_height)
 65 |     draw.rectangle(
 66 |         [(left, text_bottom - text_height - 2 * margin), (left + text_width,
 67 |                                                           text_bottom)],
 68 |         fill=color)
 69 |     draw.text(
 70 |         (left + margin, text_bottom - text_height - margin),
 71 |         display_str,
 72 |         fill='black',
 73 |         font=font)
 74 | 
 75 |     return image
 76 | 
 77 | 
 78 | def draw_bounding_boxes(image, gt_boxes, im_info, phrases):
 79 | 
 80 |     num_boxes = gt_boxes.shape[0]
 81 |     gt_boxes_new = gt_boxes.copy()
 82 |     gt_boxes_new[:, :4] = np.round(gt_boxes_new[:, :4].copy() / im_info[2])
 83 |     disp_image = Image.fromarray(np.uint8(image[0]))
 84 | 
 85 |     # show several(10) boxes for debugging
 86 |     show_ids = npr.choice(np.arange(num_boxes), size=5, replace=False)
 87 |     vocab_path = '%s/vocabulary.txt' % cfg.CACHE_DIR
 88 |     with open(vocab_path, 'r') as f:
 89 |         vocab = [line.strip() for line in f]
 90 |     # vocab_extra = ['<EOS>', '<SOS>', '<PAD>']
 91 |     # for ex in vocab_extra:
 92 |     #     vocab.insert(0, ex)
 93 |     for idx, i in enumerate(show_ids):
 94 |         # this_class = int(gt_boxes_new[i, 4])
 95 |         # phrase = phrases[i] if len(phrases[i]) < cfg.TIME_STEPS else phrases[1:]
 96 |         # for adding gt bounding box
 97 |         if len(phrases[i]) < cfg.TIME_STEPS:
 98 |             phrase = phrases[i]
 99 |         # for adding predicted boxes
100 |         else:
101 |             phrase = []
102 |             # phrases[i][1:] to remove the <SOS> token
103 |             for p in phrases[i]:
104 |                 if p == cfg.END_INDEX:
105 |                     break
106 |                 phrase.append(p)
107 | 
108 |         caption = ' '.join([vocab[j - 3] if j - 3 >= 0 else "" for j
109 |                             in phrase])
110 |         # caption = " ".join([vocab[j] for j in phrase[i])
111 |         disp_image = _draw_single_box(disp_image,
112 |                                       gt_boxes_new[i, 0],
113 |                                       gt_boxes_new[i, 1],
114 |                                       gt_boxes_new[i, 2],
115 |                                       gt_boxes_new[i, 3],
116 |                                       '%s_%s' % (i, caption),
117 |                                       FONT,
118 |                                       color=STANDARD_COLORS[idx % NUM_COLORS])
119 | 
120 |     image[0, :] = np.array(disp_image)
121 |     return image
122 | 
123 | 
124 | def draw_densecap(image, scores, rois, im_info, cap_probs, bbox_pred):
125 |     """
126 |     bbox_pred: [None, 4]
127 |     rois: [None, 5]
128 | 
129 |     """
130 |     # for bbox unnormalization
131 | 
132 |     bbox_mean = np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS).reshape((1, 4))
133 |     bbox_stds = np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS).reshape((1, 4))
134 | 
135 |     boxes = rois[:, 1:5] / im_info[2]
136 |     # [None, 12]
137 |     cap_ids = np.argmax(cap_probs, axis=1).reshape((-1, cfg.TIME_STEPS))
138 | 
139 |     # bbox target unnormalization
140 |     box_deltas = bbox_pred * bbox_stds + bbox_mean
141 | 
142 |     # do the transformation
143 |     pred_boxes = bbox_transform_inv(boxes, box_deltas)
144 |     pred_boxes = clip_boxes(pred_boxes, image.shape)
145 | 
146 |     pos_dets = np.hstack((pred_boxes, scores[:, 1][:, np.newaxis])).astype(np.float32, copy=False)
147 |     keep = nms(pos_dets, cfg.TEST.NMS)
148 |     pos_boxes = boxes[keep, :]
149 |     cap_ids = cap_ids[keep, :]
150 |     im_info[2] = 1.
151 |     img_cap = draw_bounding_boxes(image, pos_boxes, im_info, cap_ids)
152 | 
153 |     return img_cap
154 | 


--------------------------------------------------------------------------------
/logs/densecap.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/logs/densecap.png


--------------------------------------------------------------------------------
/logs/funny.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/logs/funny.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | Cython>=0.19.2
 2 | opencv-python>=3.3.0
 3 | numpy>=1.7.1
 4 | scipy>=0.13.2
 5 | scikit-image>=0.9.3
 6 | matplotlib>=1.3.1
 7 | ipython>=3.0.0
 8 | pyyaml>=3.10
 9 | Pillow>=2.3.0
10 | easydict>=1.6
11 | ijson>=2.3
12 | tqdm>=4.17.1
13 | 


--------------------------------------------------------------------------------
/scripts/dense_cap_config.yml:
--------------------------------------------------------------------------------
 1 | EXP_DIR: DenseCap
 2 | DEBUG_ALL: False
 3 | ALL_TEST: False
 4 | ALL_TEST_NUM_TRAIN: 100
 5 | ALL_TEST_NUM_VAL: 100
 6 | ALL_TEST_NUM_TEST: 1000
 7 | LIMIT_RAM: True
 8 | EMBED_DIM: 512
 9 | CONTEXT_FUSION: False
10 | INIT_BY_GLOVE: False
11 | KEEP_AS_GLOVE_DIM: False
12 | GLOVE_DIM: 300
13 | TRAIN:
14 |   HAS_RPN: True
15 |   IMS_PER_BATCH: 1
16 |   BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
17 |   RPN_POSITIVE_OVERLAP: 0.7
18 |   SUMMARY_INTERVAL: 10
19 |   RPN_BATCHSIZE: 256
20 |   BATCH_SIZE: 256
21 |   PROPOSAL_METHOD: gt
22 |   BG_THRESH_LO: 0.0
23 |   FG_FRACTION: 0.5
24 |   RPN_NMS_THRESH: 0.7
25 |   MAX_SIZE: 720
26 |   USE_FLIPPED: True
27 |   LR_DIY_DECAY: True
28 |   STEPSIZE: [100000]
29 |   WEIGHT_INITIALIZER: normal
30 |   DISPLAY: 10
31 |   # EXP_DECAY_RATE: 0.5
32 |   # EXP_DECAY_STEPS: 500
33 | RESNET:
34 |   FIXED_BLOCKS: 1
35 | TEST:
36 |   HAS_RPN: True
37 |   RPN_NMS_THRESH: 0.6
38 |   NMS: 0.5
39 |   RPN_POST_NMS_TOP_N: 300
40 |   MAX_SIZE: 720
41 | 


--------------------------------------------------------------------------------
/scripts/dense_cap_demo.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Run with:
 4 | #       bash scripts/dense_cap_demo.sh [ckpt_path] [vocab_path]
 5 | 
 6 | set -x
 7 | set -e
 8 | 
 9 | ckpt=$1
10 | vocab=$2
11 | 
12 | # For my own experiment usage, just ignore it.
13 | if [ -d '/home/joe' ]; then
14 |     ckpt='/home/joe/git/densecap/output/dc_context/vg_1.2_train'
15 |     vocab='/home/joe/git/visual_genome/1.2/vocabulary.txt'
16 | fi
17 | 
18 | time python ./tools/demo.py \
19 |     --ckpt ${ckpt} \
20 |     --cfg  scripts/dense_cap_config.yml \
21 |     --vocab ${vocab} \
22 |     --set TEST.USE_BEAM_SEARCH False EMBED_DIM 512 TEST.LN_FACTOR 1. TEST.RPN_NMS_THRESH 0.7 TEST.NMS 0.3
23 | 


--------------------------------------------------------------------------------
/scripts/dense_cap_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # --------------------------------------------------------
 4 | # DenseCap-Tensorflow
 5 | # Written by InnerPeace
 6 | # This file is adapted from Ross Linjie's work
 7 | # --------------------------------------------------------
 8 | 
 9 | # TODO: change the test procedure.
10 | set -x
11 | set -e
12 | 
13 | GPU_ID=0
14 | CKPT=$1
15 | TEST_IMDB=$2
16 | 
17 | 
18 | # Fro valohai platform, maybe out of date.
19 | if [ -d '/valohai/outputs' ]; then
20 |     CKPT="./output/Densecap_res50_context_all/vg_1.2_train"
21 | fi
22 | 
23 | # For my own experiment, just ignore it.
24 | if [ -d '/home/joe' ]; then
25 |     CKPT="/home/joe/git/densecap/output/dc_tune_context/vg_1.2_train"
26 |     TEST_IMDB="vg_1.2_test"
27 | fi
28 | 
29 | LOG="logs/test_log.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
30 | exec &> >(tee -a "$LOG")
31 | echo Logging output to "$LOG"
32 | 
33 | time python ./tools/test_net.py  \
34 |   --ckpt ${CKPT} \
35 |   --imdb ${TEST_IMDB} \
36 |   --cfg scripts/dense_cap_config.yml \
37 |   --set ALL_TEST True
38 | 


--------------------------------------------------------------------------------
/scripts/dense_cap_train.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Run with:
  4 | #       bash scripts/dense_cap_train.sh [dataset] [net] [ckpt_to_init] [data_dir] [step]
  5 | 
  6 | set -x
  7 | set -e
  8 | 
  9 | export PYTHONUNBUFFERED='True'
 10 | 
 11 | DATASET=$1
 12 | NET=$2
 13 | ckpt_path=$3
 14 | data_dir=$4
 15 | step=$5
 16 | 
 17 | # For my own experiment usage, just ignore it.
 18 | if [ -d '/home/joe' ]; then
 19 |     DATASET='visual_genome_1.2'
 20 |     NET='res50'
 21 |     ckpt_path="experiments/random_fixconv_i85k_171219/dc_fixed_1219/vg_1.2_train"
 22 |     # ckpt_path="experiments/rd_fixconv_i165k_171221/dc_conv_fixed/vg_1.2_train"
 23 |     # ckpt_path='/home/joe/git/slim_models/res50.ckpt'
 24 |     data_dir='/home/joe/git/visual_genome'
 25 | fi
 26 | 
 27 | case $DATASET in
 28 |    visual_genome)
 29 |     TRAIN_IMDB="vg_1.0_train"
 30 |     TEST_IMDB="vg_1.0_val"
 31 |     PT_DIR="dense_cap"
 32 |     FINETUNE_AFTER1=200000
 33 |     FINETUNE_AFTER2=100000
 34 |     ITERS1=400000
 35 |     ITERS2=300000
 36 |     ;;
 37 |   visual_genome_1.2)
 38 |     TRAIN_IMDB="vg_1.2_train"
 39 |     TEST_IMDB="vg_1.2_val"
 40 |     PT_DIR="dense_cap"
 41 |     FINETUNE_AFTER1=200000
 42 |     FINETUNE_AFTER2=100000
 43 |     ITERS1=400000
 44 |     ITERS2=300000
 45 |     ;;
 46 |   *)
 47 |     echo "No dataset given"
 48 |     exit
 49 |     ;;
 50 | esac
 51 | 
 52 | # This is for valohai computing platform, one can just ignore it.
 53 | if [ -d '/valohai/outputs' ]; then
 54 |     ckpt_path='/valohai/inputs/resnet'
 55 |     data_dir='/valohai/inputs/visual_genome'
 56 |     LOG="/valohai/outputs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
 57 | else
 58 |     LOG="logs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
 59 | fi
 60 | 
 61 | exec &> >(tee -a "$LOG")
 62 | echo Logging output to "$LOG"
 63 | 
 64 | # First step, freeze conv nets weights
 65 | if [ ${step} -lt '2' ]
 66 | then
 67 | time python ./tools/train_net.py \
 68 |     --weights ${ckpt_path} \
 69 |     --imdb ${TRAIN_IMDB} \
 70 |     --imdbval ${TEST_IMDB} \
 71 |     --iters ${FINETUNE_AFTER1}\
 72 |     --cfg scripts/dense_cap_config.yml \
 73 |     --data_dir ${data_dir} \
 74 |     --net ${NET} \
 75 |     --set EXP_DIR dc_conv_fixed CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3
 76 | fi
 77 | 
 78 | # Step2: Finetune convnets
 79 | NEW_WIGHTS=output/dc_conv_fixed/${TRAIN_IMDB}
 80 | if [ ${step} -lt '3' ]
 81 | then
 82 | time python ./tools/train_net.py \
 83 |     --weights ${NEW_WIGHTS} \
 84 |     --imdb ${TRAIN_IMDB} \
 85 |     --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \
 86 |     --imdbval ${TEST_IMDB} \
 87 |     --cfg scripts/dense_cap_config.yml \
 88 |     --data_dir ${data_dir} \
 89 |     --net ${NET} \
 90 |     --set EXP_DIR dc_tune_conv CONTEXT_FUSION False RESNET.FIXED_BLOCKS 1 TRAIN.LEARNING_RATE 0.00025
 91 | fi
 92 | 
 93 | # Step3: train with contex fusion
 94 | NEW_WIGHTS=output/dc_tune_conv/${TRAIN_IMDB}
 95 | if [ ${step} -lt '4' ]
 96 | then
 97 | time python ./tools/train_net.py \
 98 |     --weights ${NEW_WIGHTS} \
 99 |     --imdb ${TRAIN_IMDB} \
100 |     --imdbval ${TEST_IMDB} \
101 |     --iters ${FINETUNE_AFTER2} \
102 |     --cfg scripts/dense_cap_config.yml \
103 |     --data_dir ${data_dir} \
104 |     --net ${NET} \
105 |     --set EXP_DIR dc_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 3 TRAIN.LEARNING_RATE 0.000125
106 | fi
107 | 
108 | # Step4: finetune context fusion
109 | NEW_WIGHTS=output/dc_context/${TRAIN_IMDB}
110 | if [ ${step} -lt '5' ]
111 | then
112 | time python ./tools/train_net.py \
113 |     --weights ${NEW_WIGHTS} \
114 |     --imdb ${TRAIN_IMDB} \
115 |     --imdbval ${TEST_IMDB} \
116 |     --iters `expr ${ITERS2} - ${FINETUNE_AFTER2}` \
117 |     --cfg scripts/dense_cap_config.yml \
118 |     --data_dir ${data_dir} \
119 |     --net ${NET} \
120 |     --set EXP_DIR dc_tune_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 1 TRAIN.LEARNING_RATE 0.0000625
121 | fi
122 | 


--------------------------------------------------------------------------------
/scripts/old_dense_cap_train.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # --------------------------------------------------------
  4 | # DenseCap-Tensorflow
  5 | # Written by InnerPeace
  6 | # This file is adapted from Ross Linjie's work
  7 | # --------------------------------------------------------
  8 | # Script for training dense captioning model with joint inference and visual context
  9 | # Do freeze-convnet training first, then finetuning
 10 | # Usage:
 11 | # ./models/dense_cap/dense_cap_train.sh [GPU_ID] [DATASET] [MODEL_TYPE] [INITIAL_WEIGHTS] [EXTRA_ARGS]
 12 | # Example:
 13 | # To train a model with joint inference and visual context (late fusion, feature summation) on visual genome 1.0
 14 | # TODO: change the example.
 15 | # ./models/dense_cap/dense_cap_train.sh 1 visual_genome late_fusion_sum models/vggnet/vgg16.caffemodel
 16 | set -x
 17 | set -e
 18 | 
 19 | export PYTHONUNBUFFERED="True"
 20 | 
 21 | GPU_ID=$1
 22 | DATASET=$2
 23 | MODEL_TYPE=$3
 24 | WEIGHTS=$4
 25 | array=( $@ )
 26 | len=${#array[@]}
 27 | EXTRA_ARGS=${array[@]:4:$len}
 28 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_}
 29 | case $DATASET in
 30 |    visual_genome)
 31 |     TRAIN_IMDB="vg_1.0_train"
 32 |     TEST_IMDB="vg_1.0_val"
 33 |     PT_DIR="dense_cap"
 34 |     FINETUNE_AFTER1=200000
 35 |     FINETUNE_AFTER2=100000
 36 |     ITERS1=400000
 37 |     ITERS2=300000
 38 |     ;;
 39 |   visual_genome_1.2)
 40 |     TRAIN_IMDB="vg_1.2_train"
 41 |     TEST_IMDB="vg_1.2_val"
 42 |     PT_DIR="dense_cap"
 43 |     FINETUNE_AFTER1=200000
 44 |     FINETUNE_AFTER2=100000
 45 |     ITERS1=400000
 46 |     ITERS2=300000
 47 |     ;;
 48 |   *)
 49 |     echo "No dataset given"
 50 |     exit
 51 |     ;;
 52 | esac
 53 | GLOG_logtostderr=1
 54 | # If training visual context model, need to start with the context-free counterpart
 55 | if [ ${MODEL_TYPE} != "joint_inference" ]
 56 | then
 57 | # TODO: change the options for training
 58 | ./tools/train_net.py --gpu ${GPU_ID} \
 59 |   --solver models/${PT_DIR}/solver_joint_inference.prototxt \
 60 |   --weights ${WEIGHTS} \
 61 |   --imdb ${TRAIN_IMDB} \
 62 |   --iters ${FINETUNE_AFTER1} \
 63 |   --cfg models/${PT_DIR}/dense_cap.yml \
 64 |   ${EXTRA_ARGS}
 65 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_joint_inference_iter_${FINETUNE_AFTER1}.caffemodel
 66 | # Finetuning all weights
 67 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
 68 |   --solver models/${PT_DIR}/solver_joint_inference_finetune.prototxt \
 69 |   --weights ${NEW_WEIGHTS} \
 70 |   --imdb ${TRAIN_IMDB} \
 71 |   --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \
 72 |   --cfg models/${PT_DIR}/dense_cap.yml \
 73 |   ${EXTRA_ARGS}
 74 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_joint_inference_finetune_iter_`expr ${ITERS1} - ${FINETUNE_AFTER1}`.caffemodel
 75 | # Training with convnet weights fixed
 76 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
 77 |   --solver models/${PT_DIR}/solver_${MODEL_TYPE}.prototxt \
 78 |   --weights ${NEW_WEIGHTS} \
 79 |   --imdb ${TRAIN_IMDB} \
 80 |   --iters ${FINETUNE_AFTER2} \
 81 |   --cfg models/${PT_DIR}/dense_cap.yml \
 82 |   ${EXTRA_ARGS}
 83 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_${MODEL_TYPE}_iter_${FINETUNE_AFTER2}.caffemodel
 84 | # Finetuning all weights
 85 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
 86 |   --solver models/${PT_DIR}/solver_${MODEL_TYPE}_finetune.prototxt \
 87 |   --weights ${NEW_WEIGHTS} \
 88 |   --imdb ${TRAIN_IMDB} \
 89 |   --iters `expr ${ITERS2} - ${FINETUNE_AFTER2}` \
 90 |   --cfg models/${PT_DIR}/dense_cap.yml \
 91 |   ${EXTRA_ARGS}
 92 | 
 93 | else
 94 | # Training with convnet weights fixed
 95 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
 96 |   --solver models/${PT_DIR}/solver_${MODEL_TYPE}.prototxt \
 97 |   --weights ${WEIGHTS} \
 98 |   --imdb ${TRAIN_IMDB} \
 99 |   --iters ${FINETUNE_AFTER1} \
100 |   --cfg models/${PT_DIR}/dense_cap.yml \
101 |   ${EXTRA_ARGS}
102 | NEW_WEIGHTS=output/dense_cap/${TRAIN_IMDB}/dense_cap_${MODEL_TYPE}_iter_${FINETUNE_AFTER1}.caffemodel
103 | # Finetuning all weights
104 | ./lib/tools/train_net.py --gpu ${GPU_ID} \
105 |   --solver models/${PT_DIR}/solver_${MODEL_TYPE}_finetune.prototxt \
106 |   --weights ${NEW_WEIGHTS} \
107 |   --imdb ${TRAIN_IMDB} \
108 |   --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \
109 |   --cfg models/${PT_DIR}/dense_cap.yml \
110 |   ${EXTRA_ARGS}
111 | fi
112 | 


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
1 | ## TEST
2 | Some of the test files during developing, just ignore it.
3 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/tests/__init__.py


--------------------------------------------------------------------------------
/tests/architecture_test.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DenseCap-Tensorflow
 3 | # Written by InnerPeace
 4 | # This file is adapted from Linjie's work
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | from lib.config import cfg
11 | import tensorflow as tf
12 | from lib.nets.resnet_v1 import resnetv1
13 | from tests.roidata_test import get_data_test
14 | import six
15 | import numpy as np
16 | 
17 | 
18 | def architecture_test():
19 |     blob = get_data_test()
20 |     tf.reset_default_graph()
21 |     net = resnetv1(50)
22 |     # net._build_network()
23 |     net.create_architecture(mode='TEST', tag='pre')
24 | 
25 |     for n in tf.get_default_graph().as_graph_def().node:
26 |         print(n.name)
27 | 
28 |     tfconfig = tf.ConfigProto(allow_soft_placement=True)
29 |     tfconfig.gpu_options.allow_growth = True
30 | 
31 |     feed_dict = {net._image: blob['data'],
32 |                  net._im_info: blob['im_info'],
33 |                  net._gt_boxes: blob['gt_boxes'],
34 |                  net._gt_phrases: blob['gt_phrases']}
35 |     output = net._for_debug
36 |     output.update({
37 |         "image": net._image,
38 |         "im_info": net._im_info,
39 |         "gt_boxes": net._gt_boxes,
40 |         "gt_phrases": net._gt_phrases
41 |     })
42 | 
43 |     with tf.Session(config=tfconfig) as sess:
44 |         init = tf.global_variables_initializer()
45 |         sess.run(init)
46 |         out = sess.run('DenseCap_ResNet50/Prediction/lstm/cap_init_state:0', feed_dict=feed_dict)
47 |         print(out.shape)
48 |         # out = sess.run(output, feed_dict=feed_dict)
49 | 
50 |         # for k, v in six.iteritems(out):
51 |         #     print("name: {}               ==> {}".format(k, v.shape))
52 |         #     # print("shape: {}".format(v.shape))
53 |         #     if k == 'labels':
54 |         #         # print(v)
55 |         #         # print("first 5 example:")
56 |         #         print(v[:5])
57 |         #     if k == 'loss' or k == 'total_loss':
58 |         #         print(k, v)
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     architecture_test()
63 | 


--------------------------------------------------------------------------------
/tests/bash_log_test/bash_log_test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | set -e
 5 | 
 6 | export PYTHONUNBUFFERED="True"
 7 | 
 8 | TAG=$1
 9 | 
10 | LOG="logs/${TAG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
11 | exec &> >(tee -a "$LOG")
12 | echo Logging output to "$LOG"
13 | 
14 | time python ./nonsense.py
15 | 


--------------------------------------------------------------------------------
/tests/bash_log_test/logs/test.txt.2017-10-18_15-33-56:
--------------------------------------------------------------------------------
1 | + echo Logging output to logs/test.txt.2017-10-18_15-33-56
2 | Logging output to logs/test.txt.2017-10-18_15-33-56
3 | + python ./nonsense.py
4 | hello world
5 | 
6 | real	0m0.011s
7 | user	0m0.012s
8 | sys	0m0.000s
9 | 


--------------------------------------------------------------------------------
/tests/bash_log_test/nonsense.py:
--------------------------------------------------------------------------------
 1 | """test file"""
 2 | 
 3 | from __future__ import absolute_import
 4 | from __future__ import division
 5 | from __future__ import print_function
 6 | 
 7 | 
 8 | def main():
 9 |     print("hello world")
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     main()
14 | 


--------------------------------------------------------------------------------
/tests/ckpt_restore_test.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------------------
  2 | # DenseCap
  3 | # Written by InnerPeace
  4 | # This file is adapted from Xinlei's work
  5 | # ----------------------------------------------
  6 | from __future__ import absolute_import
  7 | from __future__ import division
  8 | from __future__ import print_function
  9 | 
 10 | import tensorflow as tf
 11 | from tensorflow.python import pywrap_tensorflow
 12 | import tensorflow.contrib.slim as slim
 13 | 
 14 | from tensorflow.contrib.slim import arg_scope
 15 | from tensorflow.contrib.slim.python.slim.nets import resnet_utils
 16 | from tensorflow.contrib.slim.python.slim.nets import resnet_v1
 17 | from tensorflow.contrib.slim.python.slim.nets.resnet_v1 import resnet_v1_block
 18 | import numpy as np
 19 | 
 20 | from lib.config import cfg
 21 | 
 22 | 
 23 | def resnet_arg_scope(is_training=True,
 24 |                      batch_norm_decay=0.997,
 25 |                      batch_norm_epsilon=1e-5,
 26 |                      batch_norm_scale=True):
 27 |     batch_norm_params = {
 28 |         'is_training': False,
 29 |         'decay': batch_norm_decay,
 30 |         'epsilon': batch_norm_epsilon,
 31 |         'scale': batch_norm_scale,
 32 |         'trainable': False,
 33 |         'updates_collections': tf.GraphKeys.UPDATE_OPS
 34 |     }
 35 | 
 36 |     with arg_scope(
 37 |             [slim.conv2d],
 38 |             # weights_regularizer=slim.l2_regularizer(cfg.TRAIN.WEIGHT_DECAY),
 39 |             weights_regularizer=None,
 40 |             weights_initializer=slim.variance_scaling_initializer(),
 41 |             trainable=is_training,
 42 |             activation_fn=tf.nn.relu,
 43 |             normalizer_fn=slim.batch_norm,
 44 |             normalizer_params=batch_norm_params):
 45 |         with arg_scope([slim.batch_norm], **batch_norm_params) as arg_sc:
 46 |             return arg_sc
 47 | 
 48 | 
 49 | class resnetv1():
 50 |     def __init__(self, num_layers=50):
 51 |         # Network.__init__(self)
 52 |         self._feat_stride = [16, ]
 53 |         self._feat_compress = [1. / float(self._feat_stride[0]), ]
 54 |         self._num_layers = num_layers
 55 |         self._scope = 'resnet_v1_%d' % num_layers
 56 |         self._decide_blocks()
 57 | 
 58 |     # Do the first few layers manually, because 'SAME' padding can behave inconsistently
 59 |     # for images of different sizes: sometimes 0, sometimes 1
 60 |     def _build_base(self):
 61 |         with tf.variable_scope(self._scope, self._scope):
 62 |             net = resnet_utils.conv2d_same(self._image, 64, 7, stride=2, scope='conv1')
 63 |             net = tf.pad(net, [[0, 0], [1, 1], [1, 1], [0, 0]])
 64 |             net = slim.max_pool2d(net, [3, 3], stride=2, padding='VALID', scope='pool1')
 65 | 
 66 |         return net
 67 | 
 68 |     def _image_to_head(self, is_training, reuse=None):
 69 |         assert (0 <= cfg.RESNET.FIXED_BLOCKS <= 3)
 70 |         # Now the base is always fixed during training
 71 |         with slim.arg_scope(resnet_arg_scope(is_training=False)):
 72 |             net_conv = self._build_base()
 73 |         if cfg.RESNET.FIXED_BLOCKS > 0:
 74 |             with slim.arg_scope(resnet_arg_scope(is_training=False)):
 75 |                 net_conv, _ = resnet_v1.resnet_v1(net_conv,
 76 |                                                   self._blocks[0:cfg.RESNET.FIXED_BLOCKS],
 77 |                                                   global_pool=False,
 78 |                                                   include_root_block=False,
 79 |                                                   reuse=reuse,
 80 |                                                   scope=self._scope)
 81 |         if cfg.RESNET.FIXED_BLOCKS < 3:
 82 |             with slim.arg_scope(resnet_arg_scope(is_training=is_training)):
 83 |                 net_conv, _ = resnet_v1.resnet_v1(net_conv,
 84 |                                                   self._blocks[cfg.RESNET.FIXED_BLOCKS:-1],
 85 |                                                   global_pool=False,
 86 |                                                   include_root_block=False,
 87 |                                                   reuse=reuse,
 88 |                                                   scope=self._scope)
 89 | 
 90 |         self._act_summaries.append(net_conv)
 91 |         self._layers['head'] = net_conv
 92 | 
 93 |         return net_conv
 94 | 
 95 |     def _decide_blocks(self):
 96 |         # choose different blocks for different number of layers
 97 |         if self._num_layers == 50:
 98 |             self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
 99 |                             resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
100 |                             # use stride 1 for the last conv4 layer
101 |                             resnet_v1_block('block3', base_depth=256, num_units=6, stride=1),
102 |                             resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
103 | 
104 |         elif self._num_layers == 101:
105 |             self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
106 |                             resnet_v1_block('block2', base_depth=128, num_units=4, stride=2),
107 |                             # use stride 1 for the last conv4 layer
108 |                             resnet_v1_block('block3', base_depth=256, num_units=23, stride=1),
109 |                             resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
110 | 
111 |         elif self._num_layers == 152:
112 |             self._blocks = [resnet_v1_block('block1', base_depth=64, num_units=3, stride=2),
113 |                             resnet_v1_block('block2', base_depth=128, num_units=8, stride=2),
114 |                             # use stride 1 for the last conv4 layer
115 |                             resnet_v1_block('block3', base_depth=256, num_units=36, stride=1),
116 |                             resnet_v1_block('block4', base_depth=512, num_units=3, stride=1)]
117 | 
118 |         else:
119 |             # other numbers are not supported
120 |             raise NotImplementedError
121 | 
122 |     def get_variables_to_restore(self, variables, var_keep_dic):
123 |         variables_to_restore = []
124 | 
125 |         for v in variables:
126 |             # exclude the first conv layer to swap RGB to BGR
127 |             if v.name == (self._scope + '/conv1/weights:0'):
128 |                 self._variables_to_fix[v.name] = v
129 |                 continue
130 |             if v.name.split(':')[0] in var_keep_dic:
131 |                 print('Variables restored: %s' % v.name)
132 |                 variables_to_restore.append(v)
133 | 
134 |         return variables_to_restore
135 | 
136 |     def fix_variables(self, sess, pretrained_model):
137 |         print('Fix Resnet V1 layers..')
138 |         with tf.variable_scope('Fix_Resnet_V1') as scope:
139 |             with tf.device("/cpu:0"):
140 |                 # fix RGB to BGR
141 |                 conv1_rgb = tf.get_variable("conv1_rgb", [7, 7, 3, 64], trainable=False)
142 |                 restorer_fc = tf.train.Saver({self._scope + "/conv1/weights": conv1_rgb})
143 |                 restorer_fc.restore(sess, pretrained_model)
144 | 
145 |                 sess.run(tf.assign(self._variables_to_fix[self._scope + '/conv1/weights:0'],
146 |                                    tf.reverse(conv1_rgb, [2])))
147 | 
148 | 
149 | def get_variables_in_checkpoint_file(file_name):
150 |     try:
151 |         reader = pywrap_tensorflow.NewCheckpointReader(file_name)
152 |         var_to_shape_map = reader.get_variable_to_shape_map()
153 |         return var_to_shape_map
154 |     except Exception as e:  # pylint: disable=broad-except
155 |         print(str(e))
156 |         if "corrupted compressed block contents" in str(e):
157 |             print("It's likely that your checkpoint file has been compressed "
158 |                   "with SNAPPY.")
159 | 
160 | 
161 | def main():
162 |     ckpt_path = '/home/joe/git/slim_models/resnet_v1_50.ckpt'
163 |     var_keep_dic = get_variables_in_checkpoint_file(ckpt_path)
164 |     for key in var_keep_dic:
165 |         print("tensor_name: ", key)
166 | 
167 | 
168 | if __name__ == '__main__':
169 |     main()
170 | 
171 | 


--------------------------------------------------------------------------------
/tests/dencap_oa_test.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # This script is used for my own experiments, just ignore it.
  4 | # Run with:
  5 | #       bash scripts/dense_cap_train.sh [dataset] [net] [ckpt_to_init] [data_dir] [step]
  6 | 
  7 | set -x
  8 | set -e
  9 | 
 10 | export PYTHONUNBUFFERED='True'
 11 | 
 12 | DATASET='visual_genome_1.2'
 13 | NET='res50'
 14 | ckpt_path='/home/joe/git/slim_models'
 15 | data_dir='/home/joe/git/visual_genome'
 16 | step=$1
 17 | 
 18 | case $DATASET in
 19 |    visual_genome)
 20 |     TRAIN_IMDB="vg_1.0_train"
 21 |     TEST_IMDB="vg_1.0_val"
 22 |     PT_DIR="dense_cap"
 23 |     FINETUNE_AFTER1=200000
 24 |     FINETUNE_AFTER2=100000
 25 |     ITERS1=400000
 26 |     ITERS2=300000
 27 |     ;;
 28 |   visual_genome_1.2)
 29 |     TRAIN_IMDB="vg_1.2_train"
 30 |     TEST_IMDB="vg_1.2_val"
 31 |     PT_DIR="dense_cap"
 32 |     FINETUNE_AFTER1=200000
 33 |     FINETUNE_AFTER2=100000
 34 |     ITERS1=400000
 35 |     ITERS2=300000
 36 |     ;;
 37 |   *)
 38 |     echo "No dataset given"
 39 |     exit
 40 |     ;;
 41 | esac
 42 | 
 43 | if [ -d '/valohai/outputs' ]; then
 44 |     ckpt_path='/valohai/inputs/resnet'
 45 |     data_dir='/valohai/inputs/visual_genome'
 46 |     LOG="/valohai/outputs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
 47 | else
 48 |     LOG="logs/s${step}_${NET}_${TRAIN_IMDB}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
 49 | fi
 50 | 
 51 | exec &> >(tee -a "$LOG")
 52 | echo Logging output to "$LOG"
 53 | 
 54 | FIRST_ITERS=80000
 55 | if [ ${step} -lt '2' ]
 56 | then
 57 | time python ./tools/train_net.py \
 58 |     --weights ${ckpt_path}/${NET}.ckpt \
 59 |     --imdb ${TRAIN_IMDB} \
 60 |     --imdbval ${TEST_IMDB} \
 61 |     --iters 50000 \
 62 |     --cfg scripts/dense_cap_config.yml \
 63 |     --data_dir ${data_dir} \
 64 |     --net ${NET} \
 65 |     --set TRAIN_GLOVE False EXP_DIR dc_fixed CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3 KEEP_AS_GLOVE_DIM False LOSS.CLS_W 1. LOSS.BBOX_W 0.2 LOSS.RPN_BBOX_W 1. LOSS.RPN_CLS_W 0.5
 66 |     # --set EXP_DIR dc_fixed CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3
 67 | 
 68 | # mkdir output/dc_fixed
 69 | # cp -r output/Densecap/ output/dc_dc_fixed
 70 | fi
 71 | 
 72 | NEW_WIGHTS=output/dc_fixed/${TRAIN_IMDB}
 73 | if [ ${step} -lt '3' ]
 74 | then
 75 | time python ./tools/train_net.py \
 76 |     --weights ${NEW_WIGHTS} \
 77 |     --imdb ${TRAIN_IMDB} \
 78 |     --iters 30000 \
 79 |     --imdbval ${TEST_IMDB} \
 80 |     --cfg scripts/dense_cap_config.yml \
 81 |     --data_dir ${data_dir} \
 82 |     --net ${NET} \
 83 |     --set TRAIN_GLOVE True EXP_DIR dc_tune_vec CONTEXT_FUSION False RESNET.FIXED_BLOCKS 3 KEEP_AS_GLOVE_DIM False
 84 | # TRAIN.LEARNING_RATE 0.0005
 85 | # --iters `expr ${FINETUNE_AFTER1} - ${FIRST_ITERS}` \
 86 | 
 87 | # mkdir output/dc_tune_vec
 88 | # cp -r output/Densecap/ output/dc_tune_vec
 89 | fi
 90 | 
 91 | #NEW_WIGHTS=output/dc_tune_vec/${TRAIN_IMDB}
 92 | if [ ${step} -lt '4' ]
 93 | then
 94 | time python ./tools/train_net.py \
 95 |     --weights ${NEW_WIGHTS} \
 96 |     --imdb ${TRAIN_IMDB} \
 97 |     --imdbval ${TEST_IMDB} \
 98 |     --iters `expr ${ITERS1} - ${FINETUNE_AFTER1}` \
 99 |     --cfg scripts/dense_cap_config.yml \
100 |     --data_dir ${data_dir} \
101 |     --net ${NET} \
102 |     --set EXP_DIR dc_tune_conv CONTEXT_FUSION False RESNET.FIXED_BLOCKS 1
103 | 
104 | # mkdir output/dc_tune_conv
105 | # cp -r output/Densecap/ output/dc_tune_conv
106 | fi
107 | 
108 | NEW_WIGHTS=output/dc_tune_conv/${TRAIN_IMDB}
109 | if [ ${step} -lt '5' ]
110 | then
111 | time python ./tools/train_net.py \
112 |     --weights ${NEW_WIGHTS} \
113 |     --imdb ${TRAIN_IMDB} \
114 |     --imdbval ${TEST_IMDB} \
115 |     --iters ${FINETUNE_AFTER2} \
116 |     --cfg scripts/dense_cap_config.yml \
117 |     --data_dir ${data_dir} \
118 |     --net ${NET} \
119 |     --set TRAIN_GLOVE True EXP_DIR dc_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 3
120 | # mkdir output/dc_context
121 | # cp -r output/Densecap/ output/dc_context
122 | # --iters `expr ${FINETUNE_AFTER1} - ${FIRST_ITERS}`
123 | fi
124 | 
125 | NEW_WIGHTS=output/dc_context/${TRAIN_IMDB}
126 | if [ ${step} -lt '6' ]
127 | then
128 | time python ./tools/train_net.py \
129 |     --weights ${NEW_WIGHTS} \
130 |     --imdb ${TRAIN_IMDB} \
131 |     --imdbval ${TEST_IMDB} \
132 |     --iters `expr ${ITERS2} - ${FINETUNE_AFTER2}` \
133 |     --cfg scripts/dense_cap_config.yml \
134 |     --data_dir ${data_dir} \
135 |     --net ${NET} \
136 |     --set TRAIN_GLOVE True EXP_DIR dc_tune_context CONTEXT_FUSION True RESNET.FIXED_BLOCKS 1
137 | fi
138 | 


--------------------------------------------------------------------------------
/tests/logs/architecture_test.txt:
--------------------------------------------------------------------------------
 1 | /home/joe/.tf_env2/bin/python /home/joe/git/densecap/tests/architecture_test.py
 2 | data_path: /home/joe/git/visual_genome_test/1.2
 3 | pre gt roidb could be loaded from /home/joe/git/visual_genome_test/1.2_cache/pre_gt_roidb
 4 | LIMIT_RAM version and load index from /home/joe/git/visual_genome_test/1.2_cache/pre_gt_roidb/image_index.json
 5 | 
 6 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:50: RuntimeWarning: overflow encountered in exp
 7 |   pred_w = np.exp(dw) * widths[:, np.newaxis]
 8 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:50: RuntimeWarning: overflow encountered in multiply
 9 |   pred_w = np.exp(dw) * widths[:, np.newaxis]
10 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:51: RuntimeWarning: overflow encountered in exp
11 |   pred_h = np.exp(dh) * heights[:, np.newaxis]
12 | /home/joe/git/densecap/lib/fast_rcnn/bbox_transform.py:51: RuntimeWarning: overflow encountered in multiply
13 |   pred_h = np.exp(dh) * heights[:, np.newaxis]
14 | 
15 | length of labels, i.e. number of regions: 256
16 | sentence data layer input (first 3)
17 | 2239.0 [ 4 87  6  5 85 87  0  0  0  0]
18 | 2239.0 [ 4 87  6  5 85 87  0  0  0  0]
19 | 2239.0 [ 4 87  6  5 85 87  0  0  0  0]
20 | sentence data layer output (first 3)
21 | input sentence
22 | [[  1.   4.  87.   6.   5.  85.  87.   0.   0.   0.   0.]
23 |  [  1.   4.  87.   6.   5.  85.  87.   0.   0.   0.   0.]
24 |  [  1.   4.  87.   6.   5.  85.  87.   0.   0.   0.   0.]]
25 | target sentence
26 | [[  1.   4.  87.   6.   5.  85.  87.   2.   0.   0.   0.   0.]
27 |  [  1.   4.  87.   6.   5.  85.  87.   2.   0.   0.   0.   0.]
28 |  [  1.   4.  87.   6.   5.  85.  87.   2.   0.   0.   0.   0.]]
29 | cont sentence
30 | [[ 0.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.]
31 |  [ 0.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.]
32 |  [ 0.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.]]
33 | cont bbox
34 | [[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
35 |  [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
36 |  [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]]
37 | 
38 | name: fc7               ==> (256, 2048)
39 | name: image               ==> (1, 540, 720, 3)
40 | name: labels               ==> (256,)
41 | [3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
42 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
43 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
44 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
45 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
46 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
47 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
48 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
49 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
50 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
51 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
52 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
53 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
54 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
55 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
56 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
57 |  3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682 3682
58 |  3682]
59 | name: bbox_inside_weights               ==> (256, 4)
60 | name: bbox_targets               ==> (256, 4)
61 | name: input_sentence               ==> (256, 11)
62 | name: rpn               ==> (1, 34, 45, 512)
63 | name: rpn_labels               ==> (1, 1, 408, 45)
64 | name: cont_bbox               ==> (256, 12)
65 | name: bbox_outside_weights               ==> (256, 4)
66 | name: target_sentence               ==> (256, 12)
67 | name: rpn_bbox_outside_weights               ==> (1, 34, 45, 48)
68 | name: pool5               ==> (256, 7, 7, 1024)
69 | name: rpn_bbox_inside_weights               ==> (1, 34, 45, 48)
70 | name: proposal_rois               ==> (9, 5)
71 | name: head               ==> (1, 34, 45, 1024)
72 | name: clss               ==> (256,)
73 | name: rpn_cls_score_reshape               ==> (1, 408, 45, 2)
74 | name: anchors               ==> (18360, 4)
75 | name: cont_sentence               ==> (256, 12)
76 | name: cls_prob               ==> (256, 2)
77 | name: gt_boxes               ==> (262, 5)
78 | name: rpn_bbox_pred               ==> (1, 34, 45, 48)
79 | name: rpn_cls_score               ==> (1, 34, 45, 24)
80 | name: im_info               ==> (3,)
81 | name: phrases               ==> (256, 10)
82 | name: rpn_cls_prob               ==> (1, 34, 45, 24)
83 | name: gt_phrases               ==> (262, 10)
84 | name: rois               ==> (256, 5)
85 | name: proposal_rpn_scores               ==> (9, 1)
86 | name: rpn_cls_prob_reshape               ==> (1, 408, 45, 2)
87 | name: rpn_bbox_targets               ==> (1, 34, 45, 48)
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/tests/logs/preprocessing.txt:
--------------------------------------------------------------------------------
 1 | split image number: 77398 for split name: train
 2 | start loading image meta data json files...
 3 | 0.316329 seconds for loading
 4 | train: 100%|███████████████████████████| 108077/108077 [03:05<00:00, 581.84it/s]
 5 | processing train set with time: 185.75 seconds
 6 | there are 272 invalid bboxes out of 3684063
 7 | there are 3 empty phrases after triming
 8 | Found 56945 unique word tokens.
 9 | Using vocabulary size 10000.
10 | The least frequent word in our vocabulary is 'ruff' and appeared 14 times.
11 | Dumping vocabulary to file: /home/joe/git/visual_genome/1.2/vocabulary.txt
12 | Done.
13 | split image number: 5000 for split name: val
14 | start loading image meta data json files...
15 | 0.273385 seconds for loading
16 | val: 100%|████████████████████████████| 108077/108077 [00:20<00:00, 5401.88it/s]
17 | processing val set with time: 20.01 seconds
18 | there are 14 invalid bboxes out of 237362
19 | there are 0 empty phrases after triming
20 | split image number: 5000 for split name: test
21 | start loading image meta data json files...
22 | 0.273840 seconds for loading
23 | test: 100%|███████████████████████████| 108077/108077 [00:20<00:00, 5225.84it/s]
24 | processing test set with time: 20.68 seconds
25 | there are 17 invalid bboxes out of 238069
26 | there are 0 empty phrases after triming


--------------------------------------------------------------------------------
/tests/logs/sentence_data_layer_test.txt:
--------------------------------------------------------------------------------
 1 | data_path: /home/joe/git/visual_genome_test/1.2
 2 | Appending horizontally-flipped training examples...
 3 | pre gt roidb loaded from /home/joe/git/visual_genome_test/1.2/pre_gt_roidb.pkl
 4 | done
 5 | Preparing training data...
 6 | done
 7 | Filtered 0 roidb entries: 4 -> 4
 8 | length of labels, i.e. number of regions: 262
 9 | sentence data layer input (first 3)
10 | 1382.0 [  4  33   6  25  20 144   0   0   0   0]
11 | 1383.0 [167   6  30   4  11   0   0   0   0   0]
12 | 1384.0 [ 7  6 21 72  0  0  0  0  0  0]
13 | sentence data layer output (first 3)
14 | input sentence
15 | [[   1.    4.   33.    6.   25.   20.  144.    0.    0.    0.    0.]
16 |  [   1.  167.    6.   30.    4.   11.    0.    0.    0.    0.    0.]
17 |  [   1.    7.    6.   21.   72.    0.    0.    0.    0.    0.    0.]]
18 | target sentence
19 | [[   1.    4.   33.    6.   25.   20.  144.    2.    0.    0.    0.    0.]
20 |  [   1.  167.    6.   30.    4.   11.    2.    0.    0.    0.    0.    0.]
21 |  [   1.    7.    6.   21.   72.    2.    0.    0.    0.    0.    0.    0.]]
22 | cont sentence
23 | [[ 0.  1.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.]
24 |  [ 0.  1.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.]
25 |  [ 0.  1.  1.  1.  1.  1.  0.  0.  0.  0.  0.  0.]]
26 | cont bbox
27 | [[ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]
28 |  [ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
29 |  [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]]


--------------------------------------------------------------------------------
/tests/pickle_read_test.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DenseCap-Tensorflow
 3 | # Written by InnerPeace
 4 | # This file is adapted from Ross Girshick's work
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | from os.path import join as pjoin
11 | from six.moves import cPickle
12 | 
13 | def pickle_test():
14 |     DEFAULT_PATH = '/home/joe/git/visual_genome_test'
15 |     cache = pjoin(DEFAULT_PATH, '1.2_cache/pre_gt_roidb', '1.pkl')
16 |     cache_flip = pjoin(DEFAULT_PATH, '1.2_cache/pre_gt_roidb', '1_flip.pkl')
17 |     ori = pjoin(DEFAULT_PATH, '1.2', 'pre_gt_roidb.pkl')
18 |     phra = pjoin(DEFAULT_PATH, '1.2', 'pre_gt_phrases.pkl')
19 |     with open(cache, 'rb') as fc:
20 |         data_cache = cPickle.load(fc)
21 |     with open(cache_flip, 'rb') as f:
22 |         data_flip = cPickle.load(f)
23 |     with open(ori, 'rb') as fo:
24 |         data_ori = cPickle.load(fo)
25 |     with open(phra, 'rb') as fp:
26 |         data_phra = cPickle.load(fp)
27 |     # from IPython import embed;
28 |     # embed()
29 | 
30 |     print(data_cache)
31 |     print ('flip------------------')
32 |     print(data_flip)
33 |     print ('ori------------------')
34 |     print(data_ori)
35 |     print("data ori length:", len(data_ori))
36 |     print ('phrase------------------')
37 |     print (data_phra)
38 |     # print (data_phra[2239])
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     pickle_test()
43 | 


--------------------------------------------------------------------------------
/tests/read_regions_json/ijson_example.txt:
--------------------------------------------------------------------------------
 1 | ('', u'start_array', None)
 2 | ('item', u'start_map', None)
 3 | ('item', u'map_key', u'regions')
 4 | (u'item.regions', u'start_array', None)
 5 | (u'item.regions.item', u'start_map', None)
 6 | (u'item.regions.item', u'map_key', u'region_id')
 7 | (u'item.regions.item.region_id', u'number', 1382)
 8 | (u'item.regions.item', u'map_key', u'width')
 9 | (u'item.regions.item.width', u'number', 82)
10 | (u'item.regions.item', u'map_key', u'height')
11 | (u'item.regions.item.height', u'number', 139)
12 | (u'item.regions.item', u'map_key', u'image_id')
13 | (u'item.regions.item.image_id', u'number', 1)
14 | (u'item.regions.item', u'map_key', u'phrase')
15 | (u'item.regions.item.phrase', u'string', u'the clock is green in colour')
16 | (u'item.regions.item', u'map_key', u'y')
17 | (u'item.regions.item.y', u'number', 57)
18 | (u'item.regions.item', u'map_key', u'x')
19 | (u'item.regions.item.x', u'number', 421)
20 | (u'item.regions.item', u'end_map', None)
21 | 


--------------------------------------------------------------------------------
/tests/read_regions_json/read_regions_test.py:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------
 2 | # DenseCap
 3 | # Written by InnerPeace
 4 | # ----------------------------------------------
 5 | 
 6 | """read large region description json files"""
 7 | 
 8 | import ijson
 9 | import json
10 | # import tqdm
11 | 
12 | def read_regions( ):
13 |     VG_VERSION = '1.2'
14 |     VG_PATH = '/home/joe/git/VG_raw_data'
15 |     VG_REGION_PATH = '%s/%s/region_descriptions.json' % (VG_PATH, VG_VERSION)
16 |     # parser = ijson.parse(open('test_region.json'))
17 |     parser = ijson.parse(open(VG_REGION_PATH))
18 | 
19 |     last_value = None
20 |     Dic = {}
21 |     regions = []
22 |     dic = {}
23 |     for prefix, event, value in parser:
24 |         if value == 'regions':
25 |             Dic = {}
26 |             regions = []
27 |             last_value = None
28 |         elif last_value == 'id':
29 |             Dic['regions'] = regions
30 |             Dic['id'] = value
31 |             with open('test_id_%s.json' % value, 'w') as f:
32 |                 json.dump(Dic, f)
33 |                 break
34 |         elif event == 'map_key':
35 |             last_value = value
36 |         elif event == 'end_map':
37 |             regions.append(dic)
38 |             dic = {}
39 |             last_value = None
40 |         elif last_value:
41 |             dic[last_value] = value
42 | 
43 | 
44 | def equal_test( ):
45 |     new = json.load(open('true_id_1_out.json'))
46 |     old = json.load(open('true_id_1.json'))
47 |     if old == new:
48 |         print('success!')
49 |     else:
50 |         print('ERROR!')
51 | 
52 |     '''OUT: success!'''
53 | 
54 | 
55 | def json_line_read( ):
56 |     '''This is not working'''
57 | 
58 |     with open('true_id_1.json', 'r') as f:
59 |         for line in f:
60 |             print(line)
61 | 
62 | 
63 | def read_time_test( ):
64 |     path = '/home/joe/git/visual_genome_test/1.2/pre_gt_regions/1.json'
65 |     import time
66 |     tic = time.time()
67 |     with open(path, 'r') as f:
68 |         data = json.load(f)
69 |     toc = time.time()
70 |     print ('read time: %s seconds' % (toc - tic))
71 | 
72 | def read_all_regions_test():
73 |     '''it gonna kill my computer'''
74 |     from tqdm import tqdm
75 |     path = '/home/joe/git/visual_genome/1.2/train_gt_regions/'
76 |     split_path = '/home/joe/git/densecap/info/densecap_splits.json'
77 |     with open(split_path, 'r') as fid:
78 |         img_index = json.load(fid)['train']
79 |     all_regions = {}
80 |     for i in tqdm(xrange(len(img_index)), desc='train set'):
81 |         idx = img_index[i]
82 |         with open(path+'%s.json'%idx, 'r') as f:
83 |             all_regions["%s"%idx] = json.load(f)
84 | 
85 | if __name__ == '__main__':
86 |     # read_regions()
87 |     # equal_test()
88 |     # json_line_read()
89 |     # read_time_test()
90 |     read_all_regions_test()
91 | 


--------------------------------------------------------------------------------
/tests/read_regions_json/test_region.json:
--------------------------------------------------------------------------------
1 | {"regions":[{"region_id": 4091, "width": 396, "height": 293, "image_id": 1, "phrase": "tall buildings with many windows", "y": 6, "x": 396}, {"region_id": 4090, "width": 709, "height": 281, "image_id": 1, "phrase": "brick sidewalk", "y": 315, "x": 81}], "id": 1}


--------------------------------------------------------------------------------
/tests/read_regions_json/test_region_out.json:
--------------------------------------------------------------------------------
1 | {"regions": [{"region_id": 4091, "image_id": 1, "height": 293, "width": 396, "x": 396, "y": 6, "phrase": "tall buildings with many windows"}, {"region_id": 4090, "image_id": 1, "height": 281, "width": 709, "x": 81, "y": 315, "phrase": "brick sidewalk"}], "id": 1}


--------------------------------------------------------------------------------
/tests/roidata_test.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DenseCap-Tensorflow
  3 | # Written by InnerPeace
  4 | # --------------------------------------------------------
  5 | from __future__ import absolute_import
  6 | from __future__ import division
  7 | from __future__ import print_function
  8 | 
  9 | from lib.fast_rcnn.layer import RoIDataLayer
 10 | from lib.config import cfg
 11 | from lib.datasets.visual_genome import visual_genome
 12 | import lib.fast_rcnn.roidb as rdl_roidb
 13 | import cv2
 14 | import numpy as np
 15 | from six.moves import xrange
 16 | 
 17 | # cfg.LIMIT_RAM = False
 18 | DEFAULT_PATH = '/home/joe/git/visual_genome_test/1.2'
 19 | 
 20 | 
 21 | # def roidata_test(roidb, num_classes=2):
 22 | #     data = RoIDataLayer(roidb, num_classes=num_classes)
 23 | 
 24 | def get_training_roidb(imdb):
 25 |     """Returns a roidb (Region of Interest database) for use in training."""
 26 |     if cfg.TRAIN.USE_FLIPPED and not cfg.LIMIT_RAM:
 27 |         print('Appending horizontally-flipped training examples...')
 28 |         imdb.append_flipped_images()
 29 |         print('done')
 30 | 
 31 |     print('Preparing training data...')
 32 |     rdl_roidb.prepare_roidb(imdb)
 33 |     print('done')
 34 | 
 35 |     return imdb.roidb
 36 | 
 37 | 
 38 | def filter_roidb(roidb):
 39 |     """Remove roidb entries that have no usable RoIs."""
 40 | 
 41 |     def is_valid(entry):
 42 |         # Valid images have:
 43 |         #   (1) At least one foreground RoI OR
 44 |         #   (2) At least one background RoI
 45 |         overlaps = entry['max_overlaps']
 46 |         # find boxes with sufficient overlap
 47 |         fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0]
 48 |         # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI)
 49 |         bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) &
 50 |                            (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0]
 51 |         # image is only valid if such boxes exist
 52 |         valid = len(fg_inds) > 0 or len(bg_inds) > 0
 53 |         return valid
 54 | 
 55 |     num = len(roidb)
 56 |     filtered_roidb = [entry for entry in roidb if is_valid(entry)]
 57 |     num_after = len(filtered_roidb)
 58 |     print('Filtered {} roidb entries: {} -> {}'.format(num - num_after,
 59 |                                                        num, num_after))
 60 |     return filtered_roidb
 61 | 
 62 | 
 63 | def vis_regions(im, regions, phrases=None, path='/home/joe/git/VG_raw_data/images_test'):
 64 |     vocab_path = '%s/vocabulary.txt' % DEFAULT_PATH
 65 |     with open(vocab_path, 'r') as f:
 66 |         vocab = [line.strip() for line in f]
 67 | 
 68 |     mean_values = np.array([[[102.9801, 115.9465, 122.7717]]])
 69 |     im = im + mean_values  # offset to original values
 70 | 
 71 |     for i in xrange(len(regions)):
 72 |         if i > 9:
 73 |             print ('save 10 examples and break out.')
 74 |             break
 75 |         bbox = regions[i, :4]
 76 |         region_id = regions[i, 4]
 77 |         # position 0,1,2 have been taken
 78 |         caption = ' '.join([vocab[j - 3] if j-3>=0 else "" for j in phrases[i]])
 79 |         im_new = np.copy(im)
 80 |         cv2.rectangle(im_new, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (0, 0, 255), 2)
 81 |         cv2.imwrite('%s/%s.jpg' % (path, caption), im_new)
 82 | 
 83 | def get_data_test():
 84 |     imdb = visual_genome('pre', '1.2')
 85 |     if cfg.LIMIT_RAM:
 86 |         roidb = imdb.roidb
 87 |     else:
 88 |         roidb = get_training_roidb(imdb)
 89 |         roidb = filter_roidb(roidb)
 90 |     rdata = RoIDataLayer(roidb)
 91 |     data = rdata.forward()
 92 | 
 93 |     return data
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     imdb = visual_genome('pre', '1.2')
 98 |     if cfg.LIMIT_RAM:
 99 |         roidb = imdb.roidb
100 |     else:
101 |         roidb = get_training_roidb(imdb)
102 |         roidb = filter_roidb(roidb)
103 |     rdata = RoIDataLayer(roidb)
104 |     data = rdata.forward()
105 |     # data = rdata.forward()
106 |     print(data)
107 |     regions = data['gt_boxes']
108 |     im = data['data'][0]
109 |     phrases = data['gt_phrases']
110 |     vis_regions(im, regions, phrases=phrases)
111 | 
112 |     # from IPython import embed;
113 |     #
114 |     # embed()
115 | 


--------------------------------------------------------------------------------
/tests/sentence_data_layer_test.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # DenseCap-Tensorflow
 3 | # Written by InnerPeace
 4 | # This file is adapted from Linjie's work
 5 | # --------------------------------------------------------
 6 | from __future__ import absolute_import
 7 | from __future__ import division
 8 | from __future__ import print_function
 9 | 
10 | from lib.config import cfg
11 | from lib.layers.sentence_data_layer import sentence_data_layer
12 | from tests.roidata_test import get_data_test
13 | import numpy as np
14 | 
15 | 
16 | def sentence_data_layer_test():
17 |     data = get_data_test()
18 |     phrases = data['gt_phrases']
19 | 
20 |     labels = data['gt_boxes'][:3, 4]
21 |     sentence_data_layer(labels, phrases)
22 | 
23 | 
24 | if __name__ == '__main__':
25 | 
26 |     sentence_data_layer_test()
27 | 


--------------------------------------------------------------------------------
/tests/vh_train_command.sh:
--------------------------------------------------------------------------------
 1 | # prapare data
 2 | pip install opencv-python
 3 | apt-get -y update && apt-get install -y libsm6 libxext6
 4 | pip install --upgrade pip
 5 | pip install -r requirements.txt
 6 | cd /valohai/inputs
 7 | tar -xvzf ./vg_data/visual_genome.tar.gz
 8 | mv ./valohai/inputs/visual_genome/ ./
 9 | mkdir ./images
10 | unzip -xvzf image_1/images.zip -d ./images
11 | unzip -xvzf image_2/images2.zip -d ./images
12 | ls
13 | cd /valohai/repository
14 | cd lib
15 | make
16 | cd ..
17 | bash ./tests/dencap_oa_test.sh {parameters}
18 | tar -czvf /valohai/outputs/output.tar.gz ./output
19 | 


--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/InnerPeace-Wu/densecap-tensorflow/2c77998f73832d5bb7324e97e8a7419cbce8398c/tools/__init__.py


--------------------------------------------------------------------------------
/tools/_init_paths.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import sys
 3 | 
 4 | 
 5 | def add_path(path):
 6 |     if path not in sys.path:
 7 |         sys.path.insert(0, path)
 8 | 
 9 | 
10 | this_dir = osp.dirname(__file__)
11 | lib_path = osp.join(this_dir, '..')
12 | add_path(lib_path)
13 | 


--------------------------------------------------------------------------------
/tools/demo.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DenseCap-Tensorflow
  3 | # Written by InnerPeace
  4 | # This file is adapted from Linjie's work
  5 | # --------------------------------------------------------
  6 | # Train a dense captioning model
  7 | # Code adapted from faster R-CNN project
  8 | # --------------------------------------------------------
  9 | # Fast R-CNN
 10 | # Copyright (c) 2015 Microsoft
 11 | # Licensed under The MIT License [see LICENSE for details]
 12 | # Written by Ross Girshick
 13 | # --------------------------------------------------------
 14 | from __future__ import absolute_import
 15 | from __future__ import division
 16 | from __future__ import print_function
 17 | 
 18 | """Train a dense caption model"""
 19 | 
 20 | import _init_paths
 21 | from os.path import join as pjoin
 22 | import sys
 23 | import six
 24 | import glob
 25 | import argparse
 26 | import json
 27 | import numpy as np
 28 | import tensorflow as tf
 29 | 
 30 | from lib.config import cfg, cfg_from_file, cfg_from_list, get_output_dir, get_output_tb_dir
 31 | from lib.datasets.factory import get_imdb
 32 | import lib.datasets.imdb
 33 | from lib.dense_cap.train import get_training_roidb, train_net
 34 | from lib.dense_cap.test import test_im
 35 | from lib.nets.vgg16 import vgg16
 36 | from lib.nets.resnet_v1 import resnetv1
 37 | import pprint
 38 | 
 39 | 
 40 | def parse_args():
 41 |     """
 42 |     Parse input arguments
 43 |     """
 44 |     parser = argparse.ArgumentParser(description='Test a Dense Caption network')
 45 | 
 46 |     parser.add_argument('--ckpt', dest='ckpt',
 47 |                         help='initialize with pretrained model weights',
 48 |                         default=None, type=str)
 49 |     parser.add_argument('--cfg', dest='cfg_file',
 50 |                         help='optional config file',
 51 |                         default=None, type=str)
 52 |     # TODO: add inception
 53 |     parser.add_argument('--net', dest='net',
 54 |                         help='vgg16, res50, res101, res152',
 55 |                         default='res50', type=str)
 56 |     parser.add_argument('--vocab', dest='vocabulary',
 57 |                         help='vocabulary file',
 58 |                         default=None, type=str)
 59 | 
 60 |     parser.add_argument('--set', dest='set_cfgs',
 61 |                         help='set config keys', default=None,
 62 |                         nargs=argparse.REMAINDER)
 63 | 
 64 |     if len(sys.argv) == 1:
 65 |         parser.print_help()
 66 |         sys.exit(1)
 67 | 
 68 |     args = parser.parse_args()
 69 |     return args
 70 | 
 71 | 
 72 | if __name__ == '__main__':
 73 |     args = parse_args()
 74 |     print('------- called with args: --------')
 75 |     pprint.pprint(args)
 76 | 
 77 |     if args.cfg_file is not None:
 78 |         cfg_from_file(args.cfg_file)
 79 |     if args.set_cfgs is not None:
 80 |         cfg_from_list(args.set_cfgs)
 81 | 
 82 |     # load network
 83 |     if args.net == 'vgg16':
 84 |         net = vgg16()
 85 |     elif args.net == 'res50':
 86 |         net = resnetv1(num_layers=50)
 87 |     elif args.net == 'res101':
 88 |         net = resnetv1(num_layers=101)
 89 |     elif args.net == 'res152':
 90 |         net = resnetv1(num_layers=152)
 91 |     else:
 92 |         raise NotImplementedError
 93 | 
 94 |     net.create_architecture("TEST", num_classes=1, tag='pre')
 95 |     vocab = ['<PAD>', '<SOS>', '<EOS>']
 96 |     with open(args.vocabulary, 'r') as f:
 97 |         for line in f:
 98 |             vocab.append(line.strip())
 99 | 
100 |     # get the image paths
101 |     im_paths = glob.glob('./data/demo/*.jpg')
102 |     print(im_paths)
103 | 
104 |     # read checkpoint file
105 |     if args.ckpt:
106 |         ckpt = tf.train.get_checkpoint_state(args.ckpt)
107 |     else:
108 |         raise ValueError
109 | 
110 |     # set config
111 |     tfconfig = tf.ConfigProto(allow_soft_placement=True)
112 |     tfconfig.gpu_options.allow_growth = True
113 | 
114 |     # init session
115 |     saver = tf.train.Saver()
116 |     with tf.Session(config=tfconfig) as sess:
117 |         print('Restored from {}'.format(ckpt.model_checkpoint_path))
118 |         saver.restore(sess, ckpt.model_checkpoint_path)
119 | 
120 |         # for n in tf.get_default_graph().as_graph_def().node:
121 |         #     if 'input_feed' in n.name:
122 |         #         print(n.name)
123 |         # for html visualization
124 |         pre_results = {}
125 |         save_path = './vis/data'
126 |         for path in im_paths:
127 |             pre_results = test_im(sess, net, path, vocab, pre_results)
128 | 
129 |         with open(save_path + '/results.json', 'w') as f:
130 |             json.dump(pre_results, f)
131 | 


--------------------------------------------------------------------------------
/tools/test_net.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # DenseCap-Tensorflow
  3 | # Written by InnerPeace
  4 | # This file is adapted from Linjie's work
  5 | # --------------------------------------------------------
  6 | # Train a dense captioning model
  7 | # Code adapted from faster R-CNN project
  8 | # --------------------------------------------------------
  9 | # Fast R-CNN
 10 | # Copyright (c) 2015 Microsoft
 11 | # Licensed under The MIT License [see LICENSE for details]
 12 | # Written by Ross Girshick
 13 | # --------------------------------------------------------
 14 | from __future__ import absolute_import
 15 | from __future__ import division
 16 | from __future__ import print_function
 17 | 
 18 | """Test a dense caption model"""
 19 | import _init_paths
 20 | from lib.dense_cap.test import test_net
 21 | from lib.config import cfg, cfg_from_file, cfg_from_list
 22 | from lib.datasets.factory import get_imdb
 23 | import argparse
 24 | import pprint
 25 | import time
 26 | import os
 27 | import sys
 28 | import tensorflow as tf
 29 | from lib.nets.vgg16 import vgg16
 30 | from lib.nets.resnet_v1 import resnetv1
 31 | 
 32 | 
 33 | def parse_args():
 34 |     """
 35 |     Parse input arguments
 36 |     """
 37 |     parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
 38 |     parser.add_argument('--device', dest='device', help='device to use',
 39 |                         default='gpu', type=str)
 40 |     parser.add_argument('--device_id', dest='device_id', help='device id to use',
 41 |                         default=0, type=int)
 42 |     parser.add_argument('--tag', dest='tag',
 43 |                         help='tag of the model',
 44 |                         default=None, type=str)
 45 |     parser.add_argument('--ckpt', dest='ckpt',
 46 |                         help='initialize with pretrained model weights',
 47 |                         default=None, type=str)
 48 |     parser.add_argument('--cfg', dest='cfg_file',
 49 |                         help='optional config file',
 50 |                         default=None, type=str)
 51 |     parser.add_argument('--imdb', dest='imdb_name',
 52 |                         help='dataset to test on',
 53 |                         default='vg_1.2_test', type=str)
 54 |     # TODO: delete extra options
 55 |     # parser.add_argument('--iters', dest='max_iters',
 56 |     #                     help='number of iterations to train',
 57 |     #                     default=40000, type=int)
 58 |     # parser.add_argument('--imdbval', dest='imdbval_name',
 59 |     #                     help='dataset to validation on',
 60 |     #                     default='vg_1.2_val', type=str)
 61 |     # parser.add_argument('--rand', dest='randomize',
 62 |     #                     help='randomize (do not use a fixed seed)',
 63 |     #                     action='store_true')
 64 |     # TODO: add inception
 65 |     parser.add_argument('--net', dest='net',
 66 |                         help='vgg16, res50, res101, res152',
 67 |                         default='res50', type=str)
 68 |     parser.add_argument('--vis', dest='vis', help='visualize detections',
 69 |                         action='store_true')
 70 |     parser.add_argument('--use_box_at', dest='use_box_at',
 71 |                         help='use predicted box at this time step, default to the last',
 72 |                         default=-1, type=int)
 73 |     parser.add_argument('--set', dest='set_cfgs',
 74 |                         help='set config keys', default=None,
 75 |                         nargs=argparse.REMAINDER)
 76 | 
 77 |     if len(sys.argv) == 1:
 78 |         parser.print_help()
 79 |         sys.exit(1)
 80 | 
 81 |     args = parser.parse_args()
 82 |     return args
 83 | 
 84 | 
 85 | if __name__ == '__main__':
 86 |     args = parse_args()
 87 | 
 88 |     print('Called with args:')
 89 |     print(args)
 90 | 
 91 |     if args.cfg_file is not None:
 92 |         cfg_from_file(args.cfg_file)
 93 |     if args.set_cfgs is not None:
 94 |         cfg_from_list(args.set_cfgs)
 95 | 
 96 |     cfg.GPU_ID = args.device_id
 97 | 
 98 |     print('Using config:')
 99 |     pprint.pprint(cfg)
100 | 
101 |     imdb = get_imdb(args.imdb_name)
102 |     # load network
103 |     if args.net == 'vgg16':
104 |         net = vgg16()
105 |     elif args.net == 'res50':
106 |         net = resnetv1(num_layers=50)
107 |     elif args.net == 'res101':
108 |         net = resnetv1(num_layers=101)
109 |     elif args.net == 'res152':
110 |         net = resnetv1(num_layers=152)
111 |     else:
112 |         raise NotImplementedError
113 | 
114 |     net.create_architecture("TEST", num_classes=1, tag='pre')
115 |     # read checkpoint file
116 |     if args.ckpt:
117 |         ckpt = tf.train.get_checkpoint_state(args.ckpt)
118 |     else:
119 |         raise ValueError("NO checkpoint found in {}".format(args.ckpt))
120 | 
121 |     # set config
122 |     tfconfig = tf.ConfigProto(allow_soft_placement=True)
123 |     tfconfig.gpu_options.allow_growth = True
124 | 
125 |     # init session
126 |     saver = tf.train.Saver()
127 |     with tf.Session(config=tfconfig) as sess:
128 |         print('Restored from {}'.format(ckpt.model_checkpoint_path))
129 |         saver.restore(sess, ckpt.model_checkpoint_path)
130 | 
131 |         test_net(sess, net, imdb,
132 |                  vis=args.vis, use_box_at=args.use_box_at)
133 | 


--------------------------------------------------------------------------------
/valohai.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | 
 3 | - step:
 4 |     name: preprocess data
 5 |     image: gcr.io/tensorflow/tensorflow:1.3.0-devel-gpu
 6 |     command: bash ./lib/preprocess.sh {parameters}
 7 |     inputs:
 8 |       - name: image_meta
 9 |         default: http://visualgenome.org/static/data/dataset/image_data.json.zip
10 |       - name: regions
11 |         default: http://visualgenome.org/static/data/dataset/region_descriptions.json.zip
12 |     parameters:
13 |       - name: vs
14 |         type: float
15 |         pass-as: -vs {v}
16 |         default: 1.2
17 |       - name: path
18 |         type: string
19 |         pass-as: -p {v}
20 |         default: "/valohai/inputs"
21 |       - name: output_dir
22 |         type: string
23 |         pass-as: -od {v}
24 |         default: "/valohai/inputs/visual_genome"
25 |       - name: max_words
26 |         type: integer
27 |         pass-as: -mw {v}
28 |         default: 10
29 | 
30 | - step:
31 |     name: download image data
32 |     image: gcr.io/tensorflow/tensorflow:1.3.0-devel-gpu
33 |     command: bash ./lib/download_data_vh.sh
34 |     inputs:
35 |       - name: image_1
36 |         default: https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip
37 |       - name: image_2
38 |         default: https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip
39 | 
40 | - step:
41 |     name: train model
42 |     image: gcr.io/tensorflow/tensorflow:1.3.0-devel-gpu
43 |     command: bash ./tests/dencap_oa_test.sh {parameters}
44 |     inputs:
45 |       - name: vg_data
46 |         default: ""
47 |       - name: resnet
48 |         default: https://drive.google.com/uc?export=download&confirm=aZtH&id=15PxiEp7HP-ZSBG9xHMamZr-zh8iBDeA4
49 |       - name: image_1
50 |         default: https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip
51 |       - name: image_2
52 |         default: https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip
53 |     parameters:
54 |       - name: iters
55 |         type: integer
56 |         pass-as: -iters {v}
57 |         default: 80000
58 | 


--------------------------------------------------------------------------------
/vis/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Visualization interface
 3 | 
 4 | When you run `run_model.lua` with `-output_vis 1` (default) it will write the images and a json struct to this folder's `data/` directory. These can then be viewed with this nice html interface.
 5 | 
 6 | For example, to evaluate a checkpoint on some VG test data:
 7 | 
 8 | ```
 9 | th run_model.lua -checkpoint data/checkpoint.t7 -input_split test -vg_img_root_dir  /path/to/visual-genome/images -max_images 10
10 | ```
11 | 
12 | and then start a webbrowser, e.g. `python -m SimpleHTTPServer` and open the `view_results.html` file!
13 | 


--------------------------------------------------------------------------------
/vis/style.css:
--------------------------------------------------------------------------------
 1 | body {
 2 |   color: #333;
 3 |   margin: 0;
 4 |   padding: 0;
 5 |   font-family: "HelveticaNeue-Light", "Helvetica Neue Light", "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif; 
 6 |   font-weight: 300;
 7 | }
 8 | svg {
 9 |   border: 1px solid black;
10 |   background-color: #FFF;
11 | }
12 | hr {
13 |   border: 1px solid black;
14 | }
15 | #wrap {
16 |   width:800px;
17 |   margin-left: auto;
18 |   margin-right: auto;
19 | }
20 | #header {
21 |   text-align: center;
22 | }
23 | #image_vis {
24 |   background-color: #FFF;
25 |   padding: 20px 0px;
26 | }
27 | #image_vis_controls {
28 |   text-align: center;
29 |   padding: 10px;
30 |   background-color: #DDD;
31 |   border: 1px solid #999;
32 |   margin-bottom: 20px;
33 | }
34 | .bb {
35 |   height: 50px;
36 |   width: 175px;
37 |   margin: 5px;
38 | }
39 | .ddesc {
40 |   font-size: 32px;
41 | }
42 | .dcent {
43 |   margin-left: auto;
44 |   margin-right: auto;
45 |   width: 720px;
46 |   margin-bottom: 20px;
47 | }
48 | .djust {
49 |   text-align: justify;
50 | }


--------------------------------------------------------------------------------
/vis/utils.js:
--------------------------------------------------------------------------------
 1 | 
 2 | // helper function to create HSL string from a vector of colors
 3 | var renderHSL = function(hsl) { // omg
 4 |   var ht = Math.min(360, Math.max(0, hsl[0]));
 5 |   var st = Math.min(100, Math.max(0, hsl[1]));
 6 |   var lt = Math.min(100, Math.max(0, hsl[2]));
 7 |   return 'hsl(' + ht + ',' + st + '%,' + lt + '%)';
 8 | }
 9 | 
10 | // randomly shuffle an array
11 | function shuffle(array) {
12 |   var currentIndex = array.length, temporaryValue, randomIndex ;
13 |   // While there remain elements to shuffle...
14 |   while (0 !== currentIndex) {
15 |     // Pick a remaining element...
16 |     randomIndex = Math.floor(Math.random() * currentIndex);
17 |     currentIndex -= 1;
18 |     // And swap it with the current element.
19 |     temporaryValue = array[currentIndex];
20 |     array[currentIndex] = array[randomIndex];
21 |     array[randomIndex] = temporaryValue;
22 |   }
23 |   return array;
24 | }
25 | 
26 | // html escaping util
27 | var entityMap = {
28 |   "&": "&amp;",
29 |   "<": "&lt;",
30 |   ">": "&gt;",
31 |   '"': '&quot;',
32 |   "'": '&#39;',
33 |   "/": '&#x2F;'
34 | };
35 | function escapeHtml(string) {
36 |   return String(string).replace(/[&<>"'\/]/g, function (s) {
37 |     return entityMap[s];
38 |   });
39 | }
40 | 
41 | 
42 | // store colors in a global var because why not
43 | var WAD_COLORS = [
44 |   "rgb(173, 35, 35)",   // Red
45 |   "rgb(42, 75, 215)",   // Blue
46 |   "rgb(87, 87, 87)",    // Dark Gray
47 |   "rgb(29, 105, 20)",   // Green
48 |   "rgb(129, 74, 25)",   // Brown
49 |   "rgb(129, 38, 192)",  // Purple
50 |   "rgb(160, 160, 160)", // Lt Gray
51 |   "rgb(129, 197, 122)", // Lt green
52 |   "rgb(157, 175, 255)", // Lt blue
53 |   "rgb(41, 208, 208)",  // Cyan
54 |   "rgb(255, 146, 51)",  // Orange
55 |   "rgb(255, 238, 51)",  // Yellow
56 |   "rgb(233, 222, 187)", // Tan
57 |   "rgb(255, 205, 243)", // Pink
58 |   // "rgb(255, 255, 255)", // White
59 |   //"rgb(0, 0, 0)",       // Black
60 | ];
61 | 
62 | // ----------------------------------------------------------------------------
63 | // visualization utils
64 | // ----------------------------------------------------------------------------
65 | 
66 | // renders a bounding box and text annotaiton in svg element elt. assumes d3js
67 | function renderBox(elt, box, color, width, text) {
68 |   if (typeof(width) === 'undefined') width = 1;
69 |   elt.append('rect')
70 |      .attr('x', box[0])
71 |      .attr('y', box[1])
72 |      .attr('width', box[2])
73 |      .attr('height', box[3])
74 |      .attr('stroke', color)
75 |      .attr('fill', 'none')
76 |      .attr('stroke-width', width);
77 |   if (typeof(text) !== 'undefined' && text != '') {
78 |     var t = elt.append('text').text(text)
79 |                .attr('x', box[0]).attr('y', box[1])
80 |                .attr('dominant-baseline', 'hanging')
81 |                .attr('text-anchor', 'start');
82 |     t = t[0][0];
83 |     var tbox = t.getBBox();
84 |     elt.insert('rect', 'text').attr('fill', color)
85 |        .attr('x', tbox.x).attr('y', tbox.y)
86 |        .attr('width', tbox.width)
87 |        .attr('height', tbox.height);
88 |   }
89 | }
90 | 


--------------------------------------------------------------------------------
/vis/view_results.html:
--------------------------------------------------------------------------------
  1 | <!doctype html>
  2 | <html lang="en">
  3 |   <head>
  4 |     <meta charset="utf-8">
  5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  6 |     <title>DenseCap results browser</title>
  7 |     <script src="utils.js"></script>
  8 |     <script src="jquery-1.8.3.min.js"></script>
  9 |     <script src="d3.min.js" charset="utf-8"></script>
 10 |     <link rel="stylesheet" type="text/css" href="style.css">
 11 | 
 12 |     <script type="application/javascript">
 13 | 
 14 |     var BOX_WIDTH = 4;
 15 |     var INPUT_JSON_PATH = 'data/results.json'
 16 |     var input_struct = null; // will be loaded from json
 17 |     var current_id = 0; // currently shown image id
 18 | 
 19 |     // render flags utils
 20 |     var render_flags = {
 21 |       detections_to_show: 10,
 22 |       captions_inline: false,
 23 |     }
 24 |     function updateCounter(name, d) {
 25 |       render_flags[name] += d;
 26 |       renderAnnotations();
 27 |     }
 28 |     function toggleFlag(name) {
 29 |       render_flags[name] = !render_flags[name];
 30 |       renderAnnotations();
 31 |     }
 32 | 
 33 |     function getUrlParam(name, fallback) {
 34 |       name = name.replace(/[\[]/, "\\[").replace(/[\]]/, "\\]");
 35 |       var regex = new RegExp("[\\?&]" + name + "=([^&#]*)"),
 36 |         results = regex.exec(location.search);
 37 |       if (results === null) return fallback;
 38 |       return decodeURIComponent(results[1].replace(/\+/g, " "));
 39 |     }
 40 | 
 41 |     function loadData() {
 42 |       var data_dir = getUrlParam('data_dir', 'data');
 43 |       var input_json_path = data_dir + '/results.json';
 44 |       var pathmod = input_json_path + '?sigh=' + Math.floor(Math.random() * 100000); // prevent caching
 45 |       $.getJSON(pathmod, function(data) {
 46 |         input_struct = data; // store in global
 47 |         renderImage();
 48 |       });
 49 |     }
 50 | 
 51 |     function updateImg(d) {
 52 |       current_id += d;
 53 |       current_id = Math.max(0,Math.min(input_struct.results.length-1, current_id)) // clamp
 54 |       renderImage();
 55 |     }
 56 | 
 57 |     var svgg = null; // g element inside svg that contains the annotations
 58 |     var desc_div = null; // element to render captions into
 59 |     function renderImage() {
 60 |       var data_dir = getUrlParam('data_dir', 'data');
 61 |       var fname = data_dir + '/' + input_struct.results[current_id].img_name;
 62 |       console.log('loading image: ' + fname);
 63 | 
 64 |       $("#counterdiv").html('image ' + (current_id+1) + '/' + input_struct.results.length);
 65 | 
 66 |       $("#image_vis").html(''); // reset area
 67 |       var div = d3.select("#image_vis").append("div").classed('dcent', true);
 68 | 
 69 |       var img = new Image();
 70 |       img.src = fname;
 71 |       img.onload = function() {
 72 |         var width = this.width;
 73 |         var height = this.height;
 74 | 
 75 |         // create main rendering elements
 76 |         var svg = div.append("svg");
 77 |         desc_div = div.append("div").classed('djust', true);
 78 | 
 79 |         // render the raw image
 80 |         var svg_img = svg.append("image").attr("x",0).attr("y",0);
 81 |         svg_img.attr('height', height).attr('width', width).attr('xlink:href', fname);
 82 |         svg.attr('height', height);
 83 |         svg.attr('width', width);
 84 | 
 85 |         svgg = svg.append('g');
 86 |         renderAnnotations();
 87 |       }
 88 |     }
 89 | 
 90 |     // render just the annotations, leave the image untouched and loaded
 91 |     function renderAnnotations() {
 92 |       var elt = svgg; // render into g element of svg
 93 |       elt.html(''); // flush contents of annotations <g> element
 94 |       var delt = desc_div; // description element
 95 |       delt.html('');
 96 | 
 97 |       var result = input_struct.results[current_id];
 98 | 
 99 |       // Render top detections
100 |       var ixscore = [];
101 |       for (var i = 0; i < result.scores.length; i++) {
102 |         ixscore.push([i, result.scores[i]]);
103 |       }
104 |       ixscore.sort(function(a, b) {
105 |         return b[1] - a[1];
106 |       });
107 | 
108 |       // lets render!
109 |       var nshow = Math.min(render_flags.detections_to_show, result.captions.length);
110 |       for (var k = 0; k < nshow; k++) {
111 |         var i = ixscore[k][0];
112 |         var det_box = result.boxes[i];
113 |         var caption = result.captions[i];
114 |         // render box
115 |         var color = WAD_COLORS[k % WAD_COLORS.length];
116 |         renderBox(elt, det_box, color, BOX_WIDTH, render_flags.captions_inline ? caption : '');
117 |         // render caption in a separate div below the image
118 |         delt.append('span').classed('ddesc', true).style('color', color).html(escapeHtml(caption) + '. ');
119 |       }
120 |     }
121 | 
122 |     // bind keys to controls
123 |     document.onkeydown = function(e) {
124 |       // d=68, a=65, w=87, s=83, t=84
125 |       if (e.keyCode == 68) updateImg(1);
126 |       if (e.keyCode == 65) updateImg(-1);
127 |       if (e.keyCode == 83) updateCounter('detections_to_show', -1);
128 |       if (e.keyCode == 87) updateCounter('detections_to_show', 1);
129 |       if (e.keyCode == 84) toggleFlag('captions_inline');
130 |       if (e.keyCode == 82) jumpRandom();
131 |     };
132 | 
133 |     // "int main" function here
134 |     function intmain() {
135 |       loadData();
136 |     }
137 | 
138 |     function jumpRandom() {
139 |       current_id = Math.floor(Math.random()*(input_struct.results.length-1));
140 |       renderImage();
141 |     }
142 | 
143 |     </script>
144 |   </head>
145 |   <body onload="intmain()">
146 |     <div id="wrap">
147 | 
148 |       <div id="header"><h1>DenseCap results browser</h1></div>
149 |       <div id="infodiv">Browse the results using the WSAD hotkeys (A,D: prev/next image, W/S: more/less detections)</div>
150 |       <hr>
151 | 
152 |       <div id="image_vis"></div>
153 |       <div id="image_vis_controls">
154 |         <button onclick="updateCounter('detections_to_show', -1)" class="bb">Fewer detections (s)</button>
155 |         <button onclick="updateCounter('detections_to_show', 1)" class="bb">More detections (w)</button>
156 |         <button onclick="updateImg(-1)" class="bb">Prev img (a)</button>
157 |         <button onclick="updateImg(1)" class="bb">Next img (d)</button>
158 |         <br>
159 |         <button onclick="toggleFlag('captions_inline')" class="bb" style="width:200px">
160 |           Toggle show captions inline (t)
161 |         </button>
162 |         <button onclick="jumpRandom()" class="bb" style="width:200px">
163 |           Jump to random image (r)
164 |         </button>
165 |       </div>
166 |       <div id="counterdiv" style="text-align:center; font-size:20px; margin-bottom:20px;">
167 | 
168 |       </div>
169 | 
170 |     </div>
171 |   </body>
172 | </html>
173 | 


--------------------------------------------------------------------------------