├── lib
    ├── db
    │   ├── __init__.py
    │   ├── imdb.py
    │   ├── maskdb.py
    │   └── roidb.py
    ├── nms
    │   ├── __init__.py
    │   ├── .gitignore
    │   ├── gpu_nms.hpp
    │   ├── gpu_mv.hpp
    │   ├── py_cpu_nms.py
    │   ├── gpu_nms.pyx
    │   ├── gpu_mv.pyx
    │   ├── cpu_nms.pyx
    │   ├── nms_wrapper.py
    │   └── nms_kernel.cu
    ├── pylayer
    │   ├── __init__.py
    │   ├── mask_layer.py
    │   ├── mnc_data_layer.py
    │   ├── proposal_target_layer.py
    │   ├── anchor_target_layer.py
    │   └── proposal_layer.py
    ├── utils
    │   ├── __init__.py
    │   ├── unmap.py
    │   ├── timer.py
    │   ├── bbox.pyx
    │   ├── blob.py
    │   └── vis_seg.py
    ├── caffeWrapper
    │   ├── __init__.py
    │   └── SolverWrapper.py
    ├── datasets
    │   ├── __init__.py
    │   ├── pascal_voc.py
    │   └── pascal_voc_seg.py
    ├── transform
    │   ├── __init__.py
    │   ├── anchors.py
    │   ├── bbox_transform.py
    │   └── mask_transform.py
    ├── Makefile
    ├── setup.py
    └── mnc_config.py
├── experiments
    ├── logs
    │   └── .gitignore
    ├── cfgs
    │   └── VGG16
    │   │   ├── mnc_5stage.yml
    │   │   ├── faster_rcnn_end2end.yml
    │   │   └── cfm.yml
    └── scripts
    │   ├── cfm.sh
    │   ├── mnc_5stage.sh
    │   └── faster_rcnn_end2end.sh
├── data
    ├── demo
    │   ├── 2008_000533.jpg
    │   ├── 2008_000910.jpg
    │   ├── 2008_001602.jpg
    │   ├── 2008_001717.jpg
    │   └── 2008_008093.jpg
    ├── readme_img
    │   └── example.png
    └── scripts
    │   ├── fetch_mnc_model.sh
    │   ├── fetch_mcg_data.sh
    │   ├── fetch_sbd_data.sh
    │   └── fetch_imagenet_models.sh
├── .gitmodules
├── models
    └── VGG16
    │   ├── cfm
    │       ├── solver.prototxt
    │       └── test.prototxt
    │   ├── mnc_5stage
    │       └── solver.prototxt
    │   └── faster_rcnn_end2end
    │       ├── solver.prototxt
    │       └── test.prototxt
├── .gitignore
├── tools
    ├── _init_paths.py
    ├── test_net.py
    ├── train_net.py
    ├── demo.py
    └── prepare_mcg_maskdb.py
├── LICENSE
└── README.md


/lib/db/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/nms/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/pylayer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/caffeWrapper/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/transform/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/experiments/logs/.gitignore:
--------------------------------------------------------------------------------
1 | *.txt*


--------------------------------------------------------------------------------
/lib/nms/.gitignore:
--------------------------------------------------------------------------------
1 | *.c
2 | *.cpp
3 | *.so
4 | 


--------------------------------------------------------------------------------
/lib/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	python setup.py build_ext --inplace
3 | 	rm -rf build
4 | 


--------------------------------------------------------------------------------
/data/demo/2008_000533.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_000533.jpg


--------------------------------------------------------------------------------
/data/demo/2008_000910.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_000910.jpg


--------------------------------------------------------------------------------
/data/demo/2008_001602.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_001602.jpg


--------------------------------------------------------------------------------
/data/demo/2008_001717.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_001717.jpg


--------------------------------------------------------------------------------
/data/demo/2008_008093.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_008093.jpg


--------------------------------------------------------------------------------
/data/readme_img/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/readme_img/example.png


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "caffe-mnc"]
2 | 	path = caffe-mnc
3 | 	url = https://github.com/daijifeng001/caffe-mnc.git
4 | 


--------------------------------------------------------------------------------
/lib/nms/gpu_nms.hpp:
--------------------------------------------------------------------------------
1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
2 |           int boxes_dim, float nms_overlap_thresh, int device_id);
3 | 


--------------------------------------------------------------------------------
/experiments/cfgs/VGG16/mnc_5stage.yml:
--------------------------------------------------------------------------------
1 | EXP_DIR: mnc_5stage
2 | MASK_SIZE: 21
3 | TRAIN:
4 |   RPN_POST_NMS_TOP_N: 300
5 |   IMS_PER_BATCH: 1
6 |   BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
7 | 


--------------------------------------------------------------------------------
/experiments/cfgs/VGG16/faster_rcnn_end2end.yml:
--------------------------------------------------------------------------------
 1 | EXP_DIR: faster_rcnn_end2end
 2 | MNC_MODE: False
 3 | TRAIN:
 4 |   MIX_INDEX: False
 5 |   IMS_PER_BATCH: 1
 6 |   BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True
 7 |   BATCH_SIZE: 128
 8 |   FG_FRACTION: [0.25]
 9 |   BG_FRACTION: [1.0]
10 |   BG_THRESH_HI: [0.5]
11 |   BG_THRESH_LO: [0.1]
12 | 


--------------------------------------------------------------------------------
/data/scripts/fetch_mnc_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 4 | cd ${DIR}
 5 | 
 6 | URL="https://onedrive.live.com/download?resid=571EABC0F8C2A19C!1103&authkey=!ALXduVujs-7r6Ug"
 7 | 
 8 | echo "Downloading mnc model..."
 9 | 
10 | mkdir ../mnc_model
11 | wget ${URL} -O mnc_model.caffemodel.h5 
12 | 
13 | mv mnc_model.caffemodel.h5 ../mnc_model/
14 | 


--------------------------------------------------------------------------------
/experiments/cfgs/VGG16/cfm.yml:
--------------------------------------------------------------------------------
 1 | EXP_DIR: cfm
 2 | MASK_SIZE: 21
 3 | CFM_MODE: True
 4 | MNC_MODE: False
 5 | RNG_SEED: 821
 6 | TRAIN:
 7 |   HAS_RPN: False
 8 |   IMS_PER_BATCH: 1
 9 |   SCALES: [480, 576, 688, 864, 1024]
10 |   MAX_SIZE: 1500
11 | TEST:
12 |   SCALES: [480, 576, 688, 864, 1024]
13 |   MAX_SIZE: 1500
14 |   GROUP_SCALE: 3
15 |   MAX_ROIS_GPU: [2000, 500]
16 |   USE_TOP_K_MCG: 2000
17 | 


--------------------------------------------------------------------------------
/models/VGG16/cfm/solver.prototxt:
--------------------------------------------------------------------------------
 1 | train_net: "models/VGG16/cfm/train.prototxt"
 2 | base_lr: 0.001
 3 | lr_policy: "step"
 4 | gamma: 0.1
 5 | stepsize: 20000
 6 | display: 100
 7 | average_loss: 100
 8 | momentum: 0.9
 9 | weight_decay: 0.0005
10 | 
11 | # We disable standard caffe solver snapshotting and implement our own snapshot
12 | # function
13 | snapshot: 0
14 | snapshot_prefix: "cfm"
15 | iter_size: 8
16 | 


--------------------------------------------------------------------------------
/lib/nms/gpu_mv.hpp:
--------------------------------------------------------------------------------
1 | void _mv(const float* all_boxes, const float* all_masks, const int all_boxes_num,
2 |         const int* candidate_inds, const int* candidate_start, const float* candidate_weights, const int candidate_num,
3 |         const int image_height, const int image_width, const int box_dim, const int mask_size, const int result_num,
4 |         float* finalize_output_mask, int* finalize_output_box, const int device_id);
5 | 


--------------------------------------------------------------------------------
/models/VGG16/mnc_5stage/solver.prototxt:
--------------------------------------------------------------------------------
 1 | train_net: "models/VGG16/mnc_5stage/train.prototxt"
 2 | base_lr: 0.001
 3 | lr_policy: "step"
 4 | gamma: 0.1
 5 | stepsize: 20000
 6 | display: 100
 7 | average_loss: 100
 8 | momentum: 0.9
 9 | weight_decay: 0.0005
10 | # We disable standard caffe solver snapshotting and implement our own snapshot
11 | # function
12 | snapshot: 0
13 | snapshot_prefix: "vgg16_mnc_5stage"
14 | iter_size: 8
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files
 2 | *.slo
 3 | *.lo
 4 | *.o
 5 | *.obj
 6 | 
 7 | # Precompiled Headers
 8 | *.gch
 9 | *.pch
10 | 
11 | # Compiled Dynamic libraries
12 | *.so
13 | *.dylib
14 | *.dll
15 | 
16 | # Fortran module files
17 | *.mod
18 | 
19 | # Compiled Static libraries
20 | *.lai
21 | *.la
22 | *.a
23 | *.lib
24 | 
25 | # Executables
26 | *.exe
27 | *.out
28 | *.app
29 | 
30 | *.pyc
31 | *.png
32 | *.jpg
33 | *~
34 | .idea


--------------------------------------------------------------------------------
/models/VGG16/faster_rcnn_end2end/solver.prototxt:
--------------------------------------------------------------------------------
 1 | train_net: "models/VGG16/faster_rcnn_end2end/train.prototxt"
 2 | base_lr: 0.001
 3 | lr_policy: "step"
 4 | gamma: 0.1
 5 | stepsize: 50000
 6 | display: 20
 7 | average_loss: 100
 8 | # iter_size: 1
 9 | momentum: 0.9
10 | weight_decay: 0.0005
11 | # We disable standard caffe solver snapshotting and implement our own snapshot
12 | # function
13 | snapshot: 0
14 | # We still use the snapshot prefix, though
15 | snapshot_prefix: "vgg16_faster_rcnn"
16 | iter_size: 2
17 | 


--------------------------------------------------------------------------------
/tools/_init_paths.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os.path
 3 | import sys
 4 | 
 5 | """
 6 | Add lib paths and caffe path to system search path
 7 | """
 8 | 
 9 | 
10 | def add_path(path):
11 |     if path not in sys.path:
12 |         sys.path.insert(0, path)
13 | 
14 | cur_dir = os.path.dirname(__file__)
15 | 
16 | # Add caffe python to PYTHONPATH
17 | caffe_path = os.path.join(cur_dir, '..', 'caffe-mnc', 'python')
18 | add_path(caffe_path)
19 | 
20 | # Add lib to PYTHONPATH
21 | lib_path = os.path.join(cur_dir, '..', 'lib')
22 | add_path(lib_path)
23 | 


--------------------------------------------------------------------------------
/data/scripts/fetch_mcg_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 4 | cd ${DIR}
 5 | 
 6 | FILE=MCG-Pascal-Main_trainvaltest_2012-proposals.tgz
 7 | URL=https://data.vision.ee.ethz.ch/jpont/mcg/${FILE}
 8 | echo "Downloading MCG proposals data..."
 9 | 
10 | wget $URL -O ${FILE}
11 |  
12 | echo "Unzipping..."
13 | 
14 | mkdir tmp
15 | tar zxvf ${FILE} -C tmp/ --strip-components=1
16 | 
17 | echo "move it to target source..."
18 | 
19 | mkdir ../MCG-raw/
20 | 
21 | mv tmp/* ../MCG-raw/
22 | 
23 | rm ${FILE}
24 | rm -r tmp
25 | 


--------------------------------------------------------------------------------
/data/scripts/fetch_sbd_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 4 | cd ${DIR}
 5 | 
 6 | FILE=benchmark.tgz
 7 | URL=http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/${FILE}
 8 | echo "Downloading SBD data..."
 9 | 
10 | wget $URL -O ${FILE}
11 |  
12 | echo "Unzipping..."
13 | 
14 | mkdir tmp
15 | tar zxvf ${FILE} -C tmp/ --strip-components=1
16 | 
17 | echo "move it to target source..."
18 | 
19 | mv -v tmp/dataset/inst/ tmp/dataset/cls/ tmp/dataset/img/ -t ../VOCdevkitSDS/
20 | 
21 | rm benchmark.tgz
22 | rm -r tmp
23 | 


--------------------------------------------------------------------------------
/data/scripts/fetch_imagenet_models.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 4 | cd ${DIR}
 5 | 
 6 | URL="https://onedrive.live.com/download?resid=F371D9563727B96F!91967&authkey=!AKjrYZBFAfb6JBQ"
 7 | 
 8 | echo "Downloading VGG16.mask.caffemodel model..."
 9 | 
10 | mkdir ../imagenet_models/
11 | wget ${URL} -O VGG16.mask.caffemodel 
12 | 
13 | mv VGG16.mask.caffemodel ../imagenet_models/
14 | 
15 | URL="https://onedrive.live.com/download?resid=F371D9563727B96F!91966&authkey=!ABoH69DkSk81FwA"
16 | 
17 | echo "Downloading VGG16.v2.caffemodel model..."
18 | 
19 | wget ${URL} -O VGG16.v2.caffemodel 
20 | 
21 | mv VGG16.v2.caffemodel ../imagenet_models/
22 | 


--------------------------------------------------------------------------------
/lib/utils/unmap.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Multitask Network Cascade
 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 4 | # Copyright (c) 2016, Haozhi Qi
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def unmap(data, count, inds, fill=0):
12 |     """ Unmap a subset of item (data) back to the original set of items (of
13 |     size count) """
14 |     if len(data.shape) == 1:
15 |         ret = np.empty((count, ), dtype=np.float32)
16 |         ret.fill(fill)
17 |         ret[inds] = data
18 |     else:
19 |         ret = np.empty((count, ) + data.shape[1:], dtype=np.float32)
20 |         ret.fill(fill)
21 |         ret[inds, :] = data
22 |     return ret
23 | 


--------------------------------------------------------------------------------
/lib/db/imdb.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Multitask Network Cascade
 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 4 | # Copyright (c) 2016, Haozhi Qi
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # --------------------------------------------------------
 7 | 
 8 | from datasets.pascal_voc_det import PascalVOCDet
 9 | from datasets.pascal_voc_seg import PascalVOCSeg
10 | 
11 | __sets = {
12 |     'voc_2012_seg_train': (lambda: PascalVOCSeg('train', '2012', 'data/VOCdevkitSDS/')),
13 |     'voc_2012_seg_val': (lambda: PascalVOCSeg('val', '2012', 'data/VOCdevkitSDS/')),
14 |     'voc_2007_trainval': (lambda: PascalVOCDet('trainval', '2007')),
15 |     'voc_2007_test': (lambda: PascalVOCDet('test', '2007'))
16 | }
17 | 
18 | 
19 | def get_imdb(name):
20 |     """ Get an imdb (image database) by name.
21 |     """
22 |     if not __sets.has_key(name):
23 |         raise KeyError('Unknown dataset: {}'.format(name))
24 |     return __sets[name]()
25 | 
26 | 
27 | def list_imdbs():
28 |     return __sets.keys()
29 | 


--------------------------------------------------------------------------------
/lib/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Multitask Network Cascade
 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 4 | # Copyright (c) 2016, Haozhi Qi
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | 
11 | class Timer(object):
12 |     """A simple timer."""
13 |     def __init__(self):
14 |         self.total_time = 0.
15 |         self.calls = 0
16 |         self.start_time = 0.
17 |         self.diff = 0.
18 |         self.average_time = 0.
19 | 
20 |     def tic(self):
21 |         # using time.time instead of time.clock because time time.clock
22 |         # does not normalize for multithreading
23 |         self.start_time = time.time()
24 | 
25 |     def toc(self, average=True):
26 |         self.diff = time.time() - self.start_time
27 |         self.total_time += self.diff
28 |         self.calls += 1
29 |         self.average_time = self.total_time / self.calls
30 |         if average:
31 |             return self.average_time
32 |         else:
33 |             return self.diff
34 | 


--------------------------------------------------------------------------------
/lib/nms/py_cpu_nms.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Multitask Network Cascade
 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 4 | # Copyright (c) 2016, Haozhi Qi
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | def py_cpu_nms(dets, thresh):
11 |     """Pure Python NMS baseline."""
12 |     x1 = dets[:, 0]
13 |     y1 = dets[:, 1]
14 |     x2 = dets[:, 2]
15 |     y2 = dets[:, 3]
16 |     scores = dets[:, 4]
17 | 
18 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
19 |     order = scores.argsort()[::-1]
20 | 
21 |     keep = []
22 |     while order.size > 0:
23 |         i = order[0]
24 |         keep.append(i)
25 |         xx1 = np.maximum(x1[i], x1[order[1:]])
26 |         yy1 = np.maximum(y1[i], y1[order[1:]])
27 |         xx2 = np.minimum(x2[i], x2[order[1:]])
28 |         yy2 = np.minimum(y2[i], y2[order[1:]])
29 | 
30 |         w = np.maximum(0.0, xx2 - xx1 + 1)
31 |         h = np.maximum(0.0, yy2 - yy1 + 1)
32 |         inter = w * h
33 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
34 | 
35 |         inds = np.where(ovr <= thresh)[0]
36 |         order = order[inds + 1]
37 | 
38 |     return keep
39 | 


--------------------------------------------------------------------------------
/lib/nms/gpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Multitask Network Cascade
 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 4 | # Copyright (c) 2016, Haozhi Qi
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | assert sizeof(int) == sizeof(np.int32_t)
12 | 
13 | cdef extern from "gpu_nms.hpp":
14 |     void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int)
15 | 
16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh,
17 |             np.int32_t device_id=0):
18 |     cdef int boxes_num = dets.shape[0]
19 |     cdef int boxes_dim = dets.shape[1]
20 |     cdef int num_out
21 |     cdef np.ndarray[np.int32_t, ndim=1] \
22 |         keep = np.zeros(boxes_num, dtype=np.int32)
23 |     cdef np.ndarray[np.float32_t, ndim=1] \
24 |         scores = dets[:, 4]
25 |     cdef np.ndarray[np.int_t, ndim=1] \
26 |         order = scores.argsort()[::-1]
27 |     cdef np.ndarray[np.float32_t, ndim=2] \
28 |         sorted_dets = dets[order, :]
29 |     _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id)
30 |     keep = keep[:num_out]
31 |     return list(order[keep])
32 | 


--------------------------------------------------------------------------------
/lib/db/maskdb.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Multitask Network Cascade
 3 | # Written by Haozhi Qi
 4 | # Copyright (c) 2016, Haozhi Qi
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # --------------------------------------------------------
 7 | 
 8 | from mnc_config import cfg
 9 | from db.imdb import get_imdb
10 | 
11 | 
12 | def get_maskdb(imdb_name):
13 | 
14 |     imdb = get_imdb(imdb_name)
15 |     print 'Loaded dataset `{:s}` for training'.format(imdb.name)
16 |     # Here set handler function. (e.g. gt_roidb in faster RCNN)
17 |     imdb.set_roi_handler(cfg.TRAIN.PROPOSAL_METHOD)
18 |     imdb.set_mask_handler(cfg.TRAIN.PROPOSAL_METHOD)
19 |     print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
20 |     if cfg.TRAIN.USE_FLIPPED:
21 |         print 'Appending horizontally-flipped training examples...'
22 |         imdb.append_flipped_masks()
23 |         print 'done'
24 |     return imdb.maskdb
25 | 
26 | 
27 | def attach_maskdb(imdb_names):
28 |     """
29 |     only implement single maskdb now
30 |     """
31 |     maskdbs = [get_maskdb(s) for s in imdb_names.split('+')]
32 |     maskdb = maskdbs[0]
33 |     if len(maskdbs) > 1:
34 |         raise NotImplementedError
35 |     else:
36 |         imdb = get_imdb(imdb_names)
37 |     return imdb, maskdb
38 | 


--------------------------------------------------------------------------------
/experiments/scripts/cfm.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Usage:
 3 | # ./experiments/scripts/cfm.sh GPU NET [--set ...]
 4 | # Example:
 5 | # ./experiments/scripts/cfm.sh 0 VGG16 \
 6 | #   --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400,500,600,700]"
 7 | 
 8 | set -x
 9 | set -e
10 | 
11 | export PYTHONUNBUFFERED="True"
12 | 
13 | GPU_ID=$1
14 | NET=$2
15 | NET_lc=${NET,,}
16 | ITERS=30000
17 | DATASET_TRAIN=voc_2012_seg_train
18 | DATASET_TEST=voc_2012_seg_val
19 | array=( $@ )
20 | len=${#array[@]}
21 | EXTRA_ARGS=${array[@]:2:$len}
22 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_}
23 | 
24 | LOG="experiments/logs/cfm_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
25 | exec &> >(tee -a "$LOG")
26 | echo Logging output to "$LOG"
27 | 
28 | NET_INIT=data/imagenet_models/${NET}.mask.caffemodel
29 | time ./tools/train_net.py --gpu ${GPU_ID} \
30 |   --solver models/${NET}/cfm/solver.prototxt \
31 |   --weights ${NET_INIT} \
32 |   --imdb ${DATASET_TRAIN} \
33 |   --iters ${ITERS} \
34 |   --cfg experiments/cfgs/${NET}/cfm.yml \
35 |   ${EXTRA_ARGS}
36 | 
37 | set +x
38 | NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'`
39 | set -x
40 | 
41 | time ./tools/test_net.py --gpu ${GPU_ID} \
42 |   --def models/${NET}/cfm/test.prototxt \
43 |   --net ${NET_FINAL} \
44 |   --imdb ${DATASET_TEST} \
45 |   --cfg experiments/cfgs/${NET}/cfm.yml \
46 |   --task cfm
47 | 


--------------------------------------------------------------------------------
/experiments/scripts/mnc_5stage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Usage:
 3 | # ./experiments/scripts/mnc_5stage.sh GPU NET [--set ...]
 4 | # Example:
 5 | # ./experiments/scripts/mnc_5stage.sh 0 VGG16 \
 6 | #   --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400,500,600,700]"
 7 | 
 8 | set -x
 9 | set -e
10 | 
11 | export PYTHONUNBUFFERED="True"
12 | 
13 | GPU_ID=$1
14 | NET=$2
15 | NET_lc=${NET,,}
16 | ITERS=25000
17 | DATASET_TRAIN=voc_2012_seg_train
18 | DATASET_TEST=voc_2012_seg_val
19 | array=( $@ )
20 | len=${#array[@]}
21 | EXTRA_ARGS=${array[@]:2:$len}
22 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_}
23 | 
24 | LOG="experiments/logs/mnc_5stage_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
25 | exec &> >(tee -a "$LOG")
26 | echo Logging output to "$LOG"
27 | 
28 | NET_INIT=data/imagenet_models/${NET}.mask.caffemodel
29 | time ./tools/train_net.py --gpu ${GPU_ID} \
30 |   --solver models/${NET}/mnc_5stage/solver.prototxt \
31 |   --weights ${NET_INIT} \
32 |   --imdb ${DATASET_TRAIN} \
33 |   --iters ${ITERS} \
34 |   --cfg experiments/cfgs/${NET}/mnc_5stage.yml \
35 |   ${EXTRA_ARGS}
36 |   
37 | set +x
38 | NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'`
39 | set -x
40 | 
41 | time ./tools/test_net.py --gpu ${GPU_ID} \
42 |   --def models/${NET}/mnc_5stage/test.prototxt \
43 |   --net ${NET_FINAL} \
44 |   --imdb ${DATASET_TEST} \
45 |   --cfg experiments/cfgs/${NET}/mnc_5stage.yml \
46 |   --task seg
47 | 
48 | 


--------------------------------------------------------------------------------
/experiments/scripts/faster_rcnn_end2end.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Usage:
 3 | # ./experiments/scripts/faster_rcnn_end2end.sh GPU NET [--set ...]
 4 | # Example:
 5 | # ./experiments/scripts/faster_rcnn_end2end.sh 0 VGG16 \
 6 | #   --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400,500,600,700]"
 7 | 
 8 | set -x
 9 | set -e
10 | 
11 | export PYTHONUNBUFFERED="True"
12 | 
13 | GPU_ID=$1
14 | NET=$2
15 | NET_lc=${NET,,}
16 | ITERS=70000
17 | DATASET_TRAIN=voc_2007_trainval
18 | DATASET_TEST=voc_2007_test
19 | 
20 | array=( $@ )
21 | len=${#array[@]}
22 | EXTRA_ARGS=${array[@]:2:$len}
23 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_}
24 | 
25 | LOG="experiments/logs/faster_rcnn_end2end_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`"
26 | exec &> >(tee -a "$LOG")
27 | echo Logging output to "$LOG"
28 | 
29 | NET_INIT=data/imagenet_models/${NET}.v2.caffemodel
30 | time ./tools/train_net.py --gpu ${GPU_ID} \
31 |   --solver models/${NET}/faster_rcnn_end2end/solver.prototxt \
32 |   --weights ${NET_INIT} \
33 |   --imdb ${DATASET_TRAIN} \
34 |   --iters ${ITERS} \
35 |   --cfg experiments/cfgs/${NET}/faster_rcnn_end2end.yml \
36 |   ${EXTRA_ARGS}
37 | 
38 | set +x
39 | NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'`
40 | set -x
41 | 
42 | time ./tools/test_net.py --gpu ${GPU_ID} \
43 |   --def models/${NET}/faster_rcnn_end2end/test.prototxt \
44 |   --net ${NET_FINAL} \
45 |   --imdb ${DATASET_TEST} \
46 |   --cfg experiments/cfgs/${NET}/faster_rcnn_end2end.yml \
47 |   --task det
48 |   ${EXTRA_ARGS}
49 | 


--------------------------------------------------------------------------------
/lib/nms/gpu_mv.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | cimport numpy as np
 4 | 
 5 | assert sizeof(int) == sizeof(np.int32_t)
 6 | 
 7 | cdef extern from "gpu_mv.hpp":
 8 |     void _mv(np.float32_t* all_boxes, np.float32_t* all_masks, np.int32_t all_boxes_num, np.int32_t* candidate_inds, np.int32_t* candidate_start, np.float32_t* candidate_weights, np.int32_t candidate_num, np.int32_t image_height, np.int32_t image_width, np.int32_t box_dim, np.int32_t mask_size, np.int32_t result_num, np.float32_t* result_mask, np.int32_t* result_box, np.int32_t device_id);
 9 | 
10 | # boxes: n * 4
11 | # masks: n * 1 * 21 * 21
12 | # scores: n * 21
13 | def mv(np.ndarray[np.float32_t, ndim=2] all_boxes,
14 |                 np.ndarray[np.float32_t, ndim=4] all_masks,
15 |                 np.ndarray[np.int32_t, ndim=1] candidate_inds,
16 |                 np.ndarray[np.int32_t, ndim=1] candidate_start,
17 |                 np.ndarray[np.float32_t, ndim=1] candidate_weights,
18 |                 np.int32_t image_height,
19 |                 np.int32_t image_width,
20 |                 np.int32_t device_id = 0):
21 |     cdef int all_box_num = all_boxes.shape[0]
22 |     cdef int boxes_dim = all_boxes.shape[1]
23 |     cdef int mask_size = all_masks.shape[3]
24 |     cdef int candidate_num = candidate_inds.shape[0]
25 |     cdef int result_num = candidate_start.shape[0]
26 |     cdef np.ndarray[np.float32_t, ndim=4] \
27 |         result_mask = np.zeros((result_num, 1, all_masks.shape[2], all_masks.shape[3]), dtype=np.float32)
28 |     cdef np.ndarray[np.int32_t, ndim=2] \
29 |         result_box = np.zeros((result_num, boxes_dim), dtype=np.int32)
30 |     _mv(&all_boxes[0, 0], &all_masks[0, 0, 0, 0], all_box_num, &candidate_inds[0], &candidate_start[0], &candidate_weights[0], candidate_num, image_height, image_width, boxes_dim, mask_size, candidate_start.shape[0], &result_mask[0,0,0,0], &result_box[0,0], device_id)
31 |     return result_mask, result_box
32 | 


--------------------------------------------------------------------------------
/lib/utils/bbox.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Multitask Network Cascade
 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 4 | # Copyright (c) 2016, Haozhi Qi
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # --------------------------------------------------------
 7 | 
 8 | cimport cython
 9 | import numpy as np
10 | cimport numpy as np
11 | 
12 | DTYPE = np.float
13 | ctypedef np.float_t DTYPE_t
14 | 
15 | def bbox_overlaps(
16 |         np.ndarray[DTYPE_t, ndim=2] boxes,
17 |         np.ndarray[DTYPE_t, ndim=2] query_boxes):
18 |     """
19 |     Parameters
20 |     ----------
21 |     boxes: (N, 4) ndarray of float
22 |     query_boxes: (K, 4) ndarray of float
23 |     Returns
24 |     -------
25 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
26 |     """
27 |     cdef unsigned int N = boxes.shape[0]
28 |     cdef unsigned int K = query_boxes.shape[0]
29 |     cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
30 |     cdef DTYPE_t iw, ih, box_area
31 |     cdef DTYPE_t ua
32 |     cdef unsigned int k, n
33 |     for k in range(K):
34 |         box_area = (
35 |             (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
36 |             (query_boxes[k, 3] - query_boxes[k, 1] + 1)
37 |         )
38 |         for n in range(N):
39 |             iw = (
40 |                 min(boxes[n, 2], query_boxes[k, 2]) -
41 |                 max(boxes[n, 0], query_boxes[k, 0]) + 1
42 |             )
43 |             if iw > 0:
44 |                 ih = (
45 |                     min(boxes[n, 3], query_boxes[k, 3]) -
46 |                     max(boxes[n, 1], query_boxes[k, 1]) + 1
47 |                 )
48 |                 if ih > 0:
49 |                     ua = float(
50 |                         (boxes[n, 2] - boxes[n, 0] + 1) *
51 |                         (boxes[n, 3] - boxes[n, 1] + 1) +
52 |                         box_area - iw * ih
53 |                     )
54 |                     overlaps[n, k] = iw * ih / ua
55 |     return overlaps
56 | 


--------------------------------------------------------------------------------
/lib/nms/cpu_nms.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Multitask Network Cascade
 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 4 | # Copyright (c) 2016, Haozhi Qi
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | cimport numpy as np
10 | 
11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b):
12 |     return a if a >= b else b
13 | 
14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b):
15 |     return a if a <= b else b
16 | 
17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh):
18 |     cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0]
19 |     cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1]
20 |     cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2]
21 |     cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3]
22 |     cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4]
23 | 
24 |     cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1)
25 |     cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1]
26 | 
27 |     cdef int ndets = dets.shape[0]
28 |     cdef np.ndarray[np.int_t, ndim=1] suppressed = \
29 |             np.zeros((ndets), dtype=np.int)
30 | 
31 |     # nominal indices
32 |     cdef int _i, _j
33 |     # sorted indices
34 |     cdef int i, j
35 |     # temp variables for box i's (the box currently under consideration)
36 |     cdef np.float32_t ix1, iy1, ix2, iy2, iarea
37 |     # variables for computing overlap with box j (lower scoring box)
38 |     cdef np.float32_t xx1, yy1, xx2, yy2
39 |     cdef np.float32_t w, h
40 |     cdef np.float32_t inter, ovr
41 | 
42 |     keep = []
43 |     for _i in range(ndets):
44 |         i = order[_i]
45 |         if suppressed[i] == 1:
46 |             continue
47 |         keep.append(i)
48 |         ix1 = x1[i]
49 |         iy1 = y1[i]
50 |         ix2 = x2[i]
51 |         iy2 = y2[i]
52 |         iarea = areas[i]
53 |         for _j in range(_i + 1, ndets):
54 |             j = order[_j]
55 |             if suppressed[j] == 1:
56 |                 continue
57 |             xx1 = max(ix1, x1[j])
58 |             yy1 = max(iy1, y1[j])
59 |             xx2 = min(ix2, x2[j])
60 |             yy2 = min(iy2, y2[j])
61 |             w = max(0.0, xx2 - xx1 + 1)
62 |             h = max(0.0, yy2 - yy1 + 1)
63 |             inter = w * h
64 |             ovr = inter / (iarea + areas[j] - inter)
65 |             if ovr >= thresh:
66 |                 suppressed[j] = 1
67 | 
68 |     return keep
69 | 


--------------------------------------------------------------------------------
/lib/nms/nms_wrapper.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Multitask Network Cascade
 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 4 | # Copyright (c) 2016, Haozhi Qi
 5 | # Licensed under The MIT License [see LICENSE for details]
 6 | # --------------------------------------------------------
 7 | 
 8 | from mnc_config import cfg
 9 | from gpu_nms import gpu_nms
10 | from cpu_nms import cpu_nms
11 | 
12 | 
13 | def nms(dets, thresh):
14 |     """Dispatch to either CPU or GPU NMS implementations."""
15 | 
16 |     if dets.shape[0] == 0:
17 |         return []
18 |     if cfg.USE_GPU_NMS:
19 |         return gpu_nms(dets, thresh, device_id=cfg.GPU_ID)
20 |     else:
21 |         return cpu_nms(dets, thresh)
22 | 
23 | 
24 | def apply_nms(all_boxes, thresh):
25 |     """Apply non-maximum suppression to all predicted boxes output by the
26 |     test_net method.
27 |     """
28 |     num_classes = len(all_boxes)
29 |     num_images = len(all_boxes[0])
30 |     nms_boxes = [[[] for _ in xrange(num_images)]
31 |                  for _ in xrange(num_classes)]
32 |     for cls_ind in xrange(num_classes):
33 |         for im_ind in xrange(num_images):
34 |             dets = all_boxes[cls_ind][im_ind]
35 |             if dets == []:
36 |                 continue
37 |             keep = nms(dets, thresh)
38 |             if len(keep) == 0:
39 |                 continue
40 |             nms_boxes[cls_ind][im_ind] = dets[keep, :].copy()
41 |     return nms_boxes
42 | 
43 | 
44 | def apply_nms_mask(all_boxes, all_masks, thresh):
45 |     num_classes = len(all_boxes)
46 |     num_images = len(all_boxes[0])
47 |     nms_boxes = [[[] for _ in xrange(num_images)]
48 |                  for _ in xrange(num_classes)]
49 |     nms_masks = [[[] for _ in xrange(num_images)]
50 |                  for _ in xrange(num_classes)]
51 |     for cls_ind in xrange(num_classes):
52 |         for im_ind in xrange(num_images):
53 |             dets = all_boxes[cls_ind][im_ind]
54 |             masks = all_masks[cls_ind][im_ind]
55 |             if dets == []:
56 |                 continue
57 |             keep = nms(dets, thresh)
58 |             if len(keep) == 0:
59 |                 continue
60 |             nms_boxes[cls_ind][im_ind] = dets[keep, :].copy()
61 |             nms_masks[cls_ind][im_ind] = masks[keep, :].copy()
62 |     return nms_boxes, nms_masks
63 | 
64 | 
65 | def apply_nms_mask_single(box, mask, thresh):
66 |     if box == []:
67 |         return box, mask
68 |     keep = nms(box, thresh)
69 |     if len(keep) == 0:
70 |         return box, mask
71 |     return box[keep, :].copy(), mask[keep, :].copy()
72 | 


--------------------------------------------------------------------------------
/tools/test_net.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # --------------------------------------------------------
 4 | # Multitask Network Cascade
 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 6 | # Copyright (c) 2016, Haozhi Qi
 7 | # Licensed under The MIT License [see LICENSE for details]
 8 | # --------------------------------------------------------
 9 | 
10 | # Standard module
11 | import argparse
12 | import sys
13 | import os
14 | import time
15 | import pprint
16 | # User-defined module
17 | import _init_paths
18 | import caffe
19 | from mnc_config import cfg, cfg_from_file
20 | from db.imdb import get_imdb
21 | from caffeWrapper.TesterWrapper import TesterWrapper
22 | 
23 | 
24 | def parse_args():
25 |     """
26 |     Parse input arguments
27 |     """
28 |     parser = argparse.ArgumentParser(description='Test a Fast R-CNN network')
29 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use',
30 |                         default=0, type=int)
31 |     parser.add_argument('--def', dest='prototxt',
32 |                         help='prototxt file defining the network',
33 |                         default=None, type=str)
34 |     parser.add_argument('--net', dest='caffemodel',
35 |                         help='model to test',
36 |                         default=None, type=str)
37 |     parser.add_argument('--cfg', dest='cfg_file',
38 |                         help='optional config file', default=None, type=str)
39 |     parser.add_argument('--imdb', dest='imdb_name',
40 |                         help='dataset to test',
41 |                         default='voc_2007_test', type=str)
42 |     parser.add_argument('--wait', dest='wait',
43 |                         help='wait until net file exists',
44 |                         default=True, type=bool)
45 |     parser.add_argument('--comp', dest='comp_mode', help='competition mode',
46 |                         action='store_true')
47 |     parser.add_argument('--set', dest='set_cfgs',
48 |                         help='set config keys', default=None,
49 |                         nargs=argparse.REMAINDER)
50 |     parser.add_argument('--task', dest='task_name',
51 |                         help='set task name', default='sds',
52 |                         type=str)
53 | 
54 |     if len(sys.argv) == 1:
55 |         parser.print_help()
56 |         sys.exit(1)
57 | 
58 |     return parser.parse_args()
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     args = parse_args()
63 | 
64 |     print('Called with args:')
65 |     print(args)
66 | 
67 |     if args.cfg_file is not None:
68 |         cfg_from_file(args.cfg_file)
69 | 
70 |     cfg.GPU_ID = args.gpu_id
71 | 
72 |     print('Using config:')
73 |     pprint.pprint(cfg)
74 | 
75 |     while not os.path.exists(args.caffemodel) and args.wait:
76 |         print('Waiting for {} to exist...'.format(args.caffemodel))
77 |         time.sleep(10)
78 | 
79 |     caffe.set_mode_gpu()
80 |     caffe.set_device(args.gpu_id)
81 | 
82 |     imdb = get_imdb(args.imdb_name)
83 |     _tester = TesterWrapper(args.prototxt, imdb, args.caffemodel, args.task_name)
84 |     _tester.get_result()
85 | 


--------------------------------------------------------------------------------
/tools/train_net.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # --------------------------------------------------------
 4 | # Multitask Network Cascade
 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
 6 | # Copyright (c) 2016, Haozhi Qi
 7 | # Licensed under The MIT License [see LICENSE for details]
 8 | # --------------------------------------------------------
 9 | 
10 | # Standard module
11 | import argparse
12 | import sys
13 | import pprint
14 | import numpy as np
15 | # User-defined module
16 | import _init_paths
17 | from mnc_config import cfg, cfg_from_file, get_output_dir  # config mnc
18 | from db.roidb import attach_roidb
19 | from db.maskdb import attach_maskdb
20 | from caffeWrapper.SolverWrapper import SolverWrapper
21 | import caffe
22 | 
23 | 
24 | def parse_args():
25 |     """ Parse input arguments
26 |     """
27 |     parser = argparse.ArgumentParser(description='Train a Fast R-CNN network')
28 |     parser.add_argument('--gpu', dest='gpu_id',
29 |                         help='GPU device id to use [0]',
30 |                         default=0, type=int)
31 |     parser.add_argument('--solver', dest='solver',
32 |                         help='solver prototxt',
33 |                         default=None, type=str)
34 |     parser.add_argument('--iters', dest='max_iters',
35 |                         help='number of iterations to train',
36 |                         default=40000, type=int)
37 |     parser.add_argument('--weights', dest='pretrained_model',
38 |                         help='initialize with pretrained model weights',
39 |                         default=None, type=str)
40 |     parser.add_argument('--cfg', dest='cfg_file',
41 |                         help='optional config file',
42 |                         default=None, type=str)
43 |     parser.add_argument('--imdb', dest='imdb_name',
44 |                         help='dataset to train on',
45 |                         default='voc_2007_trainval', type=str)
46 |     parser.add_argument('--rand', dest='randomize',
47 |                         help='randomize (do not use a fixed seed)',
48 |                         action='store_true')
49 |     parser.add_argument('--set', dest='set_cfgs',
50 |                         help='set config keys', default=None,
51 |                         nargs=argparse.REMAINDER)
52 | 
53 |     if len(sys.argv) == 1:
54 |         parser.print_help()
55 |         sys.exit(1)
56 | 
57 |     return parser.parse_args()
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     args = parse_args()
62 |     print('Called with args:')
63 |     print(args)
64 | 
65 |     if args.cfg_file is not None:
66 |         cfg_from_file(args.cfg_file)
67 | 
68 |     cfg.GPU_ID = args.gpu_id
69 |     print('Using config:')
70 |     pprint.pprint(cfg)
71 | 
72 |     caffe.set_mode_gpu()
73 |     caffe.set_device(args.gpu_id)
74 | 
75 |     if not args.randomize:
76 |         # fix the random seeds (numpy and caffe) for reproducibility
77 |         np.random.seed(cfg.RNG_SEED)
78 |         caffe.set_random_seed(cfg.RNG_SEED)
79 | 
80 |     # get imdb and roidb from specified imdb_name
81 |     imdb, roidb = attach_roidb(args.imdb_name)
82 |     # Faster RCNN doesn't need
83 |     if cfg.MNC_MODE or cfg.CFM_MODE:
84 |         imdb, maskdb = attach_maskdb(args.imdb_name)
85 |     else:
86 |         maskdb = None
87 |     print '{:d} roidb entries'.format(len(roidb))
88 | 
89 |     output_dir = get_output_dir(imdb, None)
90 |     print 'Output will be saved to `{:s}`'.format(output_dir)
91 | 
92 |     _solver = SolverWrapper(args.solver, roidb, maskdb, output_dir, imdb,
93 |                             pretrained_model=args.pretrained_model)
94 | 
95 |     print 'Solving...'
96 |     _solver.train_model(args.max_iters)
97 |     print 'done solving'
98 | 
99 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Faster R-CNN
 2 | 
 3 | The MIT License (MIT)
 4 | 
 5 | Copyright (c) 2015 Microsoft Corporation
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in
15 | all copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 | THE SOFTWARE.
24 | 
25 | ************************************************************************
26 | 
27 | THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
28 | 
29 | This project, Faster R-CNN, incorporates material from the project(s) listed below (collectively, "Third Party Code").  Microsoft is not the original author of the Third Party Code.  The original copyright notice and license under which Microsoft received such Third Party Code are set out below. This Third Party Code is licensed to you under their original license terms set forth below.  Microsoft reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
30 |  
31 | 1.	Caffe, version 0.9, (https://github.com/BVLC/caffe/)
32 | 
33 | COPYRIGHT
34 | 
35 | All contributions by the University of California:
36 | Copyright (c) 2014, 2015, The Regents of the University of California (Regents)
37 | All rights reserved.
38 | 
39 | All other contributions:
40 | Copyright (c) 2014, 2015, the respective contributors
41 | All rights reserved.
42 | 
43 | Caffe uses a shared copyright model: each contributor holds copyright over their contributions to Caffe. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed.
44 | 
45 | The BSD 2-Clause License
46 | 
47 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
48 | 
49 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
50 | 
51 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
52 | 
53 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
54 | 
55 | ************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/lib/datasets/pascal_voc.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import os
  9 | import numpy as np
 10 | import scipy.sparse
 11 | from mnc_config import cfg
 12 | 
 13 | 
 14 | class PascalVOC(object):
 15 |     """ A base class for image database."""
 16 |     def __init__(self, name):
 17 |         self._name = name
 18 |         self._num_classes = 0
 19 |         self._classes = []
 20 |         self._image_index = []
 21 |         self._obj_proposer = 'selective_search'
 22 |         self._roidb = None
 23 |         self._roidb_handler = self.default_roidb
 24 |         self._maskdb = None
 25 |         self._maskdb_handler = self.default_maskdb
 26 |         # Use this dict for storing dataset specific config options
 27 |         self.config = {}
 28 | 
 29 |     @property
 30 |     def name(self):
 31 |         return self._name
 32 | 
 33 |     @property
 34 |     def num_classes(self):
 35 |         return len(self._classes)
 36 | 
 37 |     @property
 38 |     def classes(self):
 39 |         return self._classes
 40 | 
 41 |     @property
 42 |     def image_index(self):
 43 |         return self._image_index
 44 | 
 45 |     @property
 46 |     def roidb_handler(self):
 47 |         return self._roidb_handler
 48 | 
 49 |     @roidb_handler.setter
 50 |     def roidb_handler(self, val):
 51 |         self._roidb_handler = val
 52 | 
 53 |     @property
 54 |     def maskdb_handler(self):
 55 |         return self._roidb_handler
 56 | 
 57 |     @maskdb_handler.setter
 58 |     def maskdb_handler(self, val):
 59 |         self._roidb_handler = val
 60 | 
 61 |     @property
 62 |     def roidb(self):
 63 |         # A roidb is a 'list of dictionaries', each with the following keys:
 64 |         #   boxes: the numpy array for boxes coordinate
 65 |         #   gt_overlaps: overlap ratio for ground truth
 66 |         #   gt_classes: ground truth class for that box
 67 |         #   flipped: whether get flipped
 68 |         if self._roidb is not None:
 69 |             return self._roidb
 70 |         self._roidb = self.roidb_handler()
 71 |         return self._roidb
 72 | 
 73 |     @property
 74 |     def maskdb(self):
 75 |         if self._maskdb is not None:
 76 |             return self._maskdb
 77 |         else:
 78 |             self._maskdb = self.maskdb_handler()
 79 |             return self._maskdb
 80 | 
 81 |     @property
 82 |     def cache_path(self):
 83 |         cache_path = os.path.abspath(os.path.join(cfg.DATA_DIR, 'cache'))
 84 |         if not os.path.exists(cache_path):
 85 |             os.makedirs(cache_path)
 86 |         return cache_path
 87 | 
 88 |     @property
 89 |     def num_images(self):
 90 |         return len(self.image_index)
 91 | 
 92 |     def set_roi_handler(self, method):
 93 |         method = eval('self.' + method + '_roidb')
 94 |         self.roidb_handler = method
 95 | 
 96 |     def set_mask_handler(self, method):
 97 |         method = eval('self.' + method + '_maskdb')
 98 |         self.maskdb_handler = method
 99 | 
100 |     def image_path_at(self, i):
101 |         raise NotImplementedError
102 | 
103 |     def default_roidb(self):
104 |         raise NotImplementedError
105 | 
106 |     def default_maskdb(self):
107 |         raise NotImplementedError
108 | 
109 |     def competition_mode(self, on):
110 |         """Turn competition mode on or off."""
111 |         pass
112 | 
113 |     @staticmethod
114 |     def merge_roidbs(a, b):
115 |         assert len(a) == len(b)
116 |         for i in xrange(len(a)):
117 |             a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes']))
118 |             a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'],
119 |                                             b[i]['gt_classes']))
120 |             a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'],
121 |                                                        b[i]['gt_overlaps']])
122 |         return a
123 | 


--------------------------------------------------------------------------------
/lib/utils/blob.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | """Blob helper functions."""
  9 | 
 10 | import numpy as np
 11 | import random
 12 | import cv2
 13 | from utils.cython_bbox import bbox_overlaps
 14 | from mnc_config import cfg
 15 | 
 16 | 
 17 | def im_list_to_blob(ims):
 18 |     """
 19 |     Convert a list of images into a network input.
 20 |     Assumes images are already prepared (means subtracted, BGR order, ...).
 21 |     """
 22 |     max_shape = np.array([im.shape for im in ims]).max(axis=0)
 23 |     num_images = len(ims)
 24 |     blob = np.zeros((num_images, max_shape[0], max_shape[1], 3),
 25 |                     dtype=np.float32)
 26 |     for i in xrange(num_images):
 27 |         im = ims[i]
 28 |         blob[i, 0:im.shape[0], 0:im.shape[1], :] = im
 29 |     # Move channels (axis 3) to axis 1
 30 |     # Axis order will become: (batch elem, channel, height, width)
 31 |     channel_swap = (0, 3, 1, 2)
 32 |     blob = blob.transpose(channel_swap)
 33 |     return blob
 34 | 
 35 | 
 36 | def prep_im_for_blob(im, pixel_means, target_size, max_size):
 37 |     """Mean subtract and scale an image for use in a blob."""
 38 |     im = im.astype(np.float32, copy=False)
 39 |     im -= pixel_means
 40 |     im_shape = im.shape
 41 |     im_size_min = np.min(im_shape[0:2])
 42 |     im_size_max = np.max(im_shape[0:2])
 43 |     im_scale = float(target_size) / float(im_size_min)
 44 |     # Prevent the biggest axis from being more than MAX_SIZE
 45 |     if np.round(im_scale * im_size_max) > max_size:
 46 |         im_scale = float(max_size) / float(im_size_max)
 47 |     im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale,
 48 |                     interpolation=cv2.INTER_LINEAR)
 49 | 
 50 |     return im, im_scale
 51 | 
 52 | 
 53 | def prep_im_for_blob_cfm(im, input_scales):
 54 |     """Converts an image into a network input.
 55 |     Arguments:
 56 |         im (ndarray): a color image in BGR order
 57 |     Returns:
 58 |         blob (ndarray): a data blob holding an image pyramid
 59 |         im_scale_factors (list): list of image scales (relative to im) used
 60 |             in the image pyramid
 61 |     """
 62 |     im_orig = im.astype(np.float32, copy=True)
 63 |     im_orig -= cfg.PIXEL_MEANS
 64 | 
 65 |     im_shape = im_orig.shape
 66 |     im_size_min = np.min(im_shape[0:2])
 67 |     im_size_max = np.max(im_shape[0:2])
 68 | 
 69 |     processed_ims = []
 70 |     im_scale_factors = []
 71 | 
 72 |     for target_size in input_scales:
 73 |         im_scale = float(target_size) / float(im_size_min)
 74 |         # Prevent the biggest axis from being more than MAX_SIZE
 75 |         if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE:
 76 |             im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max)
 77 |         im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale,
 78 |                         interpolation=cv2.INTER_LINEAR)
 79 |         im_scale_factors.append(im_scale)
 80 |         processed_ims.append(im)
 81 | 
 82 |     # Create a blob to hold the input images
 83 |     blob = im_list_to_blob(processed_ims)
 84 | 
 85 |     return blob, np.array(im_scale_factors)
 86 | 
 87 | 
 88 | def pred_rois_for_blob(im_rois, im_scales):
 89 |     """
 90 |     Convert rois to network input
 91 |     support multi-scale testing
 92 |     """
 93 |     im_rois = im_rois.astype(np.float, copy=False)
 94 |     if len(im_scales) > 1:
 95 |         widths = im_rois[:, 2] - im_rois[:, 0] + 1
 96 |         heights = im_rois[:, 3] - im_rois[:, 1] + 1
 97 | 
 98 |         areas = widths * heights
 99 |         scaled_areas = areas[:, np.newaxis] * (im_scales[np.newaxis, :] ** 2)
100 |         diff_areas = np.abs(scaled_areas - 224 * 224)
101 |         levels = diff_areas.argmin(axis=1)[:, np.newaxis]
102 |     else:
103 |         levels = np.zeros((im_rois.shape[0], 1), dtype=np.int)
104 |     im_rois = im_rois * im_scales[levels]
105 |     rois_blob = np.hstack((levels.astype(np.float), im_rois))
106 |     return rois_blob
107 | 
108 | 


--------------------------------------------------------------------------------
/lib/pylayer/mask_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Written by Haozhi Qi
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import caffe
  9 | import cv2
 10 | import numpy as np
 11 | from transform.mask_transform import mask_overlap
 12 | from mnc_config import cfg
 13 | 
 14 | 
 15 | class MaskLayer(caffe.Layer):
 16 |     """
 17 |     This layer Take input from sigmoid predicted masks
 18 |     Assign each label for segmentation classifier according
 19 |     to region overlap
 20 |     """
 21 | 
 22 |     def setup(self, bottom, top):
 23 |         self._phase = str(self.phase)
 24 |         self._top_name_map = {}
 25 |         top[0].reshape(1, 1, cfg.MASK_SIZE, cfg.MASK_SIZE)
 26 |         self._top_name_map['mask_proposal'] = 0
 27 |         if self._phase == 'TRAIN':
 28 |             top[1].reshape(1, 1)
 29 |             self._top_name_map['mask_proposal_label'] = 1
 30 | 
 31 |     def reshape(self, bottom, top):
 32 |         """
 33 |         Reshaping happens during the call to forward
 34 |         """
 35 |         pass
 36 | 
 37 |     def forward(self, bottom, top):
 38 |         if str(self.phase) == 'TRAIN':
 39 |             blobs = self.forward_train(bottom, top)
 40 |         elif str(self.phase) == 'TEST':
 41 |             blobs = self.forward_test(bottom, top)
 42 |         else:
 43 |             print 'Unrecognized phase'
 44 |             raise NotImplementedError
 45 | 
 46 |         for blob_name, blob in blobs.iteritems():
 47 |             top[self._top_name_map[blob_name]].reshape(*blob.shape)
 48 |             top[self._top_name_map[blob_name]].data[...] = blob.astype(np.float32, copy=False)
 49 | 
 50 |     def backward(self, top, propagate_down, bottom):
 51 |         if propagate_down[0]:
 52 |             bottom[0].diff.fill(0.)
 53 |             top_grad = top[0].diff.reshape(top[0].diff.shape[0], cfg.MASK_SIZE * cfg.MASK_SIZE)
 54 |             bottom[0].diff[self.pos_sample, :] = top_grad[self.pos_sample, :]
 55 | 
 56 |     def forward_train(self, bottom, top):
 57 |         # Take sigmoid prediction as input
 58 |         mask_pred = bottom[0].data
 59 |         # get ground truth mask and labels
 60 |         gt_masks = bottom[1].data
 61 |         gt_masks_info = bottom[2].data
 62 |         num_mask_pred = mask_pred.shape[0]
 63 |         top_label = np.zeros((gt_masks_info.shape[0], 1))
 64 |         # 2. Calculate region overlap
 65 |         #    Since the target gt mask may have different size
 66 |         #    We need to resize predicted masks into different sizes
 67 |         mask_size = cfg.MASK_SIZE
 68 |         for i in xrange(num_mask_pred):
 69 |             # if the bounding box is itself background
 70 |             if gt_masks_info[i][0] == -1:
 71 |                 top_label[i][0] = 0
 72 |                 continue
 73 |             else:
 74 |                 info = gt_masks_info[i]
 75 |                 gt_mask = gt_masks[info[0]][0:info[1], 0:info[2]]
 76 |                 ex_mask = mask_pred[i].reshape((mask_size, mask_size))
 77 |                 ex_box = np.round(info[4:8]).astype(int)
 78 |                 gt_box = np.round(info[8:12]).astype(int)
 79 |                 # resize to large gt_masks, note cv2.resize is column first
 80 |                 ex_mask = cv2.resize(ex_mask.astype(np.float32), (ex_box[2] - ex_box[0] + 1,
 81 |                                                                   ex_box[3] - ex_box[1] + 1))
 82 |                 ex_mask = ex_mask >= cfg.BINARIZE_THRESH
 83 |                 top_label[i][0] = 0 if mask_overlap(ex_box, gt_box, ex_mask, gt_mask) < cfg.TRAIN.FG_SEG_THRESH else info[3]
 84 | 
 85 |         # output continuous mask for MNC
 86 |         resized_mask_pred = mask_pred.reshape((num_mask_pred, 1, cfg.MASK_SIZE, cfg.MASK_SIZE))
 87 |         self.pos_sample = np.where(top_label > 0)[0]
 88 | 
 89 |         blobs = {
 90 |             'mask_proposal': resized_mask_pred,
 91 |             'mask_proposal_label': top_label
 92 |         }
 93 |         return blobs
 94 | 
 95 |     def forward_test(self, bottom, top):
 96 |         mask_pred = bottom[0].data
 97 |         num_mask_pred = mask_pred.shape[0]
 98 |         resized_mask_pred = mask_pred.reshape((num_mask_pred, 1, cfg.MASK_SIZE, cfg.MASK_SIZE))
 99 |         blobs = {
100 |             'mask_proposal': resized_mask_pred
101 |         }
102 |         return blobs
103 | 


--------------------------------------------------------------------------------
/lib/transform/anchors.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import numpy as np
  9 | 
 10 | # Verify that we compute the same anchors as Shaoqing's matlab implementation:
 11 | #
 12 | #    >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat
 13 | #    >> anchors
 14 | #
 15 | #    anchors =
 16 | #
 17 | #       -83   -39   100    56
 18 | #      -175   -87   192   104
 19 | #      -359  -183   376   200
 20 | #       -55   -55    72    72
 21 | #      -119  -119   136   136
 22 | #      -247  -247   264   264
 23 | #       -35   -79    52    96
 24 | #       -79  -167    96   184
 25 | #      -167  -343   184   360
 26 | 
 27 | # array([[ -83.,  -39.,  100.,   56.],
 28 | #        [-175.,  -87.,  192.,  104.],
 29 | #        [-359., -183.,  376.,  200.],
 30 | #        [ -55.,  -55.,   72.,   72.],
 31 | #        [-119., -119.,  136.,  136.],
 32 | #        [-247., -247.,  264.,  264.],
 33 | #        [ -35.,  -79.,   52.,   96.],
 34 | #        [ -79., -167.,   96.,  184.],
 35 | #        [-167., -343.,  184.,  360.]])
 36 | 
 37 | 
 38 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2],
 39 |                      scales=2**np.arange(3, 6)):
 40 |     """
 41 |     Generate anchor (reference) windows by enumerating aspect ratios X
 42 |     scales wrt a reference (0, 0, 15, 15) window.
 43 |     """
 44 | 
 45 |     base_anchor = np.array([1, 1, base_size, base_size]) - 1
 46 |     ratio_anchors = _ratio_enum(base_anchor, ratios)
 47 |     anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales)
 48 |                          for i in xrange(ratio_anchors.shape[0])])
 49 |     return anchors
 50 | 
 51 | 
 52 | def _whctrs(anchor):
 53 |     """
 54 |     Return width, height, x center, and y center for an anchor (window).
 55 |     """
 56 | 
 57 |     w = anchor[2] - anchor[0] + 1
 58 |     h = anchor[3] - anchor[1] + 1
 59 |     x_ctr = anchor[0] + 0.5 * (w - 1)
 60 |     y_ctr = anchor[1] + 0.5 * (h - 1)
 61 |     return w, h, x_ctr, y_ctr
 62 | 
 63 | 
 64 | def _mkanchors(ws, hs, x_ctr, y_ctr):
 65 |     """
 66 |     Given a vector of widths (ws) and heights (hs) around a center
 67 |     (x_ctr, y_ctr), output a set of anchors (windows).
 68 |     """
 69 | 
 70 |     ws = ws[:, np.newaxis]
 71 |     hs = hs[:, np.newaxis]
 72 |     anchors = np.hstack((x_ctr - 0.5 * (ws - 1),
 73 |                          y_ctr - 0.5 * (hs - 1),
 74 |                          x_ctr + 0.5 * (ws - 1),
 75 |                          y_ctr + 0.5 * (hs - 1)))
 76 |     return anchors
 77 | 
 78 | 
 79 | def _ratio_enum(anchor, ratios):
 80 |     """
 81 |     Enumerate a set of anchors for each aspect ratio wrt an anchor.
 82 |     """
 83 | 
 84 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
 85 |     size = w * h
 86 |     size_ratios = size / ratios
 87 |     ws = np.round(np.sqrt(size_ratios))
 88 |     hs = np.round(ws * ratios)
 89 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
 90 |     return anchors
 91 | 
 92 | 
 93 | def _scale_enum(anchor, scales):
 94 |     """
 95 |     Enumerate a set of anchors for each scale wrt an anchor.
 96 |     """
 97 | 
 98 |     w, h, x_ctr, y_ctr = _whctrs(anchor)
 99 |     ws = w * scales
100 |     hs = h * scales
101 |     anchors = _mkanchors(ws, hs, x_ctr, y_ctr)
102 |     return anchors
103 | 
104 | 
105 | def generate_shifted_anchors(anchors, height, width, feat_stride):
106 |     # Enumerate all shifted anchors:
107 |     #
108 |     # add A anchors (1, A, 4) to
109 |     # cell K shifts (K, 1, 4) to get
110 |     # shift anchors (K, A, 4)
111 |     # reshape to (K*A, 4) shifted anchors
112 |     shift_x = np.arange(0, width) * feat_stride
113 |     shift_y = np.arange(0, height) * feat_stride
114 |     shift_x, shift_y = np.meshgrid(shift_x, shift_y)
115 |     shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
116 |                        shift_x.ravel(), shift_y.ravel())).transpose()
117 |     A = anchors.shape[0]
118 |     K = shifts.shape[0]
119 |     anchors = anchors.reshape((1, A, 4)) + \
120 |               shifts.reshape((1, K, 4)).transpose((1, 0, 2))
121 |     anchors = anchors.reshape((K * A, 4))
122 |     return anchors
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     import time
127 |     t = time.time()
128 |     a = generate_anchors()
129 |     print time.time() - t
130 |     print a
131 |     from IPython import embed
132 |     embed()
133 | 


--------------------------------------------------------------------------------
/lib/nms/nms_kernel.cu:
--------------------------------------------------------------------------------
  1 | // --------------------------------------------------------
  2 | // Multitask Network Cascade
  3 | // Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn)
  4 | // Copyright (c) 2016, Haozhi Qi
  5 | // Licensed under The MIT License [see LICENSE for details]
  6 | // --------------------------------------------------------
  7 | 
  8 | #include "gpu_nms.hpp"
  9 | #include <vector>
 10 | #include <iostream>
 11 | 
 12 | #define CUDA_CHECK(condition) \
 13 |   /* Code block avoids redefinition of cudaError_t error */ \
 14 |   do { \
 15 |     cudaError_t error = condition; \
 16 |     if (error != cudaSuccess) { \
 17 |       std::cout << cudaGetErrorString(error) << std::endl; \
 18 |     } \
 19 |   } while (0)
 20 | 
 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 23 | 
 24 | __device__ inline float devIoU(float const * const a, float const * const b) {
 25 |   float left = max(a[0], b[0]), right = min(a[2], b[2]);
 26 |   float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
 27 |   float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
 28 |   float interS = width * height;
 29 |   float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
 30 |   float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
 31 |   return interS / (Sa + Sb - interS);
 32 | }
 33 | 
 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh,
 35 |                            const float *dev_boxes, unsigned long long *dev_mask) {
 36 |   const int row_start = blockIdx.y;
 37 |   const int col_start = blockIdx.x;
 38 | 
 39 |   // if (row_start > col_start) return;
 40 | 
 41 |   const int row_size =
 42 |         min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
 43 |   const int col_size =
 44 |         min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
 45 | 
 46 |   __shared__ float block_boxes[threadsPerBlock * 5];
 47 |   if (threadIdx.x < col_size) {
 48 |     block_boxes[threadIdx.x * 5 + 0] =
 49 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
 50 |     block_boxes[threadIdx.x * 5 + 1] =
 51 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
 52 |     block_boxes[threadIdx.x * 5 + 2] =
 53 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
 54 |     block_boxes[threadIdx.x * 5 + 3] =
 55 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
 56 |     block_boxes[threadIdx.x * 5 + 4] =
 57 |         dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
 58 |   }
 59 |   __syncthreads();
 60 | 
 61 |   if (threadIdx.x < row_size) {
 62 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
 63 |     const float *cur_box = dev_boxes + cur_box_idx * 5;
 64 |     int i = 0;
 65 |     unsigned long long t = 0;
 66 |     int start = 0;
 67 |     if (row_start == col_start) {
 68 |       start = threadIdx.x + 1;
 69 |     }
 70 |     for (i = start; i < col_size; i++) {
 71 |       if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) {
 72 |         t |= 1ULL << i;
 73 |       }
 74 |     }
 75 |     const int col_blocks = DIVUP(n_boxes, threadsPerBlock);
 76 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
 77 |   }
 78 | }
 79 | 
 80 | void _set_device(int device_id) {
 81 |   int current_device;
 82 |   CUDA_CHECK(cudaGetDevice(&current_device));
 83 |   if (current_device == device_id) {
 84 |     return;
 85 |   }
 86 |   // The call to cudaSetDevice must come before any calls to Get, which
 87 |   // may perform initialization using the GPU.
 88 |   CUDA_CHECK(cudaSetDevice(device_id));
 89 | }
 90 | 
 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num,
 92 |           int boxes_dim, float nms_overlap_thresh, int device_id) {
 93 |   _set_device(device_id);
 94 | 
 95 |   float* boxes_dev = NULL;
 96 |   unsigned long long* mask_dev = NULL;
 97 | 
 98 |   const int col_blocks = DIVUP(boxes_num, threadsPerBlock);
 99 | 
100 |   CUDA_CHECK(cudaMalloc(&boxes_dev,
101 |                         boxes_num * boxes_dim * sizeof(float)));
102 |   CUDA_CHECK(cudaMemcpy(boxes_dev,
103 |                         boxes_host,
104 |                         boxes_num * boxes_dim * sizeof(float),
105 |                         cudaMemcpyHostToDevice));
106 | 
107 |   CUDA_CHECK(cudaMalloc(&mask_dev,
108 |                         boxes_num * col_blocks * sizeof(unsigned long long)));
109 | 
110 |   dim3 blocks(DIVUP(boxes_num, threadsPerBlock),
111 |               DIVUP(boxes_num, threadsPerBlock));
112 |   dim3 threads(threadsPerBlock);
113 |   nms_kernel<<<blocks, threads>>>(boxes_num,
114 |                                   nms_overlap_thresh,
115 |                                   boxes_dev,
116 |                                   mask_dev);
117 | 
118 |   std::vector<unsigned long long> mask_host(boxes_num * col_blocks);
119 |   CUDA_CHECK(cudaMemcpy(&mask_host[0],
120 |                         mask_dev,
121 |                         sizeof(unsigned long long) * boxes_num * col_blocks,
122 |                         cudaMemcpyDeviceToHost));
123 | 
124 |   std::vector<unsigned long long> remv(col_blocks);
125 |   memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
126 | 
127 |   int num_to_keep = 0;
128 |   for (int i = 0; i < boxes_num; i++) {
129 |     int nblock = i / threadsPerBlock;
130 |     int inblock = i % threadsPerBlock;
131 | 
132 |     if (!(remv[nblock] & (1ULL << inblock))) {
133 |       keep_out[num_to_keep++] = i;
134 |       unsigned long long *p = &mask_host[0] + i * col_blocks;
135 |       for (int j = nblock; j < col_blocks; j++) {
136 |         remv[j] |= p[j];
137 |       }
138 |     }
139 |   }
140 |   *num_out = num_to_keep;
141 | 
142 |   CUDA_CHECK(cudaFree(boxes_dev));
143 |   CUDA_CHECK(cudaFree(mask_dev));
144 | }
145 | 


--------------------------------------------------------------------------------
/lib/utils/vis_seg.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Written by Haozhi Qi
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import numpy as np
  9 | import cPickle
 10 | import os
 11 | import cv2
 12 | from PIL import Image
 13 | from mnc_config import cfg
 14 | 
 15 | 
 16 | def vis_seg(img_names, cls_names, output_dir, gt_dir):
 17 |     """
 18 |     This function plot segmentation results to specific directory
 19 |     Args:
 20 |         img_names: list
 21 |     """
 22 |     assert os.path.exists(output_dir)
 23 |     # a list of dictionary
 24 |     inst_dir = os.path.join(output_dir, 'SegInst')
 25 |     cls_dir = os.path.join(output_dir, 'SegCls')
 26 |     res_dir = os.path.join(output_dir, 'SegRes')
 27 |     if not os.path.isdir(inst_dir):
 28 |         os.mkdir(inst_dir)
 29 |     if not os.path.isdir(cls_dir):
 30 |         os.mkdir(cls_dir)
 31 |     if not os.path.isdir(res_dir):
 32 |         os.mkdir(res_dir)
 33 | 
 34 |     res_list = _prepare_dict(img_names, cls_names, output_dir)
 35 |     for img_ind, image_name in enumerate(img_names):
 36 |         target_inst_file = os.path.join(inst_dir, image_name + '.jpg')
 37 |         target_cls_file = os.path.join(cls_dir, image_name + '.jpg')
 38 |         print image_name
 39 |         gt_image = gt_dir + '/img/' + image_name + '.jpg'
 40 |         img_data = cv2.imread(gt_image)
 41 |         img_width = img_data.shape[1]
 42 |         img_height = img_data.shape[0]
 43 |         pred_dict = res_list[img_ind]
 44 |         inst_img, cls_img = _convert_pred_to_image(img_width, img_height, pred_dict)
 45 |         color_map = _get_voc_color_map()
 46 |         inst_out_img = np.zeros((img_height, img_width, 3))
 47 |         cls_out_img = np.zeros((img_height, img_width, 3))
 48 |         for i in xrange(img_height):
 49 |             for j in xrange(img_width):
 50 |                 inst_out_img[i][j] = color_map[inst_img[i][j]][::-1]
 51 |                 cls_out_img[i][j] = color_map[cls_img[i][j]][::-1]
 52 | 
 53 |         cv2.imwrite(target_inst_file, inst_out_img)
 54 |         cv2.imwrite(target_cls_file, cls_out_img)
 55 |         background = Image.open(gt_image)
 56 |         mask = Image.open(target_cls_file)
 57 |         background = background.convert('RGBA')
 58 |         mask = mask.convert('RGBA')
 59 |         superimpose_image = Image.blend(background, mask, 0.8)
 60 |         name = os.path.join(res_dir, image_name + '.png')
 61 |         superimpose_image.save(name, 'PNG')
 62 | 
 63 | 
 64 | def _prepare_dict(img_names, cls_names, cache_dir, vis_thresh=0.5):
 65 |     """
 66 |     Returns:
 67 |         list, each list is a dictionary contains mask list, box list
 68 |     """
 69 |     res_list = []
 70 |     det_file = os.path.join(cache_dir, 'res_boxes.pkl')
 71 |     with open(det_file, 'rb') as f:
 72 |         det_pkl = cPickle.load(f)
 73 |     seg_file = os.path.join(cache_dir, 'res_masks.pkl')
 74 |     with open(seg_file, 'rb') as f:
 75 |         seg_pkl = cPickle.load(f)
 76 | 
 77 |     for img_ind, image_name in enumerate(img_names):
 78 |         box_for_img = []
 79 |         mask_for_img = []
 80 |         cls_for_img = []
 81 |         for cls_ind, cls_name in enumerate(cls_names):
 82 |             if cls_name == '__background__' or len(det_pkl[cls_ind][img_ind]) == 0:
 83 |                 continue
 84 |             det_for_img = det_pkl[cls_ind][img_ind]
 85 |             seg_for_img = seg_pkl[cls_ind][img_ind]
 86 |             keep_inds = np.where(det_for_img[:, -1] >= vis_thresh)[0]
 87 |             for keep in keep_inds:
 88 |                 box_for_img.append(det_for_img[keep])
 89 |                 # TODO: remove this annoying 0
 90 |                 mask_for_img.append(seg_for_img[keep][0])
 91 |                 cls_for_img.append(cls_ind)
 92 |         res_dict = {'image_name': image_name,
 93 |                     'cls_name': cls_for_img,
 94 |                     'boxes': box_for_img,
 95 |                     'masks': mask_for_img}
 96 |         res_list.append(res_dict)
 97 | 
 98 |     return res_list
 99 | 
100 | 
101 | def _convert_pred_to_image(img_width, img_height, pred_dict):
102 |     num_inst = len(pred_dict['boxes'])
103 |     inst_img = np.zeros((img_height, img_width))
104 |     cls_img = np.zeros((img_height, img_width))
105 |     for i in xrange(num_inst):
106 |         box = np.round(pred_dict['boxes'][i]).astype(int)
107 |         mask = pred_dict['masks'][i]
108 |         cls_num = pred_dict['cls_name'][i]
109 |         # clip box into image space
110 |         box[0] = min(max(box[0], 0), img_width - 1)
111 |         box[1] = min(max(box[1], 0), img_height - 1)
112 |         box[2] = min(max(box[2], 0), img_width - 1)
113 |         box[3] = min(max(box[3], 0), img_height - 1)
114 |         mask = cv2.resize(mask.astype(np.float32), (box[2]-box[0]+1, box[3]-box[1]+1))
115 |         mask = mask >= cfg.BINARIZE_THRESH
116 | 
117 |         part1 = (i+1) * mask.astype(np.float32)
118 |         part2 = np.multiply(np.logical_not(mask), inst_img[box[1]:box[3]+1, box[0]:box[2]+1])
119 |         part3 = np.multiply(np.logical_not(mask), cls_img[box[1]:box[3]+1, box[0]:box[2]+1])
120 |         inst_img[box[1]:box[3]+1, box[0]:box[2]+1] = part1 + part2
121 |         cls_img[box[1]:box[3]+1, box[0]:box[2]+1] = cls_num * mask.astype(np.float32) + part3
122 |         # Plot bounding boxes simultaneously
123 |         cls_img[box[1]:box[3]+1, box[0]-1:box[0]+1] = 150
124 |         cls_img[box[1]:box[3]+1, box[2]-1:box[2]+1] = 150
125 |         cls_img[box[1]-1:box[1]+1, box[0]:box[2]+1] = 150
126 |         cls_img[box[3]-1:box[3]+1, box[0]:box[2]+1] = 150
127 | 
128 |     inst_img = inst_img.astype(int)
129 |     cls_img = cls_img.astype(int)
130 |     return inst_img, cls_img
131 | 
132 | 
133 | def _get_voc_color_map(n=256):
134 |     color_map = np.zeros((n, 3))
135 |     for i in xrange(n):
136 |         r = b = g = 0
137 |         cid = i
138 |         for j in xrange(0, 8):
139 |             r = np.bitwise_or(r, np.left_shift(np.unpackbits(np.array([cid], dtype=np.uint8))[-1], 7-j))
140 |             g = np.bitwise_or(g, np.left_shift(np.unpackbits(np.array([cid], dtype=np.uint8))[-2], 7-j))
141 |             b = np.bitwise_or(b, np.left_shift(np.unpackbits(np.array([cid], dtype=np.uint8))[-3], 7-j))
142 |             cid = np.right_shift(cid, 3)
143 | 
144 |         color_map[i][0] = r
145 |         color_map[i][1] = g
146 |         color_map[i][2] = b
147 |     return color_map
148 | 


--------------------------------------------------------------------------------
/lib/pylayer/mnc_data_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import cv2
  9 | import numpy as np
 10 | import yaml
 11 | 
 12 | import caffe
 13 | from mnc_config import cfg
 14 | from utils.blob import prep_im_for_blob, im_list_to_blob
 15 | 
 16 | 
 17 | class MNCDataLayer(caffe.Layer):
 18 |     """
 19 |     Provide image, image w/h/scale, gt boxes/masks and mask info to upper layers
 20 |     """
 21 | 
 22 |     def setup(self, bottom, top):
 23 |         layer_params = yaml.load(self.param_str_)
 24 |         self._num_classes = layer_params['num_classes']
 25 |         self._name_to_top_map = {}
 26 |         # data blob: holds a batch of N images, each with 3 channels
 27 |         top[0].reshape(cfg.TRAIN.IMS_PER_BATCH, 3, max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE)
 28 |         self._name_to_top_map['data'] = 0
 29 |         assert(cfg.TRAIN.HAS_RPN, 'Use RPN for this project')
 30 |         # Just pseudo setup
 31 |         top[1].reshape(1, 3)
 32 |         self._name_to_top_map['im_info'] = 1
 33 |         top[2].reshape(1, 4)
 34 |         self._name_to_top_map['gt_boxes'] = 2
 35 |         if cfg.MNC_MODE:
 36 |             top[3].reshape(1, 21, 21)
 37 |             self._name_to_top_map['gt_masks'] = 3
 38 |             top[4].reshape(1, 3)
 39 |             self._name_to_top_map['mask_info'] = 4
 40 |         assert len(top) == len(self._name_to_top_map)
 41 | 
 42 |     def reshape(self, bottom, top):
 43 |         """Reshaping happens during the call to forward."""
 44 |         pass
 45 | 
 46 |     def forward(self, bottom, top):
 47 |         """Get blobs and copy them into this layer's top blob vector."""
 48 |         blobs = self._get_next_minibatch()
 49 |         for blob_name, blob in blobs.iteritems():
 50 |             top_ind = self._name_to_top_map[blob_name]
 51 |             # Reshape net's input blobs
 52 |             top[top_ind].reshape(*blob.shape)
 53 |             # Copy data into net's input blobs
 54 |             top[top_ind].data[...] = blob.astype(np.float32, copy=False)
 55 | 
 56 |     def backward(self, top, propagate_down, bottom):
 57 |         """This layer does not propagate gradients."""
 58 |         pass
 59 | 
 60 |     def set_roidb(self, roidb):
 61 |         """Set the roidb to be used by this layer during training."""
 62 |         self._roidb = roidb
 63 |         self._shuffle_roidb_inds()
 64 | 
 65 |     def set_maskdb(self, maskdb):
 66 |         self._maskdb = maskdb
 67 |         self._shuffle_roidb_inds()
 68 | 
 69 |     def _shuffle_roidb_inds(self):
 70 |         """Randomly permute the training roidb."""
 71 |         if cfg.TRAIN.ASPECT_GROUPING:
 72 |             widths = np.array([r['width'] for r in self._roidb])
 73 |             heights = np.array([r['height'] for r in self._roidb])
 74 |             horz = (widths >= heights)
 75 |             vert = np.logical_not(horz)
 76 |             horz_inds = np.where(horz)[0]
 77 |             vert_inds = np.where(vert)[0]
 78 |             inds = np.hstack((
 79 |                 np.random.permutation(horz_inds),
 80 |                 np.random.permutation(vert_inds)))
 81 |             inds = np.reshape(inds, (-1, 2))
 82 |             row_perm = np.random.permutation(np.arange(inds.shape[0]))
 83 |             inds = np.reshape(inds[row_perm, :], (-1,))
 84 |             self._perm = inds
 85 |         else:
 86 |             self._perm = np.random.permutation(np.arange(len(self._roidb)))
 87 |         self._cur = 0
 88 | 
 89 |     def _get_image_blob(self, roidb, scale_inds):
 90 |         """Builds an input blob from the images in the roidb at the specified
 91 |         scales.
 92 |         """
 93 |         num_images = 1  # len(roidb)
 94 |         processed_ims = []
 95 |         im_scales = []
 96 |         for i in xrange(num_images):
 97 |             im = cv2.imread(roidb['image'])
 98 |             if roidb['flipped']:
 99 |                 im = im[:, ::-1, :]
100 |             target_size = cfg.TRAIN.SCALES[scale_inds[i]]
101 |             im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size,
102 |                                             cfg.TRAIN.MAX_SIZE)
103 |             im_scales.append(im_scale)
104 |             processed_ims.append(im)
105 |         # Create a blob to hold the input images
106 |         blob = im_list_to_blob(processed_ims)
107 |         return blob, im_scales
108 | 
109 |     def _get_next_minibatch(self):
110 |         """
111 |         Return the blobs to be used for the next minibatch.
112 |         """
113 |         assert cfg.TRAIN.IMS_PER_BATCH == 1, 'Only single batch forwarding is supported'
114 | 
115 |         if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb):
116 |             self._shuffle_roidb_inds()
117 |         db_inds = self._perm[self._cur]
118 |         self._cur += 1
119 |         roidb = self._roidb[db_inds]
120 |         
121 |         random_scale_inds = np.random.randint(0, high=len(cfg.TRAIN.SCALES), size=1)
122 |         im_blob, im_scales = self._get_image_blob(roidb, random_scale_inds)
123 | 
124 |         gt_label = np.where(roidb['gt_classes'] != 0)[0]
125 |         gt_boxes = np.hstack((roidb['boxes'][gt_label, :] * im_scales[0],
126 |                               roidb['gt_classes'][gt_label, np.newaxis])).astype(np.float32)
127 |         blobs = {
128 |             'data': im_blob,
129 |             'gt_boxes': gt_boxes,
130 |             'im_info': np.array([[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], dtype=np.float32)
131 |         }
132 | 
133 |         if cfg.MNC_MODE:
134 |             maskdb = self._maskdb[db_inds]
135 |             mask_list = maskdb['gt_masks']
136 |             mask_max_x = maskdb['mask_max'][0]
137 |             mask_max_y = maskdb['mask_max'][1]
138 |             gt_masks = np.zeros((len(mask_list), mask_max_y, mask_max_x))
139 |             mask_info = np.zeros((len(mask_list), 2))
140 |             for j in xrange(len(mask_list)):
141 |                 mask = mask_list[j]
142 |                 mask_x = mask.shape[1]
143 |                 mask_y = mask.shape[0]
144 |                 gt_masks[j, 0:mask_y, 0:mask_x] = mask
145 |                 mask_info[j, 0] = mask_y
146 |                 mask_info[j, 1] = mask_x
147 |             blobs['gt_masks'] = gt_masks
148 |             blobs['mask_info'] = mask_info
149 | 
150 |         return blobs
151 | 


--------------------------------------------------------------------------------
/lib/caffeWrapper/SolverWrapper.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | 
  9 | import os
 10 | import numpy as np
 11 | 
 12 | from utils.timer import Timer
 13 | from mnc_config import cfg
 14 | from db.roidb import add_bbox_regression_targets, compute_mcg_mean_std
 15 | import caffe
 16 | from caffe.proto import caffe_pb2
 17 | import google.protobuf as pb2
 18 | 
 19 | 
 20 | class SolverWrapper(object):
 21 |     """ A simple wrapper around Caffe's solver.
 22 |         This wrapper gives us control over he snapshotting process, which we
 23 |         use to unnormalize the learned bounding-box regression weights.
 24 |     """
 25 |     def __init__(self, solver_prototxt, roidb, maskdb, output_dir, imdb,
 26 |                  pretrained_model=None):
 27 |         self.output_dir = output_dir
 28 |         if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and
 29 |                 cfg.TRAIN.BBOX_NORMALIZE_TARGETS):
 30 |             # RPN can only use precomputed normalization because there are no
 31 |             # fixed statistics to compute a priori
 32 |             assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED
 33 | 
 34 |         if cfg.TRAIN.BBOX_REG:
 35 |             if not cfg.CFM_MODE:
 36 |                 print 'Computing bounding-box regression targets...'
 37 |                 self.bbox_means, self.bbox_stds = add_bbox_regression_targets(roidb)
 38 |                 print 'done'
 39 |             else:
 40 |                 # Pre-defined mcg bbox_mean and bbox_std
 41 |                 # We store them on disk to avoid disk level IO
 42 |                 # multiple times (mcg boxes are stored on disk)
 43 |                 mean_cache = './data/cache/mcg_bbox_mean.npy'
 44 |                 std_cache = './data/cache/mcg_bbox_std.npy'
 45 |                 roidb_dir = imdb._roidb_path
 46 |                 if os.path.exists(mean_cache) and os.path.exists(std_cache):
 47 |                     self.bbox_means = np.load(mean_cache)
 48 |                     self.bbox_stds = np.load(std_cache)
 49 |                 else:
 50 |                     self.bbox_means, self.bbox_stds = compute_mcg_mean_std(roidb_dir, imdb.num_classes)
 51 | 
 52 |         self.solver = caffe.SGDSolver(solver_prototxt)
 53 |         if pretrained_model is not None:
 54 |             print 'Loading pretrained model weights from {:s}'.format(pretrained_model)
 55 |             self.solver.net.copy_from(pretrained_model)
 56 | 
 57 |         self.solver_param = caffe_pb2.SolverParameter()
 58 |         with open(solver_prototxt, 'rt') as f:
 59 |             pb2.text_format.Merge(f.read(), self.solver_param)
 60 |         if not cfg.CFM_MODE:
 61 |             self.solver.net.layers[0].set_roidb(roidb)
 62 |             if cfg.MNC_MODE:
 63 |                 self.solver.net.layers[0].set_maskdb(maskdb)
 64 |         else:
 65 |             self.solver.net.layers[0].set_image_info(imdb, self.bbox_means, self.bbox_stds)
 66 | 
 67 |     def snapshot(self):
 68 |         """ Take a snapshot of the network after unnormalizing the learned
 69 |             bounding-box regression weights. This enables easy use at test-time.
 70 |         """
 71 |         net = self.solver.net
 72 |         # I'm wondering whether I still need to keep it if only faster-RCNN is needed
 73 |         scale_bbox_params = (cfg.TRAIN.BBOX_REG and
 74 |                              cfg.TRAIN.BBOX_NORMALIZE_TARGETS and
 75 |                              'bbox_pred' in net.params)
 76 |         if scale_bbox_params:
 77 |             # save original values
 78 |             orig_0 = net.params['bbox_pred'][0].data.copy()
 79 |             orig_1 = net.params['bbox_pred'][1].data.copy()
 80 |             if cfg.CFM_MODE:
 81 |                 cfm_mean = self.bbox_means.ravel()
 82 |                 cfm_std = self.bbox_stds.ravel()
 83 |                 net.params['bbox_pred'][0].data[...] = \
 84 |                     (net.params['bbox_pred'][0].data * cfm_std[:, np.newaxis])
 85 |                 net.params['bbox_pred'][1].data[...] = \
 86 |                     (net.params['bbox_pred'][1].data * cfm_std + cfm_mean)
 87 |             else:
 88 |                 # scale and shift with transform reg unnormalization; then save snapshot
 89 |                 net.params['bbox_pred'][0].data[...] = \
 90 |                     (net.params['bbox_pred'][0].data *
 91 |                      self.bbox_stds[:, np.newaxis])
 92 |                 net.params['bbox_pred'][1].data[...] = \
 93 |                     (net.params['bbox_pred'][1].data *
 94 |                      self.bbox_stds + self.bbox_means)
 95 | 
 96 |         if not os.path.exists(self.output_dir):
 97 |             os.makedirs(self.output_dir)
 98 | 
 99 |         # If we specify an infix in the configuration
100 |         infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX
101 |                  if cfg.TRAIN.SNAPSHOT_INFIX != '' else '')
102 |         filename = (self.solver_param.snapshot_prefix + infix +
103 |                     '_iter_{:d}'.format(self.solver.iter) + '.caffemodel')
104 | 
105 |         # For snapshot caffemodel, since MNC use shared parameters
106 |         # but caffe save parameters according to layer name instead of
107 |         # parameter names, its size will exceed 2GB, which make program crash
108 |         # Luckily, we may save it to HDF5 to avoid this issues
109 |         if not cfg.MNC_MODE:
110 |             filename = os.path.join(self.output_dir, filename)
111 |             net.save(str(filename))
112 |         else:
113 |             filename = os.path.join(self.output_dir, filename + '.h5')
114 |             net.save_to_hdf5(str(filename), False)
115 |         print 'Wrote snapshot to: {:s}'.format(filename)
116 | 
117 |         if scale_bbox_params:
118 |             # restore net to original state
119 |             net.params['bbox_pred'][0].data[...] = orig_0
120 |             net.params['bbox_pred'][1].data[...] = orig_1
121 | 
122 |     def train_model(self, max_iters):
123 |         last_snapshot_iter = -1
124 |         timer = Timer()
125 |         while self.solver.iter < max_iters:
126 |             timer.tic()
127 |             self.solver.step(1)
128 |             timer.toc()
129 |             if self.solver.iter % (10 * self.solver_param.display) == 0:
130 |                 print 'speed: {:.3f}s / iter'.format(timer.average_time)
131 | 
132 |             if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0:
133 |                 last_snapshot_iter = self.solver.iter
134 |                 self.snapshot()
135 | 
136 |         if last_snapshot_iter != self.solver.iter:
137 |             self.snapshot()
138 | 
139 | 


--------------------------------------------------------------------------------
/lib/setup.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import os
  9 | from os.path import join as pjoin
 10 | from setuptools import setup
 11 | from distutils.extension import Extension
 12 | from Cython.Distutils import build_ext
 13 | import numpy as np
 14 | 
 15 | 
 16 | def find_in_path(name, path):
 17 |     "Find a file in a search path"
 18 |     # Adapted fom
 19 |     # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
 20 |     for dir in path.split(os.pathsep):
 21 |         binpath = pjoin(dir, name)
 22 |         if os.path.exists(binpath):
 23 |             return os.path.abspath(binpath)
 24 |     return None
 25 | 
 26 | 
 27 | def locate_cuda():
 28 |     """Locate the CUDA environment on the system
 29 | 
 30 |     Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
 31 |     and values giving the absolute path to each directory.
 32 | 
 33 |     Starts by looking for the CUDAHOME env variable. If not found, everything
 34 |     is based on finding 'nvcc' in the PATH.
 35 |     """
 36 | 
 37 |     # first check if the CUDAHOME env variable is in use
 38 |     if 'CUDAHOME' in os.environ:
 39 |         home = os.environ['CUDAHOME']
 40 |         nvcc = pjoin(home, 'bin', 'nvcc')
 41 |     else:
 42 |         # otherwise, search the PATH for NVCC
 43 |         default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin')
 44 |         nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
 45 |         if nvcc is None:
 46 |             raise EnvironmentError('The nvcc binary could not be '
 47 |                 'located in your $PATH. Either add it to your path, or set $CUDAHOME')
 48 |         home = os.path.dirname(os.path.dirname(nvcc))
 49 | 
 50 |     cudaconfig = {'home':home, 'nvcc':nvcc,
 51 |                   'include': pjoin(home, 'include'),
 52 |                   'lib64': pjoin(home, 'lib64')}
 53 |     for k, v in cudaconfig.iteritems():
 54 |         if not os.path.exists(v):
 55 |             raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
 56 | 
 57 |     return cudaconfig
 58 | CUDA = locate_cuda()
 59 | 
 60 | 
 61 | # Obtain the numpy include directory.  This logic works across numpy versions.
 62 | try:
 63 |     numpy_include = np.get_include()
 64 | except AttributeError:
 65 |     numpy_include = np.get_numpy_include()
 66 | 
 67 | 
 68 | def customize_compiler_for_nvcc(self):
 69 |     """inject deep into distutils to customize how the dispatch
 70 |     to gcc/nvcc works.
 71 | 
 72 |     If you subclass UnixCCompiler, it's not trivial to get your subclass
 73 |     injected in, and still have the right customizations (i.e.
 74 |     distutils.sysconfig.customize_compiler) run on it. So instead of going
 75 |     the OO route, I have this. Note, it's kindof like a wierd functional
 76 |     subclassing going on."""
 77 | 
 78 |     # tell the compiler it can processes .cu
 79 |     self.src_extensions.append('.cu')
 80 | 
 81 |     # save references to the default compiler_so and _comple methods
 82 |     default_compiler_so = self.compiler_so
 83 |     super = self._compile
 84 | 
 85 |     # now redefine the _compile method. This gets executed for each
 86 |     # object but distutils doesn't have the ability to change compilers
 87 |     # based on source extension: we add it.
 88 |     def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
 89 |         if os.path.splitext(src)[1] == '.cu':
 90 |             # use the cuda for .cu files
 91 |             self.set_executable('compiler_so', CUDA['nvcc'])
 92 |             # use only a subset of the extra_postargs, which are 1-1 translated
 93 |             # from the extra_compile_args in the Extension class
 94 |             postargs = extra_postargs['nvcc']
 95 |         else:
 96 |             postargs = extra_postargs['gcc']
 97 | 
 98 |         super(obj, src, ext, cc_args, postargs, pp_opts)
 99 |         # reset the default compiler_so, which we might have changed for cuda
100 |         self.compiler_so = default_compiler_so
101 | 
102 |     # inject our redefined _compile method into the class
103 |     self._compile = _compile
104 | 
105 | 
106 | # run the customize_compiler
107 | class custom_build_ext(build_ext):
108 |     def build_extensions(self):
109 |         customize_compiler_for_nvcc(self.compiler)
110 |         build_ext.build_extensions(self)
111 | 
112 | 
113 | ext_modules = [
114 |     Extension(
115 |         "utils.cython_bbox",
116 |         ["utils/bbox.pyx"],
117 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
118 |         include_dirs = [numpy_include]
119 |     ),
120 |     Extension(
121 |         "nms.cpu_nms",
122 |         ["nms/cpu_nms.pyx"],
123 |         extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
124 |         include_dirs = [numpy_include]
125 |     ),
126 |     Extension('nms.gpu_nms',
127 |         ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'],
128 |         library_dirs=[CUDA['lib64']],
129 |         libraries=['cudart'],
130 |         language='c++',
131 |         runtime_library_dirs=[CUDA['lib64']],
132 |         # this syntax is specific to this build system
133 |         # we're only going to use certain compiler args with nvcc and not with
134 |         # gcc the implementation of this trick is in customize_compiler() below
135 |         extra_compile_args={'gcc': ["-Wno-unused-function"],
136 |                             'nvcc': ['-arch=sm_35',
137 |                                      '--ptxas-options=-v',
138 |                                      '-c',
139 |                                      '--compiler-options',
140 |                                      "'-fPIC'"]},
141 |         include_dirs = [numpy_include, CUDA['include']]
142 |     ),
143 |     Extension('nms.mv',
144 |         ['nms/mv_kernel.cu', 'nms/gpu_mv.pyx'],
145 |         library_dirs=[CUDA['lib64']],
146 |         libraries=['cudart'],
147 |         language='c++',
148 |         runtime_library_dirs=[CUDA['lib64']],
149 |         # this syntax is specific to this build system
150 |         # we're only going to use certain compiler args with nvcc and not with
151 |         # gcc the implementation of this trick is in customize_compiler() below
152 |         extra_compile_args={'gcc': ["-Wno-unused-function"],
153 |                             'nvcc': ['-arch=sm_35',
154 |                                      '--ptxas-options=-v',
155 |                                      '-c',
156 |                                      '--compiler-options',
157 |                                      "'-fPIC'"]},
158 |         include_dirs = [numpy_include, CUDA['include']]
159 |     ),
160 | ]
161 | 
162 | setup(
163 |     name='MNC',
164 |     ext_modules=ext_modules,
165 |     # inject our custom trigger
166 |     cmdclass={'build_ext': custom_build_ext},
167 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Instance-aware Semantic Segmentation via Multi-task Network Cascades
  2 | 
  3 | By Jifeng Dai, Kaiming He, Jian Sun
  4 | 
  5 | This python version is re-implemented by [Haozhi Qi](https://github.com/Oh233) when he was an intern at Microsoft Research.
  6 | 
  7 | ### Introduction
  8 | 
  9 | MNC is an instance-aware semantic segmentation system based on deep convolutional networks, which won the first place in COCO segmentation challenge 2015, and test at a fraction of a second per image. We decompose the task of instance-aware semantic segmentation into related sub-tasks, which are solved by multi-task network cascades (MNC) with shared features. The entire MNC network is trained end-to-end with error gradients across cascaded stages.
 10 | 
 11 | 
 12 | ![example](data/readme_img/example.png)
 13 | 
 14 | 
 15 | MNC was initially described in a [CVPR 2016 oral paper](http://arxiv.org/abs/1512.04412).
 16 | 
 17 | This repository contains a python implementation of MNC, which is ~10% slower than the original matlab implementation.
 18 | 
 19 | This repository includes a bilinear RoI warping layer, which enables gradient back-propagation with respect to RoI coordinates.
 20 | 
 21 | ### Misc.
 22 | 
 23 | This code has been tested on Linux (Ubuntu 14.04), using K40/Titan X GPUs.
 24 | 
 25 | The code is built based on [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn).
 26 | 
 27 | MNC is released under the MIT License (refer to the LICENSE file for details).
 28 | 
 29 | 
 30 | ### Citing MNC
 31 | 
 32 | If you find MNC useful in your research, please consider citing:
 33 | 
 34 |     @inproceedings{dai2016instance,
 35 |         title={Instance-aware Semantic Segmentation via Multi-task Network Cascades},
 36 |         author={Dai, Jifeng and He, Kaiming and Sun, Jian},
 37 |         booktitle={CVPR},
 38 |         year={2016}
 39 |     }
 40 | 
 41 | ### Main Results
 42 | |                   | training data       | test data             | mAP^r@0.5   | mAP^r@0.7   | time (K40)    | time (Titian X)|
 43 | |-------------------|:-------------------:|:---------------------:|:-----------:|:-----------:|:-------------:|:-------------:|
 44 | |MNC, VGG-16        | VOC 12 train        | VOC 12 val            | 65.0%       | 46.3%       | 0.42sec/img   | 0.33sec/img|
 45 | 
 46 | ### Installation guide
 47 | 
 48 | 1. Clone the MNC repository:
 49 |   ```Shell
 50 |   # Make sure to clone with --recursive
 51 |   git clone --recursive https://github.com/daijifeng001/MNC.git
 52 |   ```
 53 |  
 54 | 2. Install Python packages: `numpy`, `scipy`, `cython`, `python-opencv`, `easydict`, `yaml`.
 55 | 
 56 | 3. Build the Cython modules and the gpu_nms, gpu_mask_voting modules by:
 57 |   ```Shell
 58 |   cd $MNC_ROOT/lib
 59 |   make
 60 |   ```
 61 | 
 62 | 4. Install `Caffe` and `pycaffe` dependencies (see: [Caffe installation instructions](http://caffe.berkeleyvision.org/installation.html) for official installation guide)
 63 | 
 64 |   **Note:** Caffe *must* be built with support for Python layers!
 65 | 
 66 |   ```make
 67 |   # In your Makefile.config, make sure to have this line uncommented
 68 |   WITH_PYTHON_LAYER := 1
 69 |   # CUDNN is recommended in building to reduce memory footprint
 70 |   USE_CUDNN := 1
 71 |   ```
 72 | 
 73 | 5. Build Caffe and pycaffe:
 74 |     ```Shell
 75 |     cd $MNC_ROOT/caffe-mnc
 76 |     # If you have all of the requirements installed
 77 |     # and your Makefile.config in place, then simply do:
 78 |     make -j8 && make pycaffe
 79 |     ```
 80 | 
 81 | ### Demo
 82 | 
 83 | First, download the trained MNC model.
 84 | ```Shell
 85 | ./data/scripts/fetch_mnc_model.sh
 86 | ``` 
 87 | 
 88 | Run the demo:
 89 | ```Shell
 90 | cd $MNC_ROOT
 91 | ./tools/demo.py
 92 | ```
 93 | Result demo images will be stored to ```data/demo/```.
 94 | 
 95 | The demo performs instance-aware semantic segmentation with a trained MNC model (using VGG-16 net). The model is pre-trained on ImageNet, and finetuned on VOC 2012 train set with additional annotations from [SBD](http://home.bharathh.info/pubs/codes/SBD/download.html). The mAP^r of the model is 65.0% on VOC 2012 validation set. The test speed per image is ~0.33sec on Titian X and ~0.42sec on K40.
 96 | 
 97 | ### Training
 98 | 
 99 | This repository contains code to **end-to-end** train MNC for instance-aware semantic segmentation, where gradients across cascaded stages are counted in training.
100 | 
101 | #### Preparation:
102 | 
103 | 0. Run `./data/scripts/fetch_imagenet_models.sh` to download the ImageNet pre-trained VGG-16 net. 
104 | 0. Download the VOC 2007 dataset to ./data/VOCdevkit2007
105 | 0. Run `./data/scripts/fetch_sbd_data.sh` to download the VOC 2012 dataset together with the additional segmentation annotations in [SBD](https://9bc0b5eb4c18f1fc9a28517a91305702c68a10ae.googledrive.com/host/0ByUkob0WA1-NQi1sNlg4WkJQbTg/codes/SBD/download.html) to ./data/VOCdevkitSDS.
106 | 
107 | #### 1. End-to-end training of MNC for instance-aware semantic segmentation
108 | 
109 | To end-to-end train a 5-stage MNC model (on VOC 2012 train), use `experiments/scripts/mnc_5stage.sh`. Final mAP^r@0.5 should be ~65.0% (mAP^r@0.7 should be ~46.3%), on VOC 2012 validation.
110 | 
111 | ```Shell
112 | cd $MNC_ROOT
113 | ./experiments/scripts/mnc_5stage.sh [GPU_ID] VGG16 [--set ...]
114 | # GPU_ID is the GPU you want to train on
115 | # --set ... allows you to specify fast_rcnn.config options, e.g.
116 | #   --set EXP_DIR seed_rng 1701 RNG_SEED 1701
117 | ```
118 | 
119 | #### 2. Training of CFM for instance-aware semantic segmentation
120 | 
121 | The code also includes an entry to train a [convolutional feature masking](https://arxiv.org/abs/1412.1283) (CFM) model for instance aware semantic segmentation.
122 | 
123 |     @inproceedings{dai2015convolutional,
124 |         title={Convolutional Feature Masking for Joint Object and Stuff Segmentation},
125 |         author={Dai, Jifeng and He, Kaiming and Sun, Jian},
126 |         booktitle={CVPR},
127 |         year={2015}
128 |     }
129 | 
130 | ##### 2.1. Download pre-computed MCG proposals
131 | 
132 | Download and process the pre-computed MCG proposals.
133 | 
134 | ```Shell
135 | cd $MNC_ROOT
136 | ./data/scripts/fetch_mcg_data.sh
137 | python ./tools/prepare_mcg_maskdb.py --para_job 24 --db train --output data/cache/voc_2012_train_mcg_maskdb/
138 | python ./tools/prepare_mcg_maskdb.py --para_job 24 --db val --output data/cache/voc_2012_val_mcg_maskdb/
139 | ```
140 | Resulting proposals would be at folder ```data/MCG/```.
141 | 
142 | ##### 2.2. Train the model
143 | 
144 | Run `experiments/scripts/cfm.sh` to train on VOC 2012 train set. Final mAP^r@0.5 should be ~60.5% (mAP^r@0.7 should be ~42.6%), on VOC 2012 validation.
145 | 
146 | ```Shell
147 | cd $MNC_ROOT
148 | ./experiments/scripts/cfm.sh [GPU_ID] VGG16 [--set ...]
149 | # GPU_ID is the GPU you want to train on
150 | # --set ... allows you to specify fast_rcnn.config options, e.g.
151 | #   --set EXP_DIR seed_rng 1701 RNG_SEED 1701
152 | ```
153 | 
154 | #### 3. End-to-end training of Faster-RCNN for object detection
155 | 
156 | Faster-RCNN can be viewed as a 2-stage cascades composed of region proposal network (RPN) and object detection network. Run script `experiments/scripts/faster_rcnn_end2end.sh` to train a Faster-RCNN model on VOC 2007 trainval. Final mAP^b should be ~69.1% on VOC 2007 test.
157 | 
158 | ```Shell
159 | cd $MNC_ROOT
160 | ./experiments/scripts/faster_rcnn_end2end.sh [GPU_ID] VGG16 [--set ...]
161 | # GPU_ID is the GPU you want to train on
162 | # --set ... allows you to specify fast_rcnn.config options, e.g.
163 | #   --set EXP_DIR seed_rng1701 RNG_SEED 1701
164 | ```
165 | 


--------------------------------------------------------------------------------
/lib/db/roidb.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import PIL
  9 | import numpy as np
 10 | import os
 11 | import cPickle
 12 | import scipy
 13 | 
 14 | from db.imdb import get_imdb
 15 | from mnc_config import cfg
 16 | from transform.bbox_transform import compute_targets
 17 | 
 18 | 
 19 | def prepare_roidb(imdb):
 20 |     """ Enrich the imdb's roidb by adding some derived quantities that
 21 |         are useful for training. This function pre-computes the maximum
 22 |         overlap, taken over ground-truth boxes, between each ROI and
 23 |         each ground-truth box. The class with maximum overlap is also
 24 |         recorded.
 25 |     """
 26 |     sizes = [PIL.Image.open(imdb.image_path_at(i)).size
 27 |              for i in xrange(imdb.num_images)]
 28 |     roidb = imdb.roidb
 29 |     for i in xrange(len(imdb.image_index)):
 30 |         roidb[i]['image'] = imdb.image_path_at(i)
 31 |         roidb[i]['width'] = sizes[i][0]
 32 |         roidb[i]['height'] = sizes[i][1]
 33 |         # need gt_overlaps as a dense array for argmax
 34 |         gt_overlaps = roidb[i]['gt_overlaps'].toarray()
 35 |         # max overlap with gt over classes (columns)
 36 |         max_overlaps = gt_overlaps.max(axis=1)
 37 |         # gt class that had the max overlap
 38 |         max_classes = gt_overlaps.argmax(axis=1)
 39 |         roidb[i]['max_classes'] = max_classes
 40 |         roidb[i]['max_overlaps'] = max_overlaps
 41 |         # sanity checks
 42 |         # max overlap of 0 => class should be zero (background)
 43 |         zero_inds = np.where(max_overlaps == 0)[0]
 44 |         assert all(max_classes[zero_inds] == 0)
 45 |         # max overlap > 0 => class should not be zero (must be a fg class)
 46 |         nonzero_inds = np.where(max_overlaps > 0)[0]
 47 |         assert all(max_classes[nonzero_inds] != 0)
 48 | 
 49 | 
 50 | def add_bbox_regression_targets(roidb):
 51 |     """Add information needed to train bounding-box regressors."""
 52 |     assert len(roidb) > 0
 53 |     assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?'
 54 | 
 55 |     num_images = len(roidb)
 56 |     # Infer number of classes from the number of columns in gt_overlaps
 57 |     num_classes = roidb[0]['gt_overlaps'].shape[1]
 58 |     for im_i in xrange(num_images):
 59 |         rois = roidb[im_i]['boxes']
 60 |         max_overlaps = roidb[im_i]['max_overlaps']
 61 |         max_classes = roidb[im_i]['max_classes']
 62 |         roidb[im_i]['bbox_targets'] = \
 63 |             compute_targets(rois, max_overlaps, max_classes)
 64 | 
 65 |     if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED:
 66 |         # Use fixed / precomputed "means" and "stds" instead of empirical values
 67 |         means = np.tile(
 68 |                 np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1))
 69 |         stds = np.tile(
 70 |                 np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1))
 71 |     else:
 72 |         # Compute values needed for means and stds
 73 |         # var(x) = E(x^2) - E(x)^2
 74 |         class_counts = np.zeros((num_classes, 1)) + cfg.EPS
 75 |         sums = np.zeros((num_classes, 4))
 76 |         squared_sums = np.zeros((num_classes, 4))
 77 |         for im_i in xrange(num_images):
 78 |             targets = roidb[im_i]['bbox_targets']
 79 |             for cls in xrange(1, num_classes):
 80 |                 cls_inds = np.where(targets[:, 0] == cls)[0]
 81 |                 if cls_inds.size > 0:
 82 |                     class_counts[cls] += cls_inds.size
 83 |                     sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
 84 |                     squared_sums[cls, :] += \
 85 |                             (targets[cls_inds, 1:] ** 2).sum(axis=0)
 86 | 
 87 |         means = sums / class_counts
 88 |         stds = np.sqrt(squared_sums / class_counts - means ** 2)
 89 | 
 90 |     print 'bbox target means:'
 91 |     print means
 92 |     print means[1:, :].mean(axis=0)  # ignore bg class
 93 |     print 'bbox target stdevs:'
 94 |     print stds
 95 |     print stds[1:, :].mean(axis=0)  # ignore bg class
 96 | 
 97 |     # Normalize targets
 98 |     if cfg.TRAIN.BBOX_NORMALIZE_TARGETS:
 99 |         print "Normalizing targets"
100 |         for im_i in xrange(num_images):
101 |             targets = roidb[im_i]['bbox_targets']
102 |             for cls in xrange(1, num_classes):
103 |                 cls_inds = np.where(targets[:, 0] == cls)[0]
104 |                 roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :]
105 |                 roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :]
106 |     else:
107 |         print "NOT normalizing targets"
108 | 
109 |     # These values will be needed for making predictions
110 |     # (the predicts will need to be unnormalized and uncentered)
111 |     return means.ravel(), stds.ravel()
112 | 
113 | 
114 | def get_roidb(imdb_name):
115 |     imdb = get_imdb(imdb_name)
116 |     print 'Loaded dataset `{:s}` for training'.format(imdb.name)
117 |     # Here set handler function. (e.g. gt_roidb in faster RCNN)
118 |     imdb.set_roi_handler(cfg.TRAIN.PROPOSAL_METHOD)
119 |     print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD)
120 |     if cfg.TRAIN.USE_FLIPPED:
121 |         print 'Appending horizontally-flipped training examples...'
122 |         imdb.append_flipped_rois()
123 |         print 'done'
124 |     print 'Preparing training data...'
125 |     prepare_roidb(imdb)
126 |     print 'done'
127 |     return imdb.roidb
128 | 
129 | 
130 | def attach_roidb(imdb_names):
131 |     """
132 |     only implement single roidb now
133 |     """
134 |     roidbs = [get_roidb(s) for s in imdb_names.split('+')]
135 |     roidb = roidbs[0]
136 |     if len(roidbs) > 1:
137 |         raise NotImplementedError
138 |     else:
139 |         imdb = get_imdb(imdb_names)
140 |     return imdb, roidb
141 | 
142 | 
143 | def compute_mcg_mean_std(roidb_dir, num_classes):
144 |     """
145 |     Compute bbox mean and stds for mcg proposals
146 |     Since mcg proposal are stored on disk, so we precomputed it here once
147 |     and save them to disk to avoid disk I/O next time
148 |     Args:
149 |         roidb_dir: directory contain all the mcg proposals
150 |     """
151 |     file_list = sorted(os.listdir(roidb_dir))
152 |     target_list = []
153 |     cnt = 0
154 |     for file_name in file_list:
155 |         roidb_cache = os.path.join(roidb_dir, file_name)
156 |         roidb = scipy.io.loadmat(roidb_cache)
157 |         target_list.append(compute_targets(roidb['boxes'], roidb['det_overlap'], roidb['output_label'].ravel()))
158 |         cnt += 1
159 | 
160 |     class_counts = np.zeros((num_classes, 1)) + cfg.EPS
161 |     sums = np.zeros((num_classes, 4))
162 |     squared_sums = np.zeros((num_classes, 4))
163 |     for im_i in xrange(len(target_list)):
164 |         targets = target_list[im_i]
165 |         for cls in xrange(1, num_classes):
166 |             cls_inds = np.where(targets[:, 0] == cls)[0]
167 |             if cls_inds.size > 0:
168 |                 class_counts[cls] += cls_inds.size
169 |                 sums[cls, :] += targets[cls_inds, 1:].sum(axis=0)
170 |                 squared_sums[cls, :] += \
171 |                     (targets[cls_inds, 1:] ** 2).sum(axis=0)
172 | 
173 |     means = sums / class_counts
174 |     stds = np.sqrt(squared_sums / class_counts - means ** 2)
175 |     np.save('data/cache/mcg_bbox_mean.npy', means)
176 |     np.save('data/cache/mcg_bbox_std.npy', stds)
177 |     return means, stds
178 | 


--------------------------------------------------------------------------------
/lib/mnc_config.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """MNC config system
  3 | """
  4 | import numpy as np
  5 | import os.path
  6 | from easydict import EasyDict as edict
  7 | 
  8 | __C = edict()
  9 | cfg = __C
 10 | 
 11 | # MNC/CFM mode
 12 | __C.MNC_MODE = True
 13 | __C.CFM_MODE = False
 14 | 
 15 | __C.EXP_DIR = 'default'
 16 | __C.USE_GPU_NMS = True
 17 | __C.GPU_ID = 0
 18 | __C.RNG_SEED = 3
 19 | __C.EPS = 1e-14
 20 | __C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]])
 21 | # Root directory of project
 22 | __C.ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 23 | # Data directory
 24 | __C.DATA_DIR = os.path.abspath(os.path.join(__C.ROOT_DIR, 'data'))
 25 | # Related to mask resizing and binarize predicted masks
 26 | __C.BINARIZE_THRESH = 0.4
 27 | # Mask estimation (if any) size (may be different from CFM input size)
 28 | __C.MASK_SIZE = 21
 29 | 
 30 | # Training options
 31 | __C.TRAIN = edict()
 32 | 
 33 | # ------- General setting ----
 34 | __C.TRAIN.IMS_PER_BATCH = 1
 35 | # Batch size for training Region CNN (not RPN)
 36 | __C.TRAIN.BATCH_SIZE = 64
 37 | # Make minibatches from images that have similar aspect ratios (i.e. both
 38 | # tall and thin or both short and wide) in order to avoid wasting computation
 39 | # on zero-padding.
 40 | __C.TRAIN.ASPECT_GROUPING = True
 41 | # Use flipped image for augmentation
 42 | __C.TRAIN.USE_FLIPPED = True
 43 | # Resize shortest side to 600
 44 | __C.TRAIN.SCALES = (600,)
 45 | __C.TRAIN.MAX_SIZE = 1000
 46 | __C.TRAIN.SNAPSHOT_ITERS = 5000
 47 | __C.TRAIN.SNAPSHOT_INFIX = ''
 48 | # Sample FG
 49 | __C.TRAIN.FG_FRACTION = [0.3]
 50 | __C.TRAIN.FG_THRESH_HI = [1.0]
 51 | __C.TRAIN.FG_THRESH_LO = [0.5]
 52 | # Sample BF according to remaining samples
 53 | __C.TRAIN.BG_FRACTION = [0.85, 0.15]
 54 | __C.TRAIN.BG_THRESH_HI = [0.5, 0.1]
 55 | __C.TRAIN.BG_THRESH_LO = [0.1, 0.0]
 56 | 
 57 | # ------- Proposal -------
 58 | __C.TRAIN.PROPOSAL_METHOD = 'gt'
 59 | 
 60 | # ------- BBOX Regression ---------
 61 | # Train bounding-box regressors
 62 | __C.TRAIN.BBOX_REG = True
 63 | __C.TRAIN.BBOX_NORMALIZE_TARGETS = True
 64 | __C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = False
 65 | __C.TRAIN.BBOX_THRESH = 0.5
 66 | __C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0)
 67 | __C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2)
 68 | # weight of smooth L1 loss
 69 | __C.TRAIN.BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
 70 | 
 71 | # -------- RPN ----------
 72 | # Use RPN to detect objects
 73 | __C.TRAIN.HAS_RPN = True
 74 | # IOU >= thresh: positive example
 75 | __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7
 76 | # IO < thresh: negative example
 77 | __C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3
 78 | # If an anchor satisfied by positive and negative conditions set to negative
 79 | __C.TRAIN.RPN_CLOBBER_POSITIVES = False
 80 | # Max number of foreground examples
 81 | # Note this is class-agnostic anchors' FG_FRACTION
 82 | __C.TRAIN.RPN_FG_FRACTION = 0.5
 83 | # Total number of examples
 84 | __C.TRAIN.RPN_BATCHSIZE = 256
 85 | # NMS threshold used on RPN proposals
 86 | __C.TRAIN.RPN_NMS_THRESH = 0.7
 87 | # Number of top scoring boxes to keep before apply NMS to RPN proposals
 88 | __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000
 89 | # Number of top scoring boxes to keep after applying NMS to RPN proposals
 90 | __C.TRAIN.RPN_POST_NMS_TOP_N = 2000
 91 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale)
 92 | __C.TRAIN.RPN_MIN_SIZE = 16
 93 | # Deprecated (outside weights)
 94 | __C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0)
 95 | # Give the positive RPN examples weight of p * 1 / {num positives}
 96 | # and give negatives a weight of (1 - p)
 97 | # Set to -1.0 to use uniform example weighting
 98 | __C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0
 99 | # Mix anchors used for RPN and later layer
100 | __C.TRAIN.MIX_INDEX = True
101 | 
102 | # -------- CFM ----------
103 | __C.TRAIN.CFM_INPUT_MASK_SIZE = 14
104 | __C.TRAIN.FG_DET_THRESH = 0.5
105 | __C.TRAIN.FG_SEG_THRESH = 0.5
106 | __C.TRAIN.FRACTION_SAMPLE = [0.3, 0.5, 0.2]
107 | __C.TRAIN.THRESH_LO_SAMPLE = [0.5, 0.1, 0.0]
108 | __C.TRAIN.THRESH_HI_SAMPLE = [1.0, 0.5, 0.1]
109 | 
110 | # Test option
111 | 
112 | __C.TEST = edict()
113 | # Scales to use during testing (can list multiple scales)
114 | # Each scale is the pixel size of an image's shortest side
115 | __C.TEST.SCALES = (600,)
116 | 
117 | # Max pixel size of the longest side of a scaled input image
118 | __C.TEST.MAX_SIZE = 1000
119 | 
120 | # Overlap threshold used for non-maximum suppression (suppress boxes with
121 | # IoU >= this threshold)
122 | __C.TEST.NMS = 0.3
123 | # Set this true in the yml file to specify proposed RPN
124 | __C.TEST.HAS_RPN = True
125 | # NMS threshold used on RPN proposals
126 | __C.TEST.RPN_NMS_THRESH = 0.7
127 | # Number of top scoring boxes to keep before apply NMS to RPN proposals
128 | __C.TEST.RPN_PRE_NMS_TOP_N = 6000
129 | # Number of top scoring boxes to keep after applying NMS to RPN proposals
130 | __C.TEST.RPN_POST_NMS_TOP_N = 300
131 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale)
132 | __C.TEST.RPN_MIN_SIZE = 16
133 | __C.TEST.BBOX_REG = True
134 | 
135 | # Aggregate nearby masks inside box, the box_IOU threshold
136 | __C.TEST.MASK_MERGE_IOU_THRESH = 0.5
137 | __C.TEST.MASK_MERGE_NMS_THRESH = 0.3
138 | __C.TEST.CFM_INPUT_MASK_SIZE = 14
139 | 
140 | # Used for multi-scale testing, since naive implementation
141 | # will waste a lot of on zero-padding, so we group each
142 | # $GROUP_SCALE scales to feed in gpu. And max rois for
143 | # each group is specified in MAX_ROIS_GPU
144 | __C.TEST.MAX_ROIS_GPU = [2000]
145 | __C.TEST.GROUP_SCALE = 1
146 | 
147 | # 0 means use all the MCG proposals
148 | __C.TEST.USE_TOP_K_MCG = 0
149 | 
150 | # threshold for binarize a mask
151 | __C.TEST.USE_MASK_MERGE = True
152 | __C.TEST.USE_GPU_MASK_MERGE = True
153 | 
154 | 
155 | def get_output_dir(imdb, net):
156 |     """ Return the directory where experimental artifacts are placed.
157 |         A canonical path is built using the name from an imdb and a network
158 |         (if not None).
159 |     """
160 |     path = os.path.abspath(os.path.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name))
161 |     if net is None:
162 |         return path
163 |     else:
164 |         return os.path.join(path, net.name)
165 | 
166 | 
167 | def _merge_two_config(user_cfg, default_cfg):
168 |     """ Merge user's config into default config dictionary, clobbering the
169 |         options in b whenever they are also specified in a.
170 |         Need to ensure the type of two val under same key are the same
171 |         Do recursive merge when encounter hierarchical dictionary
172 |     """
173 |     if type(user_cfg) is not edict:
174 |         return
175 |     for key, val in user_cfg.iteritems():
176 |         # Since user_cfg is a sub-file of default_cfg
177 |         if not default_cfg.has_key(key):
178 |             raise KeyError('{} is not a valid config key'.format(key))
179 | 
180 |         if type(default_cfg[key]) is not type(val):
181 |             if isinstance(default_cfg[key], np.ndarray):
182 |                 val = np.array(val, dtype=default_cfg[key].dtype)
183 |             else:
184 |                 raise ValueError(
185 |                      'Type mismatch ({} vs. {}) '
186 |                      'for config key: {}'.format(type(default_cfg[key]),
187 |                                                  type(val), key))
188 |         # Recursive merge config
189 |         if type(val) is edict:
190 |             try:
191 |                 _merge_two_config(user_cfg[key], default_cfg[key])
192 |             except:
193 |                 print 'Error under config key: {}'.format(key)
194 |                 raise
195 |         else:
196 |             default_cfg[key] = val
197 | 
198 | 
199 | def cfg_from_file(file_name):
200 |     """ Load a config file and merge it into the default options.
201 |     """
202 |     import yaml
203 |     with open(file_name, 'r') as f:
204 |         yaml_cfg = edict(yaml.load(f))
205 | 
206 |     _merge_two_config(yaml_cfg, __C)
207 | 


--------------------------------------------------------------------------------
/lib/transform/bbox_transform.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import numpy as np
  9 | from utils.cython_bbox import bbox_overlaps
 10 | from mnc_config import cfg
 11 | 
 12 | 
 13 | def compute_targets(rois, overlaps, labels):
 14 |     """
 15 |     Compute bounding-box regression targets for an image.
 16 |     """
 17 |     # Indices of ground-truth ROIs
 18 |     gt_inds = np.where(overlaps == 1)[0]
 19 |     # Indices of examples for which we try to make predictions
 20 |     ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0]
 21 | 
 22 |     # Get IoU overlap  each ex ROI and gt ROI
 23 |     ex_gt_overlaps = bbox_overlaps(
 24 |         np.ascontiguousarray(rois[ex_inds, :], dtype=np.float),
 25 |         np.ascontiguousarray(rois[gt_inds, :], dtype=np.float))
 26 | 
 27 |     # Find which gt ROI each ex ROI has max overlap with:
 28 |     # this will be the ex ROI's gt target
 29 |     gt_assignment = ex_gt_overlaps.argmax(axis=1)
 30 |     gt_rois = rois[gt_inds[gt_assignment], :]
 31 |     ex_rois = rois[ex_inds, :]
 32 | 
 33 |     targets = np.zeros((rois.shape[0], 5), dtype=np.float32)
 34 |     targets[ex_inds, 0] = labels[ex_inds]
 35 |     targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois)
 36 |     return targets
 37 | 
 38 | 
 39 | def bbox_transform(ex_rois, gt_rois):
 40 |     """
 41 |     Compute bbox regression targets of external rois
 42 |     with respect to gt rois
 43 |     """
 44 |     ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
 45 |     ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
 46 |     ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths
 47 |     ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights
 48 | 
 49 |     gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
 50 |     gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
 51 |     gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths
 52 |     gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights
 53 | 
 54 |     targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths
 55 |     targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights
 56 |     targets_dw = np.log(gt_widths / ex_widths)
 57 |     targets_dh = np.log(gt_heights / ex_heights)
 58 | 
 59 |     targets = np.vstack(
 60 |         (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
 61 |     return targets
 62 | 
 63 | 
 64 | def bbox_transform_inv(boxes, deltas):
 65 |     """
 66 |     invert bounding box transform
 67 |     apply delta on anchors to get transformed proposals
 68 |     """
 69 |     if boxes.shape[0] == 0:
 70 |         return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
 71 | 
 72 |     boxes = boxes.astype(deltas.dtype, copy=False)
 73 | 
 74 |     widths = boxes[:, 2] - boxes[:, 0] + 1.0
 75 |     heights = boxes[:, 3] - boxes[:, 1] + 1.0
 76 |     ctr_x = boxes[:, 0] + 0.5 * widths
 77 |     ctr_y = boxes[:, 1] + 0.5 * heights
 78 | 
 79 |     dx = deltas[:, 0::4]
 80 |     dy = deltas[:, 1::4]
 81 |     dw = deltas[:, 2::4]
 82 |     dh = deltas[:, 3::4]
 83 | 
 84 |     pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis]
 85 |     pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis]
 86 |     pred_w = np.exp(dw) * widths[:, np.newaxis]
 87 |     pred_h = np.exp(dh) * heights[:, np.newaxis]
 88 | 
 89 |     pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype)
 90 |     # x1
 91 |     pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w
 92 |     # y1
 93 |     pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h
 94 |     # x2
 95 |     pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w
 96 |     # y2
 97 |     pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h
 98 | 
 99 |     return pred_boxes
100 | 
101 | 
102 | def clip_boxes(boxes, im_shape):
103 |     """
104 |     Clip boxes inside image boundaries
105 |     """
106 |     x1 = boxes[:, 0::4]
107 |     y1 = boxes[:, 1::4]
108 |     x2 = boxes[:, 2::4]
109 |     y2 = boxes[:, 3::4]
110 |     keep = np.where((x1 >= 0) & (x2 <= im_shape[1] - 1) & (y1 >= 0) & (y2 <= im_shape[0] - 1))[0]
111 |     clipped_boxes = np.zeros(boxes.shape, dtype=boxes.dtype)
112 |     # x1 >= 0
113 |     clipped_boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
114 |     # y1 >= 0
115 |     clipped_boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0)
116 |     # x2 < im_shape[1]
117 |     clipped_boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0)
118 |     # y2 < im_shape[0]
119 |     clipped_boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0)
120 |     return clipped_boxes, keep
121 | 
122 | 
123 | def filter_small_boxes(boxes, min_size):
124 |     """
125 |     Remove all boxes with any side smaller than min_size.
126 |     """
127 |     ws = boxes[:, 2] - boxes[:, 0] + 1
128 |     hs = boxes[:, 3] - boxes[:, 1] + 1
129 |     keep = np.where((ws >= min_size) & (hs >= min_size))[0]
130 |     return keep
131 | 
132 | 
133 | def scale_boxes(boxes, alpha):
134 |     """
135 |     Scale boxes from w/h to alpha * w/h while keep center unchanged
136 |     Args:
137 |         boxes: a set of boxes specified using x1, y1, x2, y2
138 |         alpha: scaling factor
139 | 
140 |     Returns:
141 |         boxes: boxes after applying scaling
142 |     """
143 |     w = boxes[:, 2] - boxes[:, 0] + 1
144 |     h = boxes[:, 3] - boxes[:, 1] + 1
145 |     ctr_x = boxes[:, 0] + 0.5 * w
146 |     ctr_y = boxes[:, 1] + 0.5 * h
147 |     scaled_w = w * alpha
148 |     scaled_h = h * alpha
149 |     scaled_boxes = np.zeros(boxes.shape, dtype=boxes.dtype)
150 |     scaled_boxes[:, 0] = ctr_x - 0.5 * scaled_w
151 |     scaled_boxes[:, 1] = ctr_y - 0.5 * scaled_h
152 |     scaled_boxes[:, 2] = ctr_x + 0.5 * scaled_w
153 |     scaled_boxes[:, 3] = ctr_y + 0.5 * scaled_h
154 |     return scaled_boxes
155 | 
156 | 
157 | def bbox_compute_targets(ex_rois, gt_rois, normalize):
158 |     """
159 |     Compute bounding-box regression targets for an image
160 |     Parameters:
161 |     -----------
162 |     ex_rois: ROIs from external source (anchors or proposals)
163 |     gt_rois: ground truth ROIs
164 |     normalize: whether normalize box (since RPN doesn't need to normalize)
165 | 
166 |     Returns:
167 |     -----------
168 |     Relative value for anchor or proposals
169 |     """
170 |     assert ex_rois.shape == gt_rois.shape
171 | 
172 |     targets = bbox_transform(ex_rois, gt_rois)
173 |     if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED and normalize:
174 |         # Optionally normalize targets by a precomputed mean and std
175 |         targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) /
176 |                    np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS))
177 | 
178 |     return targets.astype(np.float32, copy=False)
179 | 
180 | 
181 | def get_bbox_regression_label(bbox_target_data, num_class):
182 |     """Bounding-box regression targets (bbox_target_data) are stored in a
183 |     compact form N x (class, tx, ty, tw, th)
184 | 
185 |     This function expands those targets into the 4-of-4*K representation used
186 |     by the network (i.e. only one class has non-zero targets).
187 | 
188 |     Returns:
189 |         bbox_target (ndarray): N x 4K blob of regression targets
190 |         bbox_inside_weights (ndarray): N x 4K blob of loss weights
191 |     """
192 |     assert bbox_target_data.shape[1] == 5
193 |     clss = bbox_target_data[:, 0]
194 |     bbox_targets = np.zeros((clss.size, 4 * num_class), dtype=np.float32)
195 |     bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32)
196 |     inds = np.where(clss > 0)[0]
197 |     for ind in inds:
198 |         cls = clss[ind]
199 |         start = 4 * cls
200 |         end = start + 4
201 |         bbox_targets[ind, start:end] = bbox_target_data[ind, 1:]
202 |         bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS
203 |     return bbox_targets, bbox_inside_weights
204 | 


--------------------------------------------------------------------------------
/tools/demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # --------------------------------------------------------
  4 | # Multitask Network Cascade
  5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  6 | # Copyright (c) 2016, Haozhi Qi
  7 | # Licensed under The MIT License [see LICENSE for details]
  8 | # --------------------------------------------------------
  9 | 
 10 | # Standard module
 11 | import os
 12 | import argparse
 13 | import time
 14 | import cv2
 15 | import numpy as np
 16 | # User-defined module
 17 | import _init_paths
 18 | import caffe
 19 | from mnc_config import cfg
 20 | from transform.bbox_transform import clip_boxes
 21 | from utils.blob import prep_im_for_blob, im_list_to_blob
 22 | from transform.mask_transform import gpu_mask_voting
 23 | import matplotlib.pyplot as plt
 24 | from utils.vis_seg import _convert_pred_to_image, _get_voc_color_map
 25 | from PIL import Image
 26 | 
 27 | # VOC 20 classes
 28 | CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat',
 29 |            'bottle', 'bus', 'car', 'cat', 'chair',
 30 |            'cow', 'diningtable', 'dog', 'horse',
 31 |            'motorbike', 'person', 'pottedplant',
 32 |            'sheep', 'sofa', 'train', 'tvmonitor')
 33 | 
 34 | 
 35 | def parse_args():
 36 |     """Parse input arguments."""
 37 |     parser = argparse.ArgumentParser(description='MNC demo')
 38 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 39 |                         default=0, type=int)
 40 |     parser.add_argument('--cpu', dest='cpu_mode',
 41 |                         help='Use CPU mode (overrides --gpu)',
 42 |                         action='store_true')
 43 |     parser.add_argument('--def', dest='prototxt',
 44 |                         help='prototxt file defining the network',
 45 |                         default='./models/VGG16/mnc_5stage/test.prototxt', type=str)
 46 |     parser.add_argument('--net', dest='caffemodel',
 47 |                         help='model to test',
 48 |                         default='./data/mnc_model/mnc_model.caffemodel.h5', type=str)
 49 | 
 50 |     args = parser.parse_args()
 51 |     return args
 52 | 
 53 | 
 54 | def prepare_mnc_args(im, net):
 55 |     # Prepare image data blob
 56 |     blobs = {'data': None}
 57 |     processed_ims = []
 58 |     im, im_scale_factors = \
 59 |         prep_im_for_blob(im, cfg.PIXEL_MEANS, cfg.TEST.SCALES[0], cfg.TRAIN.MAX_SIZE)
 60 |     processed_ims.append(im)
 61 |     blobs['data'] = im_list_to_blob(processed_ims)
 62 |     # Prepare image info blob
 63 |     im_scales = [np.array(im_scale_factors)]
 64 |     assert len(im_scales) == 1, 'Only single-image batch implemented'
 65 |     im_blob = blobs['data']
 66 |     blobs['im_info'] = np.array(
 67 |         [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]],
 68 |         dtype=np.float32)
 69 |     # Reshape network inputs and do forward
 70 |     net.blobs['data'].reshape(*blobs['data'].shape)
 71 |     net.blobs['im_info'].reshape(*blobs['im_info'].shape)
 72 |     forward_kwargs = {
 73 |         'data': blobs['data'].astype(np.float32, copy=False),
 74 |         'im_info': blobs['im_info'].astype(np.float32, copy=False)
 75 |     }
 76 |     return forward_kwargs, im_scales
 77 | 
 78 | 
 79 | def im_detect(im, net):
 80 |     forward_kwargs, im_scales = prepare_mnc_args(im, net)
 81 |     blobs_out = net.forward(**forward_kwargs)
 82 |     # output we need to collect:
 83 |     # 1. output from phase1'
 84 |     rois_phase1 = net.blobs['rois'].data.copy()
 85 |     masks_phase1 = net.blobs['mask_proposal'].data[...]
 86 |     scores_phase1 = net.blobs['seg_cls_prob'].data[...]
 87 |     # 2. output from phase2
 88 |     rois_phase2 = net.blobs['rois_ext'].data[...]
 89 |     masks_phase2 = net.blobs['mask_proposal_ext'].data[...]
 90 |     scores_phase2 = net.blobs['seg_cls_prob_ext'].data[...]
 91 |     # Boxes are in resized space, we un-scale them back
 92 |     rois_phase1 = rois_phase1[:, 1:5] / im_scales[0]
 93 |     rois_phase2 = rois_phase2[:, 1:5] / im_scales[0]
 94 |     rois_phase1, _ = clip_boxes(rois_phase1, im.shape)
 95 |     rois_phase2, _ = clip_boxes(rois_phase2, im.shape)
 96 |     # concatenate two stages to get final network output
 97 |     masks = np.concatenate((masks_phase1, masks_phase2), axis=0)
 98 |     boxes = np.concatenate((rois_phase1, rois_phase2), axis=0)
 99 |     scores = np.concatenate((scores_phase1, scores_phase2), axis=0)
100 |     return boxes, masks, scores
101 | 
102 | 
103 | def get_vis_dict(result_box, result_mask, img_name, cls_names, vis_thresh=0.5):
104 |     box_for_img = []
105 |     mask_for_img = []
106 |     cls_for_img = []
107 |     for cls_ind, cls_name in enumerate(cls_names):
108 |         det_for_img = result_box[cls_ind]
109 |         seg_for_img = result_mask[cls_ind]
110 |         keep_inds = np.where(det_for_img[:, -1] >= vis_thresh)[0]
111 |         for keep in keep_inds:
112 |             box_for_img.append(det_for_img[keep])
113 |             mask_for_img.append(seg_for_img[keep][0])
114 |             cls_for_img.append(cls_ind + 1)
115 |     res_dict = {'image_name': img_name,
116 |                 'cls_name': cls_for_img,
117 |                 'boxes': box_for_img,
118 |                 'masks': mask_for_img}
119 |     return res_dict
120 | 
121 | if __name__ == '__main__':
122 |     args = parse_args()
123 |     test_prototxt = args.prototxt
124 |     test_model = args.caffemodel
125 | 
126 |     caffe.set_mode_gpu()
127 |     caffe.set_device(args.gpu_id)
128 |     cfg.GPU_ID = args.gpu_id
129 |     net = caffe.Net(test_prototxt, test_model, caffe.TEST)
130 | 
131 |     # Warm up for the first two images
132 |     im = 128 * np.ones((300, 500, 3), dtype=np.float32)
133 |     for i in xrange(2):
134 |         _, _, _ = im_detect(im, net)
135 | 
136 |     im_names = ['2008_000533.jpg', '2008_000910.jpg', '2008_001602.jpg',
137 |                 '2008_001717.jpg', '2008_008093.jpg']
138 |     demo_dir = './data/demo'
139 |     for im_name in im_names:
140 |         print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
141 |         print 'Demo for data/demo/{}'.format(im_name)
142 |         gt_image = os.path.join(demo_dir, im_name)
143 |         im = cv2.imread(gt_image)
144 |         start = time.time()
145 |         boxes, masks, seg_scores = im_detect(im, net)
146 |         end = time.time()
147 |         print 'forward time %f' % (end-start)
148 |         result_mask, result_box = gpu_mask_voting(masks, boxes, seg_scores, len(CLASSES) + 1,
149 |                                                   100, im.shape[1], im.shape[0])
150 |         pred_dict = get_vis_dict(result_box, result_mask, 'data/demo/' + im_name, CLASSES)
151 | 
152 |         img_width = im.shape[1]
153 |         img_height = im.shape[0]
154 |         
155 |         inst_img, cls_img = _convert_pred_to_image(img_width, img_height, pred_dict)
156 |         color_map = _get_voc_color_map()
157 |         target_cls_file = os.path.join(demo_dir, 'cls_' + im_name)
158 |         cls_out_img = np.zeros((img_height, img_width, 3))
159 |         for i in xrange(img_height):
160 |             for j in xrange(img_width):
161 |                 cls_out_img[i][j] = color_map[cls_img[i][j]][::-1]
162 |         cv2.imwrite(target_cls_file, cls_out_img)
163 |         
164 |         background = Image.open(gt_image)
165 |         mask = Image.open(target_cls_file)
166 |         background = background.convert('RGBA')
167 |         mask = mask.convert('RGBA')
168 |         superimpose_image = Image.blend(background, mask, 0.8)
169 |         superimpose_name = os.path.join(demo_dir, 'final_' + im_name)
170 |         superimpose_image.save(superimpose_name, 'JPEG')
171 |         im = cv2.imread(superimpose_name)
172 | 
173 |         im = im[:, :, (2, 1, 0)]
174 |         fig, ax = plt.subplots(figsize=(12, 12))
175 |         ax.imshow(im, aspect='equal')
176 |         classes = pred_dict['cls_name']
177 |         for i in xrange(len(classes)):
178 |             score = pred_dict['boxes'][i][-1]
179 |             bbox = pred_dict['boxes'][i][:4]
180 |             cls_ind = classes[i] - 1
181 |             ax.text(bbox[0], bbox[1] - 8,
182 |                 '{:s} {:.4f}'.format(CLASSES[cls_ind], score),
183 |                 bbox=dict(facecolor='blue', alpha=0.5),
184 |                 fontsize=14, color='white')
185 |         plt.axis('off')
186 |         plt.tight_layout()
187 |         plt.draw()
188 | 
189 |         fig.savefig(os.path.join(demo_dir, im_name[:-4]+'.png'))
190 |         os.remove(superimpose_name)
191 |         os.remove(target_cls_file)
192 | 


--------------------------------------------------------------------------------
/lib/pylayer/proposal_target_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import caffe
  9 | import yaml
 10 | import numpy as np
 11 | import numpy.random as npr
 12 | from mnc_config import cfg
 13 | from transform.bbox_transform import \
 14 |     bbox_transform, bbox_compute_targets, \
 15 |     scale_boxes, get_bbox_regression_label
 16 | from transform.anchors import generate_anchors
 17 | from transform.mask_transform import intersect_mask
 18 | from utils.cython_bbox import bbox_overlaps
 19 | 
 20 | 
 21 | class ProposalTargetLayer(caffe.Layer):
 22 |     """
 23 |     Assign object detection proposals to ground-truth targets. Produces proposal
 24 |     classification labels and bounding-box regression targets.
 25 |     """
 26 | 
 27 |     def setup(self, bottom, top):
 28 |         layer_params = yaml.load(self.param_str_)
 29 |         self._anchors = generate_anchors()
 30 |         self._num_anchors = self._anchors.shape[0]
 31 |         self._num_classes = layer_params['num_classes']
 32 |         self._bp_all = layer_params.get('bp_all', True)
 33 |         self._top_name_map = {}
 34 |         top[0].reshape(1, 5)
 35 |         self._top_name_map['rois'] = 0
 36 |         top[1].reshape(1, 1)
 37 |         self._top_name_map['labels'] = 1
 38 |         top[2].reshape(1, self._num_classes * 4)
 39 |         self._top_name_map['bbox_targets'] = 2
 40 |         top[3].reshape(1, self._num_classes * 4)
 41 |         self._top_name_map['bbox_inside_weights'] = 3
 42 |         top[4].reshape(1, self._num_classes * 4)
 43 |         self._top_name_map['bbox_outside_weights'] = 4
 44 |         # Add mask-related information
 45 |         if cfg.MNC_MODE:
 46 |             top[5].reshape(1, 1, cfg.MASK_SIZE, cfg.MASK_SIZE)
 47 |             self._top_name_map['mask_targets'] = 5
 48 |             top[6].reshape(1, 1, cfg.MASK_SIZE, cfg.MASK_SIZE)
 49 |             self._top_name_map['mask_weight'] = 6
 50 |             top[7].reshape(1, 4)
 51 |             self._top_name_map['gt_masks_info'] = 7
 52 |             if cfg.TRAIN.MIX_INDEX:
 53 |                 top[8].reshape(1, 4)
 54 |                 self._top_name_map['fg_inds'] = 8
 55 |                 top[9].reshape(1, 4)
 56 |                 self._top_name_map['bg_inds'] = 9
 57 | 
 58 |     def reshape(self, bottom, top):
 59 |         """Reshaping happens during the call to forward."""
 60 |         pass
 61 | 
 62 |     def forward(self, bottom, top):
 63 |         # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN
 64 |         # (i.e., rpn.proposal_layer.ProposalLayer), or any other source
 65 |         all_rois = bottom[0].data
 66 |         # GT boxes (x1, y1, x2, y2, label)
 67 |         gt_boxes = bottom[1].data
 68 |         im_info = bottom[2].data[0, :]
 69 |         im_scale = im_info[2]
 70 |         # get original masks
 71 |         if cfg.MNC_MODE:
 72 |             gt_masks = bottom[3].data
 73 |             mask_info = bottom[4].data
 74 |         else:
 75 |             gt_masks = None
 76 |             mask_info = None
 77 |         # Include ground-truth boxes in the set of candidate rois
 78 |         zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype)
 79 |         all_rois = np.vstack(
 80 |             (all_rois, np.hstack((zeros, gt_boxes[:, :-1])))
 81 |         )
 82 | 
 83 |         # Sanity check: single batch only
 84 |         assert np.all(all_rois[:, 0] == 0), \
 85 |             'Only single item batches are supported'
 86 | 
 87 |         num_images = 1
 88 |         rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images
 89 |         # Sample rois with classification labels and bounding box regression targets
 90 | 
 91 |         blobs, fg_inds, bg_inds, keep_inds = _sample_rois(
 92 |             all_rois, gt_boxes, rois_per_image, self._num_classes, gt_masks, im_scale, mask_info)
 93 |         self._keep_ind = keep_inds if self._bp_all else fg_inds
 94 | 
 95 |         for blob_name, blob in blobs.iteritems():
 96 |             top[self._top_name_map[blob_name]].reshape(*blob.shape)
 97 |             top[self._top_name_map[blob_name]].data[...] = blob.astype(np.float32, copy=False)
 98 | 
 99 |         if cfg.TRAIN.MIX_INDEX:
100 |             all_rois_index = bottom[5].data
101 |             fg_inds = fg_inds[fg_inds < all_rois_index.shape[1]].astype(int)
102 |             fg_inds = all_rois_index[0, fg_inds]
103 |             bg_inds = all_rois_index[0, bg_inds.astype(int)]
104 |             top[self._top_name_map['fg_inds']].reshape(*fg_inds.shape)
105 |             top[self._top_name_map['fg_inds']].data[...] = fg_inds
106 |             top[self._top_name_map['bg_inds']].reshape(*bg_inds.shape)
107 |             top[self._top_name_map['bg_inds']].data[...] = bg_inds
108 | 
109 |     def backward(self, top, propagate_down, bottom):
110 |         if propagate_down[0]:
111 |             bottom[0].diff.fill(0.)
112 |             # Eliminate gt_inds from the keep inds
113 |             valid_inds = np.where(self._keep_ind < bottom[0].diff.shape[0])[0]
114 |             valid_bot_inds = self._keep_ind[valid_inds].astype(int)
115 |             bottom[0].diff[valid_bot_inds, :] = top[0].diff[valid_inds, :]
116 | 
117 | 
118 | def _sample_rois(all_rois, gt_boxes, rois_per_image, num_classes, gt_masks, im_scale, mask_info):
119 |     """
120 |     Generate a random sample of RoIs comprising
121 |     foreground and background examples.
122 |     """
123 |     # overlaps: (rois x gt_boxes)
124 |     overlaps = bbox_overlaps(
125 |         np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float),
126 |         np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float))
127 |     gt_assignment = overlaps.argmax(axis=1)
128 |     max_overlaps = overlaps.max(axis=1)
129 |     labels = gt_boxes[gt_assignment, 4]
130 | 
131 |     # Sample foreground indexes
132 |     fg_inds = []
133 |     for i in xrange(len(cfg.TRAIN.FG_FRACTION)):
134 |         cur_inds = np.where((max_overlaps >= cfg.TRAIN.FG_THRESH_LO[i]) &
135 |                             (max_overlaps <= cfg.TRAIN.FG_THRESH_HI[i]))[0]
136 |         cur_rois_this_image = min(cur_inds.size, np.round(rois_per_image *
137 |                                                           cfg.TRAIN.FG_FRACTION[i]))
138 |         if cur_inds.size > 0:
139 |             cur_inds = npr.choice(cur_inds, size=cur_rois_this_image, replace=False)
140 |         fg_inds = np.hstack((fg_inds, cur_inds))
141 |         fg_inds = np.unique(fg_inds)
142 |     fg_rois_per_image = fg_inds.size
143 |     # Sample background indexes according to number of foreground
144 |     bg_rois_per_this_image = rois_per_image - fg_rois_per_image
145 |     bg_inds = []
146 |     for i in xrange(len(cfg.TRAIN.BG_FRACTION)):
147 |         cur_inds = np.where((max_overlaps >= cfg.TRAIN.BG_THRESH_LO[i]) &
148 |                             (max_overlaps <= cfg.TRAIN.BG_THRESH_HI[i]))[0]
149 |         cur_rois_this_image = min(cur_inds.size, np.round(bg_rois_per_this_image *
150 |                                                           cfg.TRAIN.BG_FRACTION[i]))
151 |         if cur_inds.size > 0:
152 |             cur_inds = npr.choice(cur_inds, size=cur_rois_this_image, replace=False)
153 |         bg_inds = np.hstack((bg_inds, cur_inds))
154 |         bg_inds = np.unique(bg_inds)
155 | 
156 |     # The indices that we're selecting (both fg and bg)
157 |     keep_inds = np.append(fg_inds, bg_inds).astype(int)
158 |     # Select sampled values from various arrays:
159 |     labels = labels[keep_inds]
160 |     # Clamp labels for the background RoIs to 0
161 |     labels[fg_rois_per_image:] = 0
162 |     rois = all_rois[keep_inds]
163 | 
164 |     bbox_target_data = bbox_compute_targets(
165 |         rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], normalize=True)
166 |     bbox_target_data = np.hstack((labels[:, np.newaxis], bbox_target_data))\
167 |         .astype(np.float32, copy=False)
168 |     bbox_targets, bbox_inside_weights = get_bbox_regression_label(
169 |         bbox_target_data, num_classes)
170 |     bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32)
171 | 
172 |     blobs = {
173 |         'rois': rois,
174 |         'labels': labels,
175 |         'bbox_targets': bbox_targets,
176 |         'bbox_inside_weights': bbox_inside_weights,
177 |         'bbox_outside_weights': bbox_outside_weights
178 |     }
179 | 
180 |     if cfg.MNC_MODE:
181 |         scaled_rois = rois[:, 1:5] / float(im_scale)
182 | 
183 |         # map to original image space
184 |         scaled_gt_boxes = gt_boxes[:, :4] / float(im_scale)
185 |         pos_masks = np.zeros((len(keep_inds), 1, cfg.MASK_SIZE,  cfg.MASK_SIZE))
186 |         top_mask_info = np.zeros((len(keep_inds), 12))
187 |         top_mask_info[len(fg_inds):, :] = -1
188 | 
189 |         for i, val in enumerate(fg_inds):
190 |             gt_box = scaled_gt_boxes[gt_assignment[val]]
191 |             gt_box = np.around(gt_box).astype(int)
192 |             ex_box = np.around(scaled_rois[i]).astype(int)
193 |             gt_mask = gt_masks[gt_assignment[val]]
194 |             gt_mask_info = mask_info[gt_assignment[val]]
195 |             gt_mask = gt_mask[0:gt_mask_info[0], 0:gt_mask_info[1]]
196 |             # calculate mask regression targets
197 |             # (intersection of bounding box and gt mask)
198 |             ex_mask = intersect_mask(ex_box, gt_box, gt_mask)
199 | 
200 |             pos_masks[i, ...] = ex_mask
201 |             top_mask_info[i, 0] = gt_assignment[val]
202 |             top_mask_info[i, 1] = gt_mask_info[0]
203 |             top_mask_info[i, 2] = gt_mask_info[1]
204 |             top_mask_info[i, 3] = labels[i]
205 | 
206 |             top_mask_info[i, 4:8] = ex_box
207 |             top_mask_info[i, 8:12] = gt_box
208 | 
209 |         mask_weight = np.zeros((rois.shape[0], 1, cfg.MASK_SIZE, cfg.MASK_SIZE))
210 |         # only assign box-level foreground as positive mask regression
211 |         mask_weight[0:len(fg_inds), :, :, :] = 1
212 |         blobs['mask_targets'] = pos_masks
213 |         blobs['mask_weight'] = mask_weight
214 |         blobs['gt_masks_info'] = top_mask_info
215 | 
216 |     return blobs, fg_inds, bg_inds, keep_inds
217 | 


--------------------------------------------------------------------------------
/lib/pylayer/anchor_target_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import yaml
  9 | import numpy as np
 10 | 
 11 | import caffe
 12 | from transform.anchors import generate_anchors
 13 | from utils.cython_bbox import bbox_overlaps
 14 | from utils.unmap import unmap
 15 | from mnc_config import cfg
 16 | from transform.bbox_transform import bbox_transform
 17 | 
 18 | 
 19 | class AnchorTargetLayer(caffe.Layer):
 20 |     """
 21 |     Assign anchors to ground-truth targets. Produces anchor classification
 22 |     labels and bounding-box regression targets.
 23 |     """
 24 | 
 25 |     def setup(self, bottom, top):
 26 |         self._anchors = generate_anchors()
 27 |         self._num_anchors = self._anchors.shape[0]
 28 | 
 29 |         layer_params = yaml.load(self.param_str_)
 30 |         self._feat_stride = layer_params['feat_stride']
 31 | 
 32 |         # allow boxes to sit over the edge by a small amount
 33 |         self._allowed_border = layer_params.get('allowed_border', 0)
 34 | 
 35 |         height, width = bottom[0].data.shape[-2:]
 36 | 
 37 |         A = self._num_anchors
 38 |         # labels
 39 |         top[0].reshape(1, 1, A * height, width)
 40 |         # bbox_targets
 41 |         top[1].reshape(1, A * 4, height, width)
 42 |         # bbox_inside_weights
 43 |         top[2].reshape(1, A * 4, height, width)
 44 |         # bbox_outside_weights
 45 |         top[3].reshape(1, A * 4, height, width)
 46 | 
 47 |     def reshape(self, bottom, top):
 48 |         """Reshaping happens during the call to forward"""
 49 |         pass
 50 | 
 51 |     def forward(self, bottom, top):
 52 |         # Algorithm:
 53 |         #
 54 |         # for each (H, W) location i
 55 |         #   generate 9 anchor boxes centered on cell i
 56 |         #   apply predicted transform deltas at cell i to each of the 9 anchors
 57 |         # filter out-of-image anchors
 58 |         # measure GT overlap
 59 |         #
 60 |         # Output target referenced value
 61 |         height, width = bottom[0].data.shape[-2:]
 62 |         assert bottom[0].data.shape[0] == 1, 'Only single item batches are supported'
 63 |         gt_boxes = bottom[1].data
 64 |         im_info = bottom[2].data[0, :]
 65 | 
 66 |         # 1. Generate proposals from shifted anchors
 67 |         #    note: unlike proposal layer, in this stage, no deltas involved
 68 |         shift_x = np.arange(0, width) * self._feat_stride
 69 |         shift_y = np.arange(0, height) * self._feat_stride
 70 |         shift_x, shift_y = np.meshgrid(shift_x, shift_y)
 71 |         shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
 72 |                             shift_x.ravel(), shift_y.ravel())).transpose()
 73 |         # add A anchors (1, A, 4) to
 74 |         # cell K shifts (K, 1, 4) to get
 75 |         # shift anchors (K, A, 4)
 76 |         # reshape to (K*A, 4) shifted anchors
 77 |         A = self._num_anchors
 78 |         K = shifts.shape[0]
 79 |         all_anchors = (self._anchors.reshape((1, A, 4)) +
 80 |                        shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
 81 |         all_anchors = all_anchors.reshape((K * A, 4))
 82 |         total_anchors = int(K * A)
 83 | 
 84 |         # only keep anchors inside the image
 85 |         inds_inside = np.where(
 86 |             (all_anchors[:, 0] >= -self._allowed_border) &
 87 |             (all_anchors[:, 1] >= -self._allowed_border) &
 88 |             (all_anchors[:, 2] < im_info[1] + self._allowed_border) &  # width
 89 |             (all_anchors[:, 3] < im_info[0] + self._allowed_border)    # height
 90 |         )[0]
 91 | 
 92 |         # 2. For each anchor, we assign positive or negative
 93 |         anchors = all_anchors[inds_inside, :]
 94 |         # label: 1 is positive, 0 is negative, -1 is don't care
 95 |         labels = np.empty((len(inds_inside), ), dtype=np.float32)
 96 |         labels.fill(-1)
 97 |         # overlaps between the anchors and the gt boxes
 98 |         # overlaps (ex, gt)
 99 |         overlaps = bbox_overlaps(
100 |             np.ascontiguousarray(anchors, dtype=np.float),
101 |             np.ascontiguousarray(gt_boxes, dtype=np.float))
102 |         argmax_overlaps = overlaps.argmax(axis=1)
103 |         max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps]
104 |         gt_argmax_overlaps = overlaps.argmax(axis=0)
105 |         gt_max_overlaps = overlaps[gt_argmax_overlaps,
106 |                                    np.arange(overlaps.shape[1])]
107 |         gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0]
108 | 
109 |         if not cfg.TRAIN.RPN_CLOBBER_POSITIVES:
110 |             # assign bg labels first so that positive labels can clobber them
111 |             labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
112 | 
113 |         # We assign two types of anchors as positve
114 |         # fg label: for each gt, anchor with highest overlap
115 |         labels[gt_argmax_overlaps] = 1
116 |         # fg label: above threshold IOU
117 |         labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1
118 | 
119 |         if cfg.TRAIN.RPN_CLOBBER_POSITIVES:
120 |             # assign bg labels last so that negative labels can clobber positives
121 |             labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0
122 | 
123 |         num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE)
124 |         fg_inds = np.where(labels == 1)[0]
125 |         if len(fg_inds) > num_fg:
126 |             disable_inds = np.random.choice(
127 |                 fg_inds, size=(len(fg_inds) - num_fg), replace=False)
128 |             labels[disable_inds] = -1
129 | 
130 |         # subsample negative labels if we have too many
131 |         num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1)
132 |         bg_inds = np.where(labels == 0)[0]
133 |         if len(bg_inds) > num_bg:
134 |             disable_inds = np.random.choice(
135 |                 bg_inds, size=(len(bg_inds) - num_bg), replace=False)
136 |             labels[disable_inds] = -1
137 | 
138 |         if cfg.TRAIN.MIX_INDEX:
139 |             bottom_fg = bottom[3].data
140 |             bottom_bg = bottom[4].data
141 |             unmapped_fg_ind = []
142 |             unmapped_bg_ind = []
143 |             for i in list(bottom_fg):
144 |                 zal = np.where(i == inds_inside)[0]
145 |                 if len(zal) > 0:
146 |                     unmapped_fg_ind.append(zal[0])
147 |             for i in list(bottom_bg):
148 |                 zal = np.where(i == inds_inside)[0]
149 |                 if len(zal) > 0:
150 |                     unmapped_bg_ind.append(zal[0])
151 |             labels[unmapped_bg_ind] = 0
152 |             labels[unmapped_fg_ind] = 1
153 | 
154 |         bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32)
155 |         bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :])
156 | 
157 |         bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
158 |         bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS)
159 | 
160 |         bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32)
161 |         if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0:
162 |             # uniform weighting of examples (given non-uniform sampling)
163 |             num_examples = np.sum(labels >= 0)
164 |             positive_weights = np.ones((1, 4)) * 1.0 / num_examples
165 |             negative_weights = np.ones((1, 4)) * 1.0 / num_examples
166 |         else:
167 |             assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) &
168 |                     (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1))
169 |             positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT /
170 |                                 np.sum(labels == 1))
171 |             negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) /
172 |                                 np.sum(labels == 0))
173 |         bbox_outside_weights[labels == 1, :] = positive_weights
174 |         bbox_outside_weights[labels == 0, :] = negative_weights
175 | 
176 |         # Currently all the indices are in the clipped index space
177 |         # we map up to original set of anchors
178 |         # In this process, we need to set clipped boxes as label -1, weights 0
179 |         labels = unmap(labels, total_anchors, inds_inside, fill=-1)
180 |         bbox_targets = unmap(bbox_targets, total_anchors, inds_inside, fill=0)
181 |         bbox_inside_weights = unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0)
182 |         bbox_outside_weights = unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0)
183 |         # labels
184 |         labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2)
185 |         labels = labels.reshape((1, 1, A * height, width))
186 |         top[0].reshape(*labels.shape)
187 |         top[0].data[...] = labels
188 | 
189 |         # bbox_targets
190 |         bbox_targets = bbox_targets \
191 |             .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
192 |         top[1].reshape(*bbox_targets.shape)
193 |         top[1].data[...] = bbox_targets
194 | 
195 |         # bbox_inside_weights
196 |         bbox_inside_weights = bbox_inside_weights \
197 |             .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
198 |         assert bbox_inside_weights.shape[2] == height
199 |         assert bbox_inside_weights.shape[3] == width
200 |         top[2].reshape(*bbox_inside_weights.shape)
201 |         top[2].data[...] = bbox_inside_weights
202 | 
203 |         # bbox_outside_weights
204 |         bbox_outside_weights = bbox_outside_weights \
205 |             .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2)
206 |         assert bbox_outside_weights.shape[2] == height
207 |         assert bbox_outside_weights.shape[3] == width
208 |         top[3].reshape(*bbox_outside_weights.shape)
209 |         top[3].data[...] = bbox_outside_weights
210 | 
211 |     def backward(self, top, propagate_down, bottom):
212 |         """This layer does not propagate gradients."""
213 |         pass
214 | 
215 | 
216 | def _compute_targets(ex_rois, gt_rois):
217 |     """
218 |     Compute bounding-box regression targets for an image.
219 |     Parameters:
220 |     -----------
221 |     ex_rois: ROIs from external source (selective search or RPN)
222 |     gt_rois: ground truth rois
223 | 
224 |     Returns:
225 |     ---------
226 |     The correct relative value for this anchor (combined when generate proposal)
227 |     """
228 | 
229 |     assert ex_rois.shape[0] == gt_rois.shape[0]
230 |     assert ex_rois.shape[1] == 4
231 |     assert gt_rois.shape[1] == 5
232 | 
233 |     return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False)


--------------------------------------------------------------------------------
/lib/pylayer/proposal_layer.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn)
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import caffe
  9 | import numpy as np
 10 | import yaml
 11 | 
 12 | from mnc_config import cfg
 13 | from transform.anchors import generate_anchors
 14 | from transform.bbox_transform import clip_boxes, bbox_transform_inv, filter_small_boxes
 15 | from nms.nms_wrapper import nms
 16 | 
 17 | DEBUG = False
 18 | PRINT_GRADIENT = 1
 19 | 
 20 | 
 21 | class ProposalLayer(caffe.Layer):
 22 |     """
 23 |     Outputs object detection proposals by applying estimated bounding-box
 24 |     transformations to a set of regular boxes (called "anchors").
 25 |     """
 26 | 
 27 |     def setup(self, bottom, top):
 28 |         layer_params = yaml.load(self.param_str_)
 29 |         self._feat_stride = layer_params['feat_stride']
 30 |         self._anchors = generate_anchors()
 31 |         self._num_anchors = self._anchors.shape[0]
 32 |         self._use_clip = layer_params.get('use_clip', 0)
 33 |         self._clip_denominator = float(layer_params.get('clip_base', 256))
 34 |         self._clip_thresh = 1.0 / self._clip_denominator
 35 |         # rois blob: holds R regions of interest, each is a 5-tuple
 36 |         # (n, x1, y1, x2, y2) specifying an image batch index n and a
 37 |         # rectangle (x1, y1, x2, y2)
 38 |         self._top_name_map = {}
 39 |         top[0].reshape(1, 5)
 40 |         self._top_name_map['rois'] = 0
 41 |         # For MNC, we force the output proposals will also be used to train RPN
 42 |         # this is achieved by passing proposal_index to anchor_target_layer
 43 |         if str(self.phase) == 'TRAIN':
 44 |             if cfg.TRAIN.MIX_INDEX:
 45 |                 top[1].reshape(1, 1)
 46 |                 self._top_name_map['proposal_index'] = 1
 47 | 
 48 |     def reshape(self, bottom, top):
 49 |         """Reshaping happens during the call to forward."""
 50 |         pass
 51 | 
 52 |     def forward(self, bottom, top):
 53 |         # Algorithm:
 54 |         #
 55 |         # for each (H, W) location i
 56 |         #   generate A anchor boxes centered on cell i
 57 |         #   apply predicted transform deltas at cell i to each of the A anchors
 58 |         # clip predicted boxes to image
 59 |         # remove predicted boxes with either height or width < threshold
 60 |         # sort all (proposal, score) pairs by score from highest to lowest
 61 |         # take top pre_nms_topN proposals before NMS
 62 |         # apply NMS with threshold 0.7 to remaining proposals
 63 |         # take after_nms_topN proposals after NMS
 64 |         # return the top proposals (-> RoIs top, scores top)
 65 |         assert bottom[0].data.shape[0] == 1, 'Only single item batches are supported'
 66 | 
 67 |         cfg_key = str(self.phase)  # either 'TRAIN' or 'TEST'
 68 |         pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
 69 |         post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
 70 |         nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
 71 |         min_size = cfg[cfg_key].RPN_MIN_SIZE
 72 | 
 73 |         # the first set of _num_anchors channels are bg probs
 74 |         # the second set are the fg probs, which we want
 75 |         scores = bottom[0].data[:, self._num_anchors:, :, :]
 76 |         bbox_deltas = bottom[1].data
 77 |         im_info = bottom[2].data[0, :]
 78 | 
 79 |         # 1. Generate proposals from transform deltas and shifted anchors
 80 |         height, width = scores.shape[-2:]
 81 |         self._height = height
 82 |         self._width = width
 83 |         # Enumerate all shifts
 84 |         shift_x = np.arange(0, self._width) * self._feat_stride
 85 |         shift_y = np.arange(0, self._height) * self._feat_stride
 86 |         shift_x, shift_y = np.meshgrid(shift_x, shift_y)
 87 |         shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
 88 |                             shift_x.ravel(), shift_y.ravel())).transpose()
 89 | 
 90 |         # Enumerate all shifted anchors:
 91 |         #
 92 |         # add A anchors (1, A, 4) to
 93 |         # cell K shifts (K, 1, 4) to get
 94 |         # shift anchors (K, A, 4)
 95 |         # reshape to (K*A, 4) shifted anchors
 96 |         A = self._num_anchors
 97 |         K = shifts.shape[0]
 98 |         anchors = self._anchors.reshape((1, A, 4)) + \
 99 |                   shifts.reshape((1, K, 4)).transpose((1, 0, 2))
100 |         anchors = anchors.reshape((K * A, 4))
101 |         _, keep = clip_boxes(anchors, im_info[:2])
102 |         self._anchor_index_before_clip = keep
103 | 
104 |         # Transpose and reshape predicted transform transformations to get them
105 |         # into the same order as the anchors:
106 |         #
107 |         # transform deltas will be (1, 4 * A, H, W) format
108 |         # transpose to (1, H, W, 4 * A)
109 |         # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
110 |         # in slowest to fastest order
111 |         bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))
112 | 
113 |         # Same story for the scores:
114 |         #
115 |         # scores are (1, A, H, W) format
116 |         # transpose to (1, H, W, A)
117 |         # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a)
118 |         scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))
119 | 
120 |         # Convert anchors into proposals via transform transformations
121 |         proposals = bbox_transform_inv(anchors, bbox_deltas)
122 | 
123 |         # 2. clip predicted boxes to image
124 |         proposals, keep = clip_boxes(proposals, im_info[:2])
125 |         # Record the cooresponding index before and after clip
126 |         # This step doesn't need unmap
127 |         # We need it to decide whether do back propagation
128 |         self._proposal_index_before_clip = keep
129 | 
130 |         # 3. remove predicted boxes with either height or width < threshold
131 |         # (NOTE: convert min_size to input image scale stored in im_info[2])
132 |         keep = filter_small_boxes(proposals, min_size * im_info[2])
133 |         proposals = proposals[keep, :]
134 |         scores = scores[keep]
135 |         self._ind_after_filter = keep
136 | 
137 |         # 4. sort all (proposal, score) pairs by score from highest to lowest
138 |         # 5. take top pre_nms_topN (e.g. 6000)
139 |         order = scores.ravel().argsort()[::-1]
140 | 
141 |         if pre_nms_topN > 0:
142 |             order = order[:pre_nms_topN]
143 |         proposals = proposals[order, :]
144 |         scores = scores[order]
145 |         self._ind_after_sort = order
146 |         # 6. apply nms (e.g. threshold = 0.7)
147 |         # 7. take after_nms_topN (e.g. 300)
148 |         # 8. return the top proposals (-> RoIs top)
149 |         keep = nms(np.hstack((proposals, scores)), nms_thresh)
150 | 
151 |         if post_nms_topN > 0:
152 |             keep = keep[:post_nms_topN]
153 |         proposals = proposals[keep, :]
154 | 
155 |         scores = scores[keep]
156 |         # Output rois blob
157 |         # Our RPN implementation only supports a single input image, so all
158 |         # batch inds are 0
159 |         batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)
160 |         proposals = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))
161 |         self._proposal_index = keep
162 | 
163 |         blobs = {
164 |             'rois': proposals
165 |         }
166 | 
167 |         if str(self.phase) == 'TRAIN':
168 |             if cfg.TRAIN.MIX_INDEX:
169 |                 all_rois_index = self._ind_after_filter[self._ind_after_sort[self._proposal_index]].reshape(1, len(keep))
170 |                 blobs['proposal_index'] = all_rois_index
171 | 
172 |         # Copy data to forward to top layer
173 |         for blob_name, blob in blobs.iteritems():
174 |             top[self._top_name_map[blob_name]].reshape(*blob.shape)
175 |             top[self._top_name_map[blob_name]].data[...] = blob.astype(np.float32, copy=False)
176 | 
177 |     def backward(self, top, propagate_down, bottom):
178 | 
179 |         if propagate_down[1]:
180 |             bottom[1].diff.fill(0.0)
181 | 
182 |             # first count only non-zero top gradient to accelerate computing
183 |             top_non_zero_ind = np.unique(np.where(abs(top[0].diff[:, :]) > 0)[0])
184 |             proposal_index = np.asarray(self._proposal_index)
185 |             # unmap indexes to the original scale
186 |             unmap_val = self._ind_after_filter[self._ind_after_sort[proposal_index[top_non_zero_ind]]]
187 | 
188 |             # not back propagate gradient if proposals/anchors are out of image boundary
189 |             # this is a 0/1 mask so we just multiply them when calculating bottom gradient
190 |             weight_out_proposal = np.in1d(unmap_val, self._proposal_index_before_clip)
191 |             weight_out_anchor = np.in1d(unmap_val, self._anchor_index_before_clip)
192 | 
193 |             # unmap_val are arranged as (H * W * A) as stated in forward comment
194 |             # with A as the fastest dimension (which is different from caffe)
195 |             c = unmap_val % self._num_anchors
196 |             w = (unmap_val / self._num_anchors) % self._width
197 |             h = (unmap_val / self._num_anchors / self._width) % self._height
198 | 
199 |             # width and height should be in feature map scale
200 |             anchor_w = (self._anchors[c, 2] - self._anchors[c, 0])
201 |             anchor_h = (self._anchors[c, 3] - self._anchors[c, 1])
202 |             dfdx1 = top[0].diff[top_non_zero_ind, 1]
203 |             dfdy1 = top[0].diff[top_non_zero_ind, 2]
204 |             dfdx2 = top[0].diff[top_non_zero_ind, 3]
205 |             dfdy2 = top[0].diff[top_non_zero_ind, 4]
206 | 
207 |             dfdxc = dfdx1 + dfdx2
208 |             dfdyc = dfdy1 + dfdy2
209 |             dfdw = 0.5 * (dfdx2 - dfdx1)
210 |             dfdh = 0.5 * (dfdy2 - dfdy1)
211 | 
212 |             bottom[1].diff[0, 4*c, h, w] = \
213 |                 dfdxc * anchor_w * weight_out_proposal * weight_out_anchor
214 |             bottom[1].diff[0, 4*c+1, h, w] = \
215 |                 dfdyc * anchor_h * weight_out_proposal * weight_out_anchor
216 |             bottom[1].diff[0, 4*c+2, h, w] = \
217 |                 dfdw * np.exp(bottom[1].data[0, 4*c+2, h, w]) * anchor_w * weight_out_proposal * weight_out_anchor
218 |             bottom[1].diff[0, 4*c+3, h, w] = \
219 |                 dfdh * np.exp(bottom[1].data[0, 4*c+3, h, w]) * anchor_h * weight_out_proposal * weight_out_anchor
220 | 
221 |             # if use gradient clip, constraint gradient inside [-thresh, thresh]
222 |             if self._use_clip:
223 |                 bottom[1].diff[0, 4*c, h, w] = np.minimum(np.maximum(
224 |                     bottom[1].diff[0, 4*c, h, w], -self._clip_thresh), self._clip_thresh)
225 |                 bottom[1].diff[0, 4*c+1, h, w] = np.minimum(np.maximum(
226 |                     bottom[1].diff[0, 4*c+1, h, w], -self._clip_thresh), self._clip_thresh)
227 |                 bottom[1].diff[0, 4*c+2, h, w] = np.minimum(np.maximum(
228 |                     bottom[1].diff[0, 4*c+2, h, w], -self._clip_thresh), self._clip_thresh)
229 |                 bottom[1].diff[0, 4*c+3, h, w] = np.minimum(np.maximum(
230 |                     bottom[1].diff[0, 4*c+3, h, w], -self._clip_thresh), self._clip_thresh)
231 | 


--------------------------------------------------------------------------------
/models/VGG16/cfm/test.prototxt:
--------------------------------------------------------------------------------
  1 | name: "VGG16"
  2 | 
  3 | input: "data"
  4 | input_shape {
  5 |   dim: 1
  6 |   dim: 3
  7 |   dim: 224
  8 |   dim: 224
  9 | }
 10 | 
 11 | input: "rois"
 12 | input_shape {
 13 |   dim: 1
 14 |   dim: 5
 15 | }
 16 | 
 17 | input: "masks"
 18 | input_shape {
 19 |   dim: 1
 20 |   dim: 1
 21 |   dim: 14
 22 |   dim: 14
 23 | }
 24 | 
 25 | layer {
 26 |   name: "conv1_1"
 27 |   type: "Convolution"
 28 |   bottom: "data"
 29 |   top: "conv1_1"
 30 |   param {
 31 |     lr_mult: 0
 32 |     decay_mult: 0
 33 |   }
 34 |   param {
 35 |     lr_mult: 0
 36 |     decay_mult: 0
 37 |   }
 38 |   convolution_param {
 39 |     num_output: 64
 40 |     pad: 1
 41 |     kernel_size: 3
 42 |   }
 43 | }
 44 | layer {
 45 |   name: "relu1_1"
 46 |   type: "ReLU"
 47 |   bottom: "conv1_1"
 48 |   top: "conv1_1"
 49 | }
 50 | layer {
 51 |   name: "conv1_2"
 52 |   type: "Convolution"
 53 |   bottom: "conv1_1"
 54 |   top: "conv1_2"
 55 |   param {
 56 |     lr_mult: 0
 57 |     decay_mult: 0
 58 |   }
 59 |   param {
 60 |     lr_mult: 0
 61 |     decay_mult: 0
 62 |   }
 63 |   convolution_param {
 64 |     num_output: 64
 65 |     pad: 1
 66 |     kernel_size: 3
 67 |   }
 68 | }
 69 | layer {
 70 |   name: "relu1_2"
 71 |   type: "ReLU"
 72 |   bottom: "conv1_2"
 73 |   top: "conv1_2"
 74 | }
 75 | layer {
 76 |   name: "pool1"
 77 |   type: "Pooling"
 78 |   bottom: "conv1_2"
 79 |   top: "pool1"
 80 |   pooling_param {
 81 |     pool: MAX
 82 |     kernel_size: 2
 83 |     stride: 2
 84 |   }
 85 | }
 86 | layer {
 87 |   name: "conv2_1"
 88 |   type: "Convolution"
 89 |   bottom: "pool1"
 90 |   top: "conv2_1"
 91 |   param {
 92 |     lr_mult: 0
 93 |     decay_mult: 0
 94 |   }
 95 |   param {
 96 |     lr_mult: 0
 97 |     decay_mult: 0
 98 |   }
 99 |   convolution_param {
100 |     num_output: 128
101 |     pad: 1
102 |     kernel_size: 3
103 |   }
104 | }
105 | layer {
106 |   name: "relu2_1"
107 |   type: "ReLU"
108 |   bottom: "conv2_1"
109 |   top: "conv2_1"
110 | }
111 | layer {
112 |   name: "conv2_2"
113 |   type: "Convolution"
114 |   bottom: "conv2_1"
115 |   top: "conv2_2"
116 |   param {
117 |     lr_mult: 0
118 |     decay_mult: 0
119 |   }
120 |   param {
121 |     lr_mult: 0
122 |     decay_mult: 0
123 |   }
124 |   convolution_param {
125 |     num_output: 128
126 |     pad: 1
127 |     kernel_size: 3
128 |   }
129 | }
130 | layer {
131 |   name: "relu2_2"
132 |   type: "ReLU"
133 |   bottom: "conv2_2"
134 |   top: "conv2_2"
135 | }
136 | layer {
137 |   name: "pool2"
138 |   type: "Pooling"
139 |   bottom: "conv2_2"
140 |   top: "pool2"
141 |   pooling_param {
142 |     pool: MAX
143 |     kernel_size: 2
144 |     stride: 2
145 |   }
146 | }
147 | layer {
148 |   name: "conv3_1"
149 |   type: "Convolution"
150 |   bottom: "pool2"
151 |   top: "conv3_1"
152 |   param {
153 |     lr_mult: 1
154 |     decay_mult: 1
155 |   }
156 |   param {
157 |     lr_mult: 2
158 |     decay_mult: 0
159 |   }
160 |   convolution_param {
161 |     num_output: 256
162 |     pad: 1
163 |     kernel_size: 3
164 |   }
165 | }
166 | layer {
167 |   name: "relu3_1"
168 |   type: "ReLU"
169 |   bottom: "conv3_1"
170 |   top: "conv3_1"
171 | }
172 | layer {
173 |   name: "conv3_2"
174 |   type: "Convolution"
175 |   bottom: "conv3_1"
176 |   top: "conv3_2"
177 |   param {
178 |     lr_mult: 1
179 |     decay_mult: 1
180 |   }
181 |   param {
182 |     lr_mult: 2
183 |     decay_mult: 0
184 |   }
185 |   convolution_param {
186 |     num_output: 256
187 |     pad: 1
188 |     kernel_size: 3
189 |   }
190 | }
191 | layer {
192 |   name: "relu3_2"
193 |   type: "ReLU"
194 |   bottom: "conv3_2"
195 |   top: "conv3_2"
196 | }
197 | layer {
198 |   name: "conv3_3"
199 |   type: "Convolution"
200 |   bottom: "conv3_2"
201 |   top: "conv3_3"
202 |   param {
203 |     lr_mult: 1
204 |     decay_mult: 1
205 |   }
206 |   param {
207 |     lr_mult: 2
208 |     decay_mult: 0
209 |   }
210 |   convolution_param {
211 |     num_output: 256
212 |     pad: 1
213 |     kernel_size: 3
214 |   }
215 | }
216 | layer {
217 |   name: "relu3_3"
218 |   type: "ReLU"
219 |   bottom: "conv3_3"
220 |   top: "conv3_3"
221 | }
222 | layer {
223 |   name: "pool3"
224 |   type: "Pooling"
225 |   bottom: "conv3_3"
226 |   top: "pool3"
227 |   pooling_param {
228 |     pool: MAX
229 |     kernel_size: 2
230 |     stride: 2
231 |   }
232 | }
233 | layer {
234 |   name: "conv4_1"
235 |   type: "Convolution"
236 |   bottom: "pool3"
237 |   top: "conv4_1"
238 |   param {
239 |     lr_mult: 1
240 |     decay_mult: 1
241 |   }
242 |   param {
243 |     lr_mult: 2
244 |     decay_mult: 0
245 |   }
246 |   convolution_param {
247 |     num_output: 512
248 |     pad: 1
249 |     kernel_size: 3
250 |   }
251 | }
252 | layer {
253 |   name: "relu4_1"
254 |   type: "ReLU"
255 |   bottom: "conv4_1"
256 |   top: "conv4_1"
257 | }
258 | layer {
259 |   name: "conv4_2"
260 |   type: "Convolution"
261 |   bottom: "conv4_1"
262 |   top: "conv4_2"
263 |   param {
264 |     lr_mult: 1
265 |     decay_mult: 1
266 |   }
267 |   param {
268 |     lr_mult: 2
269 |     decay_mult: 0
270 |   }
271 |   convolution_param {
272 |     num_output: 512
273 |     pad: 1
274 |     kernel_size: 3
275 |   }
276 | }
277 | layer {
278 |   name: "relu4_2"
279 |   type: "ReLU"
280 |   bottom: "conv4_2"
281 |   top: "conv4_2"
282 | }
283 | layer {
284 |   name: "conv4_3"
285 |   type: "Convolution"
286 |   bottom: "conv4_2"
287 |   top: "conv4_3"
288 |   param {
289 |     lr_mult: 1
290 |     decay_mult: 1
291 |   }
292 |   param {
293 |     lr_mult: 2
294 |     decay_mult: 0
295 |   }
296 |   convolution_param {
297 |     num_output: 512
298 |     pad: 1
299 |     kernel_size: 3
300 |   }
301 | }
302 | layer {
303 |   name: "relu4_3"
304 |   type: "ReLU"
305 |   bottom: "conv4_3"
306 |   top: "conv4_3"
307 | }
308 | layer {
309 |   name: "pool4"
310 |   type: "Pooling"
311 |   bottom: "conv4_3"
312 |   top: "pool4"
313 |   pooling_param {
314 |     pool: MAX
315 |     kernel_size: 2
316 |     stride: 2
317 |   }
318 | }
319 | layer {
320 |   name: "conv5_1"
321 |   type: "Convolution"
322 |   bottom: "pool4"
323 |   top: "conv5_1"
324 |   param {
325 |     lr_mult: 1
326 |     decay_mult: 1
327 |   }
328 |   param {
329 |     lr_mult: 2
330 |     decay_mult: 0
331 |   }
332 |   convolution_param {
333 |     num_output: 512
334 |     pad: 1
335 |     kernel_size: 3
336 |   }
337 | }
338 | layer {
339 |   name: "relu5_1"
340 |   type: "ReLU"
341 |   bottom: "conv5_1"
342 |   top: "conv5_1"
343 | }
344 | layer {
345 |   name: "conv5_2"
346 |   type: "Convolution"
347 |   bottom: "conv5_1"
348 |   top: "conv5_2"
349 |   param {
350 |     lr_mult: 1
351 |     decay_mult: 1
352 |   }
353 |   param {
354 |     lr_mult: 2
355 |     decay_mult: 0
356 |   }
357 |   convolution_param {
358 |     num_output: 512
359 |     pad: 1
360 |     kernel_size: 3
361 |   }
362 | }
363 | layer {
364 |   name: "relu5_2"
365 |   type: "ReLU"
366 |   bottom: "conv5_2"
367 |   top: "conv5_2"
368 | }
369 | layer {
370 |   name: "conv5_3"
371 |   type: "Convolution"
372 |   bottom: "conv5_2"
373 |   top: "conv5_3"
374 |   param {
375 |     lr_mult: 1
376 |     decay_mult: 1
377 |   }
378 |   param {
379 |     lr_mult: 2
380 |     decay_mult: 0
381 |   }
382 |   convolution_param {
383 |     num_output: 512
384 |     pad: 1
385 |     kernel_size: 3
386 |   }
387 | }
388 | layer {
389 |   name: "relu5_3"
390 |   type: "ReLU"
391 |   bottom: "conv5_3"
392 |   top: "conv5_3"
393 | }
394 | 
395 | #-------- Box Feature --------
396 | 
397 | layer {
398 |   name: "roi_pooling_conv5"
399 |   type: "ROIPooling"
400 |   bottom: "conv5_3"
401 |   bottom: "rois"
402 |   top: "roi_pooling_conv5"
403 |   roi_pooling_param {
404 |     pooled_w: 7
405 |     pooled_h: 7
406 |     spatial_scale: 0.0625 # 1/16
407 |   }
408 | }
409 | 
410 | layer {
411 |   name: "fc6"
412 |   type: "InnerProduct"
413 |   bottom: "roi_pooling_conv5"
414 |   top: "fc6"
415 |   inner_product_param {
416 |     num_output: 4096
417 |   }
418 | }
419 | 
420 | layer {
421 |   name: "relu6"
422 |   type: "ReLU"
423 |   bottom: "fc6"
424 |   top: "fc6"
425 | }
426 | 
427 | layer {
428 |   name: "fc7"
429 |   type: "InnerProduct"
430 |   bottom: "fc6"
431 |   top: "fc7"
432 |   inner_product_param {
433 |     num_output: 4096
434 |   }
435 | }
436 | 
437 | layer {
438 |   name: "relu7"
439 |   type: "ReLU"
440 |   bottom: "fc7"
441 |   top: "fc7"
442 | }
443 | 
444 | # -------- Mask Feature --------
445 | 
446 | layer {
447 |   name: "roi_pooling_conv5_mask"
448 |   type: "ROIPooling"
449 |   bottom: "conv5_3"
450 |   bottom: "rois"
451 |   top: "roi_pooling_conv5_mask"
452 |   roi_pooling_param {
453 |     pooled_w: 14
454 |     pooled_h: 14
455 |     spatial_scale: 0.0625 # 1/16
456 |   }
457 | }
458 | 
459 | layer {
460 |   name: "mask_pooling"
461 |   type: "MaskPooling"
462 |   bottom: "roi_pooling_conv5_mask"
463 |   bottom: "masks"
464 |   top: "roi_mask_conv5"
465 | }
466 | 
467 | layer {
468 |   name: "roi_mask_conv5"
469 |   type: "Pooling"
470 |   bottom: "roi_mask_conv5"
471 |   top: "roi_mask_conv5_pool"
472 |   pooling_param {
473 |     kernel_size: 2
474 |     stride: 2
475 |     pad: 0
476 |     pool: MAX
477 |   }
478 | }
479 | 
480 | layer {
481 |   name: "fc6_mask"
482 |   type: "InnerProduct"
483 |   bottom: "roi_mask_conv5_pool"
484 |   top: "fc6_mask"
485 |   inner_product_param {
486 |     num_output: 4096
487 |   }
488 | }
489 | 
490 | layer {
491 |   name: "relu6_mask"
492 |   type: "ReLU"
493 |   bottom: "fc6_mask"
494 |   top: "fc6_mask"
495 | }
496 | 
497 | layer {
498 |   name: "fc7_mask"
499 |   type: "InnerProduct"
500 |   bottom: "fc6_mask"
501 |   top: "fc7_mask"
502 |   inner_product_param {
503 |     num_output: 4096
504 |   }
505 | }
506 | 
507 | layer {
508 |   name: "relu7_mask"
509 |   type: "ReLU"
510 |   bottom: "fc7_mask"
511 |   top: "fc7_mask"
512 | }
513 | 
514 | 
515 | # -------- Mask Estimation --------
516 | 
517 | layer {
518 |   name: "fc6_maskest"
519 |   type: "InnerProduct"
520 |   bottom: "roi_pooling_conv5_mask"
521 |   top: "fc6_maskest"
522 |   inner_product_param {
523 |     num_output: 256
524 |   }
525 | }
526 | 
527 | layer {
528 |   name: "relu6_maskest"
529 |   type: "ReLU"
530 |   bottom: "fc6_maskest"
531 |   top: "fc6_maskest"
532 | }
533 | 
534 | layer {
535 |   name: "mask_pred"
536 |   type: "InnerProduct"
537 |   bottom: "fc6_maskest"
538 |   top: "mask_pred"
539 |   inner_product_param {
540 |     num_output: 441 # 21 * 21
541 |   }
542 | }
543 | 
544 | layer {
545 |   name: "mask_prob"
546 |   type: "Sigmoid"
547 |   bottom: "mask_pred"
548 |   top: "mask_prob"
549 | }
550 | 
551 | # ----- Concat Box-Mask Feature -----
552 | 
553 | layer {
554 |   name: "join_box_mask"
555 |   type: "Concat"
556 |   bottom: "fc7_mask"
557 |   bottom: "fc7"
558 |   top: "join_box_mask"
559 |   concat_param {
560 |     axis: 1
561 |   }
562 | }
563 | 
564 | # ---- Box Classification ----
565 | 
566 | layer {
567 |   name: "cls_score"
568 |   type: "InnerProduct"
569 |   bottom: "join_box_mask"
570 |   top: "cls_score"
571 |   inner_product_param {
572 |     num_output: 21
573 |   }
574 | }
575 | 
576 | layer {
577 |   name: "cls_prob"
578 |   type: "Softmax"
579 |   bottom: "cls_score"
580 |   top: "cls_prob"
581 |   loss_param {
582 |     ignore_label: -1
583 |     normalize: true
584 |   }
585 | }
586 | 
587 | # ---- Mask Classification ----
588 | 
589 | layer {
590 |   name: "seg_cls_score"
591 |   type: "InnerProduct"
592 |   bottom: "join_box_mask"
593 |   top: "seg_cls_score"
594 |   inner_product_param {
595 |     num_output: 21
596 |   }
597 | }
598 | layer {
599 |   name: "seg_cls_prob"
600 |   type: "Softmax"
601 |   bottom: "seg_cls_score"
602 |   top: "seg_cls_prob"
603 |   loss_param {
604 |     ignore_label: -1
605 |     normalize: true
606 |   }
607 | }
608 | 
609 | layer {
610 |   name: "bbox_pred"
611 |   type: "InnerProduct"
612 |   bottom: "join_box_mask"
613 |   top: "bbox_pred"
614 |   inner_product_param {
615 |     num_output: 84
616 |   }
617 | }
618 | 


--------------------------------------------------------------------------------
/lib/datasets/pascal_voc_seg.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Written by Haozhi Qi
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import cPickle
  9 | import os
 10 | import scipy.io as sio
 11 | import numpy as np
 12 | from datasets.pascal_voc_det import PascalVOCDet
 13 | from mnc_config import cfg
 14 | from utils.vis_seg import vis_seg
 15 | from utils.voc_eval import voc_eval_sds
 16 | import scipy
 17 | 
 18 | 
 19 | class PascalVOCSeg(PascalVOCDet):
 20 |     """
 21 |     A subclass for datasets.imdb.imdb
 22 |     This class contains information of ROIDB and MaskDB
 23 |     This class implements roidb and maskdb related functions
 24 |     """
 25 |     def __init__(self, image_set, year, devkit_path=None):
 26 |         PascalVOCDet.__init__(self, image_set, year, devkit_path)
 27 |         self._ori_image_num = len(self._image_index)
 28 |         self._comp_id = 'comp6'
 29 |         # PASCAL specific config options
 30 |         self.config = {'cleanup': True,
 31 |                        'use_salt': True,
 32 |                        'top_k': 2000,
 33 |                        'use_diff': False,
 34 |                        'matlab_eval': False,
 35 |                        'rpn_file': None}
 36 |         self._data_path = os.path.join(self._devkit_path)
 37 |         self._roidb_path = os.path.join(self.cache_path, 'voc_2012_' + image_set + '_mcg_maskdb')
 38 | 
 39 |     def image_path_at(self, i):
 40 |         image_path = os.path.join(self._data_path, 'img', self._image_index[i] + self._image_ext)
 41 |         assert os.path.exists(image_path), 'Path does not exist: {}'.format(image_path)
 42 |         return image_path
 43 | 
 44 |     def roidb_path_at(self, i):
 45 |         if i >= self._ori_image_num:
 46 |             return os.path.join(self._roidb_path,
 47 |                                 self.image_index[i % self._ori_image_num] + '_flip.mat')
 48 |         else:
 49 |             return os.path.join(self._roidb_path,
 50 |                                 self.image_index[i] + '.mat')
 51 | 
 52 |     def gt_maskdb(self):
 53 |         cache_file = os.path.join(self.cache_path, self.name + '_gt_maskdb.pkl')
 54 |         if os.path.exists(cache_file):
 55 |             with open(cache_file, 'rb') as fid:
 56 |                 gt_maskdb = cPickle.load(fid)
 57 |             print '{} gt maskdb loaded from {}'.format(self.name, cache_file)
 58 |         else:
 59 |             num_image = len(self.image_index)
 60 |             gt_roidbs = self.gt_roidb()
 61 |             gt_maskdb = [self._load_sbd_mask_annotations(index, gt_roidbs)
 62 |                          for index in xrange(num_image)]
 63 |             with open(cache_file, 'wb') as fid:
 64 |                 cPickle.dump(gt_maskdb, fid, cPickle.HIGHEST_PROTOCOL)
 65 |             print 'wrote gt roidb to {}'.format(cache_file)
 66 |         return gt_maskdb
 67 | 
 68 |     def _load_image_set_index(self):
 69 |         image_set_file = os.path.join(self._data_path, self._image_set + '.txt')
 70 |         assert os.path.exists(image_set_file), 'Path does not exist: {}'.format(image_set_file)
 71 |         with open(image_set_file) as f:
 72 |             image_index = [x.strip() for x in f.readlines()]
 73 |         return image_index
 74 | 
 75 |     def _load_sbd_mask_annotations(self, index, gt_roidbs):
 76 |         """
 77 |         Load gt_masks information from SBD's additional data
 78 |         """
 79 |         if index % 1000 == 0:
 80 |             print '%d / %d' % (index, len(self._image_index))
 81 |         image_name = self._image_index[index]
 82 |         inst_file_name = os.path.join(self._data_path, 'inst', image_name + '.mat')
 83 |         gt_inst_mat = scipy.io.loadmat(inst_file_name)
 84 |         gt_inst_data = gt_inst_mat['GTinst']['Segmentation'][0][0]
 85 |         unique_inst = np.unique(gt_inst_data)
 86 |         background_ind = np.where(unique_inst == 0)[0]
 87 |         unique_inst = np.delete(unique_inst, background_ind)
 88 |         gt_roidb = gt_roidbs[index]
 89 |         cls_file_name = os.path.join(self._data_path, 'cls', image_name + '.mat')
 90 |         gt_cls_mat = scipy.io.loadmat(cls_file_name)
 91 |         gt_cls_data = gt_cls_mat['GTcls']['Segmentation'][0][0]
 92 |         gt_masks = []
 93 |         for ind, inst_mask in enumerate(unique_inst):
 94 |             box = gt_roidb['boxes'][ind]
 95 |             im_mask = (gt_inst_data == inst_mask)
 96 |             im_cls_mask = np.multiply(gt_cls_data, im_mask)
 97 |             unique_cls_inst = np.unique(im_cls_mask)
 98 |             background_ind = np.where(unique_cls_inst == 0)[0]
 99 |             unique_cls_inst = np.delete(unique_cls_inst, background_ind)
100 |             assert len(unique_cls_inst) == 1
101 |             assert unique_cls_inst[0] == gt_roidb['gt_classes'][ind]
102 |             mask = im_mask[box[1]: box[3]+1, box[0]:box[2]+1]
103 |             gt_masks.append(mask)
104 | 
105 |         # Also record the maximum dimension to create fixed dimension array when do forwarding
106 |         mask_max_x = max(gt_masks[i].shape[1] for i in xrange(len(gt_masks)))
107 |         mask_max_y = max(gt_masks[i].shape[0] for i in xrange(len(gt_masks)))
108 |         return {
109 |             'gt_masks': gt_masks,
110 |             'mask_max': [mask_max_x, mask_max_y],
111 |             'flipped': False
112 |         }
113 | 
114 |     def append_flipped_masks(self):
115 |         """
116 |         This method is only accessed when we use maskdb, so implement here
117 |         Append flipped images to mask database
118 |         Note this method doesn't actually flip the 'image', it flip masks instead
119 |         """
120 |         cache_file = os.path.join(self.cache_path, self.name + '_' + cfg.TRAIN.PROPOSAL_METHOD + '_maskdb_flip.pkl')
121 |         if os.path.exists(cache_file):
122 |             with open(cache_file, 'rb') as fid:
123 |                 flip_maskdb = cPickle.load(fid)
124 |             print '{} gt flipped roidb loaded from {}'.format(self.name, cache_file)
125 |             self.maskdb.extend(flip_maskdb)
126 |             # Need to check this condition since otherwise we may occasionally *4
127 |             if self._image_index == self.num_images:
128 |                 self._image_index *= 2
129 |         else:
130 |             # pure image number hold for future development
131 |             # this is useless since append flip mask will only be called once
132 |             num_images = self._ori_image_num
133 |             flip_maskdb = []
134 |             for i in xrange(num_images):
135 |                 masks = self.maskdb[i]['gt_masks']
136 |                 masks_flip = []
137 |                 for mask_ind in xrange(len(masks)):
138 |                     mask_flip = np.fliplr(masks[mask_ind])
139 |                     masks_flip.append(mask_flip)
140 |                 entry = {'gt_masks': masks_flip,
141 |                          'mask_max': self.maskdb[i]['mask_max'],
142 |                          'flipped': True}
143 |                 flip_maskdb.append(entry)
144 |             with open(cache_file, 'wb') as fid:
145 |                 cPickle.dump(flip_maskdb, fid, cPickle.HIGHEST_PROTOCOL)
146 |             print 'wrote gt flipped maskdb to {}'.format(cache_file)
147 |             self.maskdb.extend(flip_maskdb)
148 |             # Need to check this condition since otherwise we may occasionally *4
149 |             if self._image_index == self.num_images:
150 |                 self._image_index *= 2
151 | 
152 |     def visualization_segmentation(self, output_dir):
153 |         vis_seg(self.image_index, self.classes, output_dir, self._data_path)
154 | 
155 |     # --------------------------- Evaluation ---------------------------
156 |     def evaluate_segmentation(self, all_boxes, all_masks, output_dir):
157 |         self._write_voc_seg_results_file(all_boxes, all_masks, output_dir)
158 |         self._py_evaluate_segmentation(output_dir)
159 | 
160 |     def _write_voc_seg_results_file(self, all_boxes, all_masks, output_dir):
161 |         """
162 |         Write results as a pkl file, note this is different from
163 |         detection task since it's difficult to write masks to txt
164 |         """
165 |         # Always reformat result in case of sometimes masks are not
166 |         # binary or is in shape (n, sz*sz) instead of (n, sz, sz)
167 |         all_boxes, all_masks = self._reformat_result(all_boxes, all_masks)
168 |         for cls_inds, cls in enumerate(self.classes):
169 |             if cls == '__background__':
170 |                 continue
171 |             print 'Writing {} VOC results file'.format(cls)
172 |             filename = os.path.join(output_dir, cls + '_det.pkl')
173 |             with open(filename, 'wr') as f:
174 |                 cPickle.dump(all_boxes[cls_inds], f, cPickle.HIGHEST_PROTOCOL)
175 |             filename = os.path.join(output_dir, cls + '_seg.pkl')
176 |             with open(filename, 'wr') as f:
177 |                 cPickle.dump(all_masks[cls_inds], f, cPickle.HIGHEST_PROTOCOL)
178 | 
179 |     def _reformat_result(self, boxes, masks):
180 |         num_images = len(self.image_index)
181 |         num_class = len(self.classes)
182 |         reformat_masks = [[[] for _ in xrange(num_images)]
183 |                           for _ in xrange(num_class)]
184 |         for cls_inds in xrange(1, num_class):
185 |             for img_inds in xrange(num_images):
186 |                 if len(masks[cls_inds][img_inds]) == 0:
187 |                     continue
188 |                 num_inst = masks[cls_inds][img_inds].shape[0]
189 |                 reformat_masks[cls_inds][img_inds] = masks[cls_inds][img_inds]\
190 |                     .reshape(num_inst, cfg.MASK_SIZE, cfg.MASK_SIZE)
191 |                 reformat_masks[cls_inds][img_inds] = reformat_masks[cls_inds][img_inds] >= cfg.BINARIZE_THRESH
192 |         all_masks = reformat_masks
193 |         return boxes, all_masks
194 | 
195 |     def _py_evaluate_segmentation(self, output_dir):
196 |         gt_dir = self._data_path
197 |         imageset_file = os.path.join(gt_dir, self._image_set + '.txt')
198 |         cache_dir = os.path.join(self._devkit_path, 'annotations_cache')
199 |         aps = []
200 |         # define this as true according to SDS's evaluation protocol
201 |         use_07_metric = True
202 |         print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No')
203 |         if not os.path.isdir(output_dir):
204 |             os.mkdir(output_dir)
205 |         print '~~~~~~ Evaluation use min overlap = 0.5 ~~~~~~'
206 |         for i, cls in enumerate(self._classes):
207 |             if cls == '__background__':
208 |                 continue
209 |             det_filename = os.path.join(output_dir, cls + '_det.pkl')
210 |             seg_filename = os.path.join(output_dir, cls + '_seg.pkl')
211 |             ap = voc_eval_sds(det_filename, seg_filename, gt_dir,
212 |                               imageset_file, cls, cache_dir, self._classes, ov_thresh=0.5)
213 |             aps += [ap]
214 |             print('AP for {} = {:.2f}'.format(cls, ap*100))
215 |         print('Mean AP@0.5 = {:.2f}'.format(np.mean(aps)*100))
216 |         print '~~~~~~ Evaluation use min overlap = 0.7 ~~~~~~'
217 |         aps = []
218 |         for i, cls in enumerate(self._classes):
219 |             if cls == '__background__':
220 |                 continue
221 |             det_filename = os.path.join(output_dir, cls + '_det.pkl')
222 |             seg_filename = os.path.join(output_dir, cls + '_seg.pkl')
223 |             ap = voc_eval_sds(det_filename, seg_filename, gt_dir,
224 |                               imageset_file, cls, cache_dir, self._classes, ov_thresh=0.7)
225 |             aps += [ap]
226 |             print('AP for {} = {:.2f}'.format(cls, ap*100))
227 |         print('Mean AP@0.7 = {:.2f}'.format(np.mean(aps)*100))
228 | 
229 | 


--------------------------------------------------------------------------------
/tools/prepare_mcg_maskdb.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Written by Haozhi Qi
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | # System modules
  9 | import argparse
 10 | import os
 11 | import cPickle
 12 | import numpy as np
 13 | import scipy.io as sio
 14 | import cv2
 15 | from multiprocessing import Process
 16 | import time
 17 | import PIL
 18 | # User-defined module
 19 | import _init_paths
 20 | from mnc_config import cfg
 21 | from utils.cython_bbox import bbox_overlaps
 22 | from transform.mask_transform import mask_overlap, intersect_mask
 23 | from datasets.pascal_voc_seg import PascalVOCSeg
 24 | 
 25 | 
 26 | def parse_args():
 27 |     """ Parse input arguments
 28 |     """
 29 |     parser = argparse.ArgumentParser(description='Prepare MCG roidb')
 30 |     parser.add_argument('--input', dest='input_dir',
 31 |                         help='folder contain input mcg proposals',
 32 |                         default='data/MCG-raw/', type=str)
 33 |     parser.add_argument('--output', dest='output_dir',
 34 |                         help='folder contain output roidb', required=True,
 35 |                         type=str)
 36 |     parser.add_argument('--gt_roi', dest='roidb', help='roidb',
 37 |                         default='data/cache/voc_2012_train_gt_roidb.pkl', type=str)
 38 |     parser.add_argument('--gt_mask', dest='maskdb', help='maskdb',
 39 |                         default='data/cache/voc_2012_train_gt_maskdb.pkl', type=str)
 40 |     parser.add_argument('-mask_sz', dest='mask_size',
 41 |                         help='compressed mask resolution',
 42 |                         default=21, type=int)
 43 |     parser.add_argument('--top_k', dest='top_k',
 44 |                         help='number of generated proposal',
 45 |                         default=-1, type=int)
 46 |     parser.add_argument('--db', dest='db_name',
 47 |                         help='train or validation',
 48 |                         default='train', type=str)
 49 |     parser.add_argument('--para_job', dest='para_job',
 50 |                         help='launch several process',
 51 |                         default='1', type=int)
 52 |     return parser.parse_args()
 53 | 
 54 | 
 55 | def process_roidb(file_start, file_end, db):
 56 | 
 57 |     for cnt in xrange(file_start, file_end):
 58 |         f = file_list[cnt]
 59 |         full_file = os.path.join(input_dir, f)
 60 |         output_cache = os.path.join(output_dir, f.split('.')[0] + '.mat')
 61 |         timer_tic = time.time()
 62 |         if os.path.exists(output_cache):
 63 |             continue
 64 |         mcg_mat = sio.loadmat(full_file)
 65 |         mcg_mask_label = mcg_mat['labels']
 66 |         mcg_superpixels = mcg_mat['superpixels']
 67 |         num_proposal = len(mcg_mask_label)
 68 |         mcg_boxes = np.zeros((num_proposal, 4))
 69 |         mcg_masks = np.zeros((num_proposal, mask_size, mask_size), dtype=np.bool)
 70 | 
 71 |         for ind_proposal in xrange(num_proposal):
 72 |             label = mcg_mask_label[ind_proposal][0][0]
 73 |             proposal = np.in1d(mcg_superpixels, label).reshape(mcg_superpixels.shape)
 74 |             [r, c] = np.where(proposal == 1)
 75 |             y1 = np.min(r)
 76 |             x1 = np.min(c)
 77 |             y2 = np.max(r)
 78 |             x2 = np.max(c)
 79 |             box = np.array([x1, y1, x2, y2])
 80 |             proposal = proposal[y1:y2+1, x1:x2+1]
 81 |             proposal = cv2.resize(proposal.astype(np.float), (mask_size, mask_size), interpolation=cv2.INTER_NEAREST)
 82 |             mcg_masks[ind_proposal, :, :] = proposal
 83 |             mcg_boxes[ind_proposal, :] = box
 84 | 
 85 |         if top_k != -1:
 86 |             mcg_boxes = mcg_boxes[:top_k, :]
 87 |             mcg_masks = mcg_masks[:top_k, :]
 88 | 
 89 |         if db == 'val':
 90 |             # if we prepare validation data, we only need its masks and boxes
 91 |             roidb = {
 92 |                 'masks': (mcg_masks >= cfg.BINARIZE_THRESH).astype(bool),
 93 |                 'boxes': mcg_boxes
 94 |             }
 95 |             sio.savemat(output_cache, roidb)
 96 |             use_time = time.time() - timer_tic
 97 |             print '%d/%d use time %f' % (cnt, len(file_list), use_time)
 98 | 
 99 |         else:
100 |             # Otherwise we need to prepare other information like overlaps
101 |             num_mcg = mcg_boxes.shape[0]
102 |             gt_roidb = gt_roidbs[cnt]
103 |             gt_maskdb = gt_maskdbs[cnt]
104 |             gt_boxes = gt_roidb['boxes']
105 |             gt_masks = gt_maskdb['gt_masks']
106 |             gt_classes = gt_roidb['gt_classes']
107 |             num_gt = gt_boxes.shape[0]
108 |             num_all = num_gt + num_mcg
109 |             # define output structure
110 |             det_overlaps = np.zeros((num_all, 1))
111 |             seg_overlaps = np.zeros((num_all, 1))
112 |             seg_assignment = np.zeros((num_all, 1))
113 |             mask_targets = np.zeros((num_all, mask_size, mask_size))
114 |             # ------------------------------------------------------
115 |             all_boxes = np.vstack((gt_boxes[:, :4], mcg_boxes)).astype(int)
116 |             all_masks = np.zeros((num_all, mask_size, mask_size))
117 |             for i in xrange(num_gt):
118 |                 all_masks[i, :, :] = (cv2.resize(gt_masks[i].astype(np.float),
119 |                                                 (mask_size, mask_size)))
120 |             assert all_masks[num_gt:, :, :].shape == mcg_masks.shape
121 |             all_masks[num_gt:, :, :] = mcg_masks
122 |             # record bounding box overlaps
123 |             cur_overlap = bbox_overlaps(all_boxes.astype(np.float), gt_boxes.astype(np.float))
124 |             seg_assignment = cur_overlap.argmax(axis=1)
125 |             det_overlaps = cur_overlap.max(axis=1)
126 |             seg_assignment[det_overlaps == 0] = -1
127 |             # record mask region overlaps
128 |             seg_overlaps[:num_gt] = 1.0
129 |             for i in xrange(num_gt, num_all):
130 |                 cur_mask = cv2.resize(all_masks[i, :, :].astype(np.float),
131 |                                       (all_boxes[i, 2] - all_boxes[i, 0] + 1,
132 |                                        all_boxes[i, 3] - all_boxes[i, 1] + 1)) >= cfg.BINARIZE_THRESH
133 |                 for mask_ind in xrange(len(gt_masks)):
134 |                     gt_mask = gt_masks[mask_ind]
135 |                     gt_roi = gt_roidb['boxes'][mask_ind]
136 |                     cur_ov = mask_overlap(all_boxes[i, :], gt_roi, cur_mask, gt_mask)
137 |                     seg_overlaps[i] = max(seg_overlaps[i], cur_ov)
138 | 
139 |             output_label = np.zeros((num_all, 1))
140 |             for i in xrange(num_all):
141 |                 if seg_assignment[i] == -1:
142 |                     continue
143 |                 cur_ind = seg_assignment[i]
144 |                 output_label[i] = gt_classes[seg_assignment[i]]
145 |                 mask_targets[i, :, :] = intersect_mask(all_boxes[i, :], gt_roidb['boxes'][cur_ind], gt_masks[cur_ind])
146 | 
147 |             # Some of the array need to insert a new axis to be consistent of savemat method
148 |             roidb = {
149 |                 'masks': (all_masks >= cfg.BINARIZE_THRESH).astype(bool),
150 |                 'boxes': all_boxes,
151 |                 'det_overlap': det_overlaps[:, np.newaxis],
152 |                 'seg_overlap': seg_overlaps,
153 |                 'mask_targets': (mask_targets >= cfg.BINARIZE_THRESH).astype(bool),
154 |                 'gt_classes': gt_classes[:, np.newaxis],
155 |                 'output_label': output_label,
156 |                 'gt_assignment': seg_assignment[:, np.newaxis],
157 |                 'Flip': False
158 |             }
159 | 
160 |             sio.savemat(output_cache, roidb)
161 |             use_time = time.time() - timer_tic
162 |             print '%d/%d use time %f' % (cnt, len(file_list), use_time)
163 | 
164 | 
165 | def process_flip_masks(image_names, im_start, im_end):
166 | 
167 |     widths = [PIL.Image.open('data/VOCdevkitSDS/img/' + im_name + '.jpg').size[0] for im_name in image_names]
168 |     cache_dir = output_dir
169 |     if not os.path.isdir(cache_dir):
170 |         os.makedirs(cache_dir)
171 | 
172 |     for index in xrange(im_start, im_end):
173 |         output_cache = os.path.join(cache_dir, image_names[index] + '_flip.mat')
174 |         if os.path.exists(output_cache):
175 |             continue
176 |         image_cache = os.path.join(cache_dir, image_names[index] + '.mat')
177 |         orig_maskdb = sio.loadmat(image_cache)
178 |         # Flip mask and mask regression targets
179 |         masks = orig_maskdb['masks']
180 |         mask_targets = orig_maskdb['mask_targets']
181 |         mask_flip = masks[:, :, ::-1]
182 |         mask_target_flip = mask_targets[:, :, ::-1]
183 |         # Flip boxes
184 |         boxes = orig_maskdb['boxes']
185 |         oldx1 = boxes[:, 0].copy()
186 |         oldx2 = boxes[:, 2].copy()
187 |         boxes[:, 0] = widths[index] - oldx2 - 1
188 |         boxes[:, 2] = widths[index] - oldx1 - 1
189 |         assert (boxes[:, 2] >= boxes[:, 0]).all()
190 |         # Other maskdb values are identical with original maskdb
191 |         flip_maskdb = {
192 |             'masks': (mask_flip >= cfg.BINARIZE_THRESH).astype(bool),
193 |             'boxes': boxes,
194 |             'det_overlap': orig_maskdb['det_overlap'],
195 |             'seg_overlap': orig_maskdb['seg_overlap'],
196 |             'mask_targets': (mask_target_flip >= cfg.BINARIZE_THRESH).astype(bool),
197 |             'gt_classes': orig_maskdb['gt_classes'],
198 |             'gt_assignment': orig_maskdb['gt_assignment'],
199 |             'Flip': True,
200 |             'output_label': orig_maskdb['output_label']
201 |         }
202 |         sio.savemat(output_cache, flip_maskdb)
203 | 
204 | 
205 | if __name__ == '__main__':
206 |     args = parse_args()
207 |     input_dir = args.input_dir
208 |     assert os.path.exists(input_dir), 'Path does not exist: {}'.format(input_dir)
209 |     output_dir = args.output_dir
210 |     if not os.path.isdir(output_dir):
211 |         os.makedirs(output_dir)
212 |     mask_size = args.mask_size
213 | 
214 |     list_name = 'data/VOCdevkitSDS/train.txt' if args.db_name == 'train' else 'data/VOCdevkitSDS/val.txt'
215 |     with open(list_name) as f:
216 |         file_list = f.read().splitlines()
217 | 
218 |     # If we want to prepare training maskdb, first try to load gts
219 |     if args.db_name == 'train':
220 |         if os.path.exists(args.roidb) and os.path.exists(args.maskdb):
221 |             with open(args.roidb, 'rb') as f:
222 |                 gt_roidbs = cPickle.load(f)
223 |             with open(args.maskdb, 'rb') as f:
224 |                 gt_maskdbs = cPickle.load(f)
225 |         else:
226 |             db = PascalVOCSeg('train', '2012', 'data/VOCdevkitSDS/')
227 |             gt_roidbs = db.gt_roidb()
228 |             gt_maskdbs = db.gt_maskdb()
229 | 
230 |     top_k = args.top_k
231 |     num_process = args.para_job
232 |     # Prepare train/val maskdb use multi-process
233 |     processes = []
234 |     file_start = 0
235 |     file_offset = int(np.ceil(len(file_list) / float(num_process)))
236 |     for process_id in xrange(num_process):
237 |         file_end = min(file_start + file_offset, len(file_list))
238 |         p = Process(target=process_roidb, args=(file_start, file_end, args.db_name))
239 |         p.start()
240 |         processes.append(p)
241 |         file_start += file_offset
242 | 
243 |     for p in processes:
244 |         p.join()
245 | 
246 |     # If db_name == 'train', we still need to add flipped maskdb into output folder
247 |     # Add flipped mask and mask regression targets after prepare the original mcg proposals
248 |     if args.db_name == 'train':
249 |         print 'Appending flipped MCG to ROI'
250 |         processes = []
251 |         file_start = 0
252 |         file_offset = int(np.ceil(len(file_list) / float(num_process)))
253 |         for process_id in xrange(num_process):
254 |             file_end = min(file_start + file_offset, len(file_list))
255 |             p = Process(target=process_flip_masks, args=(file_list, file_start, file_end))
256 |             p.start()
257 |             processes.append(p)
258 |             file_start += file_offset
259 |         for p in processes:
260 |             p.join()
261 | 


--------------------------------------------------------------------------------
/lib/transform/mask_transform.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------
  2 | # Multitask Network Cascade
  3 | # Written by Haozhi Qi
  4 | # Copyright (c) 2016, Haozhi Qi
  5 | # Licensed under The MIT License [see LICENSE for details]
  6 | # --------------------------------------------------------
  7 | 
  8 | import numpy as np
  9 | import cv2
 10 | from mnc_config import cfg
 11 | from nms.nms_wrapper import nms
 12 | from utils.cython_bbox import bbox_overlaps
 13 | from nms.mv import mv
 14 | 
 15 | 
 16 | def mask_overlap(box1, box2, mask1, mask2):
 17 |     """
 18 |     This function calculate region IOU when masks are
 19 |     inside different boxes
 20 |     Returns:
 21 |         intersection over unions of this two masks
 22 |     """
 23 |     x1 = max(box1[0], box2[0])
 24 |     y1 = max(box1[1], box2[1])
 25 |     x2 = min(box1[2], box2[2])
 26 |     y2 = min(box1[3], box2[3])
 27 |     if x1 > x2 or y1 > y2:
 28 |         return 0
 29 |     w = x2 - x1 + 1
 30 |     h = y2 - y1 + 1
 31 |     # get masks in the intersection part
 32 |     start_ya = y1 - box1[1]
 33 |     start_xa = x1 - box1[0]
 34 |     inter_maska = mask1[start_ya: start_ya + h, start_xa:start_xa + w]
 35 | 
 36 |     start_yb = y1 - box2[1]
 37 |     start_xb = x1 - box2[0]
 38 |     inter_maskb = mask2[start_yb: start_yb + h, start_xb:start_xb + w]
 39 | 
 40 |     assert inter_maska.shape == inter_maskb.shape
 41 | 
 42 |     inter = np.logical_and(inter_maskb, inter_maska).sum()
 43 |     union = mask1.sum() + mask2.sum() - inter
 44 |     if union < 1.0:
 45 |         return 0
 46 |     return float(inter) / float(union)
 47 | 
 48 | 
 49 | def intersect_mask(ex_box, gt_box, gt_mask):
 50 |     """
 51 |     This function calculate the intersection part of a external box
 52 |     and gt_box, mask it according to gt_mask
 53 | 
 54 |     Args:
 55 |         ex_box: external ROIS
 56 |         gt_box: ground truth boxes
 57 |         gt_mask: ground truth masks, not been resized yet
 58 |     Returns:
 59 |         regression_target: logical numpy array
 60 |     """
 61 |     x1 = max(ex_box[0], gt_box[0])
 62 |     y1 = max(ex_box[1], gt_box[1])
 63 |     x2 = min(ex_box[2], gt_box[2])
 64 |     y2 = min(ex_box[3], gt_box[3])
 65 |     if x1 > x2 or y1 > y2:
 66 |         return np.zeros((21, 21), dtype=bool)
 67 |     w = x2 - x1 + 1
 68 |     h = y2 - y1 + 1
 69 |     ex_starty = y1 - ex_box[1]
 70 |     ex_startx = x1 - ex_box[0]
 71 | 
 72 |     gt_starty = y1 - gt_box[1]
 73 |     gt_startx = x1 - gt_box[0]
 74 |     inter_maskb = gt_mask[gt_starty: gt_starty + h, gt_startx: gt_startx + w]
 75 |     regression_target = np.zeros((ex_box[3] - ex_box[1] + 1, ex_box[2] - ex_box[0] + 1))
 76 |     regression_target[ex_starty: ex_starty + h, ex_startx: ex_startx + w] = inter_maskb
 77 |     regression_target = regression_target.astype(np.float32)
 78 |     regression_target = cv2.resize(regression_target, (cfg.MASK_SIZE, cfg.MASK_SIZE))
 79 |     regression_target = regression_target >= cfg.BINARIZE_THRESH
 80 |     return regression_target
 81 | 
 82 | 
 83 | def clip_masked_boxes(boxes, masks, im_shape):
 84 |     """
 85 |     Clipped masked boxes inside image boundary
 86 |     """
 87 |     num_box = boxes.shape[0]
 88 |     for i in xrange(num_box):
 89 |         box = np.round(boxes[i]).astype(int)
 90 |         mask = cv2.resize(masks[i, 0].astype(np.float32), (box[2] - box[0] + 1, box[3] - box[1] + 1))
 91 |         clip_x1 = max(0, 0 - box[0])
 92 |         clip_y1 = max(0, 0 - box[1])
 93 |         clip_width = min(box[2], im_shape[1] - 1) - clip_x1
 94 |         clip_height = min(box[3], im_shape[0] - 1) - clip_y1
 95 |         clip_x2 = clip_x1 + clip_width
 96 |         clip_y2 = clip_y1 + clip_height
 97 |         mask = mask[clip_y1:clip_y2, clip_x1:clip_x2]
 98 |         masks[i, 0] = cv2.resize(mask.astype(np.float32), (cfg.MASK_SIZE, cfg.MASK_SIZE))
 99 |         box[0] = clip_x1
100 |         box[1] = clip_y1
101 |         box[2] = clip_x2
102 |         box[3] = clip_y2
103 |         boxes[i] = box
104 |     return boxes, masks
105 | 
106 | 
107 | def mask_aggregation(boxes, masks, mask_weights, im_width, im_height):
108 |     """
109 |     This function implements mask voting mechanism to give finer mask
110 |     n is the candidate boxes (masks) number
111 |     Args:
112 |         masks: All masks need to be aggregated (n x sz x sz)
113 |         mask_weights: class score associated with each mask (n x 1)
114 |         boxes: tight box enclose each mask (n x 4)
115 |         im_width, im_height: image information
116 |     TODO: Ensure mask size is sz x sz or tight box size
117 |     """
118 |     assert boxes.shape[0] == len(masks) and boxes.shape[0] == mask_weights.shape[0]
119 |     im_mask = np.zeros((im_height, im_width))
120 |     for mask_ind in xrange(len(masks)):
121 |         box = np.round(boxes[mask_ind])
122 |         mask = (masks[mask_ind] >= cfg.BINARIZE_THRESH).astype(float)
123 |         mask_weight = mask_weights[mask_ind]
124 |         im_mask[box[1]:box[3]+1, box[0]:box[2]+1] += mask * mask_weight
125 |     [r, c] = np.where(im_mask >= cfg.BINARIZE_THRESH)
126 |     if len(r) == 0 or len(c) == 0:
127 |         min_y = np.ceil(im_height / 2)
128 |         min_x = np.ceil(im_width / 2)
129 |         max_y = min_y
130 |         max_x = min_x
131 |     else:
132 |         min_y = np.min(r)
133 |         min_x = np.min(c)
134 |         max_y = np.max(r)
135 |         max_x = np.max(c)
136 | 
137 |     clipped_mask = im_mask[min_y:max_y+1, min_x:max_x+1]
138 |     clipped_box = np.array((min_x, min_y, max_x, max_y), dtype=np.float32)
139 |     return clipped_mask, clipped_box
140 | 
141 | 
142 | def cpu_mask_voting(masks, boxes, scores, num_classes, max_per_image, im_width, im_height):
143 |     """
144 |     Wrapper function for mask voting, note we already know the class of boxes and masks
145 |     Args:
146 |         masks: ~ n x mask_sz x mask_sz
147 |         boxes: ~ n x 4
148 |         scores: ~ n x 1
149 |         max_per_image: default would be 100
150 |         im_width: width of image
151 |         im_height: height of image
152 |     """
153 |     # apply nms and sort to get first images according to their scores
154 |     scores = scores[:, 1:]
155 |     num_detect = boxes.shape[0]
156 |     res_mask = [[] for _ in xrange(num_detect)]
157 |     for i in xrange(num_detect):
158 |         box = np.round(boxes[i]).astype(int)
159 |         mask = cv2.resize(masks[i, 0].astype(np.float32), (box[2] - box[0] + 1, box[3] - box[1] + 1))
160 |         res_mask[i] = mask
161 |     # Intermediate results
162 |     sup_boxes = []
163 |     sup_masks = []
164 |     sup_scores = []
165 |     tobesort_scores = []
166 | 
167 |     for i in xrange(num_classes - 1):
168 |         dets = np.hstack((boxes.astype(np.float32), scores[:, i:i+1]))
169 |         inds = nms(dets, cfg.TEST.MASK_MERGE_NMS_THRESH)
170 |         ind_boxes = boxes[inds]
171 |         ind_masks = masks[inds]
172 |         ind_scores = scores[inds, i]
173 |         order = ind_scores.ravel().argsort()[::-1]
174 |         num_keep = min(len(order), max_per_image)
175 |         order = order[0:num_keep]
176 |         sup_boxes.append(ind_boxes[order])
177 |         sup_masks.append(ind_masks[order])
178 |         sup_scores.append(ind_scores[order])
179 |         tobesort_scores.extend(ind_scores[order])
180 | 
181 |     sorted_scores = np.sort(tobesort_scores)[::-1]
182 |     num_keep = min(len(sorted_scores), max_per_image)
183 |     thresh = sorted_scores[num_keep-1]
184 |     result_box = []
185 |     result_mask = []
186 |     for c in xrange(num_classes - 1):
187 |         cls_box = sup_boxes[c]
188 |         cls_score = sup_scores[c]
189 |         keep = np.where(cls_score >= thresh)[0]
190 |         new_sup_boxes = cls_box[keep]
191 |         num_sup_box = len(new_sup_boxes)
192 |         masks_ar = np.zeros((num_sup_box, 1, cfg.MASK_SIZE, cfg.MASK_SIZE))
193 |         boxes_ar = np.zeros((num_sup_box, 4))
194 |         for i in xrange(num_sup_box):
195 |             # Get weights according to their segmentation scores
196 |             cur_ov = bbox_overlaps(boxes.astype(np.float), new_sup_boxes[i, np.newaxis].astype(np.float))
197 |             cur_inds = np.where(cur_ov >= cfg.TEST.MASK_MERGE_IOU_THRESH)[0]
198 |             cur_weights = scores[cur_inds, c]
199 |             cur_weights = cur_weights / sum(cur_weights)
200 |             # Re-format mask when passing it to mask_aggregation
201 |             pass_mask = [res_mask[j] for j in list(cur_inds)]
202 |             # do mask aggregation
203 |             tmp_mask, boxes_ar[i] = mask_aggregation(boxes[cur_inds], pass_mask, cur_weights, im_width, im_height)
204 |             tmp_mask = cv2.resize(tmp_mask.astype(np.float32), (cfg.MASK_SIZE, cfg.MASK_SIZE))
205 |             masks_ar[i, 0] = tmp_mask
206 |         # make new array such that scores is the last dimension of boxes
207 |         boxes_scored_ar = np.hstack((boxes_ar, cls_score[keep, np.newaxis]))
208 |         result_box.append(boxes_scored_ar)
209 |         result_mask.append(masks_ar)
210 |     return result_box, result_mask
211 | 
212 | 
213 | def gpu_mask_voting(masks, boxes, scores, num_classes, max_per_image, im_width, im_height):
214 |     """
215 |     A wrapper function, note we already know the class of boxes and masks
216 |     Args:
217 |         masks: ~ 300 x 21 x 21
218 |         boxes: ~ 300 x 4
219 |         scores: ~ 300 x 1
220 |         max_per_image: default would be 100
221 |         im_width:
222 |         im_height:
223 |     """
224 |     # Intermediate results
225 |     sup_boxes = []
226 |     sup_scores = []
227 |     tobesort_scores = []
228 |     for i in xrange(num_classes):
229 |         if i == 0:
230 |             sup_boxes.append([])
231 |             sup_scores.append([])
232 |             continue
233 |         dets = np.hstack((boxes.astype(np.float32), scores[:, i:i+1]))
234 |         inds = nms(dets, cfg.TEST.MASK_MERGE_NMS_THRESH)
235 |         ind_boxes = boxes[inds]
236 |         ind_scores = scores[inds, i]
237 |         num_keep = min(len(ind_scores), max_per_image)
238 |         sup_boxes.append(ind_boxes[0:num_keep, :])
239 |         sup_scores.append(ind_scores[0:num_keep])
240 |         tobesort_scores.extend(ind_scores[0:num_keep])
241 | 
242 |     sorted_scores = np.sort(tobesort_scores)[::-1]
243 |     num_keep = min(len(sorted_scores), max_per_image)
244 |     thresh = sorted_scores[num_keep-1]
245 |     # inds array to record which mask should be aggregated together
246 |     candidate_inds = []
247 |     # weight for each element in the candidate inds
248 |     candidate_weights = []
249 |     # start position for candidate array
250 |     candidate_start = []
251 |     candidate_scores = []
252 |     class_bar = []
253 |     for c in xrange(num_classes):
254 |         if c == 0:
255 |             continue
256 |         cls_box = sup_boxes[c]
257 |         cls_score = sup_scores[c]
258 |         keep = np.where(cls_score >= thresh)[0]
259 |         new_sup_boxes = cls_box[keep]
260 |         num_sup_box = len(new_sup_boxes)
261 |         for i in xrange(num_sup_box):
262 |             cur_ov = bbox_overlaps(boxes.astype(np.float), new_sup_boxes[i, np.newaxis].astype(np.float))
263 |             cur_inds = np.where(cur_ov >= cfg.TEST.MASK_MERGE_IOU_THRESH)[0]
264 |             candidate_inds.extend(cur_inds)
265 |             cur_weights = scores[cur_inds, c]
266 |             cur_weights = cur_weights / sum(cur_weights)
267 |             candidate_weights.extend(cur_weights)
268 |             candidate_start.append(len(candidate_inds))
269 |         candidate_scores.extend(cls_score[keep])
270 |         class_bar.append(len(candidate_scores))
271 |     candidate_inds = np.array(candidate_inds, dtype=np.int32)
272 |     candidate_weights = np.array(candidate_weights, dtype=np.float32)
273 |     candidate_start = np.array(candidate_start, dtype=np.int32)
274 |     candidate_scores = np.array(candidate_scores, dtype=np.float32)
275 |     result_mask, result_box = mv(boxes.astype(np.float32), masks, candidate_inds, candidate_start, candidate_weights, im_height, im_width)
276 |     result_box = np.hstack((result_box, candidate_scores[:, np.newaxis]))
277 |     list_result_box = []
278 |     list_result_mask = []
279 |     # separate result mask into different classes
280 |     for i in xrange(num_classes - 1):
281 |         cls_start = class_bar[i - 1] if i > 0 else 0
282 |         cls_end = class_bar[i]
283 |         list_result_box.append(result_box[cls_start:cls_end, :])
284 |         list_result_mask.append(result_mask[cls_start:cls_end, :, :, :])
285 | 
286 |     return list_result_mask, list_result_box
287 | 


--------------------------------------------------------------------------------
/models/VGG16/faster_rcnn_end2end/test.prototxt:
--------------------------------------------------------------------------------
  1 | name: "VGG_ILSVRC_16_layers"
  2 | 
  3 | input: "data"
  4 | input_shape {
  5 |   dim: 1
  6 |   dim: 3
  7 |   dim: 224
  8 |   dim: 224
  9 | }
 10 | 
 11 | input: "im_info"
 12 | input_shape {
 13 |   dim: 1
 14 |   dim: 3
 15 | }
 16 | 
 17 | # ------------ Convolution -----------
 18 | 
 19 | layer {
 20 |   name: "conv1_1"
 21 |   type: "Convolution"
 22 |   bottom: "data"
 23 |   top: "conv1_1"
 24 |   param {
 25 |     lr_mult: 0
 26 |     decay_mult: 0
 27 |   }
 28 |   param {
 29 |     lr_mult: 0
 30 |     decay_mult: 0
 31 |   }
 32 |   convolution_param {
 33 |     num_output: 64
 34 |     pad: 1
 35 |     kernel_size: 3
 36 |   }
 37 | }
 38 | layer {
 39 |   name: "relu1_1"
 40 |   type: "ReLU"
 41 |   bottom: "conv1_1"
 42 |   top: "conv1_1"
 43 | }
 44 | layer {
 45 |   name: "conv1_2"
 46 |   type: "Convolution"
 47 |   bottom: "conv1_1"
 48 |   top: "conv1_2"
 49 |   param {
 50 |     lr_mult: 0
 51 |     decay_mult: 0
 52 |   }
 53 |   param {
 54 |     lr_mult: 0
 55 |     decay_mult: 0
 56 |   }
 57 |   convolution_param {
 58 |     num_output: 64
 59 |     pad: 1
 60 |     kernel_size: 3
 61 |   }
 62 | }
 63 | layer {
 64 |   name: "relu1_2"
 65 |   type: "ReLU"
 66 |   bottom: "conv1_2"
 67 |   top: "conv1_2"
 68 | }
 69 | layer {
 70 |   name: "pool1"
 71 |   type: "Pooling"
 72 |   bottom: "conv1_2"
 73 |   top: "pool1"
 74 |   pooling_param {
 75 |     pool: MAX
 76 |     kernel_size: 2
 77 |     stride: 2
 78 |   }
 79 | }
 80 | layer {
 81 |   name: "conv2_1"
 82 |   type: "Convolution"
 83 |   bottom: "pool1"
 84 |   top: "conv2_1"
 85 |   param {
 86 |     lr_mult: 0
 87 |     decay_mult: 0
 88 |   }
 89 |   param {
 90 |     lr_mult: 0
 91 |     decay_mult: 0
 92 |   }
 93 |   convolution_param {
 94 |     num_output: 128
 95 |     pad: 1
 96 |     kernel_size: 3
 97 |   }
 98 | }
 99 | layer {
100 |   name: "relu2_1"
101 |   type: "ReLU"
102 |   bottom: "conv2_1"
103 |   top: "conv2_1"
104 | }
105 | layer {
106 |   name: "conv2_2"
107 |   type: "Convolution"
108 |   bottom: "conv2_1"
109 |   top: "conv2_2"
110 |   param {
111 |     lr_mult: 0
112 |     decay_mult: 0
113 |   }
114 |   param {
115 |     lr_mult: 0
116 |     decay_mult: 0
117 |   }
118 |   convolution_param {
119 |     num_output: 128
120 |     pad: 1
121 |     kernel_size: 3
122 |   }
123 | }
124 | layer {
125 |   name: "relu2_2"
126 |   type: "ReLU"
127 |   bottom: "conv2_2"
128 |   top: "conv2_2"
129 | }
130 | layer {
131 |   name: "pool2"
132 |   type: "Pooling"
133 |   bottom: "conv2_2"
134 |   top: "pool2"
135 |   pooling_param {
136 |     pool: MAX
137 |     kernel_size: 2
138 |     stride: 2
139 |   }
140 | }
141 | layer {
142 |   name: "conv3_1"
143 |   type: "Convolution"
144 |   bottom: "pool2"
145 |   top: "conv3_1"
146 |   param {
147 |     lr_mult: 1
148 |     decay_mult: 1
149 |   }
150 |   param {
151 |     lr_mult: 2
152 |     decay_mult: 0
153 |   }
154 |   convolution_param {
155 |     num_output: 256
156 |     pad: 1
157 |     kernel_size: 3
158 |   }
159 | }
160 | layer {
161 |   name: "relu3_1"
162 |   type: "ReLU"
163 |   bottom: "conv3_1"
164 |   top: "conv3_1"
165 | }
166 | layer {
167 |   name: "conv3_2"
168 |   type: "Convolution"
169 |   bottom: "conv3_1"
170 |   top: "conv3_2"
171 |   param {
172 |     lr_mult: 1
173 |     decay_mult: 1
174 |   }
175 |   param {
176 |     lr_mult: 2
177 |     decay_mult: 0
178 |   }
179 |   convolution_param {
180 |     num_output: 256
181 |     pad: 1
182 |     kernel_size: 3
183 |   }
184 | }
185 | layer {
186 |   name: "relu3_2"
187 |   type: "ReLU"
188 |   bottom: "conv3_2"
189 |   top: "conv3_2"
190 | }
191 | layer {
192 |   name: "conv3_3"
193 |   type: "Convolution"
194 |   bottom: "conv3_2"
195 |   top: "conv3_3"
196 |   param {
197 |     lr_mult: 1
198 |     decay_mult: 1
199 |   }
200 |   param {
201 |     lr_mult: 2
202 |     decay_mult: 0
203 |   }
204 |   convolution_param {
205 |     num_output: 256
206 |     pad: 1
207 |     kernel_size: 3
208 |   }
209 | }
210 | layer {
211 |   name: "relu3_3"
212 |   type: "ReLU"
213 |   bottom: "conv3_3"
214 |   top: "conv3_3"
215 | }
216 | layer {
217 |   name: "pool3"
218 |   type: "Pooling"
219 |   bottom: "conv3_3"
220 |   top: "pool3"
221 |   pooling_param {
222 |     pool: MAX
223 |     kernel_size: 2
224 |     stride: 2
225 |   }
226 | }
227 | layer {
228 |   name: "conv4_1"
229 |   type: "Convolution"
230 |   bottom: "pool3"
231 |   top: "conv4_1"
232 |   param {
233 |     lr_mult: 1
234 |     decay_mult: 1
235 |   }
236 |   param {
237 |     lr_mult: 2
238 |     decay_mult: 0
239 |   }
240 |   convolution_param {
241 |     num_output: 512
242 |     pad: 1
243 |     kernel_size: 3
244 |   }
245 | }
246 | layer {
247 |   name: "relu4_1"
248 |   type: "ReLU"
249 |   bottom: "conv4_1"
250 |   top: "conv4_1"
251 | }
252 | layer {
253 |   name: "conv4_2"
254 |   type: "Convolution"
255 |   bottom: "conv4_1"
256 |   top: "conv4_2"
257 |   param {
258 |     lr_mult: 1
259 |     decay_mult: 1
260 |   }
261 |   param {
262 |     lr_mult: 2
263 |     decay_mult: 0
264 |   }
265 |   convolution_param {
266 |     num_output: 512
267 |     pad: 1
268 |     kernel_size: 3
269 |   }
270 | }
271 | layer {
272 |   name: "relu4_2"
273 |   type: "ReLU"
274 |   bottom: "conv4_2"
275 |   top: "conv4_2"
276 | }
277 | layer {
278 |   name: "conv4_3"
279 |   type: "Convolution"
280 |   bottom: "conv4_2"
281 |   top: "conv4_3"
282 |   param {
283 |     lr_mult: 1
284 |     decay_mult: 1
285 |   }
286 |   param {
287 |     lr_mult: 2
288 |     decay_mult: 0
289 |   }
290 |   convolution_param {
291 |     num_output: 512
292 |     pad: 1
293 |     kernel_size: 3
294 |   }
295 | }
296 | layer {
297 |   name: "relu4_3"
298 |   type: "ReLU"
299 |   bottom: "conv4_3"
300 |   top: "conv4_3"
301 | }
302 | layer {
303 |   name: "pool4"
304 |   type: "Pooling"
305 |   bottom: "conv4_3"
306 |   top: "pool4"
307 |   pooling_param {
308 |     pool: MAX
309 |     kernel_size: 2
310 |     stride: 2
311 |   }
312 | }
313 | layer {
314 |   name: "conv5_1"
315 |   type: "Convolution"
316 |   bottom: "pool4"
317 |   top: "conv5_1"
318 |   param {
319 |     lr_mult: 1
320 |     decay_mult: 1
321 |   }
322 |   param {
323 |     lr_mult: 2
324 |     decay_mult: 0
325 |   }
326 |   convolution_param {
327 |     num_output: 512
328 |     pad: 1
329 |     kernel_size: 3
330 |   }
331 | }
332 | layer {
333 |   name: "relu5_1"
334 |   type: "ReLU"
335 |   bottom: "conv5_1"
336 |   top: "conv5_1"
337 | }
338 | layer {
339 |   name: "conv5_2"
340 |   type: "Convolution"
341 |   bottom: "conv5_1"
342 |   top: "conv5_2"
343 |   param {
344 |     lr_mult: 1
345 |     decay_mult: 1
346 |   }
347 |   param {
348 |     lr_mult: 2
349 |     decay_mult: 0
350 |   }
351 |   convolution_param {
352 |     num_output: 512
353 |     pad: 1
354 |     kernel_size: 3
355 |   }
356 | }
357 | layer {
358 |   name: "relu5_2"
359 |   type: "ReLU"
360 |   bottom: "conv5_2"
361 |   top: "conv5_2"
362 | }
363 | layer {
364 |   name: "conv5_3"
365 |   type: "Convolution"
366 |   bottom: "conv5_2"
367 |   top: "conv5_3"
368 |   param {
369 |     lr_mult: 1
370 |     decay_mult: 1
371 |   }
372 |   param {
373 |     lr_mult: 2
374 |     decay_mult: 0
375 |   }
376 |   convolution_param {
377 |     num_output: 512
378 |     pad: 1
379 |     kernel_size: 3
380 |   }
381 | }
382 | layer {
383 |   name: "relu5_3"
384 |   type: "ReLU"
385 |   bottom: "conv5_3"
386 |   top: "conv5_3"
387 | }
388 | 
389 | #------------ RPN ------------
390 | 
391 | layer {
392 |   name: "rpn_conv/3x3"
393 |   type: "Convolution"
394 |   bottom: "conv5_3"
395 |   top: "rpn/output"
396 |   param { lr_mult: 1.0 decay_mult: 1.0 }
397 |   param { lr_mult: 2.0 decay_mult: 0 }
398 |   convolution_param {
399 |     num_output: 512
400 |     kernel_size: 3 pad: 1 stride: 1
401 |     weight_filler { type: "gaussian" std: 0.01 }
402 |     bias_filler { type: "constant" value: 0 }
403 |   }
404 | }
405 | layer {
406 |   name: "rpn_relu/3x3"
407 |   type: "ReLU"
408 |   bottom: "rpn/output"
409 |   top: "rpn/output"
410 | }
411 | 
412 | layer {
413 |   name: "rpn_cls_score"
414 |   type: "Convolution"
415 |   bottom: "rpn/output"
416 |   top: "rpn_cls_score"
417 |   param { lr_mult: 1.0 decay_mult: 1.0 }
418 |   param { lr_mult: 2.0 decay_mult: 0 }
419 |   convolution_param {
420 |     num_output: 18   # 2(bg/fg) * 9(anchors)
421 |     kernel_size: 1 pad: 0 stride: 1
422 |     weight_filler { type: "gaussian" std: 0.01 }
423 |     bias_filler { type: "constant" value: 0 }
424 |   }
425 | }
426 | layer {
427 |   name: "rpn_bbox_pred"
428 |   type: "Convolution"
429 |   bottom: "rpn/output"
430 |   top: "rpn_bbox_pred"
431 |   param { lr_mult: 1.0 decay_mult: 1.0 }
432 |   param { lr_mult: 2.0 decay_mult: 0 }
433 |   convolution_param {
434 |     num_output: 36   # 4 * 9(anchors)
435 |     kernel_size: 1 pad: 0 stride: 1
436 |     weight_filler { type: "gaussian" std: 0.01 }
437 |     bias_filler { type: "constant" value: 0 }
438 |   }
439 | }
440 | layer {
441 |    bottom: "rpn_cls_score"
442 |    top: "rpn_cls_score_reshape"
443 |    name: "rpn_cls_score_reshape"
444 |    type: "Reshape"
445 |    reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } }
446 | }
447 | 
448 | #------------ Anchor to Proposal ------------
449 | 
450 | layer {
451 |   name: "rpn_cls_prob"
452 |   type: "Softmax"
453 |   bottom: "rpn_cls_score_reshape"
454 |   top: "rpn_cls_prob"
455 | }
456 | layer {
457 |   name: 'rpn_cls_prob_reshape'
458 |   type: 'Reshape'
459 |   bottom: 'rpn_cls_prob'
460 |   top: 'rpn_cls_prob_reshape'
461 |   reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } }
462 | }
463 | layer {
464 |   name: 'proposal'
465 |   type: 'Python'
466 |   bottom: 'rpn_cls_prob_reshape'
467 |   bottom: 'rpn_bbox_pred'
468 |   bottom: 'im_info'
469 |   top: 'rois'
470 |   python_param {
471 |     module: 'pylayer.proposal_layer'
472 |     layer: 'ProposalLayer'
473 |     param_str: "{'feat_stride': 16, 'gradient_scale': 1}"
474 |   }
475 | }
476 | 
477 | #------------ Roi Warping ------------
478 | 
479 | layer {
480 |   name: "roi_pool5"
481 |   type: "ROIWarping"
482 |   bottom: "conv5_3"
483 |   bottom: "rois"
484 |   top: "pool5"
485 |   roi_warping_param {
486 |     pooled_w: 7
487 |     pooled_h: 7
488 |     spatial_scale: 0.0625 # 1/16
489 |   }
490 | }
491 | layer {
492 |   name: "fc6"
493 |   type: "InnerProduct"
494 |   bottom: "pool5"
495 |   top: "fc6"
496 |   param {
497 |     lr_mult: 1
498 |     decay_mult: 1
499 |   }
500 |   param {
501 |     lr_mult: 2
502 |     decay_mult: 0
503 |   }
504 |   inner_product_param {
505 |     num_output: 4096
506 |   }
507 | }
508 | layer {
509 |   name: "relu6"
510 |   type: "ReLU"
511 |   bottom: "fc6"
512 |   top: "fc6"
513 | }
514 | layer {
515 |   name: "drop6"
516 |   type: "Dropout"
517 |   bottom: "fc6"
518 |   top: "fc6"
519 |   dropout_param {
520 |     dropout_ratio: 0.5
521 |   }
522 | }
523 | layer {
524 |   name: "fc7"
525 |   type: "InnerProduct"
526 |   bottom: "fc6"
527 |   top: "fc7"
528 |   param {
529 |     lr_mult: 1
530 |     decay_mult: 1
531 |   }
532 |   param {
533 |     lr_mult: 2
534 |     decay_mult: 0
535 |   }
536 |   inner_product_param {
537 |     num_output: 4096
538 |   }
539 | }
540 | layer {
541 |   name: "relu7"
542 |   type: "ReLU"
543 |   bottom: "fc7"
544 |   top: "fc7"
545 | }
546 | layer {
547 |   name: "drop7"
548 |   type: "Dropout"
549 |   bottom: "fc7"
550 |   top: "fc7"
551 |   dropout_param {
552 |     dropout_ratio: 0.5
553 |   }
554 | }
555 | 
556 | #----- Classification -----
557 | 
558 | layer {
559 |   name: "cls_score"
560 |   type: "InnerProduct"
561 |   bottom: "fc7"
562 |   top: "cls_score"
563 |   param {
564 |     lr_mult: 1
565 |     decay_mult: 1
566 |   }
567 |   param {
568 |     lr_mult: 2
569 |     decay_mult: 0
570 |   }
571 |   inner_product_param {
572 |     num_output: 21
573 |     weight_filler {
574 |       type: "gaussian"
575 |       std: 0.01
576 |     }
577 |     bias_filler {
578 |       type: "constant"
579 |       value: 0
580 |     }
581 |   }
582 | }
583 | 
584 | layer {
585 |   name: "cls_prob"
586 |   type: "Softmax"
587 |   bottom: "cls_score"
588 |   top: "cls_prob"
589 | }
590 | 
591 | #----- Bounding-box Regression -----
592 | 
593 | layer {
594 |   name: "bbox_pred"
595 |   type: "InnerProduct"
596 |   bottom: "fc7"
597 |   top: "bbox_pred"
598 |   param {
599 |     lr_mult: 1
600 |     decay_mult: 1
601 |   }
602 |   param {
603 |     lr_mult: 2
604 |     decay_mult: 0
605 |   }
606 |   inner_product_param {
607 |     num_output: 84
608 |     weight_filler {
609 |       type: "gaussian"
610 |       std: 0.001
611 |     }
612 |     bias_filler {
613 |       type: "constant"
614 |       value: 0
615 |     }
616 |   }
617 | }
618 | 


--------------------------------------------------------------------------------