├── lib ├── db │ ├── __init__.py │ ├── imdb.py │ ├── maskdb.py │ └── roidb.py ├── nms │ ├── __init__.py │ ├── .gitignore │ ├── gpu_nms.hpp │ ├── gpu_mv.hpp │ ├── py_cpu_nms.py │ ├── gpu_nms.pyx │ ├── gpu_mv.pyx │ ├── cpu_nms.pyx │ ├── nms_wrapper.py │ └── nms_kernel.cu ├── pylayer │ ├── __init__.py │ ├── mask_layer.py │ ├── mnc_data_layer.py │ ├── proposal_target_layer.py │ ├── anchor_target_layer.py │ └── proposal_layer.py ├── utils │ ├── __init__.py │ ├── unmap.py │ ├── timer.py │ ├── bbox.pyx │ ├── blob.py │ └── vis_seg.py ├── caffeWrapper │ ├── __init__.py │ └── SolverWrapper.py ├── datasets │ ├── __init__.py │ ├── pascal_voc.py │ └── pascal_voc_seg.py ├── transform │ ├── __init__.py │ ├── anchors.py │ ├── bbox_transform.py │ └── mask_transform.py ├── Makefile ├── setup.py └── mnc_config.py ├── experiments ├── logs │ └── .gitignore ├── cfgs │ └── VGG16 │ │ ├── mnc_5stage.yml │ │ ├── faster_rcnn_end2end.yml │ │ └── cfm.yml └── scripts │ ├── cfm.sh │ ├── mnc_5stage.sh │ └── faster_rcnn_end2end.sh ├── data ├── demo │ ├── 2008_000533.jpg │ ├── 2008_000910.jpg │ ├── 2008_001602.jpg │ ├── 2008_001717.jpg │ └── 2008_008093.jpg ├── readme_img │ └── example.png └── scripts │ ├── fetch_mnc_model.sh │ ├── fetch_mcg_data.sh │ ├── fetch_sbd_data.sh │ └── fetch_imagenet_models.sh ├── .gitmodules ├── models └── VGG16 │ ├── cfm │ ├── solver.prototxt │ └── test.prototxt │ ├── mnc_5stage │ └── solver.prototxt │ └── faster_rcnn_end2end │ ├── solver.prototxt │ └── test.prototxt ├── .gitignore ├── tools ├── _init_paths.py ├── test_net.py ├── train_net.py ├── demo.py └── prepare_mcg_maskdb.py ├── LICENSE └── README.md /lib/db/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/pylayer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/caffeWrapper/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/transform/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /experiments/logs/.gitignore: -------------------------------------------------------------------------------- 1 | *.txt* -------------------------------------------------------------------------------- /lib/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python setup.py build_ext --inplace 3 | rm -rf build 4 | -------------------------------------------------------------------------------- /data/demo/2008_000533.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_000533.jpg -------------------------------------------------------------------------------- /data/demo/2008_000910.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_000910.jpg -------------------------------------------------------------------------------- /data/demo/2008_001602.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_001602.jpg -------------------------------------------------------------------------------- /data/demo/2008_001717.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_001717.jpg -------------------------------------------------------------------------------- /data/demo/2008_008093.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/demo/2008_008093.jpg -------------------------------------------------------------------------------- /data/readme_img/example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daijifeng001/MNC/HEAD/data/readme_img/example.png -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "caffe-mnc"] 2 | path = caffe-mnc 3 | url = https://github.com/daijifeng001/caffe-mnc.git 4 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /experiments/cfgs/VGG16/mnc_5stage.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: mnc_5stage 2 | MASK_SIZE: 21 3 | TRAIN: 4 | RPN_POST_NMS_TOP_N: 300 5 | IMS_PER_BATCH: 1 6 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 7 | -------------------------------------------------------------------------------- /experiments/cfgs/VGG16/faster_rcnn_end2end.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: faster_rcnn_end2end 2 | MNC_MODE: False 3 | TRAIN: 4 | MIX_INDEX: False 5 | IMS_PER_BATCH: 1 6 | BBOX_NORMALIZE_TARGETS_PRECOMPUTED: True 7 | BATCH_SIZE: 128 8 | FG_FRACTION: [0.25] 9 | BG_FRACTION: [1.0] 10 | BG_THRESH_HI: [0.5] 11 | BG_THRESH_LO: [0.1] 12 | -------------------------------------------------------------------------------- /data/scripts/fetch_mnc_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 4 | cd ${DIR} 5 | 6 | URL="https://onedrive.live.com/download?resid=571EABC0F8C2A19C!1103&authkey=!ALXduVujs-7r6Ug" 7 | 8 | echo "Downloading mnc model..." 9 | 10 | mkdir ../mnc_model 11 | wget ${URL} -O mnc_model.caffemodel.h5 12 | 13 | mv mnc_model.caffemodel.h5 ../mnc_model/ 14 | -------------------------------------------------------------------------------- /experiments/cfgs/VGG16/cfm.yml: -------------------------------------------------------------------------------- 1 | EXP_DIR: cfm 2 | MASK_SIZE: 21 3 | CFM_MODE: True 4 | MNC_MODE: False 5 | RNG_SEED: 821 6 | TRAIN: 7 | HAS_RPN: False 8 | IMS_PER_BATCH: 1 9 | SCALES: [480, 576, 688, 864, 1024] 10 | MAX_SIZE: 1500 11 | TEST: 12 | SCALES: [480, 576, 688, 864, 1024] 13 | MAX_SIZE: 1500 14 | GROUP_SCALE: 3 15 | MAX_ROIS_GPU: [2000, 500] 16 | USE_TOP_K_MCG: 2000 17 | -------------------------------------------------------------------------------- /models/VGG16/cfm/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG16/cfm/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 20000 6 | display: 100 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | snapshot_prefix: "cfm" 15 | iter_size: 8 16 | -------------------------------------------------------------------------------- /lib/nms/gpu_mv.hpp: -------------------------------------------------------------------------------- 1 | void _mv(const float* all_boxes, const float* all_masks, const int all_boxes_num, 2 | const int* candidate_inds, const int* candidate_start, const float* candidate_weights, const int candidate_num, 3 | const int image_height, const int image_width, const int box_dim, const int mask_size, const int result_num, 4 | float* finalize_output_mask, int* finalize_output_box, const int device_id); 5 | -------------------------------------------------------------------------------- /models/VGG16/mnc_5stage/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG16/mnc_5stage/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 20000 6 | display: 100 7 | average_loss: 100 8 | momentum: 0.9 9 | weight_decay: 0.0005 10 | # We disable standard caffe solver snapshotting and implement our own snapshot 11 | # function 12 | snapshot: 0 13 | snapshot_prefix: "vgg16_mnc_5stage" 14 | iter_size: 8 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files 2 | *.slo 3 | *.lo 4 | *.o 5 | *.obj 6 | 7 | # Precompiled Headers 8 | *.gch 9 | *.pch 10 | 11 | # Compiled Dynamic libraries 12 | *.so 13 | *.dylib 14 | *.dll 15 | 16 | # Fortran module files 17 | *.mod 18 | 19 | # Compiled Static libraries 20 | *.lai 21 | *.la 22 | *.a 23 | *.lib 24 | 25 | # Executables 26 | *.exe 27 | *.out 28 | *.app 29 | 30 | *.pyc 31 | *.png 32 | *.jpg 33 | *~ 34 | .idea -------------------------------------------------------------------------------- /models/VGG16/faster_rcnn_end2end/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "models/VGG16/faster_rcnn_end2end/train.prototxt" 2 | base_lr: 0.001 3 | lr_policy: "step" 4 | gamma: 0.1 5 | stepsize: 50000 6 | display: 20 7 | average_loss: 100 8 | # iter_size: 1 9 | momentum: 0.9 10 | weight_decay: 0.0005 11 | # We disable standard caffe solver snapshotting and implement our own snapshot 12 | # function 13 | snapshot: 0 14 | # We still use the snapshot prefix, though 15 | snapshot_prefix: "vgg16_faster_rcnn" 16 | iter_size: 2 17 | -------------------------------------------------------------------------------- /tools/_init_paths.py: -------------------------------------------------------------------------------- 1 | 2 | import os.path 3 | import sys 4 | 5 | """ 6 | Add lib paths and caffe path to system search path 7 | """ 8 | 9 | 10 | def add_path(path): 11 | if path not in sys.path: 12 | sys.path.insert(0, path) 13 | 14 | cur_dir = os.path.dirname(__file__) 15 | 16 | # Add caffe python to PYTHONPATH 17 | caffe_path = os.path.join(cur_dir, '..', 'caffe-mnc', 'python') 18 | add_path(caffe_path) 19 | 20 | # Add lib to PYTHONPATH 21 | lib_path = os.path.join(cur_dir, '..', 'lib') 22 | add_path(lib_path) 23 | -------------------------------------------------------------------------------- /data/scripts/fetch_mcg_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 4 | cd ${DIR} 5 | 6 | FILE=MCG-Pascal-Main_trainvaltest_2012-proposals.tgz 7 | URL=https://data.vision.ee.ethz.ch/jpont/mcg/${FILE} 8 | echo "Downloading MCG proposals data..." 9 | 10 | wget $URL -O ${FILE} 11 | 12 | echo "Unzipping..." 13 | 14 | mkdir tmp 15 | tar zxvf ${FILE} -C tmp/ --strip-components=1 16 | 17 | echo "move it to target source..." 18 | 19 | mkdir ../MCG-raw/ 20 | 21 | mv tmp/* ../MCG-raw/ 22 | 23 | rm ${FILE} 24 | rm -r tmp 25 | -------------------------------------------------------------------------------- /data/scripts/fetch_sbd_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 4 | cd ${DIR} 5 | 6 | FILE=benchmark.tgz 7 | URL=http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/${FILE} 8 | echo "Downloading SBD data..." 9 | 10 | wget $URL -O ${FILE} 11 | 12 | echo "Unzipping..." 13 | 14 | mkdir tmp 15 | tar zxvf ${FILE} -C tmp/ --strip-components=1 16 | 17 | echo "move it to target source..." 18 | 19 | mv -v tmp/dataset/inst/ tmp/dataset/cls/ tmp/dataset/img/ -t ../VOCdevkitSDS/ 20 | 21 | rm benchmark.tgz 22 | rm -r tmp 23 | -------------------------------------------------------------------------------- /data/scripts/fetch_imagenet_models.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 4 | cd ${DIR} 5 | 6 | URL="https://onedrive.live.com/download?resid=F371D9563727B96F!91967&authkey=!AKjrYZBFAfb6JBQ" 7 | 8 | echo "Downloading VGG16.mask.caffemodel model..." 9 | 10 | mkdir ../imagenet_models/ 11 | wget ${URL} -O VGG16.mask.caffemodel 12 | 13 | mv VGG16.mask.caffemodel ../imagenet_models/ 14 | 15 | URL="https://onedrive.live.com/download?resid=F371D9563727B96F!91966&authkey=!ABoH69DkSk81FwA" 16 | 17 | echo "Downloading VGG16.v2.caffemodel model..." 18 | 19 | wget ${URL} -O VGG16.v2.caffemodel 20 | 21 | mv VGG16.v2.caffemodel ../imagenet_models/ 22 | -------------------------------------------------------------------------------- /lib/utils/unmap.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | 11 | def unmap(data, count, inds, fill=0): 12 | """ Unmap a subset of item (data) back to the original set of items (of 13 | size count) """ 14 | if len(data.shape) == 1: 15 | ret = np.empty((count, ), dtype=np.float32) 16 | ret.fill(fill) 17 | ret[inds] = data 18 | else: 19 | ret = np.empty((count, ) + data.shape[1:], dtype=np.float32) 20 | ret.fill(fill) 21 | ret[inds, :] = data 22 | return ret 23 | -------------------------------------------------------------------------------- /lib/db/imdb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | from datasets.pascal_voc_det import PascalVOCDet 9 | from datasets.pascal_voc_seg import PascalVOCSeg 10 | 11 | __sets = { 12 | 'voc_2012_seg_train': (lambda: PascalVOCSeg('train', '2012', 'data/VOCdevkitSDS/')), 13 | 'voc_2012_seg_val': (lambda: PascalVOCSeg('val', '2012', 'data/VOCdevkitSDS/')), 14 | 'voc_2007_trainval': (lambda: PascalVOCDet('trainval', '2007')), 15 | 'voc_2007_test': (lambda: PascalVOCDet('test', '2007')) 16 | } 17 | 18 | 19 | def get_imdb(name): 20 | """ Get an imdb (image database) by name. 21 | """ 22 | if not __sets.has_key(name): 23 | raise KeyError('Unknown dataset: {}'.format(name)) 24 | return __sets[name]() 25 | 26 | 27 | def list_imdbs(): 28 | return __sets.keys() 29 | -------------------------------------------------------------------------------- /lib/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | 11 | class Timer(object): 12 | """A simple timer.""" 13 | def __init__(self): 14 | self.total_time = 0. 15 | self.calls = 0 16 | self.start_time = 0. 17 | self.diff = 0. 18 | self.average_time = 0. 19 | 20 | def tic(self): 21 | # using time.time instead of time.clock because time time.clock 22 | # does not normalize for multithreading 23 | self.start_time = time.time() 24 | 25 | def toc(self, average=True): 26 | self.diff = time.time() - self.start_time 27 | self.total_time += self.diff 28 | self.calls += 1 29 | self.average_time = self.total_time / self.calls 30 | if average: 31 | return self.average_time 32 | else: 33 | return self.diff 34 | -------------------------------------------------------------------------------- /lib/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /lib/db/maskdb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Written by Haozhi Qi 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | from mnc_config import cfg 9 | from db.imdb import get_imdb 10 | 11 | 12 | def get_maskdb(imdb_name): 13 | 14 | imdb = get_imdb(imdb_name) 15 | print 'Loaded dataset `{:s}` for training'.format(imdb.name) 16 | # Here set handler function. (e.g. gt_roidb in faster RCNN) 17 | imdb.set_roi_handler(cfg.TRAIN.PROPOSAL_METHOD) 18 | imdb.set_mask_handler(cfg.TRAIN.PROPOSAL_METHOD) 19 | print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD) 20 | if cfg.TRAIN.USE_FLIPPED: 21 | print 'Appending horizontally-flipped training examples...' 22 | imdb.append_flipped_masks() 23 | print 'done' 24 | return imdb.maskdb 25 | 26 | 27 | def attach_maskdb(imdb_names): 28 | """ 29 | only implement single maskdb now 30 | """ 31 | maskdbs = [get_maskdb(s) for s in imdb_names.split('+')] 32 | maskdb = maskdbs[0] 33 | if len(maskdbs) > 1: 34 | raise NotImplementedError 35 | else: 36 | imdb = get_imdb(imdb_names) 37 | return imdb, maskdb 38 | -------------------------------------------------------------------------------- /experiments/scripts/cfm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: 3 | # ./experiments/scripts/cfm.sh GPU NET [--set ...] 4 | # Example: 5 | # ./experiments/scripts/cfm.sh 0 VGG16 \ 6 | # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400,500,600,700]" 7 | 8 | set -x 9 | set -e 10 | 11 | export PYTHONUNBUFFERED="True" 12 | 13 | GPU_ID=$1 14 | NET=$2 15 | NET_lc=${NET,,} 16 | ITERS=30000 17 | DATASET_TRAIN=voc_2012_seg_train 18 | DATASET_TEST=voc_2012_seg_val 19 | array=( $@ ) 20 | len=${#array[@]} 21 | EXTRA_ARGS=${array[@]:2:$len} 22 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 23 | 24 | LOG="experiments/logs/cfm_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 25 | exec &> >(tee -a "$LOG") 26 | echo Logging output to "$LOG" 27 | 28 | NET_INIT=data/imagenet_models/${NET}.mask.caffemodel 29 | time ./tools/train_net.py --gpu ${GPU_ID} \ 30 | --solver models/${NET}/cfm/solver.prototxt \ 31 | --weights ${NET_INIT} \ 32 | --imdb ${DATASET_TRAIN} \ 33 | --iters ${ITERS} \ 34 | --cfg experiments/cfgs/${NET}/cfm.yml \ 35 | ${EXTRA_ARGS} 36 | 37 | set +x 38 | NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'` 39 | set -x 40 | 41 | time ./tools/test_net.py --gpu ${GPU_ID} \ 42 | --def models/${NET}/cfm/test.prototxt \ 43 | --net ${NET_FINAL} \ 44 | --imdb ${DATASET_TEST} \ 45 | --cfg experiments/cfgs/${NET}/cfm.yml \ 46 | --task cfm 47 | -------------------------------------------------------------------------------- /experiments/scripts/mnc_5stage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: 3 | # ./experiments/scripts/mnc_5stage.sh GPU NET [--set ...] 4 | # Example: 5 | # ./experiments/scripts/mnc_5stage.sh 0 VGG16 \ 6 | # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400,500,600,700]" 7 | 8 | set -x 9 | set -e 10 | 11 | export PYTHONUNBUFFERED="True" 12 | 13 | GPU_ID=$1 14 | NET=$2 15 | NET_lc=${NET,,} 16 | ITERS=25000 17 | DATASET_TRAIN=voc_2012_seg_train 18 | DATASET_TEST=voc_2012_seg_val 19 | array=( $@ ) 20 | len=${#array[@]} 21 | EXTRA_ARGS=${array[@]:2:$len} 22 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 23 | 24 | LOG="experiments/logs/mnc_5stage_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 25 | exec &> >(tee -a "$LOG") 26 | echo Logging output to "$LOG" 27 | 28 | NET_INIT=data/imagenet_models/${NET}.mask.caffemodel 29 | time ./tools/train_net.py --gpu ${GPU_ID} \ 30 | --solver models/${NET}/mnc_5stage/solver.prototxt \ 31 | --weights ${NET_INIT} \ 32 | --imdb ${DATASET_TRAIN} \ 33 | --iters ${ITERS} \ 34 | --cfg experiments/cfgs/${NET}/mnc_5stage.yml \ 35 | ${EXTRA_ARGS} 36 | 37 | set +x 38 | NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'` 39 | set -x 40 | 41 | time ./tools/test_net.py --gpu ${GPU_ID} \ 42 | --def models/${NET}/mnc_5stage/test.prototxt \ 43 | --net ${NET_FINAL} \ 44 | --imdb ${DATASET_TEST} \ 45 | --cfg experiments/cfgs/${NET}/mnc_5stage.yml \ 46 | --task seg 47 | 48 | -------------------------------------------------------------------------------- /experiments/scripts/faster_rcnn_end2end.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Usage: 3 | # ./experiments/scripts/faster_rcnn_end2end.sh GPU NET [--set ...] 4 | # Example: 5 | # ./experiments/scripts/faster_rcnn_end2end.sh 0 VGG16 \ 6 | # --set EXP_DIR foobar RNG_SEED 42 TRAIN.SCALES "[400,500,600,700]" 7 | 8 | set -x 9 | set -e 10 | 11 | export PYTHONUNBUFFERED="True" 12 | 13 | GPU_ID=$1 14 | NET=$2 15 | NET_lc=${NET,,} 16 | ITERS=70000 17 | DATASET_TRAIN=voc_2007_trainval 18 | DATASET_TEST=voc_2007_test 19 | 20 | array=( $@ ) 21 | len=${#array[@]} 22 | EXTRA_ARGS=${array[@]:2:$len} 23 | EXTRA_ARGS_SLUG=${EXTRA_ARGS// /_} 24 | 25 | LOG="experiments/logs/faster_rcnn_end2end_${NET}_${EXTRA_ARGS_SLUG}.txt.`date +'%Y-%m-%d_%H-%M-%S'`" 26 | exec &> >(tee -a "$LOG") 27 | echo Logging output to "$LOG" 28 | 29 | NET_INIT=data/imagenet_models/${NET}.v2.caffemodel 30 | time ./tools/train_net.py --gpu ${GPU_ID} \ 31 | --solver models/${NET}/faster_rcnn_end2end/solver.prototxt \ 32 | --weights ${NET_INIT} \ 33 | --imdb ${DATASET_TRAIN} \ 34 | --iters ${ITERS} \ 35 | --cfg experiments/cfgs/${NET}/faster_rcnn_end2end.yml \ 36 | ${EXTRA_ARGS} 37 | 38 | set +x 39 | NET_FINAL=`grep -B 1 "done solving" ${LOG} | grep "Wrote snapshot" | awk '{print $4}'` 40 | set -x 41 | 42 | time ./tools/test_net.py --gpu ${GPU_ID} \ 43 | --def models/${NET}/faster_rcnn_end2end/test.prototxt \ 44 | --net ${NET_FINAL} \ 45 | --imdb ${DATASET_TEST} \ 46 | --cfg experiments/cfgs/${NET}/faster_rcnn_end2end.yml \ 47 | --task det 48 | ${EXTRA_ARGS} 49 | -------------------------------------------------------------------------------- /lib/nms/gpu_mv.pyx: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | assert sizeof(int) == sizeof(np.int32_t) 6 | 7 | cdef extern from "gpu_mv.hpp": 8 | void _mv(np.float32_t* all_boxes, np.float32_t* all_masks, np.int32_t all_boxes_num, np.int32_t* candidate_inds, np.int32_t* candidate_start, np.float32_t* candidate_weights, np.int32_t candidate_num, np.int32_t image_height, np.int32_t image_width, np.int32_t box_dim, np.int32_t mask_size, np.int32_t result_num, np.float32_t* result_mask, np.int32_t* result_box, np.int32_t device_id); 9 | 10 | # boxes: n * 4 11 | # masks: n * 1 * 21 * 21 12 | # scores: n * 21 13 | def mv(np.ndarray[np.float32_t, ndim=2] all_boxes, 14 | np.ndarray[np.float32_t, ndim=4] all_masks, 15 | np.ndarray[np.int32_t, ndim=1] candidate_inds, 16 | np.ndarray[np.int32_t, ndim=1] candidate_start, 17 | np.ndarray[np.float32_t, ndim=1] candidate_weights, 18 | np.int32_t image_height, 19 | np.int32_t image_width, 20 | np.int32_t device_id = 0): 21 | cdef int all_box_num = all_boxes.shape[0] 22 | cdef int boxes_dim = all_boxes.shape[1] 23 | cdef int mask_size = all_masks.shape[3] 24 | cdef int candidate_num = candidate_inds.shape[0] 25 | cdef int result_num = candidate_start.shape[0] 26 | cdef np.ndarray[np.float32_t, ndim=4] \ 27 | result_mask = np.zeros((result_num, 1, all_masks.shape[2], all_masks.shape[3]), dtype=np.float32) 28 | cdef np.ndarray[np.int32_t, ndim=2] \ 29 | result_box = np.zeros((result_num, boxes_dim), dtype=np.int32) 30 | _mv(&all_boxes[0, 0], &all_masks[0, 0, 0, 0], all_box_num, &candidate_inds[0], &candidate_start[0], &candidate_weights[0], candidate_num, image_height, image_width, boxes_dim, mask_size, candidate_start.shape[0], &result_mask[0,0,0,0], &result_box[0,0], device_id) 31 | return result_mask, result_box 32 | -------------------------------------------------------------------------------- /lib/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | -------------------------------------------------------------------------------- /lib/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | -------------------------------------------------------------------------------- /lib/nms/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | from mnc_config import cfg 9 | from gpu_nms import gpu_nms 10 | from cpu_nms import cpu_nms 11 | 12 | 13 | def nms(dets, thresh): 14 | """Dispatch to either CPU or GPU NMS implementations.""" 15 | 16 | if dets.shape[0] == 0: 17 | return [] 18 | if cfg.USE_GPU_NMS: 19 | return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 20 | else: 21 | return cpu_nms(dets, thresh) 22 | 23 | 24 | def apply_nms(all_boxes, thresh): 25 | """Apply non-maximum suppression to all predicted boxes output by the 26 | test_net method. 27 | """ 28 | num_classes = len(all_boxes) 29 | num_images = len(all_boxes[0]) 30 | nms_boxes = [[[] for _ in xrange(num_images)] 31 | for _ in xrange(num_classes)] 32 | for cls_ind in xrange(num_classes): 33 | for im_ind in xrange(num_images): 34 | dets = all_boxes[cls_ind][im_ind] 35 | if dets == []: 36 | continue 37 | keep = nms(dets, thresh) 38 | if len(keep) == 0: 39 | continue 40 | nms_boxes[cls_ind][im_ind] = dets[keep, :].copy() 41 | return nms_boxes 42 | 43 | 44 | def apply_nms_mask(all_boxes, all_masks, thresh): 45 | num_classes = len(all_boxes) 46 | num_images = len(all_boxes[0]) 47 | nms_boxes = [[[] for _ in xrange(num_images)] 48 | for _ in xrange(num_classes)] 49 | nms_masks = [[[] for _ in xrange(num_images)] 50 | for _ in xrange(num_classes)] 51 | for cls_ind in xrange(num_classes): 52 | for im_ind in xrange(num_images): 53 | dets = all_boxes[cls_ind][im_ind] 54 | masks = all_masks[cls_ind][im_ind] 55 | if dets == []: 56 | continue 57 | keep = nms(dets, thresh) 58 | if len(keep) == 0: 59 | continue 60 | nms_boxes[cls_ind][im_ind] = dets[keep, :].copy() 61 | nms_masks[cls_ind][im_ind] = masks[keep, :].copy() 62 | return nms_boxes, nms_masks 63 | 64 | 65 | def apply_nms_mask_single(box, mask, thresh): 66 | if box == []: 67 | return box, mask 68 | keep = nms(box, thresh) 69 | if len(keep) == 0: 70 | return box, mask 71 | return box[keep, :].copy(), mask[keep, :].copy() 72 | -------------------------------------------------------------------------------- /tools/test_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Multitask Network Cascade 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # Copyright (c) 2016, Haozhi Qi 7 | # Licensed under The MIT License [see LICENSE for details] 8 | # -------------------------------------------------------- 9 | 10 | # Standard module 11 | import argparse 12 | import sys 13 | import os 14 | import time 15 | import pprint 16 | # User-defined module 17 | import _init_paths 18 | import caffe 19 | from mnc_config import cfg, cfg_from_file 20 | from db.imdb import get_imdb 21 | from caffeWrapper.TesterWrapper import TesterWrapper 22 | 23 | 24 | def parse_args(): 25 | """ 26 | Parse input arguments 27 | """ 28 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 29 | parser.add_argument('--gpu', dest='gpu_id', help='GPU id to use', 30 | default=0, type=int) 31 | parser.add_argument('--def', dest='prototxt', 32 | help='prototxt file defining the network', 33 | default=None, type=str) 34 | parser.add_argument('--net', dest='caffemodel', 35 | help='model to test', 36 | default=None, type=str) 37 | parser.add_argument('--cfg', dest='cfg_file', 38 | help='optional config file', default=None, type=str) 39 | parser.add_argument('--imdb', dest='imdb_name', 40 | help='dataset to test', 41 | default='voc_2007_test', type=str) 42 | parser.add_argument('--wait', dest='wait', 43 | help='wait until net file exists', 44 | default=True, type=bool) 45 | parser.add_argument('--comp', dest='comp_mode', help='competition mode', 46 | action='store_true') 47 | parser.add_argument('--set', dest='set_cfgs', 48 | help='set config keys', default=None, 49 | nargs=argparse.REMAINDER) 50 | parser.add_argument('--task', dest='task_name', 51 | help='set task name', default='sds', 52 | type=str) 53 | 54 | if len(sys.argv) == 1: 55 | parser.print_help() 56 | sys.exit(1) 57 | 58 | return parser.parse_args() 59 | 60 | 61 | if __name__ == '__main__': 62 | args = parse_args() 63 | 64 | print('Called with args:') 65 | print(args) 66 | 67 | if args.cfg_file is not None: 68 | cfg_from_file(args.cfg_file) 69 | 70 | cfg.GPU_ID = args.gpu_id 71 | 72 | print('Using config:') 73 | pprint.pprint(cfg) 74 | 75 | while not os.path.exists(args.caffemodel) and args.wait: 76 | print('Waiting for {} to exist...'.format(args.caffemodel)) 77 | time.sleep(10) 78 | 79 | caffe.set_mode_gpu() 80 | caffe.set_device(args.gpu_id) 81 | 82 | imdb = get_imdb(args.imdb_name) 83 | _tester = TesterWrapper(args.prototxt, imdb, args.caffemodel, args.task_name) 84 | _tester.get_result() 85 | -------------------------------------------------------------------------------- /tools/train_net.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -------------------------------------------------------- 4 | # Multitask Network Cascade 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # Copyright (c) 2016, Haozhi Qi 7 | # Licensed under The MIT License [see LICENSE for details] 8 | # -------------------------------------------------------- 9 | 10 | # Standard module 11 | import argparse 12 | import sys 13 | import pprint 14 | import numpy as np 15 | # User-defined module 16 | import _init_paths 17 | from mnc_config import cfg, cfg_from_file, get_output_dir # config mnc 18 | from db.roidb import attach_roidb 19 | from db.maskdb import attach_maskdb 20 | from caffeWrapper.SolverWrapper import SolverWrapper 21 | import caffe 22 | 23 | 24 | def parse_args(): 25 | """ Parse input arguments 26 | """ 27 | parser = argparse.ArgumentParser(description='Train a Fast R-CNN network') 28 | parser.add_argument('--gpu', dest='gpu_id', 29 | help='GPU device id to use [0]', 30 | default=0, type=int) 31 | parser.add_argument('--solver', dest='solver', 32 | help='solver prototxt', 33 | default=None, type=str) 34 | parser.add_argument('--iters', dest='max_iters', 35 | help='number of iterations to train', 36 | default=40000, type=int) 37 | parser.add_argument('--weights', dest='pretrained_model', 38 | help='initialize with pretrained model weights', 39 | default=None, type=str) 40 | parser.add_argument('--cfg', dest='cfg_file', 41 | help='optional config file', 42 | default=None, type=str) 43 | parser.add_argument('--imdb', dest='imdb_name', 44 | help='dataset to train on', 45 | default='voc_2007_trainval', type=str) 46 | parser.add_argument('--rand', dest='randomize', 47 | help='randomize (do not use a fixed seed)', 48 | action='store_true') 49 | parser.add_argument('--set', dest='set_cfgs', 50 | help='set config keys', default=None, 51 | nargs=argparse.REMAINDER) 52 | 53 | if len(sys.argv) == 1: 54 | parser.print_help() 55 | sys.exit(1) 56 | 57 | return parser.parse_args() 58 | 59 | 60 | if __name__ == '__main__': 61 | args = parse_args() 62 | print('Called with args:') 63 | print(args) 64 | 65 | if args.cfg_file is not None: 66 | cfg_from_file(args.cfg_file) 67 | 68 | cfg.GPU_ID = args.gpu_id 69 | print('Using config:') 70 | pprint.pprint(cfg) 71 | 72 | caffe.set_mode_gpu() 73 | caffe.set_device(args.gpu_id) 74 | 75 | if not args.randomize: 76 | # fix the random seeds (numpy and caffe) for reproducibility 77 | np.random.seed(cfg.RNG_SEED) 78 | caffe.set_random_seed(cfg.RNG_SEED) 79 | 80 | # get imdb and roidb from specified imdb_name 81 | imdb, roidb = attach_roidb(args.imdb_name) 82 | # Faster RCNN doesn't need 83 | if cfg.MNC_MODE or cfg.CFM_MODE: 84 | imdb, maskdb = attach_maskdb(args.imdb_name) 85 | else: 86 | maskdb = None 87 | print '{:d} roidb entries'.format(len(roidb)) 88 | 89 | output_dir = get_output_dir(imdb, None) 90 | print 'Output will be saved to `{:s}`'.format(output_dir) 91 | 92 | _solver = SolverWrapper(args.solver, roidb, maskdb, output_dir, imdb, 93 | pretrained_model=args.pretrained_model) 94 | 95 | print 'Solving...' 96 | _solver.train_model(args.max_iters) 97 | print 'done solving' 98 | 99 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Faster R-CNN 2 | 3 | The MIT License (MIT) 4 | 5 | Copyright (c) 2015 Microsoft Corporation 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy 8 | of this software and associated documentation files (the "Software"), to deal 9 | in the Software without restriction, including without limitation the rights 10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | copies of the Software, and to permit persons to whom the Software is 12 | furnished to do so, subject to the following conditions: 13 | 14 | The above copyright notice and this permission notice shall be included in 15 | all copies or substantial portions of the Software. 16 | 17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | THE SOFTWARE. 24 | 25 | ************************************************************************ 26 | 27 | THIRD-PARTY SOFTWARE NOTICES AND INFORMATION 28 | 29 | This project, Faster R-CNN, incorporates material from the project(s) listed below (collectively, "Third Party Code"). Microsoft is not the original author of the Third Party Code. The original copyright notice and license under which Microsoft received such Third Party Code are set out below. This Third Party Code is licensed to you under their original license terms set forth below. Microsoft reserves all other rights not expressly granted, whether by implication, estoppel or otherwise. 30 | 31 | 1. Caffe, version 0.9, (https://github.com/BVLC/caffe/) 32 | 33 | COPYRIGHT 34 | 35 | All contributions by the University of California: 36 | Copyright (c) 2014, 2015, The Regents of the University of California (Regents) 37 | All rights reserved. 38 | 39 | All other contributions: 40 | Copyright (c) 2014, 2015, the respective contributors 41 | All rights reserved. 42 | 43 | Caffe uses a shared copyright model: each contributor holds copyright over their contributions to Caffe. The project versioning records all such contribution and copyright details. If a contributor wants to further mark their specific copyright on a particular contribution, they should indicate their copyright solely in the commit message of the change when it is committed. 44 | 45 | The BSD 2-Clause License 46 | 47 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 48 | 49 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 50 | 51 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 52 | 53 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 54 | 55 | ************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION********** 56 | 57 | 58 | -------------------------------------------------------------------------------- /lib/datasets/pascal_voc.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | import numpy as np 10 | import scipy.sparse 11 | from mnc_config import cfg 12 | 13 | 14 | class PascalVOC(object): 15 | """ A base class for image database.""" 16 | def __init__(self, name): 17 | self._name = name 18 | self._num_classes = 0 19 | self._classes = [] 20 | self._image_index = [] 21 | self._obj_proposer = 'selective_search' 22 | self._roidb = None 23 | self._roidb_handler = self.default_roidb 24 | self._maskdb = None 25 | self._maskdb_handler = self.default_maskdb 26 | # Use this dict for storing dataset specific config options 27 | self.config = {} 28 | 29 | @property 30 | def name(self): 31 | return self._name 32 | 33 | @property 34 | def num_classes(self): 35 | return len(self._classes) 36 | 37 | @property 38 | def classes(self): 39 | return self._classes 40 | 41 | @property 42 | def image_index(self): 43 | return self._image_index 44 | 45 | @property 46 | def roidb_handler(self): 47 | return self._roidb_handler 48 | 49 | @roidb_handler.setter 50 | def roidb_handler(self, val): 51 | self._roidb_handler = val 52 | 53 | @property 54 | def maskdb_handler(self): 55 | return self._roidb_handler 56 | 57 | @maskdb_handler.setter 58 | def maskdb_handler(self, val): 59 | self._roidb_handler = val 60 | 61 | @property 62 | def roidb(self): 63 | # A roidb is a 'list of dictionaries', each with the following keys: 64 | # boxes: the numpy array for boxes coordinate 65 | # gt_overlaps: overlap ratio for ground truth 66 | # gt_classes: ground truth class for that box 67 | # flipped: whether get flipped 68 | if self._roidb is not None: 69 | return self._roidb 70 | self._roidb = self.roidb_handler() 71 | return self._roidb 72 | 73 | @property 74 | def maskdb(self): 75 | if self._maskdb is not None: 76 | return self._maskdb 77 | else: 78 | self._maskdb = self.maskdb_handler() 79 | return self._maskdb 80 | 81 | @property 82 | def cache_path(self): 83 | cache_path = os.path.abspath(os.path.join(cfg.DATA_DIR, 'cache')) 84 | if not os.path.exists(cache_path): 85 | os.makedirs(cache_path) 86 | return cache_path 87 | 88 | @property 89 | def num_images(self): 90 | return len(self.image_index) 91 | 92 | def set_roi_handler(self, method): 93 | method = eval('self.' + method + '_roidb') 94 | self.roidb_handler = method 95 | 96 | def set_mask_handler(self, method): 97 | method = eval('self.' + method + '_maskdb') 98 | self.maskdb_handler = method 99 | 100 | def image_path_at(self, i): 101 | raise NotImplementedError 102 | 103 | def default_roidb(self): 104 | raise NotImplementedError 105 | 106 | def default_maskdb(self): 107 | raise NotImplementedError 108 | 109 | def competition_mode(self, on): 110 | """Turn competition mode on or off.""" 111 | pass 112 | 113 | @staticmethod 114 | def merge_roidbs(a, b): 115 | assert len(a) == len(b) 116 | for i in xrange(len(a)): 117 | a[i]['boxes'] = np.vstack((a[i]['boxes'], b[i]['boxes'])) 118 | a[i]['gt_classes'] = np.hstack((a[i]['gt_classes'], 119 | b[i]['gt_classes'])) 120 | a[i]['gt_overlaps'] = scipy.sparse.vstack([a[i]['gt_overlaps'], 121 | b[i]['gt_overlaps']]) 122 | return a 123 | -------------------------------------------------------------------------------- /lib/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | import random 12 | import cv2 13 | from utils.cython_bbox import bbox_overlaps 14 | from mnc_config import cfg 15 | 16 | 17 | def im_list_to_blob(ims): 18 | """ 19 | Convert a list of images into a network input. 20 | Assumes images are already prepared (means subtracted, BGR order, ...). 21 | """ 22 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 23 | num_images = len(ims) 24 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 25 | dtype=np.float32) 26 | for i in xrange(num_images): 27 | im = ims[i] 28 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 29 | # Move channels (axis 3) to axis 1 30 | # Axis order will become: (batch elem, channel, height, width) 31 | channel_swap = (0, 3, 1, 2) 32 | blob = blob.transpose(channel_swap) 33 | return blob 34 | 35 | 36 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 37 | """Mean subtract and scale an image for use in a blob.""" 38 | im = im.astype(np.float32, copy=False) 39 | im -= pixel_means 40 | im_shape = im.shape 41 | im_size_min = np.min(im_shape[0:2]) 42 | im_size_max = np.max(im_shape[0:2]) 43 | im_scale = float(target_size) / float(im_size_min) 44 | # Prevent the biggest axis from being more than MAX_SIZE 45 | if np.round(im_scale * im_size_max) > max_size: 46 | im_scale = float(max_size) / float(im_size_max) 47 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 48 | interpolation=cv2.INTER_LINEAR) 49 | 50 | return im, im_scale 51 | 52 | 53 | def prep_im_for_blob_cfm(im, input_scales): 54 | """Converts an image into a network input. 55 | Arguments: 56 | im (ndarray): a color image in BGR order 57 | Returns: 58 | blob (ndarray): a data blob holding an image pyramid 59 | im_scale_factors (list): list of image scales (relative to im) used 60 | in the image pyramid 61 | """ 62 | im_orig = im.astype(np.float32, copy=True) 63 | im_orig -= cfg.PIXEL_MEANS 64 | 65 | im_shape = im_orig.shape 66 | im_size_min = np.min(im_shape[0:2]) 67 | im_size_max = np.max(im_shape[0:2]) 68 | 69 | processed_ims = [] 70 | im_scale_factors = [] 71 | 72 | for target_size in input_scales: 73 | im_scale = float(target_size) / float(im_size_min) 74 | # Prevent the biggest axis from being more than MAX_SIZE 75 | if np.round(im_scale * im_size_max) > cfg.TEST.MAX_SIZE: 76 | im_scale = float(cfg.TEST.MAX_SIZE) / float(im_size_max) 77 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 78 | interpolation=cv2.INTER_LINEAR) 79 | im_scale_factors.append(im_scale) 80 | processed_ims.append(im) 81 | 82 | # Create a blob to hold the input images 83 | blob = im_list_to_blob(processed_ims) 84 | 85 | return blob, np.array(im_scale_factors) 86 | 87 | 88 | def pred_rois_for_blob(im_rois, im_scales): 89 | """ 90 | Convert rois to network input 91 | support multi-scale testing 92 | """ 93 | im_rois = im_rois.astype(np.float, copy=False) 94 | if len(im_scales) > 1: 95 | widths = im_rois[:, 2] - im_rois[:, 0] + 1 96 | heights = im_rois[:, 3] - im_rois[:, 1] + 1 97 | 98 | areas = widths * heights 99 | scaled_areas = areas[:, np.newaxis] * (im_scales[np.newaxis, :] ** 2) 100 | diff_areas = np.abs(scaled_areas - 224 * 224) 101 | levels = diff_areas.argmin(axis=1)[:, np.newaxis] 102 | else: 103 | levels = np.zeros((im_rois.shape[0], 1), dtype=np.int) 104 | im_rois = im_rois * im_scales[levels] 105 | rois_blob = np.hstack((levels.astype(np.float), im_rois)) 106 | return rois_blob 107 | 108 | -------------------------------------------------------------------------------- /lib/pylayer/mask_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Written by Haozhi Qi 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import caffe 9 | import cv2 10 | import numpy as np 11 | from transform.mask_transform import mask_overlap 12 | from mnc_config import cfg 13 | 14 | 15 | class MaskLayer(caffe.Layer): 16 | """ 17 | This layer Take input from sigmoid predicted masks 18 | Assign each label for segmentation classifier according 19 | to region overlap 20 | """ 21 | 22 | def setup(self, bottom, top): 23 | self._phase = str(self.phase) 24 | self._top_name_map = {} 25 | top[0].reshape(1, 1, cfg.MASK_SIZE, cfg.MASK_SIZE) 26 | self._top_name_map['mask_proposal'] = 0 27 | if self._phase == 'TRAIN': 28 | top[1].reshape(1, 1) 29 | self._top_name_map['mask_proposal_label'] = 1 30 | 31 | def reshape(self, bottom, top): 32 | """ 33 | Reshaping happens during the call to forward 34 | """ 35 | pass 36 | 37 | def forward(self, bottom, top): 38 | if str(self.phase) == 'TRAIN': 39 | blobs = self.forward_train(bottom, top) 40 | elif str(self.phase) == 'TEST': 41 | blobs = self.forward_test(bottom, top) 42 | else: 43 | print 'Unrecognized phase' 44 | raise NotImplementedError 45 | 46 | for blob_name, blob in blobs.iteritems(): 47 | top[self._top_name_map[blob_name]].reshape(*blob.shape) 48 | top[self._top_name_map[blob_name]].data[...] = blob.astype(np.float32, copy=False) 49 | 50 | def backward(self, top, propagate_down, bottom): 51 | if propagate_down[0]: 52 | bottom[0].diff.fill(0.) 53 | top_grad = top[0].diff.reshape(top[0].diff.shape[0], cfg.MASK_SIZE * cfg.MASK_SIZE) 54 | bottom[0].diff[self.pos_sample, :] = top_grad[self.pos_sample, :] 55 | 56 | def forward_train(self, bottom, top): 57 | # Take sigmoid prediction as input 58 | mask_pred = bottom[0].data 59 | # get ground truth mask and labels 60 | gt_masks = bottom[1].data 61 | gt_masks_info = bottom[2].data 62 | num_mask_pred = mask_pred.shape[0] 63 | top_label = np.zeros((gt_masks_info.shape[0], 1)) 64 | # 2. Calculate region overlap 65 | # Since the target gt mask may have different size 66 | # We need to resize predicted masks into different sizes 67 | mask_size = cfg.MASK_SIZE 68 | for i in xrange(num_mask_pred): 69 | # if the bounding box is itself background 70 | if gt_masks_info[i][0] == -1: 71 | top_label[i][0] = 0 72 | continue 73 | else: 74 | info = gt_masks_info[i] 75 | gt_mask = gt_masks[info[0]][0:info[1], 0:info[2]] 76 | ex_mask = mask_pred[i].reshape((mask_size, mask_size)) 77 | ex_box = np.round(info[4:8]).astype(int) 78 | gt_box = np.round(info[8:12]).astype(int) 79 | # resize to large gt_masks, note cv2.resize is column first 80 | ex_mask = cv2.resize(ex_mask.astype(np.float32), (ex_box[2] - ex_box[0] + 1, 81 | ex_box[3] - ex_box[1] + 1)) 82 | ex_mask = ex_mask >= cfg.BINARIZE_THRESH 83 | top_label[i][0] = 0 if mask_overlap(ex_box, gt_box, ex_mask, gt_mask) < cfg.TRAIN.FG_SEG_THRESH else info[3] 84 | 85 | # output continuous mask for MNC 86 | resized_mask_pred = mask_pred.reshape((num_mask_pred, 1, cfg.MASK_SIZE, cfg.MASK_SIZE)) 87 | self.pos_sample = np.where(top_label > 0)[0] 88 | 89 | blobs = { 90 | 'mask_proposal': resized_mask_pred, 91 | 'mask_proposal_label': top_label 92 | } 93 | return blobs 94 | 95 | def forward_test(self, bottom, top): 96 | mask_pred = bottom[0].data 97 | num_mask_pred = mask_pred.shape[0] 98 | resized_mask_pred = mask_pred.reshape((num_mask_pred, 1, cfg.MASK_SIZE, cfg.MASK_SIZE)) 99 | blobs = { 100 | 'mask_proposal': resized_mask_pred 101 | } 102 | return blobs 103 | -------------------------------------------------------------------------------- /lib/transform/anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 11 | # 12 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 13 | # >> anchors 14 | # 15 | # anchors = 16 | # 17 | # -83 -39 100 56 18 | # -175 -87 192 104 19 | # -359 -183 376 200 20 | # -55 -55 72 72 21 | # -119 -119 136 136 22 | # -247 -247 264 264 23 | # -35 -79 52 96 24 | # -79 -167 96 184 25 | # -167 -343 184 360 26 | 27 | # array([[ -83., -39., 100., 56.], 28 | # [-175., -87., 192., 104.], 29 | # [-359., -183., 376., 200.], 30 | # [ -55., -55., 72., 72.], 31 | # [-119., -119., 136., 136.], 32 | # [-247., -247., 264., 264.], 33 | # [ -35., -79., 52., 96.], 34 | # [ -79., -167., 96., 184.], 35 | # [-167., -343., 184., 360.]]) 36 | 37 | 38 | def generate_anchors(base_size=16, ratios=[0.5, 1, 2], 39 | scales=2**np.arange(3, 6)): 40 | """ 41 | Generate anchor (reference) windows by enumerating aspect ratios X 42 | scales wrt a reference (0, 0, 15, 15) window. 43 | """ 44 | 45 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 46 | ratio_anchors = _ratio_enum(base_anchor, ratios) 47 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 48 | for i in xrange(ratio_anchors.shape[0])]) 49 | return anchors 50 | 51 | 52 | def _whctrs(anchor): 53 | """ 54 | Return width, height, x center, and y center for an anchor (window). 55 | """ 56 | 57 | w = anchor[2] - anchor[0] + 1 58 | h = anchor[3] - anchor[1] + 1 59 | x_ctr = anchor[0] + 0.5 * (w - 1) 60 | y_ctr = anchor[1] + 0.5 * (h - 1) 61 | return w, h, x_ctr, y_ctr 62 | 63 | 64 | def _mkanchors(ws, hs, x_ctr, y_ctr): 65 | """ 66 | Given a vector of widths (ws) and heights (hs) around a center 67 | (x_ctr, y_ctr), output a set of anchors (windows). 68 | """ 69 | 70 | ws = ws[:, np.newaxis] 71 | hs = hs[:, np.newaxis] 72 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 73 | y_ctr - 0.5 * (hs - 1), 74 | x_ctr + 0.5 * (ws - 1), 75 | y_ctr + 0.5 * (hs - 1))) 76 | return anchors 77 | 78 | 79 | def _ratio_enum(anchor, ratios): 80 | """ 81 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 82 | """ 83 | 84 | w, h, x_ctr, y_ctr = _whctrs(anchor) 85 | size = w * h 86 | size_ratios = size / ratios 87 | ws = np.round(np.sqrt(size_ratios)) 88 | hs = np.round(ws * ratios) 89 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 90 | return anchors 91 | 92 | 93 | def _scale_enum(anchor, scales): 94 | """ 95 | Enumerate a set of anchors for each scale wrt an anchor. 96 | """ 97 | 98 | w, h, x_ctr, y_ctr = _whctrs(anchor) 99 | ws = w * scales 100 | hs = h * scales 101 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 102 | return anchors 103 | 104 | 105 | def generate_shifted_anchors(anchors, height, width, feat_stride): 106 | # Enumerate all shifted anchors: 107 | # 108 | # add A anchors (1, A, 4) to 109 | # cell K shifts (K, 1, 4) to get 110 | # shift anchors (K, A, 4) 111 | # reshape to (K*A, 4) shifted anchors 112 | shift_x = np.arange(0, width) * feat_stride 113 | shift_y = np.arange(0, height) * feat_stride 114 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 115 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 116 | shift_x.ravel(), shift_y.ravel())).transpose() 117 | A = anchors.shape[0] 118 | K = shifts.shape[0] 119 | anchors = anchors.reshape((1, A, 4)) + \ 120 | shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 121 | anchors = anchors.reshape((K * A, 4)) 122 | return anchors 123 | 124 | 125 | if __name__ == '__main__': 126 | import time 127 | t = time.time() 128 | a = generate_anchors() 129 | print time.time() - t 130 | print a 131 | from IPython import embed 132 | embed() 133 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // -------------------------------------------------------- 2 | // Multitask Network Cascade 3 | // Modified from MATLAB Faster R-CNN (https://github.com/shaoqingren/faster_rcnn) 4 | // Copyright (c) 2016, Haozhi Qi 5 | // Licensed under The MIT License [see LICENSE for details] 6 | // -------------------------------------------------------- 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/utils/vis_seg.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Written by Haozhi Qi 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import cPickle 10 | import os 11 | import cv2 12 | from PIL import Image 13 | from mnc_config import cfg 14 | 15 | 16 | def vis_seg(img_names, cls_names, output_dir, gt_dir): 17 | """ 18 | This function plot segmentation results to specific directory 19 | Args: 20 | img_names: list 21 | """ 22 | assert os.path.exists(output_dir) 23 | # a list of dictionary 24 | inst_dir = os.path.join(output_dir, 'SegInst') 25 | cls_dir = os.path.join(output_dir, 'SegCls') 26 | res_dir = os.path.join(output_dir, 'SegRes') 27 | if not os.path.isdir(inst_dir): 28 | os.mkdir(inst_dir) 29 | if not os.path.isdir(cls_dir): 30 | os.mkdir(cls_dir) 31 | if not os.path.isdir(res_dir): 32 | os.mkdir(res_dir) 33 | 34 | res_list = _prepare_dict(img_names, cls_names, output_dir) 35 | for img_ind, image_name in enumerate(img_names): 36 | target_inst_file = os.path.join(inst_dir, image_name + '.jpg') 37 | target_cls_file = os.path.join(cls_dir, image_name + '.jpg') 38 | print image_name 39 | gt_image = gt_dir + '/img/' + image_name + '.jpg' 40 | img_data = cv2.imread(gt_image) 41 | img_width = img_data.shape[1] 42 | img_height = img_data.shape[0] 43 | pred_dict = res_list[img_ind] 44 | inst_img, cls_img = _convert_pred_to_image(img_width, img_height, pred_dict) 45 | color_map = _get_voc_color_map() 46 | inst_out_img = np.zeros((img_height, img_width, 3)) 47 | cls_out_img = np.zeros((img_height, img_width, 3)) 48 | for i in xrange(img_height): 49 | for j in xrange(img_width): 50 | inst_out_img[i][j] = color_map[inst_img[i][j]][::-1] 51 | cls_out_img[i][j] = color_map[cls_img[i][j]][::-1] 52 | 53 | cv2.imwrite(target_inst_file, inst_out_img) 54 | cv2.imwrite(target_cls_file, cls_out_img) 55 | background = Image.open(gt_image) 56 | mask = Image.open(target_cls_file) 57 | background = background.convert('RGBA') 58 | mask = mask.convert('RGBA') 59 | superimpose_image = Image.blend(background, mask, 0.8) 60 | name = os.path.join(res_dir, image_name + '.png') 61 | superimpose_image.save(name, 'PNG') 62 | 63 | 64 | def _prepare_dict(img_names, cls_names, cache_dir, vis_thresh=0.5): 65 | """ 66 | Returns: 67 | list, each list is a dictionary contains mask list, box list 68 | """ 69 | res_list = [] 70 | det_file = os.path.join(cache_dir, 'res_boxes.pkl') 71 | with open(det_file, 'rb') as f: 72 | det_pkl = cPickle.load(f) 73 | seg_file = os.path.join(cache_dir, 'res_masks.pkl') 74 | with open(seg_file, 'rb') as f: 75 | seg_pkl = cPickle.load(f) 76 | 77 | for img_ind, image_name in enumerate(img_names): 78 | box_for_img = [] 79 | mask_for_img = [] 80 | cls_for_img = [] 81 | for cls_ind, cls_name in enumerate(cls_names): 82 | if cls_name == '__background__' or len(det_pkl[cls_ind][img_ind]) == 0: 83 | continue 84 | det_for_img = det_pkl[cls_ind][img_ind] 85 | seg_for_img = seg_pkl[cls_ind][img_ind] 86 | keep_inds = np.where(det_for_img[:, -1] >= vis_thresh)[0] 87 | for keep in keep_inds: 88 | box_for_img.append(det_for_img[keep]) 89 | # TODO: remove this annoying 0 90 | mask_for_img.append(seg_for_img[keep][0]) 91 | cls_for_img.append(cls_ind) 92 | res_dict = {'image_name': image_name, 93 | 'cls_name': cls_for_img, 94 | 'boxes': box_for_img, 95 | 'masks': mask_for_img} 96 | res_list.append(res_dict) 97 | 98 | return res_list 99 | 100 | 101 | def _convert_pred_to_image(img_width, img_height, pred_dict): 102 | num_inst = len(pred_dict['boxes']) 103 | inst_img = np.zeros((img_height, img_width)) 104 | cls_img = np.zeros((img_height, img_width)) 105 | for i in xrange(num_inst): 106 | box = np.round(pred_dict['boxes'][i]).astype(int) 107 | mask = pred_dict['masks'][i] 108 | cls_num = pred_dict['cls_name'][i] 109 | # clip box into image space 110 | box[0] = min(max(box[0], 0), img_width - 1) 111 | box[1] = min(max(box[1], 0), img_height - 1) 112 | box[2] = min(max(box[2], 0), img_width - 1) 113 | box[3] = min(max(box[3], 0), img_height - 1) 114 | mask = cv2.resize(mask.astype(np.float32), (box[2]-box[0]+1, box[3]-box[1]+1)) 115 | mask = mask >= cfg.BINARIZE_THRESH 116 | 117 | part1 = (i+1) * mask.astype(np.float32) 118 | part2 = np.multiply(np.logical_not(mask), inst_img[box[1]:box[3]+1, box[0]:box[2]+1]) 119 | part3 = np.multiply(np.logical_not(mask), cls_img[box[1]:box[3]+1, box[0]:box[2]+1]) 120 | inst_img[box[1]:box[3]+1, box[0]:box[2]+1] = part1 + part2 121 | cls_img[box[1]:box[3]+1, box[0]:box[2]+1] = cls_num * mask.astype(np.float32) + part3 122 | # Plot bounding boxes simultaneously 123 | cls_img[box[1]:box[3]+1, box[0]-1:box[0]+1] = 150 124 | cls_img[box[1]:box[3]+1, box[2]-1:box[2]+1] = 150 125 | cls_img[box[1]-1:box[1]+1, box[0]:box[2]+1] = 150 126 | cls_img[box[3]-1:box[3]+1, box[0]:box[2]+1] = 150 127 | 128 | inst_img = inst_img.astype(int) 129 | cls_img = cls_img.astype(int) 130 | return inst_img, cls_img 131 | 132 | 133 | def _get_voc_color_map(n=256): 134 | color_map = np.zeros((n, 3)) 135 | for i in xrange(n): 136 | r = b = g = 0 137 | cid = i 138 | for j in xrange(0, 8): 139 | r = np.bitwise_or(r, np.left_shift(np.unpackbits(np.array([cid], dtype=np.uint8))[-1], 7-j)) 140 | g = np.bitwise_or(g, np.left_shift(np.unpackbits(np.array([cid], dtype=np.uint8))[-2], 7-j)) 141 | b = np.bitwise_or(b, np.left_shift(np.unpackbits(np.array([cid], dtype=np.uint8))[-3], 7-j)) 142 | cid = np.right_shift(cid, 3) 143 | 144 | color_map[i][0] = r 145 | color_map[i][1] = g 146 | color_map[i][2] = b 147 | return color_map 148 | -------------------------------------------------------------------------------- /lib/pylayer/mnc_data_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import cv2 9 | import numpy as np 10 | import yaml 11 | 12 | import caffe 13 | from mnc_config import cfg 14 | from utils.blob import prep_im_for_blob, im_list_to_blob 15 | 16 | 17 | class MNCDataLayer(caffe.Layer): 18 | """ 19 | Provide image, image w/h/scale, gt boxes/masks and mask info to upper layers 20 | """ 21 | 22 | def setup(self, bottom, top): 23 | layer_params = yaml.load(self.param_str_) 24 | self._num_classes = layer_params['num_classes'] 25 | self._name_to_top_map = {} 26 | # data blob: holds a batch of N images, each with 3 channels 27 | top[0].reshape(cfg.TRAIN.IMS_PER_BATCH, 3, max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE) 28 | self._name_to_top_map['data'] = 0 29 | assert(cfg.TRAIN.HAS_RPN, 'Use RPN for this project') 30 | # Just pseudo setup 31 | top[1].reshape(1, 3) 32 | self._name_to_top_map['im_info'] = 1 33 | top[2].reshape(1, 4) 34 | self._name_to_top_map['gt_boxes'] = 2 35 | if cfg.MNC_MODE: 36 | top[3].reshape(1, 21, 21) 37 | self._name_to_top_map['gt_masks'] = 3 38 | top[4].reshape(1, 3) 39 | self._name_to_top_map['mask_info'] = 4 40 | assert len(top) == len(self._name_to_top_map) 41 | 42 | def reshape(self, bottom, top): 43 | """Reshaping happens during the call to forward.""" 44 | pass 45 | 46 | def forward(self, bottom, top): 47 | """Get blobs and copy them into this layer's top blob vector.""" 48 | blobs = self._get_next_minibatch() 49 | for blob_name, blob in blobs.iteritems(): 50 | top_ind = self._name_to_top_map[blob_name] 51 | # Reshape net's input blobs 52 | top[top_ind].reshape(*blob.shape) 53 | # Copy data into net's input blobs 54 | top[top_ind].data[...] = blob.astype(np.float32, copy=False) 55 | 56 | def backward(self, top, propagate_down, bottom): 57 | """This layer does not propagate gradients.""" 58 | pass 59 | 60 | def set_roidb(self, roidb): 61 | """Set the roidb to be used by this layer during training.""" 62 | self._roidb = roidb 63 | self._shuffle_roidb_inds() 64 | 65 | def set_maskdb(self, maskdb): 66 | self._maskdb = maskdb 67 | self._shuffle_roidb_inds() 68 | 69 | def _shuffle_roidb_inds(self): 70 | """Randomly permute the training roidb.""" 71 | if cfg.TRAIN.ASPECT_GROUPING: 72 | widths = np.array([r['width'] for r in self._roidb]) 73 | heights = np.array([r['height'] for r in self._roidb]) 74 | horz = (widths >= heights) 75 | vert = np.logical_not(horz) 76 | horz_inds = np.where(horz)[0] 77 | vert_inds = np.where(vert)[0] 78 | inds = np.hstack(( 79 | np.random.permutation(horz_inds), 80 | np.random.permutation(vert_inds))) 81 | inds = np.reshape(inds, (-1, 2)) 82 | row_perm = np.random.permutation(np.arange(inds.shape[0])) 83 | inds = np.reshape(inds[row_perm, :], (-1,)) 84 | self._perm = inds 85 | else: 86 | self._perm = np.random.permutation(np.arange(len(self._roidb))) 87 | self._cur = 0 88 | 89 | def _get_image_blob(self, roidb, scale_inds): 90 | """Builds an input blob from the images in the roidb at the specified 91 | scales. 92 | """ 93 | num_images = 1 # len(roidb) 94 | processed_ims = [] 95 | im_scales = [] 96 | for i in xrange(num_images): 97 | im = cv2.imread(roidb['image']) 98 | if roidb['flipped']: 99 | im = im[:, ::-1, :] 100 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 101 | im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, 102 | cfg.TRAIN.MAX_SIZE) 103 | im_scales.append(im_scale) 104 | processed_ims.append(im) 105 | # Create a blob to hold the input images 106 | blob = im_list_to_blob(processed_ims) 107 | return blob, im_scales 108 | 109 | def _get_next_minibatch(self): 110 | """ 111 | Return the blobs to be used for the next minibatch. 112 | """ 113 | assert cfg.TRAIN.IMS_PER_BATCH == 1, 'Only single batch forwarding is supported' 114 | 115 | if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): 116 | self._shuffle_roidb_inds() 117 | db_inds = self._perm[self._cur] 118 | self._cur += 1 119 | roidb = self._roidb[db_inds] 120 | 121 | random_scale_inds = np.random.randint(0, high=len(cfg.TRAIN.SCALES), size=1) 122 | im_blob, im_scales = self._get_image_blob(roidb, random_scale_inds) 123 | 124 | gt_label = np.where(roidb['gt_classes'] != 0)[0] 125 | gt_boxes = np.hstack((roidb['boxes'][gt_label, :] * im_scales[0], 126 | roidb['gt_classes'][gt_label, np.newaxis])).astype(np.float32) 127 | blobs = { 128 | 'data': im_blob, 129 | 'gt_boxes': gt_boxes, 130 | 'im_info': np.array([[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], dtype=np.float32) 131 | } 132 | 133 | if cfg.MNC_MODE: 134 | maskdb = self._maskdb[db_inds] 135 | mask_list = maskdb['gt_masks'] 136 | mask_max_x = maskdb['mask_max'][0] 137 | mask_max_y = maskdb['mask_max'][1] 138 | gt_masks = np.zeros((len(mask_list), mask_max_y, mask_max_x)) 139 | mask_info = np.zeros((len(mask_list), 2)) 140 | for j in xrange(len(mask_list)): 141 | mask = mask_list[j] 142 | mask_x = mask.shape[1] 143 | mask_y = mask.shape[0] 144 | gt_masks[j, 0:mask_y, 0:mask_x] = mask 145 | mask_info[j, 0] = mask_y 146 | mask_info[j, 1] = mask_x 147 | blobs['gt_masks'] = gt_masks 148 | blobs['mask_info'] = mask_info 149 | 150 | return blobs 151 | -------------------------------------------------------------------------------- /lib/caffeWrapper/SolverWrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | 9 | import os 10 | import numpy as np 11 | 12 | from utils.timer import Timer 13 | from mnc_config import cfg 14 | from db.roidb import add_bbox_regression_targets, compute_mcg_mean_std 15 | import caffe 16 | from caffe.proto import caffe_pb2 17 | import google.protobuf as pb2 18 | 19 | 20 | class SolverWrapper(object): 21 | """ A simple wrapper around Caffe's solver. 22 | This wrapper gives us control over he snapshotting process, which we 23 | use to unnormalize the learned bounding-box regression weights. 24 | """ 25 | def __init__(self, solver_prototxt, roidb, maskdb, output_dir, imdb, 26 | pretrained_model=None): 27 | self.output_dir = output_dir 28 | if (cfg.TRAIN.HAS_RPN and cfg.TRAIN.BBOX_REG and 29 | cfg.TRAIN.BBOX_NORMALIZE_TARGETS): 30 | # RPN can only use precomputed normalization because there are no 31 | # fixed statistics to compute a priori 32 | assert cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED 33 | 34 | if cfg.TRAIN.BBOX_REG: 35 | if not cfg.CFM_MODE: 36 | print 'Computing bounding-box regression targets...' 37 | self.bbox_means, self.bbox_stds = add_bbox_regression_targets(roidb) 38 | print 'done' 39 | else: 40 | # Pre-defined mcg bbox_mean and bbox_std 41 | # We store them on disk to avoid disk level IO 42 | # multiple times (mcg boxes are stored on disk) 43 | mean_cache = './data/cache/mcg_bbox_mean.npy' 44 | std_cache = './data/cache/mcg_bbox_std.npy' 45 | roidb_dir = imdb._roidb_path 46 | if os.path.exists(mean_cache) and os.path.exists(std_cache): 47 | self.bbox_means = np.load(mean_cache) 48 | self.bbox_stds = np.load(std_cache) 49 | else: 50 | self.bbox_means, self.bbox_stds = compute_mcg_mean_std(roidb_dir, imdb.num_classes) 51 | 52 | self.solver = caffe.SGDSolver(solver_prototxt) 53 | if pretrained_model is not None: 54 | print 'Loading pretrained model weights from {:s}'.format(pretrained_model) 55 | self.solver.net.copy_from(pretrained_model) 56 | 57 | self.solver_param = caffe_pb2.SolverParameter() 58 | with open(solver_prototxt, 'rt') as f: 59 | pb2.text_format.Merge(f.read(), self.solver_param) 60 | if not cfg.CFM_MODE: 61 | self.solver.net.layers[0].set_roidb(roidb) 62 | if cfg.MNC_MODE: 63 | self.solver.net.layers[0].set_maskdb(maskdb) 64 | else: 65 | self.solver.net.layers[0].set_image_info(imdb, self.bbox_means, self.bbox_stds) 66 | 67 | def snapshot(self): 68 | """ Take a snapshot of the network after unnormalizing the learned 69 | bounding-box regression weights. This enables easy use at test-time. 70 | """ 71 | net = self.solver.net 72 | # I'm wondering whether I still need to keep it if only faster-RCNN is needed 73 | scale_bbox_params = (cfg.TRAIN.BBOX_REG and 74 | cfg.TRAIN.BBOX_NORMALIZE_TARGETS and 75 | 'bbox_pred' in net.params) 76 | if scale_bbox_params: 77 | # save original values 78 | orig_0 = net.params['bbox_pred'][0].data.copy() 79 | orig_1 = net.params['bbox_pred'][1].data.copy() 80 | if cfg.CFM_MODE: 81 | cfm_mean = self.bbox_means.ravel() 82 | cfm_std = self.bbox_stds.ravel() 83 | net.params['bbox_pred'][0].data[...] = \ 84 | (net.params['bbox_pred'][0].data * cfm_std[:, np.newaxis]) 85 | net.params['bbox_pred'][1].data[...] = \ 86 | (net.params['bbox_pred'][1].data * cfm_std + cfm_mean) 87 | else: 88 | # scale and shift with transform reg unnormalization; then save snapshot 89 | net.params['bbox_pred'][0].data[...] = \ 90 | (net.params['bbox_pred'][0].data * 91 | self.bbox_stds[:, np.newaxis]) 92 | net.params['bbox_pred'][1].data[...] = \ 93 | (net.params['bbox_pred'][1].data * 94 | self.bbox_stds + self.bbox_means) 95 | 96 | if not os.path.exists(self.output_dir): 97 | os.makedirs(self.output_dir) 98 | 99 | # If we specify an infix in the configuration 100 | infix = ('_' + cfg.TRAIN.SNAPSHOT_INFIX 101 | if cfg.TRAIN.SNAPSHOT_INFIX != '' else '') 102 | filename = (self.solver_param.snapshot_prefix + infix + 103 | '_iter_{:d}'.format(self.solver.iter) + '.caffemodel') 104 | 105 | # For snapshot caffemodel, since MNC use shared parameters 106 | # but caffe save parameters according to layer name instead of 107 | # parameter names, its size will exceed 2GB, which make program crash 108 | # Luckily, we may save it to HDF5 to avoid this issues 109 | if not cfg.MNC_MODE: 110 | filename = os.path.join(self.output_dir, filename) 111 | net.save(str(filename)) 112 | else: 113 | filename = os.path.join(self.output_dir, filename + '.h5') 114 | net.save_to_hdf5(str(filename), False) 115 | print 'Wrote snapshot to: {:s}'.format(filename) 116 | 117 | if scale_bbox_params: 118 | # restore net to original state 119 | net.params['bbox_pred'][0].data[...] = orig_0 120 | net.params['bbox_pred'][1].data[...] = orig_1 121 | 122 | def train_model(self, max_iters): 123 | last_snapshot_iter = -1 124 | timer = Timer() 125 | while self.solver.iter < max_iters: 126 | timer.tic() 127 | self.solver.step(1) 128 | timer.toc() 129 | if self.solver.iter % (10 * self.solver_param.display) == 0: 130 | print 'speed: {:.3f}s / iter'.format(timer.average_time) 131 | 132 | if self.solver.iter % cfg.TRAIN.SNAPSHOT_ITERS == 0: 133 | last_snapshot_iter = self.solver.iter 134 | self.snapshot() 135 | 136 | if last_snapshot_iter != self.solver.iter: 137 | self.snapshot() 138 | 139 | -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | from setuptools import setup 11 | from distutils.extension import Extension 12 | from Cython.Distutils import build_ext 13 | import numpy as np 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # Adapted fom 19 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 20 | for dir in path.split(os.pathsep): 21 | binpath = pjoin(dir, name) 22 | if os.path.exists(binpath): 23 | return os.path.abspath(binpath) 24 | return None 25 | 26 | 27 | def locate_cuda(): 28 | """Locate the CUDA environment on the system 29 | 30 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 31 | and values giving the absolute path to each directory. 32 | 33 | Starts by looking for the CUDAHOME env variable. If not found, everything 34 | is based on finding 'nvcc' in the PATH. 35 | """ 36 | 37 | # first check if the CUDAHOME env variable is in use 38 | if 'CUDAHOME' in os.environ: 39 | home = os.environ['CUDAHOME'] 40 | nvcc = pjoin(home, 'bin', 'nvcc') 41 | else: 42 | # otherwise, search the PATH for NVCC 43 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 44 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 45 | if nvcc is None: 46 | raise EnvironmentError('The nvcc binary could not be ' 47 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 48 | home = os.path.dirname(os.path.dirname(nvcc)) 49 | 50 | cudaconfig = {'home':home, 'nvcc':nvcc, 51 | 'include': pjoin(home, 'include'), 52 | 'lib64': pjoin(home, 'lib64')} 53 | for k, v in cudaconfig.iteritems(): 54 | if not os.path.exists(v): 55 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 56 | 57 | return cudaconfig 58 | CUDA = locate_cuda() 59 | 60 | 61 | # Obtain the numpy include directory. This logic works across numpy versions. 62 | try: 63 | numpy_include = np.get_include() 64 | except AttributeError: 65 | numpy_include = np.get_numpy_include() 66 | 67 | 68 | def customize_compiler_for_nvcc(self): 69 | """inject deep into distutils to customize how the dispatch 70 | to gcc/nvcc works. 71 | 72 | If you subclass UnixCCompiler, it's not trivial to get your subclass 73 | injected in, and still have the right customizations (i.e. 74 | distutils.sysconfig.customize_compiler) run on it. So instead of going 75 | the OO route, I have this. Note, it's kindof like a wierd functional 76 | subclassing going on.""" 77 | 78 | # tell the compiler it can processes .cu 79 | self.src_extensions.append('.cu') 80 | 81 | # save references to the default compiler_so and _comple methods 82 | default_compiler_so = self.compiler_so 83 | super = self._compile 84 | 85 | # now redefine the _compile method. This gets executed for each 86 | # object but distutils doesn't have the ability to change compilers 87 | # based on source extension: we add it. 88 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 89 | if os.path.splitext(src)[1] == '.cu': 90 | # use the cuda for .cu files 91 | self.set_executable('compiler_so', CUDA['nvcc']) 92 | # use only a subset of the extra_postargs, which are 1-1 translated 93 | # from the extra_compile_args in the Extension class 94 | postargs = extra_postargs['nvcc'] 95 | else: 96 | postargs = extra_postargs['gcc'] 97 | 98 | super(obj, src, ext, cc_args, postargs, pp_opts) 99 | # reset the default compiler_so, which we might have changed for cuda 100 | self.compiler_so = default_compiler_so 101 | 102 | # inject our redefined _compile method into the class 103 | self._compile = _compile 104 | 105 | 106 | # run the customize_compiler 107 | class custom_build_ext(build_ext): 108 | def build_extensions(self): 109 | customize_compiler_for_nvcc(self.compiler) 110 | build_ext.build_extensions(self) 111 | 112 | 113 | ext_modules = [ 114 | Extension( 115 | "utils.cython_bbox", 116 | ["utils/bbox.pyx"], 117 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 118 | include_dirs = [numpy_include] 119 | ), 120 | Extension( 121 | "nms.cpu_nms", 122 | ["nms/cpu_nms.pyx"], 123 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 124 | include_dirs = [numpy_include] 125 | ), 126 | Extension('nms.gpu_nms', 127 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 128 | library_dirs=[CUDA['lib64']], 129 | libraries=['cudart'], 130 | language='c++', 131 | runtime_library_dirs=[CUDA['lib64']], 132 | # this syntax is specific to this build system 133 | # we're only going to use certain compiler args with nvcc and not with 134 | # gcc the implementation of this trick is in customize_compiler() below 135 | extra_compile_args={'gcc': ["-Wno-unused-function"], 136 | 'nvcc': ['-arch=sm_35', 137 | '--ptxas-options=-v', 138 | '-c', 139 | '--compiler-options', 140 | "'-fPIC'"]}, 141 | include_dirs = [numpy_include, CUDA['include']] 142 | ), 143 | Extension('nms.mv', 144 | ['nms/mv_kernel.cu', 'nms/gpu_mv.pyx'], 145 | library_dirs=[CUDA['lib64']], 146 | libraries=['cudart'], 147 | language='c++', 148 | runtime_library_dirs=[CUDA['lib64']], 149 | # this syntax is specific to this build system 150 | # we're only going to use certain compiler args with nvcc and not with 151 | # gcc the implementation of this trick is in customize_compiler() below 152 | extra_compile_args={'gcc': ["-Wno-unused-function"], 153 | 'nvcc': ['-arch=sm_35', 154 | '--ptxas-options=-v', 155 | '-c', 156 | '--compiler-options', 157 | "'-fPIC'"]}, 158 | include_dirs = [numpy_include, CUDA['include']] 159 | ), 160 | ] 161 | 162 | setup( 163 | name='MNC', 164 | ext_modules=ext_modules, 165 | # inject our custom trigger 166 | cmdclass={'build_ext': custom_build_ext}, 167 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Instance-aware Semantic Segmentation via Multi-task Network Cascades 2 | 3 | By Jifeng Dai, Kaiming He, Jian Sun 4 | 5 | This python version is re-implemented by [Haozhi Qi](https://github.com/Oh233) when he was an intern at Microsoft Research. 6 | 7 | ### Introduction 8 | 9 | MNC is an instance-aware semantic segmentation system based on deep convolutional networks, which won the first place in COCO segmentation challenge 2015, and test at a fraction of a second per image. We decompose the task of instance-aware semantic segmentation into related sub-tasks, which are solved by multi-task network cascades (MNC) with shared features. The entire MNC network is trained end-to-end with error gradients across cascaded stages. 10 | 11 | 12 | ![example](data/readme_img/example.png) 13 | 14 | 15 | MNC was initially described in a [CVPR 2016 oral paper](http://arxiv.org/abs/1512.04412). 16 | 17 | This repository contains a python implementation of MNC, which is ~10% slower than the original matlab implementation. 18 | 19 | This repository includes a bilinear RoI warping layer, which enables gradient back-propagation with respect to RoI coordinates. 20 | 21 | ### Misc. 22 | 23 | This code has been tested on Linux (Ubuntu 14.04), using K40/Titan X GPUs. 24 | 25 | The code is built based on [py-faster-rcnn](https://github.com/rbgirshick/py-faster-rcnn). 26 | 27 | MNC is released under the MIT License (refer to the LICENSE file for details). 28 | 29 | 30 | ### Citing MNC 31 | 32 | If you find MNC useful in your research, please consider citing: 33 | 34 | @inproceedings{dai2016instance, 35 | title={Instance-aware Semantic Segmentation via Multi-task Network Cascades}, 36 | author={Dai, Jifeng and He, Kaiming and Sun, Jian}, 37 | booktitle={CVPR}, 38 | year={2016} 39 | } 40 | 41 | ### Main Results 42 | | | training data | test data | mAP^r@0.5 | mAP^r@0.7 | time (K40) | time (Titian X)| 43 | |-------------------|:-------------------:|:---------------------:|:-----------:|:-----------:|:-------------:|:-------------:| 44 | |MNC, VGG-16 | VOC 12 train | VOC 12 val | 65.0% | 46.3% | 0.42sec/img | 0.33sec/img| 45 | 46 | ### Installation guide 47 | 48 | 1. Clone the MNC repository: 49 | ```Shell 50 | # Make sure to clone with --recursive 51 | git clone --recursive https://github.com/daijifeng001/MNC.git 52 | ``` 53 | 54 | 2. Install Python packages: `numpy`, `scipy`, `cython`, `python-opencv`, `easydict`, `yaml`. 55 | 56 | 3. Build the Cython modules and the gpu_nms, gpu_mask_voting modules by: 57 | ```Shell 58 | cd $MNC_ROOT/lib 59 | make 60 | ``` 61 | 62 | 4. Install `Caffe` and `pycaffe` dependencies (see: [Caffe installation instructions](http://caffe.berkeleyvision.org/installation.html) for official installation guide) 63 | 64 | **Note:** Caffe *must* be built with support for Python layers! 65 | 66 | ```make 67 | # In your Makefile.config, make sure to have this line uncommented 68 | WITH_PYTHON_LAYER := 1 69 | # CUDNN is recommended in building to reduce memory footprint 70 | USE_CUDNN := 1 71 | ``` 72 | 73 | 5. Build Caffe and pycaffe: 74 | ```Shell 75 | cd $MNC_ROOT/caffe-mnc 76 | # If you have all of the requirements installed 77 | # and your Makefile.config in place, then simply do: 78 | make -j8 && make pycaffe 79 | ``` 80 | 81 | ### Demo 82 | 83 | First, download the trained MNC model. 84 | ```Shell 85 | ./data/scripts/fetch_mnc_model.sh 86 | ``` 87 | 88 | Run the demo: 89 | ```Shell 90 | cd $MNC_ROOT 91 | ./tools/demo.py 92 | ``` 93 | Result demo images will be stored to ```data/demo/```. 94 | 95 | The demo performs instance-aware semantic segmentation with a trained MNC model (using VGG-16 net). The model is pre-trained on ImageNet, and finetuned on VOC 2012 train set with additional annotations from [SBD](http://home.bharathh.info/pubs/codes/SBD/download.html). The mAP^r of the model is 65.0% on VOC 2012 validation set. The test speed per image is ~0.33sec on Titian X and ~0.42sec on K40. 96 | 97 | ### Training 98 | 99 | This repository contains code to **end-to-end** train MNC for instance-aware semantic segmentation, where gradients across cascaded stages are counted in training. 100 | 101 | #### Preparation: 102 | 103 | 0. Run `./data/scripts/fetch_imagenet_models.sh` to download the ImageNet pre-trained VGG-16 net. 104 | 0. Download the VOC 2007 dataset to ./data/VOCdevkit2007 105 | 0. Run `./data/scripts/fetch_sbd_data.sh` to download the VOC 2012 dataset together with the additional segmentation annotations in [SBD](https://9bc0b5eb4c18f1fc9a28517a91305702c68a10ae.googledrive.com/host/0ByUkob0WA1-NQi1sNlg4WkJQbTg/codes/SBD/download.html) to ./data/VOCdevkitSDS. 106 | 107 | #### 1. End-to-end training of MNC for instance-aware semantic segmentation 108 | 109 | To end-to-end train a 5-stage MNC model (on VOC 2012 train), use `experiments/scripts/mnc_5stage.sh`. Final mAP^r@0.5 should be ~65.0% (mAP^r@0.7 should be ~46.3%), on VOC 2012 validation. 110 | 111 | ```Shell 112 | cd $MNC_ROOT 113 | ./experiments/scripts/mnc_5stage.sh [GPU_ID] VGG16 [--set ...] 114 | # GPU_ID is the GPU you want to train on 115 | # --set ... allows you to specify fast_rcnn.config options, e.g. 116 | # --set EXP_DIR seed_rng 1701 RNG_SEED 1701 117 | ``` 118 | 119 | #### 2. Training of CFM for instance-aware semantic segmentation 120 | 121 | The code also includes an entry to train a [convolutional feature masking](https://arxiv.org/abs/1412.1283) (CFM) model for instance aware semantic segmentation. 122 | 123 | @inproceedings{dai2015convolutional, 124 | title={Convolutional Feature Masking for Joint Object and Stuff Segmentation}, 125 | author={Dai, Jifeng and He, Kaiming and Sun, Jian}, 126 | booktitle={CVPR}, 127 | year={2015} 128 | } 129 | 130 | ##### 2.1. Download pre-computed MCG proposals 131 | 132 | Download and process the pre-computed MCG proposals. 133 | 134 | ```Shell 135 | cd $MNC_ROOT 136 | ./data/scripts/fetch_mcg_data.sh 137 | python ./tools/prepare_mcg_maskdb.py --para_job 24 --db train --output data/cache/voc_2012_train_mcg_maskdb/ 138 | python ./tools/prepare_mcg_maskdb.py --para_job 24 --db val --output data/cache/voc_2012_val_mcg_maskdb/ 139 | ``` 140 | Resulting proposals would be at folder ```data/MCG/```. 141 | 142 | ##### 2.2. Train the model 143 | 144 | Run `experiments/scripts/cfm.sh` to train on VOC 2012 train set. Final mAP^r@0.5 should be ~60.5% (mAP^r@0.7 should be ~42.6%), on VOC 2012 validation. 145 | 146 | ```Shell 147 | cd $MNC_ROOT 148 | ./experiments/scripts/cfm.sh [GPU_ID] VGG16 [--set ...] 149 | # GPU_ID is the GPU you want to train on 150 | # --set ... allows you to specify fast_rcnn.config options, e.g. 151 | # --set EXP_DIR seed_rng 1701 RNG_SEED 1701 152 | ``` 153 | 154 | #### 3. End-to-end training of Faster-RCNN for object detection 155 | 156 | Faster-RCNN can be viewed as a 2-stage cascades composed of region proposal network (RPN) and object detection network. Run script `experiments/scripts/faster_rcnn_end2end.sh` to train a Faster-RCNN model on VOC 2007 trainval. Final mAP^b should be ~69.1% on VOC 2007 test. 157 | 158 | ```Shell 159 | cd $MNC_ROOT 160 | ./experiments/scripts/faster_rcnn_end2end.sh [GPU_ID] VGG16 [--set ...] 161 | # GPU_ID is the GPU you want to train on 162 | # --set ... allows you to specify fast_rcnn.config options, e.g. 163 | # --set EXP_DIR seed_rng1701 RNG_SEED 1701 164 | ``` 165 | -------------------------------------------------------------------------------- /lib/db/roidb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import PIL 9 | import numpy as np 10 | import os 11 | import cPickle 12 | import scipy 13 | 14 | from db.imdb import get_imdb 15 | from mnc_config import cfg 16 | from transform.bbox_transform import compute_targets 17 | 18 | 19 | def prepare_roidb(imdb): 20 | """ Enrich the imdb's roidb by adding some derived quantities that 21 | are useful for training. This function pre-computes the maximum 22 | overlap, taken over ground-truth boxes, between each ROI and 23 | each ground-truth box. The class with maximum overlap is also 24 | recorded. 25 | """ 26 | sizes = [PIL.Image.open(imdb.image_path_at(i)).size 27 | for i in xrange(imdb.num_images)] 28 | roidb = imdb.roidb 29 | for i in xrange(len(imdb.image_index)): 30 | roidb[i]['image'] = imdb.image_path_at(i) 31 | roidb[i]['width'] = sizes[i][0] 32 | roidb[i]['height'] = sizes[i][1] 33 | # need gt_overlaps as a dense array for argmax 34 | gt_overlaps = roidb[i]['gt_overlaps'].toarray() 35 | # max overlap with gt over classes (columns) 36 | max_overlaps = gt_overlaps.max(axis=1) 37 | # gt class that had the max overlap 38 | max_classes = gt_overlaps.argmax(axis=1) 39 | roidb[i]['max_classes'] = max_classes 40 | roidb[i]['max_overlaps'] = max_overlaps 41 | # sanity checks 42 | # max overlap of 0 => class should be zero (background) 43 | zero_inds = np.where(max_overlaps == 0)[0] 44 | assert all(max_classes[zero_inds] == 0) 45 | # max overlap > 0 => class should not be zero (must be a fg class) 46 | nonzero_inds = np.where(max_overlaps > 0)[0] 47 | assert all(max_classes[nonzero_inds] != 0) 48 | 49 | 50 | def add_bbox_regression_targets(roidb): 51 | """Add information needed to train bounding-box regressors.""" 52 | assert len(roidb) > 0 53 | assert 'max_classes' in roidb[0], 'Did you call prepare_roidb first?' 54 | 55 | num_images = len(roidb) 56 | # Infer number of classes from the number of columns in gt_overlaps 57 | num_classes = roidb[0]['gt_overlaps'].shape[1] 58 | for im_i in xrange(num_images): 59 | rois = roidb[im_i]['boxes'] 60 | max_overlaps = roidb[im_i]['max_overlaps'] 61 | max_classes = roidb[im_i]['max_classes'] 62 | roidb[im_i]['bbox_targets'] = \ 63 | compute_targets(rois, max_overlaps, max_classes) 64 | 65 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 66 | # Use fixed / precomputed "means" and "stds" instead of empirical values 67 | means = np.tile( 68 | np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS), (num_classes, 1)) 69 | stds = np.tile( 70 | np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS), (num_classes, 1)) 71 | else: 72 | # Compute values needed for means and stds 73 | # var(x) = E(x^2) - E(x)^2 74 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 75 | sums = np.zeros((num_classes, 4)) 76 | squared_sums = np.zeros((num_classes, 4)) 77 | for im_i in xrange(num_images): 78 | targets = roidb[im_i]['bbox_targets'] 79 | for cls in xrange(1, num_classes): 80 | cls_inds = np.where(targets[:, 0] == cls)[0] 81 | if cls_inds.size > 0: 82 | class_counts[cls] += cls_inds.size 83 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 84 | squared_sums[cls, :] += \ 85 | (targets[cls_inds, 1:] ** 2).sum(axis=0) 86 | 87 | means = sums / class_counts 88 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 89 | 90 | print 'bbox target means:' 91 | print means 92 | print means[1:, :].mean(axis=0) # ignore bg class 93 | print 'bbox target stdevs:' 94 | print stds 95 | print stds[1:, :].mean(axis=0) # ignore bg class 96 | 97 | # Normalize targets 98 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS: 99 | print "Normalizing targets" 100 | for im_i in xrange(num_images): 101 | targets = roidb[im_i]['bbox_targets'] 102 | for cls in xrange(1, num_classes): 103 | cls_inds = np.where(targets[:, 0] == cls)[0] 104 | roidb[im_i]['bbox_targets'][cls_inds, 1:] -= means[cls, :] 105 | roidb[im_i]['bbox_targets'][cls_inds, 1:] /= stds[cls, :] 106 | else: 107 | print "NOT normalizing targets" 108 | 109 | # These values will be needed for making predictions 110 | # (the predicts will need to be unnormalized and uncentered) 111 | return means.ravel(), stds.ravel() 112 | 113 | 114 | def get_roidb(imdb_name): 115 | imdb = get_imdb(imdb_name) 116 | print 'Loaded dataset `{:s}` for training'.format(imdb.name) 117 | # Here set handler function. (e.g. gt_roidb in faster RCNN) 118 | imdb.set_roi_handler(cfg.TRAIN.PROPOSAL_METHOD) 119 | print 'Set proposal method: {:s}'.format(cfg.TRAIN.PROPOSAL_METHOD) 120 | if cfg.TRAIN.USE_FLIPPED: 121 | print 'Appending horizontally-flipped training examples...' 122 | imdb.append_flipped_rois() 123 | print 'done' 124 | print 'Preparing training data...' 125 | prepare_roidb(imdb) 126 | print 'done' 127 | return imdb.roidb 128 | 129 | 130 | def attach_roidb(imdb_names): 131 | """ 132 | only implement single roidb now 133 | """ 134 | roidbs = [get_roidb(s) for s in imdb_names.split('+')] 135 | roidb = roidbs[0] 136 | if len(roidbs) > 1: 137 | raise NotImplementedError 138 | else: 139 | imdb = get_imdb(imdb_names) 140 | return imdb, roidb 141 | 142 | 143 | def compute_mcg_mean_std(roidb_dir, num_classes): 144 | """ 145 | Compute bbox mean and stds for mcg proposals 146 | Since mcg proposal are stored on disk, so we precomputed it here once 147 | and save them to disk to avoid disk I/O next time 148 | Args: 149 | roidb_dir: directory contain all the mcg proposals 150 | """ 151 | file_list = sorted(os.listdir(roidb_dir)) 152 | target_list = [] 153 | cnt = 0 154 | for file_name in file_list: 155 | roidb_cache = os.path.join(roidb_dir, file_name) 156 | roidb = scipy.io.loadmat(roidb_cache) 157 | target_list.append(compute_targets(roidb['boxes'], roidb['det_overlap'], roidb['output_label'].ravel())) 158 | cnt += 1 159 | 160 | class_counts = np.zeros((num_classes, 1)) + cfg.EPS 161 | sums = np.zeros((num_classes, 4)) 162 | squared_sums = np.zeros((num_classes, 4)) 163 | for im_i in xrange(len(target_list)): 164 | targets = target_list[im_i] 165 | for cls in xrange(1, num_classes): 166 | cls_inds = np.where(targets[:, 0] == cls)[0] 167 | if cls_inds.size > 0: 168 | class_counts[cls] += cls_inds.size 169 | sums[cls, :] += targets[cls_inds, 1:].sum(axis=0) 170 | squared_sums[cls, :] += \ 171 | (targets[cls_inds, 1:] ** 2).sum(axis=0) 172 | 173 | means = sums / class_counts 174 | stds = np.sqrt(squared_sums / class_counts - means ** 2) 175 | np.save('data/cache/mcg_bbox_mean.npy', means) 176 | np.save('data/cache/mcg_bbox_std.npy', stds) 177 | return means, stds 178 | -------------------------------------------------------------------------------- /lib/mnc_config.py: -------------------------------------------------------------------------------- 1 | 2 | """MNC config system 3 | """ 4 | import numpy as np 5 | import os.path 6 | from easydict import EasyDict as edict 7 | 8 | __C = edict() 9 | cfg = __C 10 | 11 | # MNC/CFM mode 12 | __C.MNC_MODE = True 13 | __C.CFM_MODE = False 14 | 15 | __C.EXP_DIR = 'default' 16 | __C.USE_GPU_NMS = True 17 | __C.GPU_ID = 0 18 | __C.RNG_SEED = 3 19 | __C.EPS = 1e-14 20 | __C.PIXEL_MEANS = np.array([[[102.9801, 115.9465, 122.7717]]]) 21 | # Root directory of project 22 | __C.ROOT_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 23 | # Data directory 24 | __C.DATA_DIR = os.path.abspath(os.path.join(__C.ROOT_DIR, 'data')) 25 | # Related to mask resizing and binarize predicted masks 26 | __C.BINARIZE_THRESH = 0.4 27 | # Mask estimation (if any) size (may be different from CFM input size) 28 | __C.MASK_SIZE = 21 29 | 30 | # Training options 31 | __C.TRAIN = edict() 32 | 33 | # ------- General setting ---- 34 | __C.TRAIN.IMS_PER_BATCH = 1 35 | # Batch size for training Region CNN (not RPN) 36 | __C.TRAIN.BATCH_SIZE = 64 37 | # Make minibatches from images that have similar aspect ratios (i.e. both 38 | # tall and thin or both short and wide) in order to avoid wasting computation 39 | # on zero-padding. 40 | __C.TRAIN.ASPECT_GROUPING = True 41 | # Use flipped image for augmentation 42 | __C.TRAIN.USE_FLIPPED = True 43 | # Resize shortest side to 600 44 | __C.TRAIN.SCALES = (600,) 45 | __C.TRAIN.MAX_SIZE = 1000 46 | __C.TRAIN.SNAPSHOT_ITERS = 5000 47 | __C.TRAIN.SNAPSHOT_INFIX = '' 48 | # Sample FG 49 | __C.TRAIN.FG_FRACTION = [0.3] 50 | __C.TRAIN.FG_THRESH_HI = [1.0] 51 | __C.TRAIN.FG_THRESH_LO = [0.5] 52 | # Sample BF according to remaining samples 53 | __C.TRAIN.BG_FRACTION = [0.85, 0.15] 54 | __C.TRAIN.BG_THRESH_HI = [0.5, 0.1] 55 | __C.TRAIN.BG_THRESH_LO = [0.1, 0.0] 56 | 57 | # ------- Proposal ------- 58 | __C.TRAIN.PROPOSAL_METHOD = 'gt' 59 | 60 | # ------- BBOX Regression --------- 61 | # Train bounding-box regressors 62 | __C.TRAIN.BBOX_REG = True 63 | __C.TRAIN.BBOX_NORMALIZE_TARGETS = True 64 | __C.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED = False 65 | __C.TRAIN.BBOX_THRESH = 0.5 66 | __C.TRAIN.BBOX_NORMALIZE_MEANS = (0.0, 0.0, 0.0, 0.0) 67 | __C.TRAIN.BBOX_NORMALIZE_STDS = (0.1, 0.1, 0.2, 0.2) 68 | # weight of smooth L1 loss 69 | __C.TRAIN.BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 70 | 71 | # -------- RPN ---------- 72 | # Use RPN to detect objects 73 | __C.TRAIN.HAS_RPN = True 74 | # IOU >= thresh: positive example 75 | __C.TRAIN.RPN_POSITIVE_OVERLAP = 0.7 76 | # IO < thresh: negative example 77 | __C.TRAIN.RPN_NEGATIVE_OVERLAP = 0.3 78 | # If an anchor satisfied by positive and negative conditions set to negative 79 | __C.TRAIN.RPN_CLOBBER_POSITIVES = False 80 | # Max number of foreground examples 81 | # Note this is class-agnostic anchors' FG_FRACTION 82 | __C.TRAIN.RPN_FG_FRACTION = 0.5 83 | # Total number of examples 84 | __C.TRAIN.RPN_BATCHSIZE = 256 85 | # NMS threshold used on RPN proposals 86 | __C.TRAIN.RPN_NMS_THRESH = 0.7 87 | # Number of top scoring boxes to keep before apply NMS to RPN proposals 88 | __C.TRAIN.RPN_PRE_NMS_TOP_N = 12000 89 | # Number of top scoring boxes to keep after applying NMS to RPN proposals 90 | __C.TRAIN.RPN_POST_NMS_TOP_N = 2000 91 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) 92 | __C.TRAIN.RPN_MIN_SIZE = 16 93 | # Deprecated (outside weights) 94 | __C.TRAIN.RPN_BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 95 | # Give the positive RPN examples weight of p * 1 / {num positives} 96 | # and give negatives a weight of (1 - p) 97 | # Set to -1.0 to use uniform example weighting 98 | __C.TRAIN.RPN_POSITIVE_WEIGHT = -1.0 99 | # Mix anchors used for RPN and later layer 100 | __C.TRAIN.MIX_INDEX = True 101 | 102 | # -------- CFM ---------- 103 | __C.TRAIN.CFM_INPUT_MASK_SIZE = 14 104 | __C.TRAIN.FG_DET_THRESH = 0.5 105 | __C.TRAIN.FG_SEG_THRESH = 0.5 106 | __C.TRAIN.FRACTION_SAMPLE = [0.3, 0.5, 0.2] 107 | __C.TRAIN.THRESH_LO_SAMPLE = [0.5, 0.1, 0.0] 108 | __C.TRAIN.THRESH_HI_SAMPLE = [1.0, 0.5, 0.1] 109 | 110 | # Test option 111 | 112 | __C.TEST = edict() 113 | # Scales to use during testing (can list multiple scales) 114 | # Each scale is the pixel size of an image's shortest side 115 | __C.TEST.SCALES = (600,) 116 | 117 | # Max pixel size of the longest side of a scaled input image 118 | __C.TEST.MAX_SIZE = 1000 119 | 120 | # Overlap threshold used for non-maximum suppression (suppress boxes with 121 | # IoU >= this threshold) 122 | __C.TEST.NMS = 0.3 123 | # Set this true in the yml file to specify proposed RPN 124 | __C.TEST.HAS_RPN = True 125 | # NMS threshold used on RPN proposals 126 | __C.TEST.RPN_NMS_THRESH = 0.7 127 | # Number of top scoring boxes to keep before apply NMS to RPN proposals 128 | __C.TEST.RPN_PRE_NMS_TOP_N = 6000 129 | # Number of top scoring boxes to keep after applying NMS to RPN proposals 130 | __C.TEST.RPN_POST_NMS_TOP_N = 300 131 | # Proposal height and width both need to be greater than RPN_MIN_SIZE (at orig image scale) 132 | __C.TEST.RPN_MIN_SIZE = 16 133 | __C.TEST.BBOX_REG = True 134 | 135 | # Aggregate nearby masks inside box, the box_IOU threshold 136 | __C.TEST.MASK_MERGE_IOU_THRESH = 0.5 137 | __C.TEST.MASK_MERGE_NMS_THRESH = 0.3 138 | __C.TEST.CFM_INPUT_MASK_SIZE = 14 139 | 140 | # Used for multi-scale testing, since naive implementation 141 | # will waste a lot of on zero-padding, so we group each 142 | # $GROUP_SCALE scales to feed in gpu. And max rois for 143 | # each group is specified in MAX_ROIS_GPU 144 | __C.TEST.MAX_ROIS_GPU = [2000] 145 | __C.TEST.GROUP_SCALE = 1 146 | 147 | # 0 means use all the MCG proposals 148 | __C.TEST.USE_TOP_K_MCG = 0 149 | 150 | # threshold for binarize a mask 151 | __C.TEST.USE_MASK_MERGE = True 152 | __C.TEST.USE_GPU_MASK_MERGE = True 153 | 154 | 155 | def get_output_dir(imdb, net): 156 | """ Return the directory where experimental artifacts are placed. 157 | A canonical path is built using the name from an imdb and a network 158 | (if not None). 159 | """ 160 | path = os.path.abspath(os.path.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name)) 161 | if net is None: 162 | return path 163 | else: 164 | return os.path.join(path, net.name) 165 | 166 | 167 | def _merge_two_config(user_cfg, default_cfg): 168 | """ Merge user's config into default config dictionary, clobbering the 169 | options in b whenever they are also specified in a. 170 | Need to ensure the type of two val under same key are the same 171 | Do recursive merge when encounter hierarchical dictionary 172 | """ 173 | if type(user_cfg) is not edict: 174 | return 175 | for key, val in user_cfg.iteritems(): 176 | # Since user_cfg is a sub-file of default_cfg 177 | if not default_cfg.has_key(key): 178 | raise KeyError('{} is not a valid config key'.format(key)) 179 | 180 | if type(default_cfg[key]) is not type(val): 181 | if isinstance(default_cfg[key], np.ndarray): 182 | val = np.array(val, dtype=default_cfg[key].dtype) 183 | else: 184 | raise ValueError( 185 | 'Type mismatch ({} vs. {}) ' 186 | 'for config key: {}'.format(type(default_cfg[key]), 187 | type(val), key)) 188 | # Recursive merge config 189 | if type(val) is edict: 190 | try: 191 | _merge_two_config(user_cfg[key], default_cfg[key]) 192 | except: 193 | print 'Error under config key: {}'.format(key) 194 | raise 195 | else: 196 | default_cfg[key] = val 197 | 198 | 199 | def cfg_from_file(file_name): 200 | """ Load a config file and merge it into the default options. 201 | """ 202 | import yaml 203 | with open(file_name, 'r') as f: 204 | yaml_cfg = edict(yaml.load(f)) 205 | 206 | _merge_two_config(yaml_cfg, __C) 207 | -------------------------------------------------------------------------------- /lib/transform/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | from utils.cython_bbox import bbox_overlaps 10 | from mnc_config import cfg 11 | 12 | 13 | def compute_targets(rois, overlaps, labels): 14 | """ 15 | Compute bounding-box regression targets for an image. 16 | """ 17 | # Indices of ground-truth ROIs 18 | gt_inds = np.where(overlaps == 1)[0] 19 | # Indices of examples for which we try to make predictions 20 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 21 | 22 | # Get IoU overlap each ex ROI and gt ROI 23 | ex_gt_overlaps = bbox_overlaps( 24 | np.ascontiguousarray(rois[ex_inds, :], dtype=np.float), 25 | np.ascontiguousarray(rois[gt_inds, :], dtype=np.float)) 26 | 27 | # Find which gt ROI each ex ROI has max overlap with: 28 | # this will be the ex ROI's gt target 29 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 30 | gt_rois = rois[gt_inds[gt_assignment], :] 31 | ex_rois = rois[ex_inds, :] 32 | 33 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 34 | targets[ex_inds, 0] = labels[ex_inds] 35 | targets[ex_inds, 1:] = bbox_transform(ex_rois, gt_rois) 36 | return targets 37 | 38 | 39 | def bbox_transform(ex_rois, gt_rois): 40 | """ 41 | Compute bbox regression targets of external rois 42 | with respect to gt rois 43 | """ 44 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 45 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 46 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 47 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 48 | 49 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 50 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 51 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 52 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 53 | 54 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 55 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 56 | targets_dw = np.log(gt_widths / ex_widths) 57 | targets_dh = np.log(gt_heights / ex_heights) 58 | 59 | targets = np.vstack( 60 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 61 | return targets 62 | 63 | 64 | def bbox_transform_inv(boxes, deltas): 65 | """ 66 | invert bounding box transform 67 | apply delta on anchors to get transformed proposals 68 | """ 69 | if boxes.shape[0] == 0: 70 | return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype) 71 | 72 | boxes = boxes.astype(deltas.dtype, copy=False) 73 | 74 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 75 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 76 | ctr_x = boxes[:, 0] + 0.5 * widths 77 | ctr_y = boxes[:, 1] + 0.5 * heights 78 | 79 | dx = deltas[:, 0::4] 80 | dy = deltas[:, 1::4] 81 | dw = deltas[:, 2::4] 82 | dh = deltas[:, 3::4] 83 | 84 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 85 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 86 | pred_w = np.exp(dw) * widths[:, np.newaxis] 87 | pred_h = np.exp(dh) * heights[:, np.newaxis] 88 | 89 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 90 | # x1 91 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 92 | # y1 93 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 94 | # x2 95 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w 96 | # y2 97 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h 98 | 99 | return pred_boxes 100 | 101 | 102 | def clip_boxes(boxes, im_shape): 103 | """ 104 | Clip boxes inside image boundaries 105 | """ 106 | x1 = boxes[:, 0::4] 107 | y1 = boxes[:, 1::4] 108 | x2 = boxes[:, 2::4] 109 | y2 = boxes[:, 3::4] 110 | keep = np.where((x1 >= 0) & (x2 <= im_shape[1] - 1) & (y1 >= 0) & (y2 <= im_shape[0] - 1))[0] 111 | clipped_boxes = np.zeros(boxes.shape, dtype=boxes.dtype) 112 | # x1 >= 0 113 | clipped_boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 114 | # y1 >= 0 115 | clipped_boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 116 | # x2 < im_shape[1] 117 | clipped_boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 118 | # y2 < im_shape[0] 119 | clipped_boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 120 | return clipped_boxes, keep 121 | 122 | 123 | def filter_small_boxes(boxes, min_size): 124 | """ 125 | Remove all boxes with any side smaller than min_size. 126 | """ 127 | ws = boxes[:, 2] - boxes[:, 0] + 1 128 | hs = boxes[:, 3] - boxes[:, 1] + 1 129 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 130 | return keep 131 | 132 | 133 | def scale_boxes(boxes, alpha): 134 | """ 135 | Scale boxes from w/h to alpha * w/h while keep center unchanged 136 | Args: 137 | boxes: a set of boxes specified using x1, y1, x2, y2 138 | alpha: scaling factor 139 | 140 | Returns: 141 | boxes: boxes after applying scaling 142 | """ 143 | w = boxes[:, 2] - boxes[:, 0] + 1 144 | h = boxes[:, 3] - boxes[:, 1] + 1 145 | ctr_x = boxes[:, 0] + 0.5 * w 146 | ctr_y = boxes[:, 1] + 0.5 * h 147 | scaled_w = w * alpha 148 | scaled_h = h * alpha 149 | scaled_boxes = np.zeros(boxes.shape, dtype=boxes.dtype) 150 | scaled_boxes[:, 0] = ctr_x - 0.5 * scaled_w 151 | scaled_boxes[:, 1] = ctr_y - 0.5 * scaled_h 152 | scaled_boxes[:, 2] = ctr_x + 0.5 * scaled_w 153 | scaled_boxes[:, 3] = ctr_y + 0.5 * scaled_h 154 | return scaled_boxes 155 | 156 | 157 | def bbox_compute_targets(ex_rois, gt_rois, normalize): 158 | """ 159 | Compute bounding-box regression targets for an image 160 | Parameters: 161 | ----------- 162 | ex_rois: ROIs from external source (anchors or proposals) 163 | gt_rois: ground truth ROIs 164 | normalize: whether normalize box (since RPN doesn't need to normalize) 165 | 166 | Returns: 167 | ----------- 168 | Relative value for anchor or proposals 169 | """ 170 | assert ex_rois.shape == gt_rois.shape 171 | 172 | targets = bbox_transform(ex_rois, gt_rois) 173 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED and normalize: 174 | # Optionally normalize targets by a precomputed mean and std 175 | targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) / 176 | np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) 177 | 178 | return targets.astype(np.float32, copy=False) 179 | 180 | 181 | def get_bbox_regression_label(bbox_target_data, num_class): 182 | """Bounding-box regression targets (bbox_target_data) are stored in a 183 | compact form N x (class, tx, ty, tw, th) 184 | 185 | This function expands those targets into the 4-of-4*K representation used 186 | by the network (i.e. only one class has non-zero targets). 187 | 188 | Returns: 189 | bbox_target (ndarray): N x 4K blob of regression targets 190 | bbox_inside_weights (ndarray): N x 4K blob of loss weights 191 | """ 192 | assert bbox_target_data.shape[1] == 5 193 | clss = bbox_target_data[:, 0] 194 | bbox_targets = np.zeros((clss.size, 4 * num_class), dtype=np.float32) 195 | bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) 196 | inds = np.where(clss > 0)[0] 197 | for ind in inds: 198 | cls = clss[ind] 199 | start = 4 * cls 200 | end = start + 4 201 | bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] 202 | bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS 203 | return bbox_targets, bbox_inside_weights 204 | -------------------------------------------------------------------------------- /tools/demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # -------------------------------------------------------- 4 | # Multitask Network Cascade 5 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 6 | # Copyright (c) 2016, Haozhi Qi 7 | # Licensed under The MIT License [see LICENSE for details] 8 | # -------------------------------------------------------- 9 | 10 | # Standard module 11 | import os 12 | import argparse 13 | import time 14 | import cv2 15 | import numpy as np 16 | # User-defined module 17 | import _init_paths 18 | import caffe 19 | from mnc_config import cfg 20 | from transform.bbox_transform import clip_boxes 21 | from utils.blob import prep_im_for_blob, im_list_to_blob 22 | from transform.mask_transform import gpu_mask_voting 23 | import matplotlib.pyplot as plt 24 | from utils.vis_seg import _convert_pred_to_image, _get_voc_color_map 25 | from PIL import Image 26 | 27 | # VOC 20 classes 28 | CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 29 | 'bottle', 'bus', 'car', 'cat', 'chair', 30 | 'cow', 'diningtable', 'dog', 'horse', 31 | 'motorbike', 'person', 'pottedplant', 32 | 'sheep', 'sofa', 'train', 'tvmonitor') 33 | 34 | 35 | def parse_args(): 36 | """Parse input arguments.""" 37 | parser = argparse.ArgumentParser(description='MNC demo') 38 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 39 | default=0, type=int) 40 | parser.add_argument('--cpu', dest='cpu_mode', 41 | help='Use CPU mode (overrides --gpu)', 42 | action='store_true') 43 | parser.add_argument('--def', dest='prototxt', 44 | help='prototxt file defining the network', 45 | default='./models/VGG16/mnc_5stage/test.prototxt', type=str) 46 | parser.add_argument('--net', dest='caffemodel', 47 | help='model to test', 48 | default='./data/mnc_model/mnc_model.caffemodel.h5', type=str) 49 | 50 | args = parser.parse_args() 51 | return args 52 | 53 | 54 | def prepare_mnc_args(im, net): 55 | # Prepare image data blob 56 | blobs = {'data': None} 57 | processed_ims = [] 58 | im, im_scale_factors = \ 59 | prep_im_for_blob(im, cfg.PIXEL_MEANS, cfg.TEST.SCALES[0], cfg.TRAIN.MAX_SIZE) 60 | processed_ims.append(im) 61 | blobs['data'] = im_list_to_blob(processed_ims) 62 | # Prepare image info blob 63 | im_scales = [np.array(im_scale_factors)] 64 | assert len(im_scales) == 1, 'Only single-image batch implemented' 65 | im_blob = blobs['data'] 66 | blobs['im_info'] = np.array( 67 | [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], 68 | dtype=np.float32) 69 | # Reshape network inputs and do forward 70 | net.blobs['data'].reshape(*blobs['data'].shape) 71 | net.blobs['im_info'].reshape(*blobs['im_info'].shape) 72 | forward_kwargs = { 73 | 'data': blobs['data'].astype(np.float32, copy=False), 74 | 'im_info': blobs['im_info'].astype(np.float32, copy=False) 75 | } 76 | return forward_kwargs, im_scales 77 | 78 | 79 | def im_detect(im, net): 80 | forward_kwargs, im_scales = prepare_mnc_args(im, net) 81 | blobs_out = net.forward(**forward_kwargs) 82 | # output we need to collect: 83 | # 1. output from phase1' 84 | rois_phase1 = net.blobs['rois'].data.copy() 85 | masks_phase1 = net.blobs['mask_proposal'].data[...] 86 | scores_phase1 = net.blobs['seg_cls_prob'].data[...] 87 | # 2. output from phase2 88 | rois_phase2 = net.blobs['rois_ext'].data[...] 89 | masks_phase2 = net.blobs['mask_proposal_ext'].data[...] 90 | scores_phase2 = net.blobs['seg_cls_prob_ext'].data[...] 91 | # Boxes are in resized space, we un-scale them back 92 | rois_phase1 = rois_phase1[:, 1:5] / im_scales[0] 93 | rois_phase2 = rois_phase2[:, 1:5] / im_scales[0] 94 | rois_phase1, _ = clip_boxes(rois_phase1, im.shape) 95 | rois_phase2, _ = clip_boxes(rois_phase2, im.shape) 96 | # concatenate two stages to get final network output 97 | masks = np.concatenate((masks_phase1, masks_phase2), axis=0) 98 | boxes = np.concatenate((rois_phase1, rois_phase2), axis=0) 99 | scores = np.concatenate((scores_phase1, scores_phase2), axis=0) 100 | return boxes, masks, scores 101 | 102 | 103 | def get_vis_dict(result_box, result_mask, img_name, cls_names, vis_thresh=0.5): 104 | box_for_img = [] 105 | mask_for_img = [] 106 | cls_for_img = [] 107 | for cls_ind, cls_name in enumerate(cls_names): 108 | det_for_img = result_box[cls_ind] 109 | seg_for_img = result_mask[cls_ind] 110 | keep_inds = np.where(det_for_img[:, -1] >= vis_thresh)[0] 111 | for keep in keep_inds: 112 | box_for_img.append(det_for_img[keep]) 113 | mask_for_img.append(seg_for_img[keep][0]) 114 | cls_for_img.append(cls_ind + 1) 115 | res_dict = {'image_name': img_name, 116 | 'cls_name': cls_for_img, 117 | 'boxes': box_for_img, 118 | 'masks': mask_for_img} 119 | return res_dict 120 | 121 | if __name__ == '__main__': 122 | args = parse_args() 123 | test_prototxt = args.prototxt 124 | test_model = args.caffemodel 125 | 126 | caffe.set_mode_gpu() 127 | caffe.set_device(args.gpu_id) 128 | cfg.GPU_ID = args.gpu_id 129 | net = caffe.Net(test_prototxt, test_model, caffe.TEST) 130 | 131 | # Warm up for the first two images 132 | im = 128 * np.ones((300, 500, 3), dtype=np.float32) 133 | for i in xrange(2): 134 | _, _, _ = im_detect(im, net) 135 | 136 | im_names = ['2008_000533.jpg', '2008_000910.jpg', '2008_001602.jpg', 137 | '2008_001717.jpg', '2008_008093.jpg'] 138 | demo_dir = './data/demo' 139 | for im_name in im_names: 140 | print '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' 141 | print 'Demo for data/demo/{}'.format(im_name) 142 | gt_image = os.path.join(demo_dir, im_name) 143 | im = cv2.imread(gt_image) 144 | start = time.time() 145 | boxes, masks, seg_scores = im_detect(im, net) 146 | end = time.time() 147 | print 'forward time %f' % (end-start) 148 | result_mask, result_box = gpu_mask_voting(masks, boxes, seg_scores, len(CLASSES) + 1, 149 | 100, im.shape[1], im.shape[0]) 150 | pred_dict = get_vis_dict(result_box, result_mask, 'data/demo/' + im_name, CLASSES) 151 | 152 | img_width = im.shape[1] 153 | img_height = im.shape[0] 154 | 155 | inst_img, cls_img = _convert_pred_to_image(img_width, img_height, pred_dict) 156 | color_map = _get_voc_color_map() 157 | target_cls_file = os.path.join(demo_dir, 'cls_' + im_name) 158 | cls_out_img = np.zeros((img_height, img_width, 3)) 159 | for i in xrange(img_height): 160 | for j in xrange(img_width): 161 | cls_out_img[i][j] = color_map[cls_img[i][j]][::-1] 162 | cv2.imwrite(target_cls_file, cls_out_img) 163 | 164 | background = Image.open(gt_image) 165 | mask = Image.open(target_cls_file) 166 | background = background.convert('RGBA') 167 | mask = mask.convert('RGBA') 168 | superimpose_image = Image.blend(background, mask, 0.8) 169 | superimpose_name = os.path.join(demo_dir, 'final_' + im_name) 170 | superimpose_image.save(superimpose_name, 'JPEG') 171 | im = cv2.imread(superimpose_name) 172 | 173 | im = im[:, :, (2, 1, 0)] 174 | fig, ax = plt.subplots(figsize=(12, 12)) 175 | ax.imshow(im, aspect='equal') 176 | classes = pred_dict['cls_name'] 177 | for i in xrange(len(classes)): 178 | score = pred_dict['boxes'][i][-1] 179 | bbox = pred_dict['boxes'][i][:4] 180 | cls_ind = classes[i] - 1 181 | ax.text(bbox[0], bbox[1] - 8, 182 | '{:s} {:.4f}'.format(CLASSES[cls_ind], score), 183 | bbox=dict(facecolor='blue', alpha=0.5), 184 | fontsize=14, color='white') 185 | plt.axis('off') 186 | plt.tight_layout() 187 | plt.draw() 188 | 189 | fig.savefig(os.path.join(demo_dir, im_name[:-4]+'.png')) 190 | os.remove(superimpose_name) 191 | os.remove(target_cls_file) 192 | -------------------------------------------------------------------------------- /lib/pylayer/proposal_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import caffe 9 | import yaml 10 | import numpy as np 11 | import numpy.random as npr 12 | from mnc_config import cfg 13 | from transform.bbox_transform import \ 14 | bbox_transform, bbox_compute_targets, \ 15 | scale_boxes, get_bbox_regression_label 16 | from transform.anchors import generate_anchors 17 | from transform.mask_transform import intersect_mask 18 | from utils.cython_bbox import bbox_overlaps 19 | 20 | 21 | class ProposalTargetLayer(caffe.Layer): 22 | """ 23 | Assign object detection proposals to ground-truth targets. Produces proposal 24 | classification labels and bounding-box regression targets. 25 | """ 26 | 27 | def setup(self, bottom, top): 28 | layer_params = yaml.load(self.param_str_) 29 | self._anchors = generate_anchors() 30 | self._num_anchors = self._anchors.shape[0] 31 | self._num_classes = layer_params['num_classes'] 32 | self._bp_all = layer_params.get('bp_all', True) 33 | self._top_name_map = {} 34 | top[0].reshape(1, 5) 35 | self._top_name_map['rois'] = 0 36 | top[1].reshape(1, 1) 37 | self._top_name_map['labels'] = 1 38 | top[2].reshape(1, self._num_classes * 4) 39 | self._top_name_map['bbox_targets'] = 2 40 | top[3].reshape(1, self._num_classes * 4) 41 | self._top_name_map['bbox_inside_weights'] = 3 42 | top[4].reshape(1, self._num_classes * 4) 43 | self._top_name_map['bbox_outside_weights'] = 4 44 | # Add mask-related information 45 | if cfg.MNC_MODE: 46 | top[5].reshape(1, 1, cfg.MASK_SIZE, cfg.MASK_SIZE) 47 | self._top_name_map['mask_targets'] = 5 48 | top[6].reshape(1, 1, cfg.MASK_SIZE, cfg.MASK_SIZE) 49 | self._top_name_map['mask_weight'] = 6 50 | top[7].reshape(1, 4) 51 | self._top_name_map['gt_masks_info'] = 7 52 | if cfg.TRAIN.MIX_INDEX: 53 | top[8].reshape(1, 4) 54 | self._top_name_map['fg_inds'] = 8 55 | top[9].reshape(1, 4) 56 | self._top_name_map['bg_inds'] = 9 57 | 58 | def reshape(self, bottom, top): 59 | """Reshaping happens during the call to forward.""" 60 | pass 61 | 62 | def forward(self, bottom, top): 63 | # Proposal ROIs (0, x1, y1, x2, y2) coming from RPN 64 | # (i.e., rpn.proposal_layer.ProposalLayer), or any other source 65 | all_rois = bottom[0].data 66 | # GT boxes (x1, y1, x2, y2, label) 67 | gt_boxes = bottom[1].data 68 | im_info = bottom[2].data[0, :] 69 | im_scale = im_info[2] 70 | # get original masks 71 | if cfg.MNC_MODE: 72 | gt_masks = bottom[3].data 73 | mask_info = bottom[4].data 74 | else: 75 | gt_masks = None 76 | mask_info = None 77 | # Include ground-truth boxes in the set of candidate rois 78 | zeros = np.zeros((gt_boxes.shape[0], 1), dtype=gt_boxes.dtype) 79 | all_rois = np.vstack( 80 | (all_rois, np.hstack((zeros, gt_boxes[:, :-1]))) 81 | ) 82 | 83 | # Sanity check: single batch only 84 | assert np.all(all_rois[:, 0] == 0), \ 85 | 'Only single item batches are supported' 86 | 87 | num_images = 1 88 | rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images 89 | # Sample rois with classification labels and bounding box regression targets 90 | 91 | blobs, fg_inds, bg_inds, keep_inds = _sample_rois( 92 | all_rois, gt_boxes, rois_per_image, self._num_classes, gt_masks, im_scale, mask_info) 93 | self._keep_ind = keep_inds if self._bp_all else fg_inds 94 | 95 | for blob_name, blob in blobs.iteritems(): 96 | top[self._top_name_map[blob_name]].reshape(*blob.shape) 97 | top[self._top_name_map[blob_name]].data[...] = blob.astype(np.float32, copy=False) 98 | 99 | if cfg.TRAIN.MIX_INDEX: 100 | all_rois_index = bottom[5].data 101 | fg_inds = fg_inds[fg_inds < all_rois_index.shape[1]].astype(int) 102 | fg_inds = all_rois_index[0, fg_inds] 103 | bg_inds = all_rois_index[0, bg_inds.astype(int)] 104 | top[self._top_name_map['fg_inds']].reshape(*fg_inds.shape) 105 | top[self._top_name_map['fg_inds']].data[...] = fg_inds 106 | top[self._top_name_map['bg_inds']].reshape(*bg_inds.shape) 107 | top[self._top_name_map['bg_inds']].data[...] = bg_inds 108 | 109 | def backward(self, top, propagate_down, bottom): 110 | if propagate_down[0]: 111 | bottom[0].diff.fill(0.) 112 | # Eliminate gt_inds from the keep inds 113 | valid_inds = np.where(self._keep_ind < bottom[0].diff.shape[0])[0] 114 | valid_bot_inds = self._keep_ind[valid_inds].astype(int) 115 | bottom[0].diff[valid_bot_inds, :] = top[0].diff[valid_inds, :] 116 | 117 | 118 | def _sample_rois(all_rois, gt_boxes, rois_per_image, num_classes, gt_masks, im_scale, mask_info): 119 | """ 120 | Generate a random sample of RoIs comprising 121 | foreground and background examples. 122 | """ 123 | # overlaps: (rois x gt_boxes) 124 | overlaps = bbox_overlaps( 125 | np.ascontiguousarray(all_rois[:, 1:5], dtype=np.float), 126 | np.ascontiguousarray(gt_boxes[:, :4], dtype=np.float)) 127 | gt_assignment = overlaps.argmax(axis=1) 128 | max_overlaps = overlaps.max(axis=1) 129 | labels = gt_boxes[gt_assignment, 4] 130 | 131 | # Sample foreground indexes 132 | fg_inds = [] 133 | for i in xrange(len(cfg.TRAIN.FG_FRACTION)): 134 | cur_inds = np.where((max_overlaps >= cfg.TRAIN.FG_THRESH_LO[i]) & 135 | (max_overlaps <= cfg.TRAIN.FG_THRESH_HI[i]))[0] 136 | cur_rois_this_image = min(cur_inds.size, np.round(rois_per_image * 137 | cfg.TRAIN.FG_FRACTION[i])) 138 | if cur_inds.size > 0: 139 | cur_inds = npr.choice(cur_inds, size=cur_rois_this_image, replace=False) 140 | fg_inds = np.hstack((fg_inds, cur_inds)) 141 | fg_inds = np.unique(fg_inds) 142 | fg_rois_per_image = fg_inds.size 143 | # Sample background indexes according to number of foreground 144 | bg_rois_per_this_image = rois_per_image - fg_rois_per_image 145 | bg_inds = [] 146 | for i in xrange(len(cfg.TRAIN.BG_FRACTION)): 147 | cur_inds = np.where((max_overlaps >= cfg.TRAIN.BG_THRESH_LO[i]) & 148 | (max_overlaps <= cfg.TRAIN.BG_THRESH_HI[i]))[0] 149 | cur_rois_this_image = min(cur_inds.size, np.round(bg_rois_per_this_image * 150 | cfg.TRAIN.BG_FRACTION[i])) 151 | if cur_inds.size > 0: 152 | cur_inds = npr.choice(cur_inds, size=cur_rois_this_image, replace=False) 153 | bg_inds = np.hstack((bg_inds, cur_inds)) 154 | bg_inds = np.unique(bg_inds) 155 | 156 | # The indices that we're selecting (both fg and bg) 157 | keep_inds = np.append(fg_inds, bg_inds).astype(int) 158 | # Select sampled values from various arrays: 159 | labels = labels[keep_inds] 160 | # Clamp labels for the background RoIs to 0 161 | labels[fg_rois_per_image:] = 0 162 | rois = all_rois[keep_inds] 163 | 164 | bbox_target_data = bbox_compute_targets( 165 | rois[:, 1:5], gt_boxes[gt_assignment[keep_inds], :4], normalize=True) 166 | bbox_target_data = np.hstack((labels[:, np.newaxis], bbox_target_data))\ 167 | .astype(np.float32, copy=False) 168 | bbox_targets, bbox_inside_weights = get_bbox_regression_label( 169 | bbox_target_data, num_classes) 170 | bbox_outside_weights = np.array(bbox_inside_weights > 0).astype(np.float32) 171 | 172 | blobs = { 173 | 'rois': rois, 174 | 'labels': labels, 175 | 'bbox_targets': bbox_targets, 176 | 'bbox_inside_weights': bbox_inside_weights, 177 | 'bbox_outside_weights': bbox_outside_weights 178 | } 179 | 180 | if cfg.MNC_MODE: 181 | scaled_rois = rois[:, 1:5] / float(im_scale) 182 | 183 | # map to original image space 184 | scaled_gt_boxes = gt_boxes[:, :4] / float(im_scale) 185 | pos_masks = np.zeros((len(keep_inds), 1, cfg.MASK_SIZE, cfg.MASK_SIZE)) 186 | top_mask_info = np.zeros((len(keep_inds), 12)) 187 | top_mask_info[len(fg_inds):, :] = -1 188 | 189 | for i, val in enumerate(fg_inds): 190 | gt_box = scaled_gt_boxes[gt_assignment[val]] 191 | gt_box = np.around(gt_box).astype(int) 192 | ex_box = np.around(scaled_rois[i]).astype(int) 193 | gt_mask = gt_masks[gt_assignment[val]] 194 | gt_mask_info = mask_info[gt_assignment[val]] 195 | gt_mask = gt_mask[0:gt_mask_info[0], 0:gt_mask_info[1]] 196 | # calculate mask regression targets 197 | # (intersection of bounding box and gt mask) 198 | ex_mask = intersect_mask(ex_box, gt_box, gt_mask) 199 | 200 | pos_masks[i, ...] = ex_mask 201 | top_mask_info[i, 0] = gt_assignment[val] 202 | top_mask_info[i, 1] = gt_mask_info[0] 203 | top_mask_info[i, 2] = gt_mask_info[1] 204 | top_mask_info[i, 3] = labels[i] 205 | 206 | top_mask_info[i, 4:8] = ex_box 207 | top_mask_info[i, 8:12] = gt_box 208 | 209 | mask_weight = np.zeros((rois.shape[0], 1, cfg.MASK_SIZE, cfg.MASK_SIZE)) 210 | # only assign box-level foreground as positive mask regression 211 | mask_weight[0:len(fg_inds), :, :, :] = 1 212 | blobs['mask_targets'] = pos_masks 213 | blobs['mask_weight'] = mask_weight 214 | blobs['gt_masks_info'] = top_mask_info 215 | 216 | return blobs, fg_inds, bg_inds, keep_inds 217 | -------------------------------------------------------------------------------- /lib/pylayer/anchor_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import yaml 9 | import numpy as np 10 | 11 | import caffe 12 | from transform.anchors import generate_anchors 13 | from utils.cython_bbox import bbox_overlaps 14 | from utils.unmap import unmap 15 | from mnc_config import cfg 16 | from transform.bbox_transform import bbox_transform 17 | 18 | 19 | class AnchorTargetLayer(caffe.Layer): 20 | """ 21 | Assign anchors to ground-truth targets. Produces anchor classification 22 | labels and bounding-box regression targets. 23 | """ 24 | 25 | def setup(self, bottom, top): 26 | self._anchors = generate_anchors() 27 | self._num_anchors = self._anchors.shape[0] 28 | 29 | layer_params = yaml.load(self.param_str_) 30 | self._feat_stride = layer_params['feat_stride'] 31 | 32 | # allow boxes to sit over the edge by a small amount 33 | self._allowed_border = layer_params.get('allowed_border', 0) 34 | 35 | height, width = bottom[0].data.shape[-2:] 36 | 37 | A = self._num_anchors 38 | # labels 39 | top[0].reshape(1, 1, A * height, width) 40 | # bbox_targets 41 | top[1].reshape(1, A * 4, height, width) 42 | # bbox_inside_weights 43 | top[2].reshape(1, A * 4, height, width) 44 | # bbox_outside_weights 45 | top[3].reshape(1, A * 4, height, width) 46 | 47 | def reshape(self, bottom, top): 48 | """Reshaping happens during the call to forward""" 49 | pass 50 | 51 | def forward(self, bottom, top): 52 | # Algorithm: 53 | # 54 | # for each (H, W) location i 55 | # generate 9 anchor boxes centered on cell i 56 | # apply predicted transform deltas at cell i to each of the 9 anchors 57 | # filter out-of-image anchors 58 | # measure GT overlap 59 | # 60 | # Output target referenced value 61 | height, width = bottom[0].data.shape[-2:] 62 | assert bottom[0].data.shape[0] == 1, 'Only single item batches are supported' 63 | gt_boxes = bottom[1].data 64 | im_info = bottom[2].data[0, :] 65 | 66 | # 1. Generate proposals from shifted anchors 67 | # note: unlike proposal layer, in this stage, no deltas involved 68 | shift_x = np.arange(0, width) * self._feat_stride 69 | shift_y = np.arange(0, height) * self._feat_stride 70 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 71 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 72 | shift_x.ravel(), shift_y.ravel())).transpose() 73 | # add A anchors (1, A, 4) to 74 | # cell K shifts (K, 1, 4) to get 75 | # shift anchors (K, A, 4) 76 | # reshape to (K*A, 4) shifted anchors 77 | A = self._num_anchors 78 | K = shifts.shape[0] 79 | all_anchors = (self._anchors.reshape((1, A, 4)) + 80 | shifts.reshape((1, K, 4)).transpose((1, 0, 2))) 81 | all_anchors = all_anchors.reshape((K * A, 4)) 82 | total_anchors = int(K * A) 83 | 84 | # only keep anchors inside the image 85 | inds_inside = np.where( 86 | (all_anchors[:, 0] >= -self._allowed_border) & 87 | (all_anchors[:, 1] >= -self._allowed_border) & 88 | (all_anchors[:, 2] < im_info[1] + self._allowed_border) & # width 89 | (all_anchors[:, 3] < im_info[0] + self._allowed_border) # height 90 | )[0] 91 | 92 | # 2. For each anchor, we assign positive or negative 93 | anchors = all_anchors[inds_inside, :] 94 | # label: 1 is positive, 0 is negative, -1 is don't care 95 | labels = np.empty((len(inds_inside), ), dtype=np.float32) 96 | labels.fill(-1) 97 | # overlaps between the anchors and the gt boxes 98 | # overlaps (ex, gt) 99 | overlaps = bbox_overlaps( 100 | np.ascontiguousarray(anchors, dtype=np.float), 101 | np.ascontiguousarray(gt_boxes, dtype=np.float)) 102 | argmax_overlaps = overlaps.argmax(axis=1) 103 | max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] 104 | gt_argmax_overlaps = overlaps.argmax(axis=0) 105 | gt_max_overlaps = overlaps[gt_argmax_overlaps, 106 | np.arange(overlaps.shape[1])] 107 | gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] 108 | 109 | if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: 110 | # assign bg labels first so that positive labels can clobber them 111 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 112 | 113 | # We assign two types of anchors as positve 114 | # fg label: for each gt, anchor with highest overlap 115 | labels[gt_argmax_overlaps] = 1 116 | # fg label: above threshold IOU 117 | labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 118 | 119 | if cfg.TRAIN.RPN_CLOBBER_POSITIVES: 120 | # assign bg labels last so that negative labels can clobber positives 121 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 122 | 123 | num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) 124 | fg_inds = np.where(labels == 1)[0] 125 | if len(fg_inds) > num_fg: 126 | disable_inds = np.random.choice( 127 | fg_inds, size=(len(fg_inds) - num_fg), replace=False) 128 | labels[disable_inds] = -1 129 | 130 | # subsample negative labels if we have too many 131 | num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) 132 | bg_inds = np.where(labels == 0)[0] 133 | if len(bg_inds) > num_bg: 134 | disable_inds = np.random.choice( 135 | bg_inds, size=(len(bg_inds) - num_bg), replace=False) 136 | labels[disable_inds] = -1 137 | 138 | if cfg.TRAIN.MIX_INDEX: 139 | bottom_fg = bottom[3].data 140 | bottom_bg = bottom[4].data 141 | unmapped_fg_ind = [] 142 | unmapped_bg_ind = [] 143 | for i in list(bottom_fg): 144 | zal = np.where(i == inds_inside)[0] 145 | if len(zal) > 0: 146 | unmapped_fg_ind.append(zal[0]) 147 | for i in list(bottom_bg): 148 | zal = np.where(i == inds_inside)[0] 149 | if len(zal) > 0: 150 | unmapped_bg_ind.append(zal[0]) 151 | labels[unmapped_bg_ind] = 0 152 | labels[unmapped_fg_ind] = 1 153 | 154 | bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) 155 | bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) 156 | 157 | bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 158 | bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) 159 | 160 | bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 161 | if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: 162 | # uniform weighting of examples (given non-uniform sampling) 163 | num_examples = np.sum(labels >= 0) 164 | positive_weights = np.ones((1, 4)) * 1.0 / num_examples 165 | negative_weights = np.ones((1, 4)) * 1.0 / num_examples 166 | else: 167 | assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & 168 | (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) 169 | positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / 170 | np.sum(labels == 1)) 171 | negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / 172 | np.sum(labels == 0)) 173 | bbox_outside_weights[labels == 1, :] = positive_weights 174 | bbox_outside_weights[labels == 0, :] = negative_weights 175 | 176 | # Currently all the indices are in the clipped index space 177 | # we map up to original set of anchors 178 | # In this process, we need to set clipped boxes as label -1, weights 0 179 | labels = unmap(labels, total_anchors, inds_inside, fill=-1) 180 | bbox_targets = unmap(bbox_targets, total_anchors, inds_inside, fill=0) 181 | bbox_inside_weights = unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) 182 | bbox_outside_weights = unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) 183 | # labels 184 | labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2) 185 | labels = labels.reshape((1, 1, A * height, width)) 186 | top[0].reshape(*labels.shape) 187 | top[0].data[...] = labels 188 | 189 | # bbox_targets 190 | bbox_targets = bbox_targets \ 191 | .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) 192 | top[1].reshape(*bbox_targets.shape) 193 | top[1].data[...] = bbox_targets 194 | 195 | # bbox_inside_weights 196 | bbox_inside_weights = bbox_inside_weights \ 197 | .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) 198 | assert bbox_inside_weights.shape[2] == height 199 | assert bbox_inside_weights.shape[3] == width 200 | top[2].reshape(*bbox_inside_weights.shape) 201 | top[2].data[...] = bbox_inside_weights 202 | 203 | # bbox_outside_weights 204 | bbox_outside_weights = bbox_outside_weights \ 205 | .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) 206 | assert bbox_outside_weights.shape[2] == height 207 | assert bbox_outside_weights.shape[3] == width 208 | top[3].reshape(*bbox_outside_weights.shape) 209 | top[3].data[...] = bbox_outside_weights 210 | 211 | def backward(self, top, propagate_down, bottom): 212 | """This layer does not propagate gradients.""" 213 | pass 214 | 215 | 216 | def _compute_targets(ex_rois, gt_rois): 217 | """ 218 | Compute bounding-box regression targets for an image. 219 | Parameters: 220 | ----------- 221 | ex_rois: ROIs from external source (selective search or RPN) 222 | gt_rois: ground truth rois 223 | 224 | Returns: 225 | --------- 226 | The correct relative value for this anchor (combined when generate proposal) 227 | """ 228 | 229 | assert ex_rois.shape[0] == gt_rois.shape[0] 230 | assert ex_rois.shape[1] == 4 231 | assert gt_rois.shape[1] == 5 232 | 233 | return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False) -------------------------------------------------------------------------------- /lib/pylayer/proposal_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Modified from py-faster-rcnn (https://github.com/rbgirshick/py-faster-rcnn) 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import caffe 9 | import numpy as np 10 | import yaml 11 | 12 | from mnc_config import cfg 13 | from transform.anchors import generate_anchors 14 | from transform.bbox_transform import clip_boxes, bbox_transform_inv, filter_small_boxes 15 | from nms.nms_wrapper import nms 16 | 17 | DEBUG = False 18 | PRINT_GRADIENT = 1 19 | 20 | 21 | class ProposalLayer(caffe.Layer): 22 | """ 23 | Outputs object detection proposals by applying estimated bounding-box 24 | transformations to a set of regular boxes (called "anchors"). 25 | """ 26 | 27 | def setup(self, bottom, top): 28 | layer_params = yaml.load(self.param_str_) 29 | self._feat_stride = layer_params['feat_stride'] 30 | self._anchors = generate_anchors() 31 | self._num_anchors = self._anchors.shape[0] 32 | self._use_clip = layer_params.get('use_clip', 0) 33 | self._clip_denominator = float(layer_params.get('clip_base', 256)) 34 | self._clip_thresh = 1.0 / self._clip_denominator 35 | # rois blob: holds R regions of interest, each is a 5-tuple 36 | # (n, x1, y1, x2, y2) specifying an image batch index n and a 37 | # rectangle (x1, y1, x2, y2) 38 | self._top_name_map = {} 39 | top[0].reshape(1, 5) 40 | self._top_name_map['rois'] = 0 41 | # For MNC, we force the output proposals will also be used to train RPN 42 | # this is achieved by passing proposal_index to anchor_target_layer 43 | if str(self.phase) == 'TRAIN': 44 | if cfg.TRAIN.MIX_INDEX: 45 | top[1].reshape(1, 1) 46 | self._top_name_map['proposal_index'] = 1 47 | 48 | def reshape(self, bottom, top): 49 | """Reshaping happens during the call to forward.""" 50 | pass 51 | 52 | def forward(self, bottom, top): 53 | # Algorithm: 54 | # 55 | # for each (H, W) location i 56 | # generate A anchor boxes centered on cell i 57 | # apply predicted transform deltas at cell i to each of the A anchors 58 | # clip predicted boxes to image 59 | # remove predicted boxes with either height or width < threshold 60 | # sort all (proposal, score) pairs by score from highest to lowest 61 | # take top pre_nms_topN proposals before NMS 62 | # apply NMS with threshold 0.7 to remaining proposals 63 | # take after_nms_topN proposals after NMS 64 | # return the top proposals (-> RoIs top, scores top) 65 | assert bottom[0].data.shape[0] == 1, 'Only single item batches are supported' 66 | 67 | cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' 68 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N 69 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N 70 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH 71 | min_size = cfg[cfg_key].RPN_MIN_SIZE 72 | 73 | # the first set of _num_anchors channels are bg probs 74 | # the second set are the fg probs, which we want 75 | scores = bottom[0].data[:, self._num_anchors:, :, :] 76 | bbox_deltas = bottom[1].data 77 | im_info = bottom[2].data[0, :] 78 | 79 | # 1. Generate proposals from transform deltas and shifted anchors 80 | height, width = scores.shape[-2:] 81 | self._height = height 82 | self._width = width 83 | # Enumerate all shifts 84 | shift_x = np.arange(0, self._width) * self._feat_stride 85 | shift_y = np.arange(0, self._height) * self._feat_stride 86 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 87 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 88 | shift_x.ravel(), shift_y.ravel())).transpose() 89 | 90 | # Enumerate all shifted anchors: 91 | # 92 | # add A anchors (1, A, 4) to 93 | # cell K shifts (K, 1, 4) to get 94 | # shift anchors (K, A, 4) 95 | # reshape to (K*A, 4) shifted anchors 96 | A = self._num_anchors 97 | K = shifts.shape[0] 98 | anchors = self._anchors.reshape((1, A, 4)) + \ 99 | shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 100 | anchors = anchors.reshape((K * A, 4)) 101 | _, keep = clip_boxes(anchors, im_info[:2]) 102 | self._anchor_index_before_clip = keep 103 | 104 | # Transpose and reshape predicted transform transformations to get them 105 | # into the same order as the anchors: 106 | # 107 | # transform deltas will be (1, 4 * A, H, W) format 108 | # transpose to (1, H, W, 4 * A) 109 | # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) 110 | # in slowest to fastest order 111 | bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) 112 | 113 | # Same story for the scores: 114 | # 115 | # scores are (1, A, H, W) format 116 | # transpose to (1, H, W, A) 117 | # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) 118 | scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) 119 | 120 | # Convert anchors into proposals via transform transformations 121 | proposals = bbox_transform_inv(anchors, bbox_deltas) 122 | 123 | # 2. clip predicted boxes to image 124 | proposals, keep = clip_boxes(proposals, im_info[:2]) 125 | # Record the cooresponding index before and after clip 126 | # This step doesn't need unmap 127 | # We need it to decide whether do back propagation 128 | self._proposal_index_before_clip = keep 129 | 130 | # 3. remove predicted boxes with either height or width < threshold 131 | # (NOTE: convert min_size to input image scale stored in im_info[2]) 132 | keep = filter_small_boxes(proposals, min_size * im_info[2]) 133 | proposals = proposals[keep, :] 134 | scores = scores[keep] 135 | self._ind_after_filter = keep 136 | 137 | # 4. sort all (proposal, score) pairs by score from highest to lowest 138 | # 5. take top pre_nms_topN (e.g. 6000) 139 | order = scores.ravel().argsort()[::-1] 140 | 141 | if pre_nms_topN > 0: 142 | order = order[:pre_nms_topN] 143 | proposals = proposals[order, :] 144 | scores = scores[order] 145 | self._ind_after_sort = order 146 | # 6. apply nms (e.g. threshold = 0.7) 147 | # 7. take after_nms_topN (e.g. 300) 148 | # 8. return the top proposals (-> RoIs top) 149 | keep = nms(np.hstack((proposals, scores)), nms_thresh) 150 | 151 | if post_nms_topN > 0: 152 | keep = keep[:post_nms_topN] 153 | proposals = proposals[keep, :] 154 | 155 | scores = scores[keep] 156 | # Output rois blob 157 | # Our RPN implementation only supports a single input image, so all 158 | # batch inds are 0 159 | batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) 160 | proposals = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) 161 | self._proposal_index = keep 162 | 163 | blobs = { 164 | 'rois': proposals 165 | } 166 | 167 | if str(self.phase) == 'TRAIN': 168 | if cfg.TRAIN.MIX_INDEX: 169 | all_rois_index = self._ind_after_filter[self._ind_after_sort[self._proposal_index]].reshape(1, len(keep)) 170 | blobs['proposal_index'] = all_rois_index 171 | 172 | # Copy data to forward to top layer 173 | for blob_name, blob in blobs.iteritems(): 174 | top[self._top_name_map[blob_name]].reshape(*blob.shape) 175 | top[self._top_name_map[blob_name]].data[...] = blob.astype(np.float32, copy=False) 176 | 177 | def backward(self, top, propagate_down, bottom): 178 | 179 | if propagate_down[1]: 180 | bottom[1].diff.fill(0.0) 181 | 182 | # first count only non-zero top gradient to accelerate computing 183 | top_non_zero_ind = np.unique(np.where(abs(top[0].diff[:, :]) > 0)[0]) 184 | proposal_index = np.asarray(self._proposal_index) 185 | # unmap indexes to the original scale 186 | unmap_val = self._ind_after_filter[self._ind_after_sort[proposal_index[top_non_zero_ind]]] 187 | 188 | # not back propagate gradient if proposals/anchors are out of image boundary 189 | # this is a 0/1 mask so we just multiply them when calculating bottom gradient 190 | weight_out_proposal = np.in1d(unmap_val, self._proposal_index_before_clip) 191 | weight_out_anchor = np.in1d(unmap_val, self._anchor_index_before_clip) 192 | 193 | # unmap_val are arranged as (H * W * A) as stated in forward comment 194 | # with A as the fastest dimension (which is different from caffe) 195 | c = unmap_val % self._num_anchors 196 | w = (unmap_val / self._num_anchors) % self._width 197 | h = (unmap_val / self._num_anchors / self._width) % self._height 198 | 199 | # width and height should be in feature map scale 200 | anchor_w = (self._anchors[c, 2] - self._anchors[c, 0]) 201 | anchor_h = (self._anchors[c, 3] - self._anchors[c, 1]) 202 | dfdx1 = top[0].diff[top_non_zero_ind, 1] 203 | dfdy1 = top[0].diff[top_non_zero_ind, 2] 204 | dfdx2 = top[0].diff[top_non_zero_ind, 3] 205 | dfdy2 = top[0].diff[top_non_zero_ind, 4] 206 | 207 | dfdxc = dfdx1 + dfdx2 208 | dfdyc = dfdy1 + dfdy2 209 | dfdw = 0.5 * (dfdx2 - dfdx1) 210 | dfdh = 0.5 * (dfdy2 - dfdy1) 211 | 212 | bottom[1].diff[0, 4*c, h, w] = \ 213 | dfdxc * anchor_w * weight_out_proposal * weight_out_anchor 214 | bottom[1].diff[0, 4*c+1, h, w] = \ 215 | dfdyc * anchor_h * weight_out_proposal * weight_out_anchor 216 | bottom[1].diff[0, 4*c+2, h, w] = \ 217 | dfdw * np.exp(bottom[1].data[0, 4*c+2, h, w]) * anchor_w * weight_out_proposal * weight_out_anchor 218 | bottom[1].diff[0, 4*c+3, h, w] = \ 219 | dfdh * np.exp(bottom[1].data[0, 4*c+3, h, w]) * anchor_h * weight_out_proposal * weight_out_anchor 220 | 221 | # if use gradient clip, constraint gradient inside [-thresh, thresh] 222 | if self._use_clip: 223 | bottom[1].diff[0, 4*c, h, w] = np.minimum(np.maximum( 224 | bottom[1].diff[0, 4*c, h, w], -self._clip_thresh), self._clip_thresh) 225 | bottom[1].diff[0, 4*c+1, h, w] = np.minimum(np.maximum( 226 | bottom[1].diff[0, 4*c+1, h, w], -self._clip_thresh), self._clip_thresh) 227 | bottom[1].diff[0, 4*c+2, h, w] = np.minimum(np.maximum( 228 | bottom[1].diff[0, 4*c+2, h, w], -self._clip_thresh), self._clip_thresh) 229 | bottom[1].diff[0, 4*c+3, h, w] = np.minimum(np.maximum( 230 | bottom[1].diff[0, 4*c+3, h, w], -self._clip_thresh), self._clip_thresh) 231 | -------------------------------------------------------------------------------- /models/VGG16/cfm/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG16" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "rois" 12 | input_shape { 13 | dim: 1 14 | dim: 5 15 | } 16 | 17 | input: "masks" 18 | input_shape { 19 | dim: 1 20 | dim: 1 21 | dim: 14 22 | dim: 14 23 | } 24 | 25 | layer { 26 | name: "conv1_1" 27 | type: "Convolution" 28 | bottom: "data" 29 | top: "conv1_1" 30 | param { 31 | lr_mult: 0 32 | decay_mult: 0 33 | } 34 | param { 35 | lr_mult: 0 36 | decay_mult: 0 37 | } 38 | convolution_param { 39 | num_output: 64 40 | pad: 1 41 | kernel_size: 3 42 | } 43 | } 44 | layer { 45 | name: "relu1_1" 46 | type: "ReLU" 47 | bottom: "conv1_1" 48 | top: "conv1_1" 49 | } 50 | layer { 51 | name: "conv1_2" 52 | type: "Convolution" 53 | bottom: "conv1_1" 54 | top: "conv1_2" 55 | param { 56 | lr_mult: 0 57 | decay_mult: 0 58 | } 59 | param { 60 | lr_mult: 0 61 | decay_mult: 0 62 | } 63 | convolution_param { 64 | num_output: 64 65 | pad: 1 66 | kernel_size: 3 67 | } 68 | } 69 | layer { 70 | name: "relu1_2" 71 | type: "ReLU" 72 | bottom: "conv1_2" 73 | top: "conv1_2" 74 | } 75 | layer { 76 | name: "pool1" 77 | type: "Pooling" 78 | bottom: "conv1_2" 79 | top: "pool1" 80 | pooling_param { 81 | pool: MAX 82 | kernel_size: 2 83 | stride: 2 84 | } 85 | } 86 | layer { 87 | name: "conv2_1" 88 | type: "Convolution" 89 | bottom: "pool1" 90 | top: "conv2_1" 91 | param { 92 | lr_mult: 0 93 | decay_mult: 0 94 | } 95 | param { 96 | lr_mult: 0 97 | decay_mult: 0 98 | } 99 | convolution_param { 100 | num_output: 128 101 | pad: 1 102 | kernel_size: 3 103 | } 104 | } 105 | layer { 106 | name: "relu2_1" 107 | type: "ReLU" 108 | bottom: "conv2_1" 109 | top: "conv2_1" 110 | } 111 | layer { 112 | name: "conv2_2" 113 | type: "Convolution" 114 | bottom: "conv2_1" 115 | top: "conv2_2" 116 | param { 117 | lr_mult: 0 118 | decay_mult: 0 119 | } 120 | param { 121 | lr_mult: 0 122 | decay_mult: 0 123 | } 124 | convolution_param { 125 | num_output: 128 126 | pad: 1 127 | kernel_size: 3 128 | } 129 | } 130 | layer { 131 | name: "relu2_2" 132 | type: "ReLU" 133 | bottom: "conv2_2" 134 | top: "conv2_2" 135 | } 136 | layer { 137 | name: "pool2" 138 | type: "Pooling" 139 | bottom: "conv2_2" 140 | top: "pool2" 141 | pooling_param { 142 | pool: MAX 143 | kernel_size: 2 144 | stride: 2 145 | } 146 | } 147 | layer { 148 | name: "conv3_1" 149 | type: "Convolution" 150 | bottom: "pool2" 151 | top: "conv3_1" 152 | param { 153 | lr_mult: 1 154 | decay_mult: 1 155 | } 156 | param { 157 | lr_mult: 2 158 | decay_mult: 0 159 | } 160 | convolution_param { 161 | num_output: 256 162 | pad: 1 163 | kernel_size: 3 164 | } 165 | } 166 | layer { 167 | name: "relu3_1" 168 | type: "ReLU" 169 | bottom: "conv3_1" 170 | top: "conv3_1" 171 | } 172 | layer { 173 | name: "conv3_2" 174 | type: "Convolution" 175 | bottom: "conv3_1" 176 | top: "conv3_2" 177 | param { 178 | lr_mult: 1 179 | decay_mult: 1 180 | } 181 | param { 182 | lr_mult: 2 183 | decay_mult: 0 184 | } 185 | convolution_param { 186 | num_output: 256 187 | pad: 1 188 | kernel_size: 3 189 | } 190 | } 191 | layer { 192 | name: "relu3_2" 193 | type: "ReLU" 194 | bottom: "conv3_2" 195 | top: "conv3_2" 196 | } 197 | layer { 198 | name: "conv3_3" 199 | type: "Convolution" 200 | bottom: "conv3_2" 201 | top: "conv3_3" 202 | param { 203 | lr_mult: 1 204 | decay_mult: 1 205 | } 206 | param { 207 | lr_mult: 2 208 | decay_mult: 0 209 | } 210 | convolution_param { 211 | num_output: 256 212 | pad: 1 213 | kernel_size: 3 214 | } 215 | } 216 | layer { 217 | name: "relu3_3" 218 | type: "ReLU" 219 | bottom: "conv3_3" 220 | top: "conv3_3" 221 | } 222 | layer { 223 | name: "pool3" 224 | type: "Pooling" 225 | bottom: "conv3_3" 226 | top: "pool3" 227 | pooling_param { 228 | pool: MAX 229 | kernel_size: 2 230 | stride: 2 231 | } 232 | } 233 | layer { 234 | name: "conv4_1" 235 | type: "Convolution" 236 | bottom: "pool3" 237 | top: "conv4_1" 238 | param { 239 | lr_mult: 1 240 | decay_mult: 1 241 | } 242 | param { 243 | lr_mult: 2 244 | decay_mult: 0 245 | } 246 | convolution_param { 247 | num_output: 512 248 | pad: 1 249 | kernel_size: 3 250 | } 251 | } 252 | layer { 253 | name: "relu4_1" 254 | type: "ReLU" 255 | bottom: "conv4_1" 256 | top: "conv4_1" 257 | } 258 | layer { 259 | name: "conv4_2" 260 | type: "Convolution" 261 | bottom: "conv4_1" 262 | top: "conv4_2" 263 | param { 264 | lr_mult: 1 265 | decay_mult: 1 266 | } 267 | param { 268 | lr_mult: 2 269 | decay_mult: 0 270 | } 271 | convolution_param { 272 | num_output: 512 273 | pad: 1 274 | kernel_size: 3 275 | } 276 | } 277 | layer { 278 | name: "relu4_2" 279 | type: "ReLU" 280 | bottom: "conv4_2" 281 | top: "conv4_2" 282 | } 283 | layer { 284 | name: "conv4_3" 285 | type: "Convolution" 286 | bottom: "conv4_2" 287 | top: "conv4_3" 288 | param { 289 | lr_mult: 1 290 | decay_mult: 1 291 | } 292 | param { 293 | lr_mult: 2 294 | decay_mult: 0 295 | } 296 | convolution_param { 297 | num_output: 512 298 | pad: 1 299 | kernel_size: 3 300 | } 301 | } 302 | layer { 303 | name: "relu4_3" 304 | type: "ReLU" 305 | bottom: "conv4_3" 306 | top: "conv4_3" 307 | } 308 | layer { 309 | name: "pool4" 310 | type: "Pooling" 311 | bottom: "conv4_3" 312 | top: "pool4" 313 | pooling_param { 314 | pool: MAX 315 | kernel_size: 2 316 | stride: 2 317 | } 318 | } 319 | layer { 320 | name: "conv5_1" 321 | type: "Convolution" 322 | bottom: "pool4" 323 | top: "conv5_1" 324 | param { 325 | lr_mult: 1 326 | decay_mult: 1 327 | } 328 | param { 329 | lr_mult: 2 330 | decay_mult: 0 331 | } 332 | convolution_param { 333 | num_output: 512 334 | pad: 1 335 | kernel_size: 3 336 | } 337 | } 338 | layer { 339 | name: "relu5_1" 340 | type: "ReLU" 341 | bottom: "conv5_1" 342 | top: "conv5_1" 343 | } 344 | layer { 345 | name: "conv5_2" 346 | type: "Convolution" 347 | bottom: "conv5_1" 348 | top: "conv5_2" 349 | param { 350 | lr_mult: 1 351 | decay_mult: 1 352 | } 353 | param { 354 | lr_mult: 2 355 | decay_mult: 0 356 | } 357 | convolution_param { 358 | num_output: 512 359 | pad: 1 360 | kernel_size: 3 361 | } 362 | } 363 | layer { 364 | name: "relu5_2" 365 | type: "ReLU" 366 | bottom: "conv5_2" 367 | top: "conv5_2" 368 | } 369 | layer { 370 | name: "conv5_3" 371 | type: "Convolution" 372 | bottom: "conv5_2" 373 | top: "conv5_3" 374 | param { 375 | lr_mult: 1 376 | decay_mult: 1 377 | } 378 | param { 379 | lr_mult: 2 380 | decay_mult: 0 381 | } 382 | convolution_param { 383 | num_output: 512 384 | pad: 1 385 | kernel_size: 3 386 | } 387 | } 388 | layer { 389 | name: "relu5_3" 390 | type: "ReLU" 391 | bottom: "conv5_3" 392 | top: "conv5_3" 393 | } 394 | 395 | #-------- Box Feature -------- 396 | 397 | layer { 398 | name: "roi_pooling_conv5" 399 | type: "ROIPooling" 400 | bottom: "conv5_3" 401 | bottom: "rois" 402 | top: "roi_pooling_conv5" 403 | roi_pooling_param { 404 | pooled_w: 7 405 | pooled_h: 7 406 | spatial_scale: 0.0625 # 1/16 407 | } 408 | } 409 | 410 | layer { 411 | name: "fc6" 412 | type: "InnerProduct" 413 | bottom: "roi_pooling_conv5" 414 | top: "fc6" 415 | inner_product_param { 416 | num_output: 4096 417 | } 418 | } 419 | 420 | layer { 421 | name: "relu6" 422 | type: "ReLU" 423 | bottom: "fc6" 424 | top: "fc6" 425 | } 426 | 427 | layer { 428 | name: "fc7" 429 | type: "InnerProduct" 430 | bottom: "fc6" 431 | top: "fc7" 432 | inner_product_param { 433 | num_output: 4096 434 | } 435 | } 436 | 437 | layer { 438 | name: "relu7" 439 | type: "ReLU" 440 | bottom: "fc7" 441 | top: "fc7" 442 | } 443 | 444 | # -------- Mask Feature -------- 445 | 446 | layer { 447 | name: "roi_pooling_conv5_mask" 448 | type: "ROIPooling" 449 | bottom: "conv5_3" 450 | bottom: "rois" 451 | top: "roi_pooling_conv5_mask" 452 | roi_pooling_param { 453 | pooled_w: 14 454 | pooled_h: 14 455 | spatial_scale: 0.0625 # 1/16 456 | } 457 | } 458 | 459 | layer { 460 | name: "mask_pooling" 461 | type: "MaskPooling" 462 | bottom: "roi_pooling_conv5_mask" 463 | bottom: "masks" 464 | top: "roi_mask_conv5" 465 | } 466 | 467 | layer { 468 | name: "roi_mask_conv5" 469 | type: "Pooling" 470 | bottom: "roi_mask_conv5" 471 | top: "roi_mask_conv5_pool" 472 | pooling_param { 473 | kernel_size: 2 474 | stride: 2 475 | pad: 0 476 | pool: MAX 477 | } 478 | } 479 | 480 | layer { 481 | name: "fc6_mask" 482 | type: "InnerProduct" 483 | bottom: "roi_mask_conv5_pool" 484 | top: "fc6_mask" 485 | inner_product_param { 486 | num_output: 4096 487 | } 488 | } 489 | 490 | layer { 491 | name: "relu6_mask" 492 | type: "ReLU" 493 | bottom: "fc6_mask" 494 | top: "fc6_mask" 495 | } 496 | 497 | layer { 498 | name: "fc7_mask" 499 | type: "InnerProduct" 500 | bottom: "fc6_mask" 501 | top: "fc7_mask" 502 | inner_product_param { 503 | num_output: 4096 504 | } 505 | } 506 | 507 | layer { 508 | name: "relu7_mask" 509 | type: "ReLU" 510 | bottom: "fc7_mask" 511 | top: "fc7_mask" 512 | } 513 | 514 | 515 | # -------- Mask Estimation -------- 516 | 517 | layer { 518 | name: "fc6_maskest" 519 | type: "InnerProduct" 520 | bottom: "roi_pooling_conv5_mask" 521 | top: "fc6_maskest" 522 | inner_product_param { 523 | num_output: 256 524 | } 525 | } 526 | 527 | layer { 528 | name: "relu6_maskest" 529 | type: "ReLU" 530 | bottom: "fc6_maskest" 531 | top: "fc6_maskest" 532 | } 533 | 534 | layer { 535 | name: "mask_pred" 536 | type: "InnerProduct" 537 | bottom: "fc6_maskest" 538 | top: "mask_pred" 539 | inner_product_param { 540 | num_output: 441 # 21 * 21 541 | } 542 | } 543 | 544 | layer { 545 | name: "mask_prob" 546 | type: "Sigmoid" 547 | bottom: "mask_pred" 548 | top: "mask_prob" 549 | } 550 | 551 | # ----- Concat Box-Mask Feature ----- 552 | 553 | layer { 554 | name: "join_box_mask" 555 | type: "Concat" 556 | bottom: "fc7_mask" 557 | bottom: "fc7" 558 | top: "join_box_mask" 559 | concat_param { 560 | axis: 1 561 | } 562 | } 563 | 564 | # ---- Box Classification ---- 565 | 566 | layer { 567 | name: "cls_score" 568 | type: "InnerProduct" 569 | bottom: "join_box_mask" 570 | top: "cls_score" 571 | inner_product_param { 572 | num_output: 21 573 | } 574 | } 575 | 576 | layer { 577 | name: "cls_prob" 578 | type: "Softmax" 579 | bottom: "cls_score" 580 | top: "cls_prob" 581 | loss_param { 582 | ignore_label: -1 583 | normalize: true 584 | } 585 | } 586 | 587 | # ---- Mask Classification ---- 588 | 589 | layer { 590 | name: "seg_cls_score" 591 | type: "InnerProduct" 592 | bottom: "join_box_mask" 593 | top: "seg_cls_score" 594 | inner_product_param { 595 | num_output: 21 596 | } 597 | } 598 | layer { 599 | name: "seg_cls_prob" 600 | type: "Softmax" 601 | bottom: "seg_cls_score" 602 | top: "seg_cls_prob" 603 | loss_param { 604 | ignore_label: -1 605 | normalize: true 606 | } 607 | } 608 | 609 | layer { 610 | name: "bbox_pred" 611 | type: "InnerProduct" 612 | bottom: "join_box_mask" 613 | top: "bbox_pred" 614 | inner_product_param { 615 | num_output: 84 616 | } 617 | } 618 | -------------------------------------------------------------------------------- /lib/datasets/pascal_voc_seg.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Written by Haozhi Qi 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import cPickle 9 | import os 10 | import scipy.io as sio 11 | import numpy as np 12 | from datasets.pascal_voc_det import PascalVOCDet 13 | from mnc_config import cfg 14 | from utils.vis_seg import vis_seg 15 | from utils.voc_eval import voc_eval_sds 16 | import scipy 17 | 18 | 19 | class PascalVOCSeg(PascalVOCDet): 20 | """ 21 | A subclass for datasets.imdb.imdb 22 | This class contains information of ROIDB and MaskDB 23 | This class implements roidb and maskdb related functions 24 | """ 25 | def __init__(self, image_set, year, devkit_path=None): 26 | PascalVOCDet.__init__(self, image_set, year, devkit_path) 27 | self._ori_image_num = len(self._image_index) 28 | self._comp_id = 'comp6' 29 | # PASCAL specific config options 30 | self.config = {'cleanup': True, 31 | 'use_salt': True, 32 | 'top_k': 2000, 33 | 'use_diff': False, 34 | 'matlab_eval': False, 35 | 'rpn_file': None} 36 | self._data_path = os.path.join(self._devkit_path) 37 | self._roidb_path = os.path.join(self.cache_path, 'voc_2012_' + image_set + '_mcg_maskdb') 38 | 39 | def image_path_at(self, i): 40 | image_path = os.path.join(self._data_path, 'img', self._image_index[i] + self._image_ext) 41 | assert os.path.exists(image_path), 'Path does not exist: {}'.format(image_path) 42 | return image_path 43 | 44 | def roidb_path_at(self, i): 45 | if i >= self._ori_image_num: 46 | return os.path.join(self._roidb_path, 47 | self.image_index[i % self._ori_image_num] + '_flip.mat') 48 | else: 49 | return os.path.join(self._roidb_path, 50 | self.image_index[i] + '.mat') 51 | 52 | def gt_maskdb(self): 53 | cache_file = os.path.join(self.cache_path, self.name + '_gt_maskdb.pkl') 54 | if os.path.exists(cache_file): 55 | with open(cache_file, 'rb') as fid: 56 | gt_maskdb = cPickle.load(fid) 57 | print '{} gt maskdb loaded from {}'.format(self.name, cache_file) 58 | else: 59 | num_image = len(self.image_index) 60 | gt_roidbs = self.gt_roidb() 61 | gt_maskdb = [self._load_sbd_mask_annotations(index, gt_roidbs) 62 | for index in xrange(num_image)] 63 | with open(cache_file, 'wb') as fid: 64 | cPickle.dump(gt_maskdb, fid, cPickle.HIGHEST_PROTOCOL) 65 | print 'wrote gt roidb to {}'.format(cache_file) 66 | return gt_maskdb 67 | 68 | def _load_image_set_index(self): 69 | image_set_file = os.path.join(self._data_path, self._image_set + '.txt') 70 | assert os.path.exists(image_set_file), 'Path does not exist: {}'.format(image_set_file) 71 | with open(image_set_file) as f: 72 | image_index = [x.strip() for x in f.readlines()] 73 | return image_index 74 | 75 | def _load_sbd_mask_annotations(self, index, gt_roidbs): 76 | """ 77 | Load gt_masks information from SBD's additional data 78 | """ 79 | if index % 1000 == 0: 80 | print '%d / %d' % (index, len(self._image_index)) 81 | image_name = self._image_index[index] 82 | inst_file_name = os.path.join(self._data_path, 'inst', image_name + '.mat') 83 | gt_inst_mat = scipy.io.loadmat(inst_file_name) 84 | gt_inst_data = gt_inst_mat['GTinst']['Segmentation'][0][0] 85 | unique_inst = np.unique(gt_inst_data) 86 | background_ind = np.where(unique_inst == 0)[0] 87 | unique_inst = np.delete(unique_inst, background_ind) 88 | gt_roidb = gt_roidbs[index] 89 | cls_file_name = os.path.join(self._data_path, 'cls', image_name + '.mat') 90 | gt_cls_mat = scipy.io.loadmat(cls_file_name) 91 | gt_cls_data = gt_cls_mat['GTcls']['Segmentation'][0][0] 92 | gt_masks = [] 93 | for ind, inst_mask in enumerate(unique_inst): 94 | box = gt_roidb['boxes'][ind] 95 | im_mask = (gt_inst_data == inst_mask) 96 | im_cls_mask = np.multiply(gt_cls_data, im_mask) 97 | unique_cls_inst = np.unique(im_cls_mask) 98 | background_ind = np.where(unique_cls_inst == 0)[0] 99 | unique_cls_inst = np.delete(unique_cls_inst, background_ind) 100 | assert len(unique_cls_inst) == 1 101 | assert unique_cls_inst[0] == gt_roidb['gt_classes'][ind] 102 | mask = im_mask[box[1]: box[3]+1, box[0]:box[2]+1] 103 | gt_masks.append(mask) 104 | 105 | # Also record the maximum dimension to create fixed dimension array when do forwarding 106 | mask_max_x = max(gt_masks[i].shape[1] for i in xrange(len(gt_masks))) 107 | mask_max_y = max(gt_masks[i].shape[0] for i in xrange(len(gt_masks))) 108 | return { 109 | 'gt_masks': gt_masks, 110 | 'mask_max': [mask_max_x, mask_max_y], 111 | 'flipped': False 112 | } 113 | 114 | def append_flipped_masks(self): 115 | """ 116 | This method is only accessed when we use maskdb, so implement here 117 | Append flipped images to mask database 118 | Note this method doesn't actually flip the 'image', it flip masks instead 119 | """ 120 | cache_file = os.path.join(self.cache_path, self.name + '_' + cfg.TRAIN.PROPOSAL_METHOD + '_maskdb_flip.pkl') 121 | if os.path.exists(cache_file): 122 | with open(cache_file, 'rb') as fid: 123 | flip_maskdb = cPickle.load(fid) 124 | print '{} gt flipped roidb loaded from {}'.format(self.name, cache_file) 125 | self.maskdb.extend(flip_maskdb) 126 | # Need to check this condition since otherwise we may occasionally *4 127 | if self._image_index == self.num_images: 128 | self._image_index *= 2 129 | else: 130 | # pure image number hold for future development 131 | # this is useless since append flip mask will only be called once 132 | num_images = self._ori_image_num 133 | flip_maskdb = [] 134 | for i in xrange(num_images): 135 | masks = self.maskdb[i]['gt_masks'] 136 | masks_flip = [] 137 | for mask_ind in xrange(len(masks)): 138 | mask_flip = np.fliplr(masks[mask_ind]) 139 | masks_flip.append(mask_flip) 140 | entry = {'gt_masks': masks_flip, 141 | 'mask_max': self.maskdb[i]['mask_max'], 142 | 'flipped': True} 143 | flip_maskdb.append(entry) 144 | with open(cache_file, 'wb') as fid: 145 | cPickle.dump(flip_maskdb, fid, cPickle.HIGHEST_PROTOCOL) 146 | print 'wrote gt flipped maskdb to {}'.format(cache_file) 147 | self.maskdb.extend(flip_maskdb) 148 | # Need to check this condition since otherwise we may occasionally *4 149 | if self._image_index == self.num_images: 150 | self._image_index *= 2 151 | 152 | def visualization_segmentation(self, output_dir): 153 | vis_seg(self.image_index, self.classes, output_dir, self._data_path) 154 | 155 | # --------------------------- Evaluation --------------------------- 156 | def evaluate_segmentation(self, all_boxes, all_masks, output_dir): 157 | self._write_voc_seg_results_file(all_boxes, all_masks, output_dir) 158 | self._py_evaluate_segmentation(output_dir) 159 | 160 | def _write_voc_seg_results_file(self, all_boxes, all_masks, output_dir): 161 | """ 162 | Write results as a pkl file, note this is different from 163 | detection task since it's difficult to write masks to txt 164 | """ 165 | # Always reformat result in case of sometimes masks are not 166 | # binary or is in shape (n, sz*sz) instead of (n, sz, sz) 167 | all_boxes, all_masks = self._reformat_result(all_boxes, all_masks) 168 | for cls_inds, cls in enumerate(self.classes): 169 | if cls == '__background__': 170 | continue 171 | print 'Writing {} VOC results file'.format(cls) 172 | filename = os.path.join(output_dir, cls + '_det.pkl') 173 | with open(filename, 'wr') as f: 174 | cPickle.dump(all_boxes[cls_inds], f, cPickle.HIGHEST_PROTOCOL) 175 | filename = os.path.join(output_dir, cls + '_seg.pkl') 176 | with open(filename, 'wr') as f: 177 | cPickle.dump(all_masks[cls_inds], f, cPickle.HIGHEST_PROTOCOL) 178 | 179 | def _reformat_result(self, boxes, masks): 180 | num_images = len(self.image_index) 181 | num_class = len(self.classes) 182 | reformat_masks = [[[] for _ in xrange(num_images)] 183 | for _ in xrange(num_class)] 184 | for cls_inds in xrange(1, num_class): 185 | for img_inds in xrange(num_images): 186 | if len(masks[cls_inds][img_inds]) == 0: 187 | continue 188 | num_inst = masks[cls_inds][img_inds].shape[0] 189 | reformat_masks[cls_inds][img_inds] = masks[cls_inds][img_inds]\ 190 | .reshape(num_inst, cfg.MASK_SIZE, cfg.MASK_SIZE) 191 | reformat_masks[cls_inds][img_inds] = reformat_masks[cls_inds][img_inds] >= cfg.BINARIZE_THRESH 192 | all_masks = reformat_masks 193 | return boxes, all_masks 194 | 195 | def _py_evaluate_segmentation(self, output_dir): 196 | gt_dir = self._data_path 197 | imageset_file = os.path.join(gt_dir, self._image_set + '.txt') 198 | cache_dir = os.path.join(self._devkit_path, 'annotations_cache') 199 | aps = [] 200 | # define this as true according to SDS's evaluation protocol 201 | use_07_metric = True 202 | print 'VOC07 metric? ' + ('Yes' if use_07_metric else 'No') 203 | if not os.path.isdir(output_dir): 204 | os.mkdir(output_dir) 205 | print '~~~~~~ Evaluation use min overlap = 0.5 ~~~~~~' 206 | for i, cls in enumerate(self._classes): 207 | if cls == '__background__': 208 | continue 209 | det_filename = os.path.join(output_dir, cls + '_det.pkl') 210 | seg_filename = os.path.join(output_dir, cls + '_seg.pkl') 211 | ap = voc_eval_sds(det_filename, seg_filename, gt_dir, 212 | imageset_file, cls, cache_dir, self._classes, ov_thresh=0.5) 213 | aps += [ap] 214 | print('AP for {} = {:.2f}'.format(cls, ap*100)) 215 | print('Mean AP@0.5 = {:.2f}'.format(np.mean(aps)*100)) 216 | print '~~~~~~ Evaluation use min overlap = 0.7 ~~~~~~' 217 | aps = [] 218 | for i, cls in enumerate(self._classes): 219 | if cls == '__background__': 220 | continue 221 | det_filename = os.path.join(output_dir, cls + '_det.pkl') 222 | seg_filename = os.path.join(output_dir, cls + '_seg.pkl') 223 | ap = voc_eval_sds(det_filename, seg_filename, gt_dir, 224 | imageset_file, cls, cache_dir, self._classes, ov_thresh=0.7) 225 | aps += [ap] 226 | print('AP for {} = {:.2f}'.format(cls, ap*100)) 227 | print('Mean AP@0.7 = {:.2f}'.format(np.mean(aps)*100)) 228 | 229 | -------------------------------------------------------------------------------- /tools/prepare_mcg_maskdb.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Written by Haozhi Qi 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | # System modules 9 | import argparse 10 | import os 11 | import cPickle 12 | import numpy as np 13 | import scipy.io as sio 14 | import cv2 15 | from multiprocessing import Process 16 | import time 17 | import PIL 18 | # User-defined module 19 | import _init_paths 20 | from mnc_config import cfg 21 | from utils.cython_bbox import bbox_overlaps 22 | from transform.mask_transform import mask_overlap, intersect_mask 23 | from datasets.pascal_voc_seg import PascalVOCSeg 24 | 25 | 26 | def parse_args(): 27 | """ Parse input arguments 28 | """ 29 | parser = argparse.ArgumentParser(description='Prepare MCG roidb') 30 | parser.add_argument('--input', dest='input_dir', 31 | help='folder contain input mcg proposals', 32 | default='data/MCG-raw/', type=str) 33 | parser.add_argument('--output', dest='output_dir', 34 | help='folder contain output roidb', required=True, 35 | type=str) 36 | parser.add_argument('--gt_roi', dest='roidb', help='roidb', 37 | default='data/cache/voc_2012_train_gt_roidb.pkl', type=str) 38 | parser.add_argument('--gt_mask', dest='maskdb', help='maskdb', 39 | default='data/cache/voc_2012_train_gt_maskdb.pkl', type=str) 40 | parser.add_argument('-mask_sz', dest='mask_size', 41 | help='compressed mask resolution', 42 | default=21, type=int) 43 | parser.add_argument('--top_k', dest='top_k', 44 | help='number of generated proposal', 45 | default=-1, type=int) 46 | parser.add_argument('--db', dest='db_name', 47 | help='train or validation', 48 | default='train', type=str) 49 | parser.add_argument('--para_job', dest='para_job', 50 | help='launch several process', 51 | default='1', type=int) 52 | return parser.parse_args() 53 | 54 | 55 | def process_roidb(file_start, file_end, db): 56 | 57 | for cnt in xrange(file_start, file_end): 58 | f = file_list[cnt] 59 | full_file = os.path.join(input_dir, f) 60 | output_cache = os.path.join(output_dir, f.split('.')[0] + '.mat') 61 | timer_tic = time.time() 62 | if os.path.exists(output_cache): 63 | continue 64 | mcg_mat = sio.loadmat(full_file) 65 | mcg_mask_label = mcg_mat['labels'] 66 | mcg_superpixels = mcg_mat['superpixels'] 67 | num_proposal = len(mcg_mask_label) 68 | mcg_boxes = np.zeros((num_proposal, 4)) 69 | mcg_masks = np.zeros((num_proposal, mask_size, mask_size), dtype=np.bool) 70 | 71 | for ind_proposal in xrange(num_proposal): 72 | label = mcg_mask_label[ind_proposal][0][0] 73 | proposal = np.in1d(mcg_superpixels, label).reshape(mcg_superpixels.shape) 74 | [r, c] = np.where(proposal == 1) 75 | y1 = np.min(r) 76 | x1 = np.min(c) 77 | y2 = np.max(r) 78 | x2 = np.max(c) 79 | box = np.array([x1, y1, x2, y2]) 80 | proposal = proposal[y1:y2+1, x1:x2+1] 81 | proposal = cv2.resize(proposal.astype(np.float), (mask_size, mask_size), interpolation=cv2.INTER_NEAREST) 82 | mcg_masks[ind_proposal, :, :] = proposal 83 | mcg_boxes[ind_proposal, :] = box 84 | 85 | if top_k != -1: 86 | mcg_boxes = mcg_boxes[:top_k, :] 87 | mcg_masks = mcg_masks[:top_k, :] 88 | 89 | if db == 'val': 90 | # if we prepare validation data, we only need its masks and boxes 91 | roidb = { 92 | 'masks': (mcg_masks >= cfg.BINARIZE_THRESH).astype(bool), 93 | 'boxes': mcg_boxes 94 | } 95 | sio.savemat(output_cache, roidb) 96 | use_time = time.time() - timer_tic 97 | print '%d/%d use time %f' % (cnt, len(file_list), use_time) 98 | 99 | else: 100 | # Otherwise we need to prepare other information like overlaps 101 | num_mcg = mcg_boxes.shape[0] 102 | gt_roidb = gt_roidbs[cnt] 103 | gt_maskdb = gt_maskdbs[cnt] 104 | gt_boxes = gt_roidb['boxes'] 105 | gt_masks = gt_maskdb['gt_masks'] 106 | gt_classes = gt_roidb['gt_classes'] 107 | num_gt = gt_boxes.shape[0] 108 | num_all = num_gt + num_mcg 109 | # define output structure 110 | det_overlaps = np.zeros((num_all, 1)) 111 | seg_overlaps = np.zeros((num_all, 1)) 112 | seg_assignment = np.zeros((num_all, 1)) 113 | mask_targets = np.zeros((num_all, mask_size, mask_size)) 114 | # ------------------------------------------------------ 115 | all_boxes = np.vstack((gt_boxes[:, :4], mcg_boxes)).astype(int) 116 | all_masks = np.zeros((num_all, mask_size, mask_size)) 117 | for i in xrange(num_gt): 118 | all_masks[i, :, :] = (cv2.resize(gt_masks[i].astype(np.float), 119 | (mask_size, mask_size))) 120 | assert all_masks[num_gt:, :, :].shape == mcg_masks.shape 121 | all_masks[num_gt:, :, :] = mcg_masks 122 | # record bounding box overlaps 123 | cur_overlap = bbox_overlaps(all_boxes.astype(np.float), gt_boxes.astype(np.float)) 124 | seg_assignment = cur_overlap.argmax(axis=1) 125 | det_overlaps = cur_overlap.max(axis=1) 126 | seg_assignment[det_overlaps == 0] = -1 127 | # record mask region overlaps 128 | seg_overlaps[:num_gt] = 1.0 129 | for i in xrange(num_gt, num_all): 130 | cur_mask = cv2.resize(all_masks[i, :, :].astype(np.float), 131 | (all_boxes[i, 2] - all_boxes[i, 0] + 1, 132 | all_boxes[i, 3] - all_boxes[i, 1] + 1)) >= cfg.BINARIZE_THRESH 133 | for mask_ind in xrange(len(gt_masks)): 134 | gt_mask = gt_masks[mask_ind] 135 | gt_roi = gt_roidb['boxes'][mask_ind] 136 | cur_ov = mask_overlap(all_boxes[i, :], gt_roi, cur_mask, gt_mask) 137 | seg_overlaps[i] = max(seg_overlaps[i], cur_ov) 138 | 139 | output_label = np.zeros((num_all, 1)) 140 | for i in xrange(num_all): 141 | if seg_assignment[i] == -1: 142 | continue 143 | cur_ind = seg_assignment[i] 144 | output_label[i] = gt_classes[seg_assignment[i]] 145 | mask_targets[i, :, :] = intersect_mask(all_boxes[i, :], gt_roidb['boxes'][cur_ind], gt_masks[cur_ind]) 146 | 147 | # Some of the array need to insert a new axis to be consistent of savemat method 148 | roidb = { 149 | 'masks': (all_masks >= cfg.BINARIZE_THRESH).astype(bool), 150 | 'boxes': all_boxes, 151 | 'det_overlap': det_overlaps[:, np.newaxis], 152 | 'seg_overlap': seg_overlaps, 153 | 'mask_targets': (mask_targets >= cfg.BINARIZE_THRESH).astype(bool), 154 | 'gt_classes': gt_classes[:, np.newaxis], 155 | 'output_label': output_label, 156 | 'gt_assignment': seg_assignment[:, np.newaxis], 157 | 'Flip': False 158 | } 159 | 160 | sio.savemat(output_cache, roidb) 161 | use_time = time.time() - timer_tic 162 | print '%d/%d use time %f' % (cnt, len(file_list), use_time) 163 | 164 | 165 | def process_flip_masks(image_names, im_start, im_end): 166 | 167 | widths = [PIL.Image.open('data/VOCdevkitSDS/img/' + im_name + '.jpg').size[0] for im_name in image_names] 168 | cache_dir = output_dir 169 | if not os.path.isdir(cache_dir): 170 | os.makedirs(cache_dir) 171 | 172 | for index in xrange(im_start, im_end): 173 | output_cache = os.path.join(cache_dir, image_names[index] + '_flip.mat') 174 | if os.path.exists(output_cache): 175 | continue 176 | image_cache = os.path.join(cache_dir, image_names[index] + '.mat') 177 | orig_maskdb = sio.loadmat(image_cache) 178 | # Flip mask and mask regression targets 179 | masks = orig_maskdb['masks'] 180 | mask_targets = orig_maskdb['mask_targets'] 181 | mask_flip = masks[:, :, ::-1] 182 | mask_target_flip = mask_targets[:, :, ::-1] 183 | # Flip boxes 184 | boxes = orig_maskdb['boxes'] 185 | oldx1 = boxes[:, 0].copy() 186 | oldx2 = boxes[:, 2].copy() 187 | boxes[:, 0] = widths[index] - oldx2 - 1 188 | boxes[:, 2] = widths[index] - oldx1 - 1 189 | assert (boxes[:, 2] >= boxes[:, 0]).all() 190 | # Other maskdb values are identical with original maskdb 191 | flip_maskdb = { 192 | 'masks': (mask_flip >= cfg.BINARIZE_THRESH).astype(bool), 193 | 'boxes': boxes, 194 | 'det_overlap': orig_maskdb['det_overlap'], 195 | 'seg_overlap': orig_maskdb['seg_overlap'], 196 | 'mask_targets': (mask_target_flip >= cfg.BINARIZE_THRESH).astype(bool), 197 | 'gt_classes': orig_maskdb['gt_classes'], 198 | 'gt_assignment': orig_maskdb['gt_assignment'], 199 | 'Flip': True, 200 | 'output_label': orig_maskdb['output_label'] 201 | } 202 | sio.savemat(output_cache, flip_maskdb) 203 | 204 | 205 | if __name__ == '__main__': 206 | args = parse_args() 207 | input_dir = args.input_dir 208 | assert os.path.exists(input_dir), 'Path does not exist: {}'.format(input_dir) 209 | output_dir = args.output_dir 210 | if not os.path.isdir(output_dir): 211 | os.makedirs(output_dir) 212 | mask_size = args.mask_size 213 | 214 | list_name = 'data/VOCdevkitSDS/train.txt' if args.db_name == 'train' else 'data/VOCdevkitSDS/val.txt' 215 | with open(list_name) as f: 216 | file_list = f.read().splitlines() 217 | 218 | # If we want to prepare training maskdb, first try to load gts 219 | if args.db_name == 'train': 220 | if os.path.exists(args.roidb) and os.path.exists(args.maskdb): 221 | with open(args.roidb, 'rb') as f: 222 | gt_roidbs = cPickle.load(f) 223 | with open(args.maskdb, 'rb') as f: 224 | gt_maskdbs = cPickle.load(f) 225 | else: 226 | db = PascalVOCSeg('train', '2012', 'data/VOCdevkitSDS/') 227 | gt_roidbs = db.gt_roidb() 228 | gt_maskdbs = db.gt_maskdb() 229 | 230 | top_k = args.top_k 231 | num_process = args.para_job 232 | # Prepare train/val maskdb use multi-process 233 | processes = [] 234 | file_start = 0 235 | file_offset = int(np.ceil(len(file_list) / float(num_process))) 236 | for process_id in xrange(num_process): 237 | file_end = min(file_start + file_offset, len(file_list)) 238 | p = Process(target=process_roidb, args=(file_start, file_end, args.db_name)) 239 | p.start() 240 | processes.append(p) 241 | file_start += file_offset 242 | 243 | for p in processes: 244 | p.join() 245 | 246 | # If db_name == 'train', we still need to add flipped maskdb into output folder 247 | # Add flipped mask and mask regression targets after prepare the original mcg proposals 248 | if args.db_name == 'train': 249 | print 'Appending flipped MCG to ROI' 250 | processes = [] 251 | file_start = 0 252 | file_offset = int(np.ceil(len(file_list) / float(num_process))) 253 | for process_id in xrange(num_process): 254 | file_end = min(file_start + file_offset, len(file_list)) 255 | p = Process(target=process_flip_masks, args=(file_list, file_start, file_end)) 256 | p.start() 257 | processes.append(p) 258 | file_start += file_offset 259 | for p in processes: 260 | p.join() 261 | -------------------------------------------------------------------------------- /lib/transform/mask_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Multitask Network Cascade 3 | # Written by Haozhi Qi 4 | # Copyright (c) 2016, Haozhi Qi 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import cv2 10 | from mnc_config import cfg 11 | from nms.nms_wrapper import nms 12 | from utils.cython_bbox import bbox_overlaps 13 | from nms.mv import mv 14 | 15 | 16 | def mask_overlap(box1, box2, mask1, mask2): 17 | """ 18 | This function calculate region IOU when masks are 19 | inside different boxes 20 | Returns: 21 | intersection over unions of this two masks 22 | """ 23 | x1 = max(box1[0], box2[0]) 24 | y1 = max(box1[1], box2[1]) 25 | x2 = min(box1[2], box2[2]) 26 | y2 = min(box1[3], box2[3]) 27 | if x1 > x2 or y1 > y2: 28 | return 0 29 | w = x2 - x1 + 1 30 | h = y2 - y1 + 1 31 | # get masks in the intersection part 32 | start_ya = y1 - box1[1] 33 | start_xa = x1 - box1[0] 34 | inter_maska = mask1[start_ya: start_ya + h, start_xa:start_xa + w] 35 | 36 | start_yb = y1 - box2[1] 37 | start_xb = x1 - box2[0] 38 | inter_maskb = mask2[start_yb: start_yb + h, start_xb:start_xb + w] 39 | 40 | assert inter_maska.shape == inter_maskb.shape 41 | 42 | inter = np.logical_and(inter_maskb, inter_maska).sum() 43 | union = mask1.sum() + mask2.sum() - inter 44 | if union < 1.0: 45 | return 0 46 | return float(inter) / float(union) 47 | 48 | 49 | def intersect_mask(ex_box, gt_box, gt_mask): 50 | """ 51 | This function calculate the intersection part of a external box 52 | and gt_box, mask it according to gt_mask 53 | 54 | Args: 55 | ex_box: external ROIS 56 | gt_box: ground truth boxes 57 | gt_mask: ground truth masks, not been resized yet 58 | Returns: 59 | regression_target: logical numpy array 60 | """ 61 | x1 = max(ex_box[0], gt_box[0]) 62 | y1 = max(ex_box[1], gt_box[1]) 63 | x2 = min(ex_box[2], gt_box[2]) 64 | y2 = min(ex_box[3], gt_box[3]) 65 | if x1 > x2 or y1 > y2: 66 | return np.zeros((21, 21), dtype=bool) 67 | w = x2 - x1 + 1 68 | h = y2 - y1 + 1 69 | ex_starty = y1 - ex_box[1] 70 | ex_startx = x1 - ex_box[0] 71 | 72 | gt_starty = y1 - gt_box[1] 73 | gt_startx = x1 - gt_box[0] 74 | inter_maskb = gt_mask[gt_starty: gt_starty + h, gt_startx: gt_startx + w] 75 | regression_target = np.zeros((ex_box[3] - ex_box[1] + 1, ex_box[2] - ex_box[0] + 1)) 76 | regression_target[ex_starty: ex_starty + h, ex_startx: ex_startx + w] = inter_maskb 77 | regression_target = regression_target.astype(np.float32) 78 | regression_target = cv2.resize(regression_target, (cfg.MASK_SIZE, cfg.MASK_SIZE)) 79 | regression_target = regression_target >= cfg.BINARIZE_THRESH 80 | return regression_target 81 | 82 | 83 | def clip_masked_boxes(boxes, masks, im_shape): 84 | """ 85 | Clipped masked boxes inside image boundary 86 | """ 87 | num_box = boxes.shape[0] 88 | for i in xrange(num_box): 89 | box = np.round(boxes[i]).astype(int) 90 | mask = cv2.resize(masks[i, 0].astype(np.float32), (box[2] - box[0] + 1, box[3] - box[1] + 1)) 91 | clip_x1 = max(0, 0 - box[0]) 92 | clip_y1 = max(0, 0 - box[1]) 93 | clip_width = min(box[2], im_shape[1] - 1) - clip_x1 94 | clip_height = min(box[3], im_shape[0] - 1) - clip_y1 95 | clip_x2 = clip_x1 + clip_width 96 | clip_y2 = clip_y1 + clip_height 97 | mask = mask[clip_y1:clip_y2, clip_x1:clip_x2] 98 | masks[i, 0] = cv2.resize(mask.astype(np.float32), (cfg.MASK_SIZE, cfg.MASK_SIZE)) 99 | box[0] = clip_x1 100 | box[1] = clip_y1 101 | box[2] = clip_x2 102 | box[3] = clip_y2 103 | boxes[i] = box 104 | return boxes, masks 105 | 106 | 107 | def mask_aggregation(boxes, masks, mask_weights, im_width, im_height): 108 | """ 109 | This function implements mask voting mechanism to give finer mask 110 | n is the candidate boxes (masks) number 111 | Args: 112 | masks: All masks need to be aggregated (n x sz x sz) 113 | mask_weights: class score associated with each mask (n x 1) 114 | boxes: tight box enclose each mask (n x 4) 115 | im_width, im_height: image information 116 | TODO: Ensure mask size is sz x sz or tight box size 117 | """ 118 | assert boxes.shape[0] == len(masks) and boxes.shape[0] == mask_weights.shape[0] 119 | im_mask = np.zeros((im_height, im_width)) 120 | for mask_ind in xrange(len(masks)): 121 | box = np.round(boxes[mask_ind]) 122 | mask = (masks[mask_ind] >= cfg.BINARIZE_THRESH).astype(float) 123 | mask_weight = mask_weights[mask_ind] 124 | im_mask[box[1]:box[3]+1, box[0]:box[2]+1] += mask * mask_weight 125 | [r, c] = np.where(im_mask >= cfg.BINARIZE_THRESH) 126 | if len(r) == 0 or len(c) == 0: 127 | min_y = np.ceil(im_height / 2) 128 | min_x = np.ceil(im_width / 2) 129 | max_y = min_y 130 | max_x = min_x 131 | else: 132 | min_y = np.min(r) 133 | min_x = np.min(c) 134 | max_y = np.max(r) 135 | max_x = np.max(c) 136 | 137 | clipped_mask = im_mask[min_y:max_y+1, min_x:max_x+1] 138 | clipped_box = np.array((min_x, min_y, max_x, max_y), dtype=np.float32) 139 | return clipped_mask, clipped_box 140 | 141 | 142 | def cpu_mask_voting(masks, boxes, scores, num_classes, max_per_image, im_width, im_height): 143 | """ 144 | Wrapper function for mask voting, note we already know the class of boxes and masks 145 | Args: 146 | masks: ~ n x mask_sz x mask_sz 147 | boxes: ~ n x 4 148 | scores: ~ n x 1 149 | max_per_image: default would be 100 150 | im_width: width of image 151 | im_height: height of image 152 | """ 153 | # apply nms and sort to get first images according to their scores 154 | scores = scores[:, 1:] 155 | num_detect = boxes.shape[0] 156 | res_mask = [[] for _ in xrange(num_detect)] 157 | for i in xrange(num_detect): 158 | box = np.round(boxes[i]).astype(int) 159 | mask = cv2.resize(masks[i, 0].astype(np.float32), (box[2] - box[0] + 1, box[3] - box[1] + 1)) 160 | res_mask[i] = mask 161 | # Intermediate results 162 | sup_boxes = [] 163 | sup_masks = [] 164 | sup_scores = [] 165 | tobesort_scores = [] 166 | 167 | for i in xrange(num_classes - 1): 168 | dets = np.hstack((boxes.astype(np.float32), scores[:, i:i+1])) 169 | inds = nms(dets, cfg.TEST.MASK_MERGE_NMS_THRESH) 170 | ind_boxes = boxes[inds] 171 | ind_masks = masks[inds] 172 | ind_scores = scores[inds, i] 173 | order = ind_scores.ravel().argsort()[::-1] 174 | num_keep = min(len(order), max_per_image) 175 | order = order[0:num_keep] 176 | sup_boxes.append(ind_boxes[order]) 177 | sup_masks.append(ind_masks[order]) 178 | sup_scores.append(ind_scores[order]) 179 | tobesort_scores.extend(ind_scores[order]) 180 | 181 | sorted_scores = np.sort(tobesort_scores)[::-1] 182 | num_keep = min(len(sorted_scores), max_per_image) 183 | thresh = sorted_scores[num_keep-1] 184 | result_box = [] 185 | result_mask = [] 186 | for c in xrange(num_classes - 1): 187 | cls_box = sup_boxes[c] 188 | cls_score = sup_scores[c] 189 | keep = np.where(cls_score >= thresh)[0] 190 | new_sup_boxes = cls_box[keep] 191 | num_sup_box = len(new_sup_boxes) 192 | masks_ar = np.zeros((num_sup_box, 1, cfg.MASK_SIZE, cfg.MASK_SIZE)) 193 | boxes_ar = np.zeros((num_sup_box, 4)) 194 | for i in xrange(num_sup_box): 195 | # Get weights according to their segmentation scores 196 | cur_ov = bbox_overlaps(boxes.astype(np.float), new_sup_boxes[i, np.newaxis].astype(np.float)) 197 | cur_inds = np.where(cur_ov >= cfg.TEST.MASK_MERGE_IOU_THRESH)[0] 198 | cur_weights = scores[cur_inds, c] 199 | cur_weights = cur_weights / sum(cur_weights) 200 | # Re-format mask when passing it to mask_aggregation 201 | pass_mask = [res_mask[j] for j in list(cur_inds)] 202 | # do mask aggregation 203 | tmp_mask, boxes_ar[i] = mask_aggregation(boxes[cur_inds], pass_mask, cur_weights, im_width, im_height) 204 | tmp_mask = cv2.resize(tmp_mask.astype(np.float32), (cfg.MASK_SIZE, cfg.MASK_SIZE)) 205 | masks_ar[i, 0] = tmp_mask 206 | # make new array such that scores is the last dimension of boxes 207 | boxes_scored_ar = np.hstack((boxes_ar, cls_score[keep, np.newaxis])) 208 | result_box.append(boxes_scored_ar) 209 | result_mask.append(masks_ar) 210 | return result_box, result_mask 211 | 212 | 213 | def gpu_mask_voting(masks, boxes, scores, num_classes, max_per_image, im_width, im_height): 214 | """ 215 | A wrapper function, note we already know the class of boxes and masks 216 | Args: 217 | masks: ~ 300 x 21 x 21 218 | boxes: ~ 300 x 4 219 | scores: ~ 300 x 1 220 | max_per_image: default would be 100 221 | im_width: 222 | im_height: 223 | """ 224 | # Intermediate results 225 | sup_boxes = [] 226 | sup_scores = [] 227 | tobesort_scores = [] 228 | for i in xrange(num_classes): 229 | if i == 0: 230 | sup_boxes.append([]) 231 | sup_scores.append([]) 232 | continue 233 | dets = np.hstack((boxes.astype(np.float32), scores[:, i:i+1])) 234 | inds = nms(dets, cfg.TEST.MASK_MERGE_NMS_THRESH) 235 | ind_boxes = boxes[inds] 236 | ind_scores = scores[inds, i] 237 | num_keep = min(len(ind_scores), max_per_image) 238 | sup_boxes.append(ind_boxes[0:num_keep, :]) 239 | sup_scores.append(ind_scores[0:num_keep]) 240 | tobesort_scores.extend(ind_scores[0:num_keep]) 241 | 242 | sorted_scores = np.sort(tobesort_scores)[::-1] 243 | num_keep = min(len(sorted_scores), max_per_image) 244 | thresh = sorted_scores[num_keep-1] 245 | # inds array to record which mask should be aggregated together 246 | candidate_inds = [] 247 | # weight for each element in the candidate inds 248 | candidate_weights = [] 249 | # start position for candidate array 250 | candidate_start = [] 251 | candidate_scores = [] 252 | class_bar = [] 253 | for c in xrange(num_classes): 254 | if c == 0: 255 | continue 256 | cls_box = sup_boxes[c] 257 | cls_score = sup_scores[c] 258 | keep = np.where(cls_score >= thresh)[0] 259 | new_sup_boxes = cls_box[keep] 260 | num_sup_box = len(new_sup_boxes) 261 | for i in xrange(num_sup_box): 262 | cur_ov = bbox_overlaps(boxes.astype(np.float), new_sup_boxes[i, np.newaxis].astype(np.float)) 263 | cur_inds = np.where(cur_ov >= cfg.TEST.MASK_MERGE_IOU_THRESH)[0] 264 | candidate_inds.extend(cur_inds) 265 | cur_weights = scores[cur_inds, c] 266 | cur_weights = cur_weights / sum(cur_weights) 267 | candidate_weights.extend(cur_weights) 268 | candidate_start.append(len(candidate_inds)) 269 | candidate_scores.extend(cls_score[keep]) 270 | class_bar.append(len(candidate_scores)) 271 | candidate_inds = np.array(candidate_inds, dtype=np.int32) 272 | candidate_weights = np.array(candidate_weights, dtype=np.float32) 273 | candidate_start = np.array(candidate_start, dtype=np.int32) 274 | candidate_scores = np.array(candidate_scores, dtype=np.float32) 275 | result_mask, result_box = mv(boxes.astype(np.float32), masks, candidate_inds, candidate_start, candidate_weights, im_height, im_width) 276 | result_box = np.hstack((result_box, candidate_scores[:, np.newaxis])) 277 | list_result_box = [] 278 | list_result_mask = [] 279 | # separate result mask into different classes 280 | for i in xrange(num_classes - 1): 281 | cls_start = class_bar[i - 1] if i > 0 else 0 282 | cls_end = class_bar[i] 283 | list_result_box.append(result_box[cls_start:cls_end, :]) 284 | list_result_mask.append(result_mask[cls_start:cls_end, :, :, :]) 285 | 286 | return list_result_mask, list_result_box 287 | -------------------------------------------------------------------------------- /models/VGG16/faster_rcnn_end2end/test.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_ILSVRC_16_layers" 2 | 3 | input: "data" 4 | input_shape { 5 | dim: 1 6 | dim: 3 7 | dim: 224 8 | dim: 224 9 | } 10 | 11 | input: "im_info" 12 | input_shape { 13 | dim: 1 14 | dim: 3 15 | } 16 | 17 | # ------------ Convolution ----------- 18 | 19 | layer { 20 | name: "conv1_1" 21 | type: "Convolution" 22 | bottom: "data" 23 | top: "conv1_1" 24 | param { 25 | lr_mult: 0 26 | decay_mult: 0 27 | } 28 | param { 29 | lr_mult: 0 30 | decay_mult: 0 31 | } 32 | convolution_param { 33 | num_output: 64 34 | pad: 1 35 | kernel_size: 3 36 | } 37 | } 38 | layer { 39 | name: "relu1_1" 40 | type: "ReLU" 41 | bottom: "conv1_1" 42 | top: "conv1_1" 43 | } 44 | layer { 45 | name: "conv1_2" 46 | type: "Convolution" 47 | bottom: "conv1_1" 48 | top: "conv1_2" 49 | param { 50 | lr_mult: 0 51 | decay_mult: 0 52 | } 53 | param { 54 | lr_mult: 0 55 | decay_mult: 0 56 | } 57 | convolution_param { 58 | num_output: 64 59 | pad: 1 60 | kernel_size: 3 61 | } 62 | } 63 | layer { 64 | name: "relu1_2" 65 | type: "ReLU" 66 | bottom: "conv1_2" 67 | top: "conv1_2" 68 | } 69 | layer { 70 | name: "pool1" 71 | type: "Pooling" 72 | bottom: "conv1_2" 73 | top: "pool1" 74 | pooling_param { 75 | pool: MAX 76 | kernel_size: 2 77 | stride: 2 78 | } 79 | } 80 | layer { 81 | name: "conv2_1" 82 | type: "Convolution" 83 | bottom: "pool1" 84 | top: "conv2_1" 85 | param { 86 | lr_mult: 0 87 | decay_mult: 0 88 | } 89 | param { 90 | lr_mult: 0 91 | decay_mult: 0 92 | } 93 | convolution_param { 94 | num_output: 128 95 | pad: 1 96 | kernel_size: 3 97 | } 98 | } 99 | layer { 100 | name: "relu2_1" 101 | type: "ReLU" 102 | bottom: "conv2_1" 103 | top: "conv2_1" 104 | } 105 | layer { 106 | name: "conv2_2" 107 | type: "Convolution" 108 | bottom: "conv2_1" 109 | top: "conv2_2" 110 | param { 111 | lr_mult: 0 112 | decay_mult: 0 113 | } 114 | param { 115 | lr_mult: 0 116 | decay_mult: 0 117 | } 118 | convolution_param { 119 | num_output: 128 120 | pad: 1 121 | kernel_size: 3 122 | } 123 | } 124 | layer { 125 | name: "relu2_2" 126 | type: "ReLU" 127 | bottom: "conv2_2" 128 | top: "conv2_2" 129 | } 130 | layer { 131 | name: "pool2" 132 | type: "Pooling" 133 | bottom: "conv2_2" 134 | top: "pool2" 135 | pooling_param { 136 | pool: MAX 137 | kernel_size: 2 138 | stride: 2 139 | } 140 | } 141 | layer { 142 | name: "conv3_1" 143 | type: "Convolution" 144 | bottom: "pool2" 145 | top: "conv3_1" 146 | param { 147 | lr_mult: 1 148 | decay_mult: 1 149 | } 150 | param { 151 | lr_mult: 2 152 | decay_mult: 0 153 | } 154 | convolution_param { 155 | num_output: 256 156 | pad: 1 157 | kernel_size: 3 158 | } 159 | } 160 | layer { 161 | name: "relu3_1" 162 | type: "ReLU" 163 | bottom: "conv3_1" 164 | top: "conv3_1" 165 | } 166 | layer { 167 | name: "conv3_2" 168 | type: "Convolution" 169 | bottom: "conv3_1" 170 | top: "conv3_2" 171 | param { 172 | lr_mult: 1 173 | decay_mult: 1 174 | } 175 | param { 176 | lr_mult: 2 177 | decay_mult: 0 178 | } 179 | convolution_param { 180 | num_output: 256 181 | pad: 1 182 | kernel_size: 3 183 | } 184 | } 185 | layer { 186 | name: "relu3_2" 187 | type: "ReLU" 188 | bottom: "conv3_2" 189 | top: "conv3_2" 190 | } 191 | layer { 192 | name: "conv3_3" 193 | type: "Convolution" 194 | bottom: "conv3_2" 195 | top: "conv3_3" 196 | param { 197 | lr_mult: 1 198 | decay_mult: 1 199 | } 200 | param { 201 | lr_mult: 2 202 | decay_mult: 0 203 | } 204 | convolution_param { 205 | num_output: 256 206 | pad: 1 207 | kernel_size: 3 208 | } 209 | } 210 | layer { 211 | name: "relu3_3" 212 | type: "ReLU" 213 | bottom: "conv3_3" 214 | top: "conv3_3" 215 | } 216 | layer { 217 | name: "pool3" 218 | type: "Pooling" 219 | bottom: "conv3_3" 220 | top: "pool3" 221 | pooling_param { 222 | pool: MAX 223 | kernel_size: 2 224 | stride: 2 225 | } 226 | } 227 | layer { 228 | name: "conv4_1" 229 | type: "Convolution" 230 | bottom: "pool3" 231 | top: "conv4_1" 232 | param { 233 | lr_mult: 1 234 | decay_mult: 1 235 | } 236 | param { 237 | lr_mult: 2 238 | decay_mult: 0 239 | } 240 | convolution_param { 241 | num_output: 512 242 | pad: 1 243 | kernel_size: 3 244 | } 245 | } 246 | layer { 247 | name: "relu4_1" 248 | type: "ReLU" 249 | bottom: "conv4_1" 250 | top: "conv4_1" 251 | } 252 | layer { 253 | name: "conv4_2" 254 | type: "Convolution" 255 | bottom: "conv4_1" 256 | top: "conv4_2" 257 | param { 258 | lr_mult: 1 259 | decay_mult: 1 260 | } 261 | param { 262 | lr_mult: 2 263 | decay_mult: 0 264 | } 265 | convolution_param { 266 | num_output: 512 267 | pad: 1 268 | kernel_size: 3 269 | } 270 | } 271 | layer { 272 | name: "relu4_2" 273 | type: "ReLU" 274 | bottom: "conv4_2" 275 | top: "conv4_2" 276 | } 277 | layer { 278 | name: "conv4_3" 279 | type: "Convolution" 280 | bottom: "conv4_2" 281 | top: "conv4_3" 282 | param { 283 | lr_mult: 1 284 | decay_mult: 1 285 | } 286 | param { 287 | lr_mult: 2 288 | decay_mult: 0 289 | } 290 | convolution_param { 291 | num_output: 512 292 | pad: 1 293 | kernel_size: 3 294 | } 295 | } 296 | layer { 297 | name: "relu4_3" 298 | type: "ReLU" 299 | bottom: "conv4_3" 300 | top: "conv4_3" 301 | } 302 | layer { 303 | name: "pool4" 304 | type: "Pooling" 305 | bottom: "conv4_3" 306 | top: "pool4" 307 | pooling_param { 308 | pool: MAX 309 | kernel_size: 2 310 | stride: 2 311 | } 312 | } 313 | layer { 314 | name: "conv5_1" 315 | type: "Convolution" 316 | bottom: "pool4" 317 | top: "conv5_1" 318 | param { 319 | lr_mult: 1 320 | decay_mult: 1 321 | } 322 | param { 323 | lr_mult: 2 324 | decay_mult: 0 325 | } 326 | convolution_param { 327 | num_output: 512 328 | pad: 1 329 | kernel_size: 3 330 | } 331 | } 332 | layer { 333 | name: "relu5_1" 334 | type: "ReLU" 335 | bottom: "conv5_1" 336 | top: "conv5_1" 337 | } 338 | layer { 339 | name: "conv5_2" 340 | type: "Convolution" 341 | bottom: "conv5_1" 342 | top: "conv5_2" 343 | param { 344 | lr_mult: 1 345 | decay_mult: 1 346 | } 347 | param { 348 | lr_mult: 2 349 | decay_mult: 0 350 | } 351 | convolution_param { 352 | num_output: 512 353 | pad: 1 354 | kernel_size: 3 355 | } 356 | } 357 | layer { 358 | name: "relu5_2" 359 | type: "ReLU" 360 | bottom: "conv5_2" 361 | top: "conv5_2" 362 | } 363 | layer { 364 | name: "conv5_3" 365 | type: "Convolution" 366 | bottom: "conv5_2" 367 | top: "conv5_3" 368 | param { 369 | lr_mult: 1 370 | decay_mult: 1 371 | } 372 | param { 373 | lr_mult: 2 374 | decay_mult: 0 375 | } 376 | convolution_param { 377 | num_output: 512 378 | pad: 1 379 | kernel_size: 3 380 | } 381 | } 382 | layer { 383 | name: "relu5_3" 384 | type: "ReLU" 385 | bottom: "conv5_3" 386 | top: "conv5_3" 387 | } 388 | 389 | #------------ RPN ------------ 390 | 391 | layer { 392 | name: "rpn_conv/3x3" 393 | type: "Convolution" 394 | bottom: "conv5_3" 395 | top: "rpn/output" 396 | param { lr_mult: 1.0 decay_mult: 1.0 } 397 | param { lr_mult: 2.0 decay_mult: 0 } 398 | convolution_param { 399 | num_output: 512 400 | kernel_size: 3 pad: 1 stride: 1 401 | weight_filler { type: "gaussian" std: 0.01 } 402 | bias_filler { type: "constant" value: 0 } 403 | } 404 | } 405 | layer { 406 | name: "rpn_relu/3x3" 407 | type: "ReLU" 408 | bottom: "rpn/output" 409 | top: "rpn/output" 410 | } 411 | 412 | layer { 413 | name: "rpn_cls_score" 414 | type: "Convolution" 415 | bottom: "rpn/output" 416 | top: "rpn_cls_score" 417 | param { lr_mult: 1.0 decay_mult: 1.0 } 418 | param { lr_mult: 2.0 decay_mult: 0 } 419 | convolution_param { 420 | num_output: 18 # 2(bg/fg) * 9(anchors) 421 | kernel_size: 1 pad: 0 stride: 1 422 | weight_filler { type: "gaussian" std: 0.01 } 423 | bias_filler { type: "constant" value: 0 } 424 | } 425 | } 426 | layer { 427 | name: "rpn_bbox_pred" 428 | type: "Convolution" 429 | bottom: "rpn/output" 430 | top: "rpn_bbox_pred" 431 | param { lr_mult: 1.0 decay_mult: 1.0 } 432 | param { lr_mult: 2.0 decay_mult: 0 } 433 | convolution_param { 434 | num_output: 36 # 4 * 9(anchors) 435 | kernel_size: 1 pad: 0 stride: 1 436 | weight_filler { type: "gaussian" std: 0.01 } 437 | bias_filler { type: "constant" value: 0 } 438 | } 439 | } 440 | layer { 441 | bottom: "rpn_cls_score" 442 | top: "rpn_cls_score_reshape" 443 | name: "rpn_cls_score_reshape" 444 | type: "Reshape" 445 | reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } 446 | } 447 | 448 | #------------ Anchor to Proposal ------------ 449 | 450 | layer { 451 | name: "rpn_cls_prob" 452 | type: "Softmax" 453 | bottom: "rpn_cls_score_reshape" 454 | top: "rpn_cls_prob" 455 | } 456 | layer { 457 | name: 'rpn_cls_prob_reshape' 458 | type: 'Reshape' 459 | bottom: 'rpn_cls_prob' 460 | top: 'rpn_cls_prob_reshape' 461 | reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } 462 | } 463 | layer { 464 | name: 'proposal' 465 | type: 'Python' 466 | bottom: 'rpn_cls_prob_reshape' 467 | bottom: 'rpn_bbox_pred' 468 | bottom: 'im_info' 469 | top: 'rois' 470 | python_param { 471 | module: 'pylayer.proposal_layer' 472 | layer: 'ProposalLayer' 473 | param_str: "{'feat_stride': 16, 'gradient_scale': 1}" 474 | } 475 | } 476 | 477 | #------------ Roi Warping ------------ 478 | 479 | layer { 480 | name: "roi_pool5" 481 | type: "ROIWarping" 482 | bottom: "conv5_3" 483 | bottom: "rois" 484 | top: "pool5" 485 | roi_warping_param { 486 | pooled_w: 7 487 | pooled_h: 7 488 | spatial_scale: 0.0625 # 1/16 489 | } 490 | } 491 | layer { 492 | name: "fc6" 493 | type: "InnerProduct" 494 | bottom: "pool5" 495 | top: "fc6" 496 | param { 497 | lr_mult: 1 498 | decay_mult: 1 499 | } 500 | param { 501 | lr_mult: 2 502 | decay_mult: 0 503 | } 504 | inner_product_param { 505 | num_output: 4096 506 | } 507 | } 508 | layer { 509 | name: "relu6" 510 | type: "ReLU" 511 | bottom: "fc6" 512 | top: "fc6" 513 | } 514 | layer { 515 | name: "drop6" 516 | type: "Dropout" 517 | bottom: "fc6" 518 | top: "fc6" 519 | dropout_param { 520 | dropout_ratio: 0.5 521 | } 522 | } 523 | layer { 524 | name: "fc7" 525 | type: "InnerProduct" 526 | bottom: "fc6" 527 | top: "fc7" 528 | param { 529 | lr_mult: 1 530 | decay_mult: 1 531 | } 532 | param { 533 | lr_mult: 2 534 | decay_mult: 0 535 | } 536 | inner_product_param { 537 | num_output: 4096 538 | } 539 | } 540 | layer { 541 | name: "relu7" 542 | type: "ReLU" 543 | bottom: "fc7" 544 | top: "fc7" 545 | } 546 | layer { 547 | name: "drop7" 548 | type: "Dropout" 549 | bottom: "fc7" 550 | top: "fc7" 551 | dropout_param { 552 | dropout_ratio: 0.5 553 | } 554 | } 555 | 556 | #----- Classification ----- 557 | 558 | layer { 559 | name: "cls_score" 560 | type: "InnerProduct" 561 | bottom: "fc7" 562 | top: "cls_score" 563 | param { 564 | lr_mult: 1 565 | decay_mult: 1 566 | } 567 | param { 568 | lr_mult: 2 569 | decay_mult: 0 570 | } 571 | inner_product_param { 572 | num_output: 21 573 | weight_filler { 574 | type: "gaussian" 575 | std: 0.01 576 | } 577 | bias_filler { 578 | type: "constant" 579 | value: 0 580 | } 581 | } 582 | } 583 | 584 | layer { 585 | name: "cls_prob" 586 | type: "Softmax" 587 | bottom: "cls_score" 588 | top: "cls_prob" 589 | } 590 | 591 | #----- Bounding-box Regression ----- 592 | 593 | layer { 594 | name: "bbox_pred" 595 | type: "InnerProduct" 596 | bottom: "fc7" 597 | top: "bbox_pred" 598 | param { 599 | lr_mult: 1 600 | decay_mult: 1 601 | } 602 | param { 603 | lr_mult: 2 604 | decay_mult: 0 605 | } 606 | inner_product_param { 607 | num_output: 84 608 | weight_filler { 609 | type: "gaussian" 610 | std: 0.001 611 | } 612 | bias_filler { 613 | type: "constant" 614 | value: 0 615 | } 616 | } 617 | } 618 | --------------------------------------------------------------------------------