├── .gitignore ├── README.md ├── __init__.py ├── lib ├── Makefile ├── __init__.py ├── datasets │ ├── VRD_loader.py │ ├── __init__.py │ ├── sVG_loader.py │ └── visual_genome_loader.py ├── fast_rcnn │ ├── __init__.py │ ├── bbox_transform.py │ ├── config.py │ ├── config.py.FN │ ├── config.py.hdn │ ├── config2.py │ └── nms_wrapper.py ├── layer_utils │ ├── __init__.py │ ├── anchor_target_layer.py │ ├── csrc │ │ ├── ROIAlign.h │ │ ├── ROIPool.h │ │ ├── cpu │ │ │ ├── ROIAlign_cpu.cpp │ │ │ ├── nms_cpu.cpp │ │ │ └── vision.h │ │ ├── cuda │ │ │ ├── ROIAlign_cuda.cu │ │ │ ├── ROIPool_cuda.cu │ │ │ ├── nms.cu │ │ │ └── vision.h │ │ ├── nms.h │ │ └── vision.cpp │ ├── generate_anchors.py │ ├── proposal_layer.py │ ├── proposal_target_layer.py │ ├── proposal_top_layer.py │ ├── roi_layers │ │ ├── __init__.py │ │ ├── nms.py │ │ ├── roi_align.py │ │ └── roi_pool.py │ └── snippets.py ├── network.py ├── nms │ ├── .gitignore │ ├── __init__.py │ ├── _ext │ │ ├── __init__.py │ │ └── nms │ │ │ └── __init__.py │ ├── build.py │ ├── make.sh │ ├── nms_gpu.py │ ├── nms_kernel.cu │ ├── nms_retain_all.pyx │ └── src │ │ ├── nms_cuda.c │ │ ├── nms_cuda.h │ │ ├── nms_cuda_kernel.cu │ │ └── nms_cuda_kernel.h ├── pycocotools │ ├── UPSTREAM_REV │ ├── __init__.py │ ├── _mask.c │ ├── _mask.pyx │ ├── coco.py │ ├── cocoeval.py │ ├── license.txt │ ├── mask.py │ ├── maskApi.c │ └── maskApi.h ├── rpn_msr │ ├── __init__.py │ ├── anchor_target_layer.py │ ├── generate.py │ ├── generate_anchors.py │ ├── proposal_layer.py │ ├── proposal_target_layer.py │ ├── proposal_target_layer_FN.py │ ├── proposal_target_layer_hdn_v0.py │ ├── proposal_target_layer_hdn_v1.py │ └── proposal_target_layer_hdn_v2.py ├── setup_cuda.py ├── setup_cython.py ├── utils │ ├── .gitignore │ ├── FN_utils.py │ ├── HDN_utils.py │ ├── __init__.py │ ├── bbox.pyx │ ├── blob.py │ ├── boxes_grid.py │ ├── general_utils.py │ ├── logger.py │ ├── metrics.py │ ├── nms.py │ ├── nms.pyx │ ├── proposal_target_layer_v0.py │ ├── proposal_target_layer_v1.py │ ├── proposal_target_layer_v2.py │ ├── proposal_target_layer_v3.py │ ├── timer.py │ └── voc_eval.py └── visualize_graph │ ├── __init__.py │ ├── vis_utils.py │ └── visualize.py ├── models ├── HDN_v2 │ ├── __init__.py │ ├── criteria.py │ ├── engines_v1.py │ ├── factorizable_network_v4.py │ ├── factorizable_network_v4s.py │ └── utils.py ├── RPN │ ├── RPN.py │ ├── RPN_region.py │ ├── __init__.py │ └── utils.py ├── __init__.py ├── modules │ ├── NMS.py │ ├── __init__.py │ ├── dataParallel.py │ ├── factor_updating_structure.py │ ├── factor_updating_structure_v3.py │ ├── factor_updating_structure_v3r.py │ ├── geometry_transform.py │ ├── phrase_inference_structure.py │ └── relation_module.py └── utils │ ├── __init__.py │ └── vgg16.py ├── options ├── RPN │ ├── RPN_FN.yaml │ ├── RPN_FN_VRD.yaml │ └── RPN_FN_svg.yaml ├── __init__.py ├── config_FN.py ├── data.yaml ├── data_VRD.yaml ├── data_sVG.yaml └── models │ ├── VG-DR-Net.yaml │ ├── VG-MSDN.yaml │ └── VRD.yaml ├── requirements.txt ├── scripts ├── collect_samples.py ├── preprocessing_data.py ├── setup_eval.sh ├── voc_converter.py └── voc_converter_vg.py ├── train.sh ├── train_FN.py ├── train_rpn.py ├── train_rpn_VRD.py ├── visualize_graph.py └── visualize_gt_graphs.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | demo.py 10 | *.o 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo *.pot # Django stuff: *.log 53 | local_settings.py 54 | 55 | # Flask stuff: 56 | instance/ 57 | .webassets-cache 58 | 59 | # Scrapy stuff: 60 | .scrapy 61 | 62 | # Sphinx documentation 63 | docs/_build/ 64 | 65 | # PyBuilder 66 | target/ 67 | 68 | # Jupyter Notebook 69 | .ipynb_checkpoints 70 | 71 | # pyenv 72 | .python-version 73 | 74 | # celery beat schedule file 75 | celerybeat-schedule 76 | 77 | # dotenv 78 | .env 79 | 80 | # virtualenv 81 | .venv/ 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .idea 92 | extension-ffi 93 | demo_mot.py 94 | 95 | 96 | # defined by yikang 97 | model/ 98 | data/ 99 | data 100 | model 101 | output 102 | output/ 103 | log/ 104 | *.mat 105 | Debug_Code.ipynb 106 | eval/*.json 107 | *.DS_Store 108 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikang-li/FactorizableNet/1e3e1813630d8d647ab0f01bce7782198272e2e8/__init__.py -------------------------------------------------------------------------------- /lib/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python setup_cuda.py build develop 3 | python setup_cython.py build_ext --inplace 4 | 5 | clean: 6 | rm -rf nms/*.c nms/*.cpp nms/*.so 7 | rm -rf utils/*.c utils/*.cpp utils/*.so 8 | -------------------------------------------------------------------------------- /lib/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikang-li/FactorizableNet/1e3e1813630d8d647ab0f01bce7782198272e2e8/lib/__init__.py -------------------------------------------------------------------------------- /lib/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | # TODO: make this fold self-contained, only depends on utils package 9 | 10 | from .VRD_loader import VRD 11 | from .visual_genome_loader import visual_genome 12 | from .sVG_loader import sVG 13 | -------------------------------------------------------------------------------- /lib/datasets/sVG_loader.py: -------------------------------------------------------------------------------- 1 | from .VRD_loader import VRD 2 | import os.path as osp 3 | 4 | class sVG(VRD): 5 | def __init__(self, opts, image_set='train', batch_size=1, dataset_option=None, use_region=False): 6 | image_set = image_set + '_' + dataset_option 7 | super(sVG, self).__init__(opts, image_set, batch_size, dataset_option, use_region) 8 | self._data_path = osp.join(self.opts['dir'], 'images') 9 | -------------------------------------------------------------------------------- /lib/fast_rcnn/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from . import config 9 | from . import nms_wrapper 10 | # from nms_wrapper import nms -------------------------------------------------------------------------------- /lib/fast_rcnn/bbox_transform.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | from sympy.physics.paulialgebra import delta 10 | from config import cfg 11 | 12 | np.seterr(all='warn') 13 | 14 | def bbox_transform(ex_rois, gt_rois): 15 | 16 | # print 'ex_rois', ex_rois 17 | # print 'gt_rois', gt_rois 18 | 19 | 20 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 21 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 22 | ex_ctr_x = ex_rois[:, 0] + 0.5 * ex_widths 23 | ex_ctr_y = ex_rois[:, 1] + 0.5 * ex_heights 24 | 25 | # print 'ex_widths', ex_widths 26 | # print 'ex_heights', ex_heights 27 | # print 'ex_ctr_x', ex_ctr_x 28 | # print 'ex_ctr_y', ex_ctr_y 29 | 30 | 31 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 32 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 33 | gt_ctr_x = gt_rois[:, 0] + 0.5 * gt_widths 34 | gt_ctr_y = gt_rois[:, 1] + 0.5 * gt_heights 35 | 36 | 37 | # print 'gt_widths', gt_widths 38 | # print 'gt_heights', gt_heights 39 | # print 'gt_ctr_x', gt_ctr_x 40 | # print 'gt_ctr_y', gt_ctr_y 41 | 42 | 43 | targets_dx = (gt_ctr_x - ex_ctr_x) / ex_widths 44 | targets_dy = (gt_ctr_y - ex_ctr_y) / ex_heights 45 | targets_dw = np.log(gt_widths / ex_widths) 46 | targets_dh = np.log(gt_heights / ex_heights) 47 | 48 | # print 'targets_dx', targets_dx.mean(), targets_dx.std() 49 | # print 'targets_dy', targets_dy.mean(), targets_dy.std() 50 | # print 'targets_dw', targets_dw.mean(), targets_dw.std() 51 | # print 'targets_dh', targets_dh.mean(), targets_dh.std() 52 | 53 | 54 | targets = np.vstack( 55 | (targets_dx, targets_dy, targets_dw, targets_dh)).transpose() 56 | 57 | 58 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 59 | targets = ((targets - np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS)) 60 | / np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS)) 61 | 62 | # print 'targets_dx(normalized)', targets[:, 0].mean(), targets[:, 0].std() 63 | # print 'targets_dy(normalized)', targets[:, 1].mean(), targets[:, 1].std() 64 | # print 'targets_dw(normalized)', targets[:, 2].mean(), targets[:, 2].std() 65 | # print 'targets_dh(normalized)', targets[:, 3].mean(), targets[:, 3].std() 66 | 67 | return targets 68 | 69 | 70 | def bbox_transform_inv(boxes, deltas): 71 | return bbox_transform_inv_hdn(boxes, deltas) 72 | 73 | 74 | def bbox_transform_inv_hdn(boxes, deltas): 75 | if boxes.shape[0] == 0: 76 | return np.zeros((0,), dtype=deltas.dtype) 77 | 78 | boxes = boxes.astype(deltas.dtype, copy=False) 79 | 80 | widths = boxes[:, 2] - boxes[:, 0] + 1.0 81 | heights = boxes[:, 3] - boxes[:, 1] + 1.0 82 | ctr_x = boxes[:, 0] + 0.5 * widths 83 | ctr_y = boxes[:, 1] + 0.5 * heights 84 | 85 | if cfg.TRAIN.BBOX_NORMALIZE_TARGETS_PRECOMPUTED: 86 | deltas = deltas * np.array(cfg.TRAIN.BBOX_NORMALIZE_STDS) + np.array(cfg.TRAIN.BBOX_NORMALIZE_MEANS) 87 | 88 | dx = deltas[:, 0::4] 89 | dy = deltas[:, 1::4] 90 | dw = deltas[:, 2::4] 91 | dh = deltas[:, 3::4] 92 | 93 | pred_ctr_x = dx * widths[:, np.newaxis] + ctr_x[:, np.newaxis] 94 | pred_ctr_y = dy * heights[:, np.newaxis] + ctr_y[:, np.newaxis] 95 | pred_w = np.exp(dw) * widths[:, np.newaxis] 96 | pred_h = np.exp(dh) * heights[:, np.newaxis] 97 | 98 | pred_boxes = np.zeros(deltas.shape, dtype=deltas.dtype) 99 | # x1 100 | pred_boxes[:, 0::4] = pred_ctr_x - 0.5 * pred_w 101 | # y1 102 | pred_boxes[:, 1::4] = pred_ctr_y - 0.5 * pred_h 103 | # x2 104 | pred_boxes[:, 2::4] = pred_ctr_x + 0.5 * pred_w - 1.0 105 | # y2 106 | pred_boxes[:, 3::4] = pred_ctr_y + 0.5 * pred_h - 1.0 107 | 108 | return pred_boxes 109 | 110 | 111 | def clip_boxes(boxes, im_shape): 112 | """ 113 | Clip boxes to image boundaries. 114 | """ 115 | if boxes.shape[0] == 0: 116 | return boxes 117 | 118 | # x1 >= 0 119 | boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0) 120 | # y1 >= 0 121 | boxes[:, 1::4] = np.maximum(np.minimum(boxes[:, 1::4], im_shape[0] - 1), 0) 122 | # x2 < im_shape[1] 123 | boxes[:, 2::4] = np.maximum(np.minimum(boxes[:, 2::4], im_shape[1] - 1), 0) 124 | # y2 < im_shape[0] 125 | boxes[:, 3::4] = np.maximum(np.minimum(boxes[:, 3::4], im_shape[0] - 1), 0) 126 | return boxes 127 | -------------------------------------------------------------------------------- /lib/fast_rcnn/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from lib.layer_utils.roi_layers import nms as nms_gpu 9 | from lib.nms.nms_retain_all import nms_retain_all 10 | import torch 11 | # from ..nms import cpu_nms 12 | # from ..nms import gpu_nms 13 | 14 | def nms(dets, thresh, retain_all=False): 15 | """Dispatch to either CPU or GPU NMS implementations.""" 16 | 17 | if dets.shape[0] == 0: 18 | return [] 19 | # ---numpy version--- 20 | # original: return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 21 | # ---pytorch version--- 22 | if retain_all: 23 | return nms_retain_all(dets, thresh) 24 | else: 25 | dets = torch.Tensor(dets).cuda() 26 | return nms_gpu(dets[:, :4], dets[:, 4], thresh).cpu().numpy() -------------------------------------------------------------------------------- /lib/layer_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikang-li/FactorizableNet/1e3e1813630d8d647ab0f01bce7782198272e2e8/lib/layer_utils/__init__.py -------------------------------------------------------------------------------- /lib/layer_utils/anchor_target_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Xinlei Chen 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import os 12 | from model.config import cfg 13 | import numpy as np 14 | import numpy.random as npr 15 | from utils.bbox import bbox_overlaps 16 | from model.bbox_transform import bbox_transform 17 | import torch 18 | 19 | 20 | def anchor_target_layer(rpn_cls_score, gt_boxes, im_info, _feat_stride, 21 | all_anchors, num_anchors): 22 | """Same as the anchor target layer in original Fast/er RCNN """ 23 | A = num_anchors 24 | total_anchors = all_anchors.shape[0] 25 | K = total_anchors / num_anchors 26 | 27 | # allow boxes to sit over the edge by a small amount 28 | _allowed_border = 0 29 | 30 | # map of shape (..., H, W) 31 | height, width = rpn_cls_score.shape[1:3] 32 | 33 | # only keep anchors inside the image 34 | inds_inside = np.where( 35 | (all_anchors[:, 0] >= -_allowed_border) & 36 | (all_anchors[:, 1] >= -_allowed_border) & 37 | (all_anchors[:, 2] < im_info[1] + _allowed_border) & # width 38 | (all_anchors[:, 3] < im_info[0] + _allowed_border) # height 39 | )[0] 40 | 41 | # keep only inside anchors 42 | anchors = all_anchors[inds_inside, :] 43 | 44 | # label: 1 is positive, 0 is negative, -1 is dont care 45 | labels = np.empty((len(inds_inside), ), dtype=np.float32) 46 | labels.fill(-1) 47 | 48 | # overlaps between the anchors and the gt boxes 49 | # overlaps (ex, gt) 50 | overlaps = bbox_overlaps( 51 | np.ascontiguousarray(anchors, dtype=np.float), 52 | np.ascontiguousarray(gt_boxes, dtype=np.float)) 53 | argmax_overlaps = overlaps.argmax(axis=1) 54 | max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] 55 | gt_argmax_overlaps = overlaps.argmax(axis=0) 56 | gt_max_overlaps = overlaps[gt_argmax_overlaps, 57 | np.arange(overlaps.shape[1])] 58 | gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] 59 | 60 | if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: 61 | # assign bg labels first so that positive labels can clobber them 62 | # first set the negatives 63 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 64 | 65 | # fg label: for each gt, anchor with highest overlap 66 | labels[gt_argmax_overlaps] = 1 67 | 68 | # fg label: above threshold IOU 69 | labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 70 | 71 | if cfg.TRAIN.RPN_CLOBBER_POSITIVES: 72 | # assign bg labels last so that negative labels can clobber positives 73 | labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 74 | 75 | # subsample positive labels if we have too many 76 | num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) 77 | fg_inds = np.where(labels == 1)[0] 78 | if len(fg_inds) > num_fg: 79 | disable_inds = npr.choice( 80 | fg_inds, size=(len(fg_inds) - num_fg), replace=False) 81 | labels[disable_inds] = -1 82 | 83 | # subsample negative labels if we have too many 84 | num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) 85 | bg_inds = np.where(labels == 0)[0] 86 | if len(bg_inds) > num_bg: 87 | disable_inds = npr.choice( 88 | bg_inds, size=(len(bg_inds) - num_bg), replace=False) 89 | labels[disable_inds] = -1 90 | 91 | bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) 92 | bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) 93 | 94 | bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 95 | # only the positive ones have regression targets 96 | bbox_inside_weights[labels == 1, :] = np.array( 97 | cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) 98 | 99 | bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) 100 | if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: 101 | # uniform weighting of examples (given non-uniform sampling) 102 | num_examples = np.sum(labels >= 0) 103 | positive_weights = np.ones((1, 4)) * 1.0 / num_examples 104 | negative_weights = np.ones((1, 4)) * 1.0 / num_examples 105 | else: 106 | assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & 107 | (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) 108 | positive_weights = ( 109 | cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1)) 110 | negative_weights = ( 111 | (1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / np.sum(labels == 0)) 112 | bbox_outside_weights[labels == 1, :] = positive_weights 113 | bbox_outside_weights[labels == 0, :] = negative_weights 114 | 115 | # map up to original set of anchors 116 | labels = _unmap(labels, total_anchors, inds_inside, fill=-1) 117 | bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) 118 | bbox_inside_weights = _unmap( 119 | bbox_inside_weights, total_anchors, inds_inside, fill=0) 120 | bbox_outside_weights = _unmap( 121 | bbox_outside_weights, total_anchors, inds_inside, fill=0) 122 | 123 | # labels 124 | labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2) 125 | labels = labels.reshape((1, 1, A * height, width)) 126 | rpn_labels = labels 127 | 128 | # bbox_targets 129 | bbox_targets = bbox_targets \ 130 | .reshape((1, height, width, A * 4)) 131 | 132 | rpn_bbox_targets = bbox_targets 133 | # bbox_inside_weights 134 | bbox_inside_weights = bbox_inside_weights \ 135 | .reshape((1, height, width, A * 4)) 136 | 137 | rpn_bbox_inside_weights = bbox_inside_weights 138 | 139 | # bbox_outside_weights 140 | bbox_outside_weights = bbox_outside_weights \ 141 | .reshape((1, height, width, A * 4)) 142 | 143 | rpn_bbox_outside_weights = bbox_outside_weights 144 | return rpn_labels, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights 145 | 146 | 147 | def _unmap(data, count, inds, fill=0): 148 | """ Unmap a subset of item (data) back to the original set of items (of 149 | size count) """ 150 | if len(data.shape) == 1: 151 | ret = np.empty((count, ), dtype=np.float32) 152 | ret.fill(fill) 153 | ret[inds] = data 154 | else: 155 | ret = np.empty((count, ) + data.shape[1:], dtype=np.float32) 156 | ret.fill(fill) 157 | ret[inds, :] = data 158 | return ret 159 | 160 | 161 | def _compute_targets(ex_rois, gt_rois): 162 | """Compute bounding-box regression targets for an image.""" 163 | 164 | assert ex_rois.shape[0] == gt_rois.shape[0] 165 | assert ex_rois.shape[1] == 4 166 | assert gt_rois.shape[1] == 5 167 | 168 | return bbox_transform( 169 | torch.from_numpy(ex_rois), torch.from_numpy(gt_rois[:, :4])).numpy() -------------------------------------------------------------------------------- /lib/layer_utils/csrc/ROIAlign.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | // Interface for Python 11 | at::Tensor ROIAlign_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width, 16 | const int sampling_ratio) { 17 | if (input.type().is_cuda()) { 18 | #ifdef WITH_CUDA 19 | return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 20 | #else 21 | AT_ERROR("Not compiled with GPU support"); 22 | #endif 23 | } 24 | return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 25 | } 26 | 27 | at::Tensor ROIAlign_backward(const at::Tensor& grad, 28 | const at::Tensor& rois, 29 | const float spatial_scale, 30 | const int pooled_height, 31 | const int pooled_width, 32 | const int batch_size, 33 | const int channels, 34 | const int height, 35 | const int width, 36 | const int sampling_ratio) { 37 | if (grad.type().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /lib/layer_utils/csrc/ROIPool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | 11 | std::tuple ROIPool_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width) { 16 | if (input.type().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor ROIPool_backward(const at::Tensor& grad, 27 | const at::Tensor& input, 28 | const at::Tensor& rois, 29 | const at::Tensor& argmax, 30 | const float spatial_scale, 31 | const int pooled_height, 32 | const int pooled_width, 33 | const int batch_size, 34 | const int channels, 35 | const int height, 36 | const int width) { 37 | if (grad.type().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /lib/layer_utils/csrc/cpu/nms_cpu.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "cpu/vision.h" 3 | 4 | 5 | template 6 | at::Tensor nms_cpu_kernel(const at::Tensor& dets, 7 | const at::Tensor& scores, 8 | const float threshold) { 9 | AT_ASSERTM(!dets.type().is_cuda(), "dets must be a CPU tensor"); 10 | AT_ASSERTM(!scores.type().is_cuda(), "scores must be a CPU tensor"); 11 | AT_ASSERTM(dets.type() == scores.type(), "dets should have the same type as scores"); 12 | 13 | if (dets.numel() == 0) { 14 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 15 | } 16 | 17 | auto x1_t = dets.select(1, 0).contiguous(); 18 | auto y1_t = dets.select(1, 1).contiguous(); 19 | auto x2_t = dets.select(1, 2).contiguous(); 20 | auto y2_t = dets.select(1, 3).contiguous(); 21 | 22 | at::Tensor areas_t = (x2_t - x1_t + 1) * (y2_t - y1_t + 1); 23 | 24 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 25 | 26 | auto ndets = dets.size(0); 27 | at::Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU)); 28 | 29 | auto suppressed = suppressed_t.data(); 30 | auto order = order_t.data(); 31 | auto x1 = x1_t.data(); 32 | auto y1 = y1_t.data(); 33 | auto x2 = x2_t.data(); 34 | auto y2 = y2_t.data(); 35 | auto areas = areas_t.data(); 36 | 37 | for (int64_t _i = 0; _i < ndets; _i++) { 38 | auto i = order[_i]; 39 | if (suppressed[i] == 1) 40 | continue; 41 | auto ix1 = x1[i]; 42 | auto iy1 = y1[i]; 43 | auto ix2 = x2[i]; 44 | auto iy2 = y2[i]; 45 | auto iarea = areas[i]; 46 | 47 | for (int64_t _j = _i + 1; _j < ndets; _j++) { 48 | auto j = order[_j]; 49 | if (suppressed[j] == 1) 50 | continue; 51 | auto xx1 = std::max(ix1, x1[j]); 52 | auto yy1 = std::max(iy1, y1[j]); 53 | auto xx2 = std::min(ix2, x2[j]); 54 | auto yy2 = std::min(iy2, y2[j]); 55 | 56 | auto w = std::max(static_cast(0), xx2 - xx1 + 1); 57 | auto h = std::max(static_cast(0), yy2 - yy1 + 1); 58 | auto inter = w * h; 59 | auto ovr = inter / (iarea + areas[j] - inter); 60 | if (ovr >= threshold) 61 | suppressed[j] = 1; 62 | } 63 | } 64 | return at::nonzero(suppressed_t == 0).squeeze(1); 65 | } 66 | 67 | at::Tensor nms_cpu(const at::Tensor& dets, 68 | const at::Tensor& scores, 69 | const float threshold) { 70 | at::Tensor result; 71 | AT_DISPATCH_FLOATING_TYPES(dets.type(), "nms", [&] { 72 | result = nms_cpu_kernel(dets, scores, threshold); 73 | }); 74 | return result; 75 | } 76 | -------------------------------------------------------------------------------- /lib/layer_utils/csrc/cpu/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | 14 | at::Tensor nms_cpu(const at::Tensor& dets, 15 | const at::Tensor& scores, 16 | const float threshold); 17 | -------------------------------------------------------------------------------- /lib/layer_utils/csrc/cuda/nms.cu: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 12 | 13 | __device__ inline float devIoU(float const * const a, float const * const b) { 14 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 15 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 16 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 17 | float interS = width * height; 18 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 19 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 20 | return interS / (Sa + Sb - interS); 21 | } 22 | 23 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 24 | const float *dev_boxes, unsigned long long *dev_mask) { 25 | const int row_start = blockIdx.y; 26 | const int col_start = blockIdx.x; 27 | 28 | // if (row_start > col_start) return; 29 | 30 | const int row_size = 31 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 32 | const int col_size = 33 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 34 | 35 | __shared__ float block_boxes[threadsPerBlock * 5]; 36 | if (threadIdx.x < col_size) { 37 | block_boxes[threadIdx.x * 5 + 0] = 38 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 39 | block_boxes[threadIdx.x * 5 + 1] = 40 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 41 | block_boxes[threadIdx.x * 5 + 2] = 42 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 43 | block_boxes[threadIdx.x * 5 + 3] = 44 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 45 | block_boxes[threadIdx.x * 5 + 4] = 46 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 47 | } 48 | __syncthreads(); 49 | 50 | if (threadIdx.x < row_size) { 51 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 52 | const float *cur_box = dev_boxes + cur_box_idx * 5; 53 | int i = 0; 54 | unsigned long long t = 0; 55 | int start = 0; 56 | if (row_start == col_start) { 57 | start = threadIdx.x + 1; 58 | } 59 | for (i = start; i < col_size; i++) { 60 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 61 | t |= 1ULL << i; 62 | } 63 | } 64 | const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); 65 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 66 | } 67 | } 68 | 69 | // boxes is a N x 5 tensor 70 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh) { 71 | using scalar_t = float; 72 | AT_ASSERTM(boxes.type().is_cuda(), "boxes must be a CUDA tensor"); 73 | auto scores = boxes.select(1, 4); 74 | auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); 75 | auto boxes_sorted = boxes.index_select(0, order_t); 76 | 77 | int boxes_num = boxes.size(0); 78 | 79 | const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); 80 | 81 | scalar_t* boxes_dev = boxes_sorted.data(); 82 | 83 | THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState 84 | 85 | unsigned long long* mask_dev = NULL; 86 | //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, 87 | // boxes_num * col_blocks * sizeof(unsigned long long))); 88 | 89 | mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); 90 | 91 | dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), 92 | THCCeilDiv(boxes_num, threadsPerBlock)); 93 | dim3 threads(threadsPerBlock); 94 | nms_kernel<<>>(boxes_num, 95 | nms_overlap_thresh, 96 | boxes_dev, 97 | mask_dev); 98 | 99 | std::vector mask_host(boxes_num * col_blocks); 100 | THCudaCheck(cudaMemcpy(&mask_host[0], 101 | mask_dev, 102 | sizeof(unsigned long long) * boxes_num * col_blocks, 103 | cudaMemcpyDeviceToHost)); 104 | 105 | std::vector remv(col_blocks); 106 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 107 | 108 | at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); 109 | int64_t* keep_out = keep.data(); 110 | 111 | int num_to_keep = 0; 112 | for (int i = 0; i < boxes_num; i++) { 113 | int nblock = i / threadsPerBlock; 114 | int inblock = i % threadsPerBlock; 115 | 116 | if (!(remv[nblock] & (1ULL << inblock))) { 117 | keep_out[num_to_keep++] = i; 118 | unsigned long long *p = &mask_host[0] + i * col_blocks; 119 | for (int j = nblock; j < col_blocks; j++) { 120 | remv[j] |= p[j]; 121 | } 122 | } 123 | } 124 | 125 | THCudaFree(state, mask_dev); 126 | // TODO improve this part 127 | return std::get<0>(order_t.index({ 128 | keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( 129 | order_t.device(), keep.scalar_type()) 130 | }).sort(0, false)); 131 | } 132 | -------------------------------------------------------------------------------- /lib/layer_utils/csrc/cuda/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, 14 | const at::Tensor& rois, 15 | const float spatial_scale, 16 | const int pooled_height, 17 | const int pooled_width, 18 | const int batch_size, 19 | const int channels, 20 | const int height, 21 | const int width, 22 | const int sampling_ratio); 23 | 24 | 25 | std::tuple ROIPool_forward_cuda(const at::Tensor& input, 26 | const at::Tensor& rois, 27 | const float spatial_scale, 28 | const int pooled_height, 29 | const int pooled_width); 30 | 31 | at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, 32 | const at::Tensor& input, 33 | const at::Tensor& rois, 34 | const at::Tensor& argmax, 35 | const float spatial_scale, 36 | const int pooled_height, 37 | const int pooled_width, 38 | const int batch_size, 39 | const int channels, 40 | const int height, 41 | const int width); 42 | 43 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); 44 | 45 | 46 | at::Tensor compute_flow_cuda(const at::Tensor& boxes, 47 | const int height, 48 | const int width); 49 | -------------------------------------------------------------------------------- /lib/layer_utils/csrc/nms.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include "cpu/vision.h" 4 | 5 | #ifdef WITH_CUDA 6 | #include "cuda/vision.h" 7 | #endif 8 | 9 | 10 | at::Tensor nms(const at::Tensor& dets, 11 | const at::Tensor& scores, 12 | const float threshold) { 13 | 14 | if (dets.type().is_cuda()) { 15 | #ifdef WITH_CUDA 16 | // TODO raise error if not compiled with CUDA 17 | if (dets.numel() == 0) 18 | return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); 19 | auto b = at::cat({dets, scores.unsqueeze(1)}, 1); 20 | return nms_cuda(b, threshold); 21 | #else 22 | AT_ERROR("Not compiled with GPU support"); 23 | #endif 24 | } 25 | 26 | at::Tensor result = nms_cpu(dets, scores, threshold); 27 | return result; 28 | } 29 | -------------------------------------------------------------------------------- /lib/layer_utils/csrc/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #include "nms.h" 3 | #include "ROIAlign.h" 4 | #include "ROIPool.h" 5 | 6 | 7 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 8 | m.def("nms", &nms, "non-maximum suppression"); 9 | m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); 10 | m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); 11 | m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); 12 | m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); 13 | } 14 | -------------------------------------------------------------------------------- /lib/layer_utils/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | from __future__ import absolute_import 8 | from __future__ import division 9 | from __future__ import print_function 10 | 11 | import numpy as np 12 | 13 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 14 | # 15 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 16 | # >> anchors 17 | # 18 | # anchors = 19 | # 20 | # -83 -39 100 56 21 | # -175 -87 192 104 22 | # -359 -183 376 200 23 | # -55 -55 72 72 24 | # -119 -119 136 136 25 | # -247 -247 264 264 26 | # -35 -79 52 96 27 | # -79 -167 96 184 28 | # -167 -343 184 360 29 | 30 | # array([[ -83., -39., 100., 56.], 31 | # [-175., -87., 192., 104.], 32 | # [-359., -183., 376., 200.], 33 | # [ -55., -55., 72., 72.], 34 | # [-119., -119., 136., 136.], 35 | # [-247., -247., 264., 264.], 36 | # [ -35., -79., 52., 96.], 37 | # [ -79., -167., 96., 184.], 38 | # [-167., -343., 184., 360.]]) 39 | 40 | 41 | def generate_anchors(base_size=16, 42 | ratios=[0.5, 1, 2], 43 | scales=2**np.arange(3, 6)): 44 | """ 45 | Generate anchor (reference) windows by enumerating aspect ratios X 46 | scales wrt a reference (0, 0, 15, 15) window. 47 | """ 48 | 49 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 50 | ratio_anchors = _ratio_enum(base_anchor, ratios) 51 | anchors = np.vstack([ 52 | _scale_enum(ratio_anchors[i, :], scales) 53 | for i in range(ratio_anchors.shape[0]) 54 | ]) 55 | return anchors 56 | 57 | 58 | def _whctrs(anchor): 59 | """ 60 | Return width, height, x center, and y center for an anchor (window). 61 | """ 62 | 63 | w = anchor[2] - anchor[0] + 1 64 | h = anchor[3] - anchor[1] + 1 65 | x_ctr = anchor[0] + 0.5 * (w - 1) 66 | y_ctr = anchor[1] + 0.5 * (h - 1) 67 | return w, h, x_ctr, y_ctr 68 | 69 | 70 | def _mkanchors(ws, hs, x_ctr, y_ctr): 71 | """ 72 | Given a vector of widths (ws) and heights (hs) around a center 73 | (x_ctr, y_ctr), output a set of anchors (windows). 74 | """ 75 | 76 | ws = ws[:, np.newaxis] 77 | hs = hs[:, np.newaxis] 78 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), 79 | x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1))) 80 | return anchors 81 | 82 | 83 | def _ratio_enum(anchor, ratios): 84 | """ 85 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 86 | """ 87 | 88 | w, h, x_ctr, y_ctr = _whctrs(anchor) 89 | size = w * h 90 | size_ratios = size / ratios 91 | ws = np.round(np.sqrt(size_ratios)) 92 | hs = np.round(ws * ratios) 93 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 94 | return anchors 95 | 96 | 97 | def _scale_enum(anchor, scales): 98 | """ 99 | Enumerate a set of anchors for each scale wrt an anchor. 100 | """ 101 | 102 | w, h, x_ctr, y_ctr = _whctrs(anchor) 103 | ws = w * scales 104 | hs = h * scales 105 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 106 | return anchors 107 | 108 | 109 | if __name__ == '__main__': 110 | import time 111 | 112 | t = time.time() 113 | a = generate_anchors() 114 | print(time.time() - t) 115 | print(a) 116 | from IPython import embed 117 | 118 | embed() 119 | -------------------------------------------------------------------------------- /lib/layer_utils/proposal_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Ross Girshick and Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from model.config import cfg 12 | from model.bbox_transform import bbox_transform_inv, clip_boxes 13 | from layer_utils.roi_layers import nms 14 | 15 | import torch 16 | 17 | 18 | def proposal_layer(rpn_cls_prob, rpn_bbox_pred, im_info, cfg_key, _feat_stride, 19 | anchors, num_anchors): 20 | """A simplified version compared to fast/er RCNN 21 | For details please see the technical report 22 | """ 23 | if type(cfg_key) == bytes: 24 | cfg_key = cfg_key.decode('utf-8') 25 | pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N 26 | post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N 27 | nms_thresh = cfg[cfg_key].RPN_NMS_THRESH 28 | 29 | # Get the scores and bounding boxes 30 | scores = rpn_cls_prob[:, :, :, num_anchors:] 31 | rpn_bbox_pred = rpn_bbox_pred.view((-1, 4)) 32 | scores = scores.contiguous().view(-1, 1) 33 | proposals = bbox_transform_inv(anchors, rpn_bbox_pred) 34 | proposals = clip_boxes(proposals, im_info[:2]) 35 | 36 | # Pick the top region proposals 37 | scores, order = scores.view(-1).sort(descending=True) 38 | if pre_nms_topN > 0: 39 | order = order[:pre_nms_topN] 40 | scores = scores[:pre_nms_topN].view(-1, 1) 41 | proposals = proposals[order.data, :] 42 | 43 | # Non-maximal suppression 44 | keep = nms(proposals, scores.squeeze(1), nms_thresh) 45 | 46 | # Pick th top region proposals after NMS 47 | if post_nms_topN > 0: 48 | keep = keep[:post_nms_topN] 49 | proposals = proposals[keep, :] 50 | scores = scores[keep, ] 51 | 52 | # Only support single image as input 53 | batch_inds = proposals.new_zeros(proposals.size(0), 1) 54 | blob = torch.cat((batch_inds, proposals), 1) 55 | 56 | return blob, scores 57 | -------------------------------------------------------------------------------- /lib/layer_utils/proposal_top_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from model.config import cfg 12 | from model.bbox_transform import bbox_transform_inv, clip_boxes 13 | import numpy.random as npr 14 | 15 | import torch 16 | 17 | 18 | def proposal_top_layer(rpn_cls_prob, rpn_bbox_pred, im_info, _feat_stride, 19 | anchors, num_anchors): 20 | """A layer that just selects the top region proposals 21 | without using non-maximal suppression, 22 | For details please see the technical report 23 | """ 24 | rpn_top_n = cfg.TEST.RPN_TOP_N 25 | 26 | scores = rpn_cls_prob[:, :, :, num_anchors:] 27 | 28 | rpn_bbox_pred = rpn_bbox_pred.view(-1, 4) 29 | scores = scores.contiguous().view(-1, 1) 30 | 31 | length = scores.size(0) 32 | if length < rpn_top_n: 33 | # Random selection, maybe unnecessary and loses good proposals 34 | # But such case rarely happens 35 | top_inds = torch.from_numpy( 36 | npr.choice(length, size=rpn_top_n, 37 | replace=True)).long().to(anchors.device) 38 | else: 39 | top_inds = scores.sort(0, descending=True)[1] 40 | top_inds = top_inds[:rpn_top_n] 41 | top_inds = top_inds.view(rpn_top_n) 42 | 43 | # Do the selection here 44 | anchors = anchors[top_inds, :].contiguous() 45 | rpn_bbox_pred = rpn_bbox_pred[top_inds, :].contiguous() 46 | scores = scores[top_inds].contiguous() 47 | 48 | # Convert anchors into proposals via bbox transformations 49 | proposals = bbox_transform_inv(anchors, rpn_bbox_pred) 50 | 51 | # Clip predicted boxes to image 52 | proposals = clip_boxes(proposals, im_info[:2]) 53 | 54 | # Output rois blob 55 | # Our RPN implementation only supports a single input image, so all 56 | # batch inds are 0 57 | batch_inds = proposals.new_zeros(proposals.size(0), 1) 58 | blob = torch.cat([batch_inds, proposals], 1) 59 | return blob, scores 60 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from .nms import nms 4 | from .roi_align import ROIAlign 5 | from .roi_align import roi_align 6 | from .roi_pool import ROIPool 7 | from .roi_pool import roi_pool 8 | 9 | __all__ = ["nms", "roi_align", "ROIAlign", "roi_pool", "ROIPool"] 10 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_layers/nms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # from ._utils import _C 3 | from layer_utils import _C 4 | 5 | nms = _C.nms 6 | # nms.__doc__ = """ 7 | # This function performs Non-maximum suppresion""" 8 | 9 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_layers/roi_align.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from layer_utils import _C 9 | 10 | 11 | class _ROIAlign(Function): 12 | @staticmethod 13 | def forward(ctx, input, roi, output_size, spatial_scale, sampling_ratio): 14 | ctx.save_for_backward(roi) 15 | ctx.output_size = _pair(output_size) 16 | ctx.spatial_scale = spatial_scale 17 | ctx.sampling_ratio = sampling_ratio 18 | ctx.input_shape = input.size() 19 | output = _C.roi_align_forward(input, roi, spatial_scale, 20 | output_size[0], output_size[1], 21 | sampling_ratio) 22 | return output 23 | 24 | @staticmethod 25 | @once_differentiable 26 | def backward(ctx, grad_output): 27 | rois, = ctx.saved_tensors 28 | output_size = ctx.output_size 29 | spatial_scale = ctx.spatial_scale 30 | sampling_ratio = ctx.sampling_ratio 31 | bs, ch, h, w = ctx.input_shape 32 | grad_input = _C.roi_align_backward( 33 | grad_output, 34 | rois, 35 | spatial_scale, 36 | output_size[0], 37 | output_size[1], 38 | bs, 39 | ch, 40 | h, 41 | w, 42 | sampling_ratio, 43 | ) 44 | return grad_input, None, None, None, None 45 | 46 | 47 | roi_align = _ROIAlign.apply 48 | 49 | 50 | class ROIAlign(nn.Module): 51 | def __init__(self, output_size, spatial_scale, sampling_ratio): 52 | super(ROIAlign, self).__init__() 53 | self.output_size = output_size 54 | self.spatial_scale = spatial_scale 55 | self.sampling_ratio = sampling_ratio 56 | 57 | def forward(self, input, rois): 58 | return roi_align(input, rois, self.output_size, self.spatial_scale, 59 | self.sampling_ratio) 60 | 61 | def __repr__(self): 62 | tmpstr = self.__class__.__name__ + "(" 63 | tmpstr += "output_size=" + str(self.output_size) 64 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 65 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 66 | tmpstr += ")" 67 | return tmpstr 68 | -------------------------------------------------------------------------------- /lib/layer_utils/roi_layers/roi_pool.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from layer_utils import _C 9 | 10 | 11 | class _ROIPool(Function): 12 | @staticmethod 13 | def forward(ctx, input, roi, output_size, spatial_scale): 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.input_shape = input.size() 17 | output, argmax = _C.roi_pool_forward(input, roi, spatial_scale, 18 | output_size[0], output_size[1]) 19 | ctx.save_for_backward(input, roi, argmax) 20 | return output 21 | 22 | @staticmethod 23 | @once_differentiable 24 | def backward(ctx, grad_output): 25 | input, rois, argmax = ctx.saved_tensors 26 | output_size = ctx.output_size 27 | spatial_scale = ctx.spatial_scale 28 | bs, ch, h, w = ctx.input_shape 29 | grad_input = _C.roi_pool_backward( 30 | grad_output, 31 | input, 32 | rois, 33 | argmax, 34 | spatial_scale, 35 | output_size[0], 36 | output_size[1], 37 | bs, 38 | ch, 39 | h, 40 | w, 41 | ) 42 | return grad_input, None, None, None 43 | 44 | 45 | roi_pool = _ROIPool.apply 46 | 47 | 48 | class ROIPool(nn.Module): 49 | def __init__(self, output_size, spatial_scale): 50 | super(ROIPool, self).__init__() 51 | self.output_size = output_size 52 | self.spatial_scale = spatial_scale 53 | 54 | def forward(self, input, rois): 55 | return roi_pool(input, rois, self.output_size, self.spatial_scale) 56 | 57 | def __repr__(self): 58 | tmpstr = self.__class__.__name__ + "(" 59 | tmpstr += "output_size=" + str(self.output_size) 60 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 61 | tmpstr += ")" 62 | return tmpstr 63 | -------------------------------------------------------------------------------- /lib/layer_utils/snippets.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Tensorflow Faster R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Xinlei Chen 5 | # -------------------------------------------------------- 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | 10 | import numpy as np 11 | from layer_utils.generate_anchors import generate_anchors 12 | 13 | 14 | def generate_anchors_pre(height, 15 | width, 16 | feat_stride, 17 | anchor_scales=(8, 16, 32), 18 | anchor_ratios=(0.5, 1, 2)): 19 | """ A wrapper function to generate anchors given different scales 20 | Also return the number of anchors in variable 'length' 21 | """ 22 | anchors = generate_anchors( 23 | ratios=np.array(anchor_ratios), scales=np.array(anchor_scales)) 24 | A = anchors.shape[0] 25 | shift_x = np.arange(0, width) * feat_stride 26 | shift_y = np.arange(0, height) * feat_stride 27 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 28 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), 29 | shift_y.ravel())).transpose() 30 | K = shifts.shape[0] 31 | # width changes faster, so here it is H, W, C 32 | anchors = anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose( 33 | (1, 0, 2)) 34 | anchors = anchors.reshape((K * A, 4)).astype(np.float32, copy=False) 35 | length = np.int32(anchors.shape[0]) 36 | 37 | return anchors, length 38 | -------------------------------------------------------------------------------- /lib/nms/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikang-li/FactorizableNet/1e3e1813630d8d647ab0f01bce7782198272e2e8/lib/nms/__init__.py -------------------------------------------------------------------------------- /lib/nms/_ext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikang-li/FactorizableNet/1e3e1813630d8d647ab0f01bce7782198272e2e8/lib/nms/_ext/__init__.py -------------------------------------------------------------------------------- /lib/nms/_ext/nms/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from torch.utils.ffi import _wrap_function 3 | from ._nms import lib as _lib, ffi as _ffi 4 | 5 | __all__ = [] 6 | def _import_symbols(locals): 7 | for symbol in dir(_lib): 8 | fn = getattr(_lib, symbol) 9 | if callable(fn): 10 | locals[symbol] = _wrap_function(fn, _ffi) 11 | else: 12 | locals[symbol] = fn 13 | __all__.append(symbol) 14 | 15 | _import_symbols(locals()) 16 | -------------------------------------------------------------------------------- /lib/nms/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | #this_file = os.path.dirname(__file__) 6 | 7 | sources = [] 8 | headers = [] 9 | defines = [] 10 | with_cuda = False 11 | 12 | if torch.cuda.is_available(): 13 | print('Including CUDA code.') 14 | sources += ['src/nms_cuda.c'] 15 | headers += ['src/nms_cuda.h'] 16 | defines += [('WITH_CUDA', None)] 17 | with_cuda = True 18 | 19 | this_file = os.path.dirname(os.path.realpath(__file__)) 20 | print(this_file) 21 | extra_objects = ['src/nms_cuda_kernel.cu.o'] 22 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 23 | print(extra_objects) 24 | 25 | ffi = create_extension( 26 | '_ext.nms', 27 | headers=headers, 28 | sources=sources, 29 | define_macros=defines, 30 | relative_to=__file__, 31 | with_cuda=with_cuda, 32 | extra_objects=extra_objects 33 | ) 34 | 35 | if __name__ == '__main__': 36 | ffi.build() 37 | -------------------------------------------------------------------------------- /lib/nms/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # CUDA_PATH=/usr/local/cuda/ 4 | 5 | cd src 6 | echo "Compiling stnm kernels by nvcc..." 7 | nvcc -c -o nms_cuda_kernel.cu.o nms_cuda_kernel.cu -x cu -Xcompiler -fPIC -arch=sm_52 8 | 9 | cd ../ 10 | python build.py 11 | -------------------------------------------------------------------------------- /lib/nms/nms_gpu.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from _ext import nms 4 | import pdb 5 | 6 | def nms_gpu(dets, thresh): 7 | dets = torch.Tensor(dets).cuda() 8 | keep = dets.new(dets.size(0), 1).zero_().int() 9 | num_out = dets.new(1).zero_().int() 10 | nms.nms_cuda(keep, dets, num_out, thresh) 11 | keep = keep[:num_out[0]] 12 | return keep.view(-1) 13 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /lib/nms/nms_retain_all.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def nms_retain_all(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | cdef np.ndarray[np.int_t, ndim=1] keep = \ 32 | np.zeros((ndets), dtype=np.int) 33 | 34 | # nominal indices 35 | cdef int _i, _j 36 | # sorted indices 37 | cdef int i, j 38 | # temp variables for box i's (the box currently under consideration) 39 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 40 | # variables for computing overlap with box j (lower scoring box) 41 | cdef np.float32_t xx1, yy1, xx2, yy2 42 | cdef np.float32_t w, h 43 | cdef np.float32_t inter, ovr 44 | 45 | for _i in range(ndets): 46 | i = order[_i] 47 | if suppressed[i] == 1: 48 | continue 49 | keep[i] = i 50 | ix1 = x1[i] 51 | iy1 = y1[i] 52 | ix2 = x2[i] 53 | iy2 = y2[i] 54 | iarea = areas[i] 55 | for _j in range(_i + 1, ndets): 56 | j = order[_j] 57 | if suppressed[j] == 1: 58 | continue 59 | xx1 = max(ix1, x1[j]) 60 | yy1 = max(iy1, y1[j]) 61 | xx2 = min(ix2, x2[j]) 62 | yy2 = min(iy2, y2[j]) 63 | w = max(0.0, xx2 - xx1 + 1) 64 | h = max(0.0, yy2 - yy1 + 1) 65 | inter = w * h 66 | ovr = inter / (iarea + areas[j] - inter) 67 | if ovr >= thresh: 68 | keep[j] = i 69 | suppressed[j] = 1 70 | 71 | return keep -------------------------------------------------------------------------------- /lib/nms/src/nms_cuda.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "nms_cuda_kernel.h" 4 | 5 | // this symbol will be resolved automatically from PyTorch libs 6 | extern THCState *state; 7 | 8 | int nms_cuda(THCudaIntTensor *keep_out, THCudaTensor *boxes_host, 9 | THCudaIntTensor *num_out, float nms_overlap_thresh) { 10 | 11 | nms_cuda_compute(THCudaIntTensor_data(state, keep_out), 12 | THCudaIntTensor_data(state, num_out), 13 | THCudaTensor_data(state, boxes_host), 14 | boxes_host->size[0], 15 | boxes_host->size[1], 16 | nms_overlap_thresh); 17 | 18 | return 1; 19 | } 20 | -------------------------------------------------------------------------------- /lib/nms/src/nms_cuda.h: -------------------------------------------------------------------------------- 1 | // int nms_cuda(THCudaTensor *keep_out, THCudaTensor *num_out, 2 | // THCudaTensor *boxes_host, THCudaTensor *nms_overlap_thresh); 3 | 4 | int nms_cuda(THCudaIntTensor *keep_out, THCudaTensor *boxes_host, 5 | THCudaIntTensor *num_out, float nms_overlap_thresh); 6 | -------------------------------------------------------------------------------- /lib/nms/src/nms_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "nms_cuda_kernel.h" 13 | 14 | #define CUDA_WARN(XXX) \ 15 | do { if (XXX != cudaSuccess) std::cout << "CUDA Error: " << \ 16 | cudaGetErrorString(XXX) << ", at line " << __LINE__ \ 17 | << std::endl; cudaDeviceSynchronize(); } while (0) 18 | 19 | #define CUDA_CHECK(condition) \ 20 | /* Code block avoids redefinition of cudaError_t error */ \ 21 | do { \ 22 | cudaError_t error = condition; \ 23 | if (error != cudaSuccess) { \ 24 | std::cout << cudaGetErrorString(error) << std::endl; \ 25 | } \ 26 | } while (0) 27 | 28 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 29 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 30 | 31 | __device__ inline float devIoU(float const * const a, float const * const b) { 32 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 33 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 34 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 35 | float interS = width * height; 36 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 37 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 38 | return interS / (Sa + Sb - interS); 39 | } 40 | 41 | __global__ void nms_kernel(int n_boxes, float nms_overlap_thresh, 42 | float *dev_boxes, unsigned long long *dev_mask) { 43 | const int row_start = blockIdx.y; 44 | const int col_start = blockIdx.x; 45 | 46 | // if (row_start > col_start) return; 47 | 48 | const int row_size = 49 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 50 | const int col_size = 51 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 52 | 53 | __shared__ float block_boxes[threadsPerBlock * 5]; 54 | if (threadIdx.x < col_size) { 55 | block_boxes[threadIdx.x * 5 + 0] = 56 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 57 | block_boxes[threadIdx.x * 5 + 1] = 58 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 59 | block_boxes[threadIdx.x * 5 + 2] = 60 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 61 | block_boxes[threadIdx.x * 5 + 3] = 62 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 63 | block_boxes[threadIdx.x * 5 + 4] = 64 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 65 | } 66 | __syncthreads(); 67 | 68 | if (threadIdx.x < row_size) { 69 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 70 | const float *cur_box = dev_boxes + cur_box_idx * 5; 71 | int i = 0; 72 | unsigned long long t = 0; 73 | int start = 0; 74 | if (row_start == col_start) { 75 | start = threadIdx.x + 1; 76 | } 77 | for (i = start; i < col_size; i++) { 78 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 79 | t |= 1ULL << i; 80 | } 81 | } 82 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 83 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 84 | } 85 | } 86 | 87 | void nms_cuda_compute(int* keep_out, int *num_out, float* boxes_host, int boxes_num, 88 | int boxes_dim, float nms_overlap_thresh) { 89 | 90 | float* boxes_dev = NULL; 91 | unsigned long long* mask_dev = NULL; 92 | 93 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 94 | 95 | CUDA_CHECK(cudaMalloc(&boxes_dev, 96 | boxes_num * boxes_dim * sizeof(float))); 97 | CUDA_CHECK(cudaMemcpy(boxes_dev, 98 | boxes_host, 99 | boxes_num * boxes_dim * sizeof(float), 100 | cudaMemcpyHostToDevice)); 101 | 102 | CUDA_CHECK(cudaMalloc(&mask_dev, 103 | boxes_num * col_blocks * sizeof(unsigned long long))); 104 | 105 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 106 | DIVUP(boxes_num, threadsPerBlock)); 107 | dim3 threads(threadsPerBlock); 108 | 109 | // printf("i am at line %d\n", boxes_num); 110 | // printf("i am at line %d\n", boxes_dim); 111 | 112 | nms_kernel<<>>(boxes_num, 113 | nms_overlap_thresh, 114 | boxes_dev, 115 | mask_dev); 116 | 117 | std::vector mask_host(boxes_num * col_blocks); 118 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 119 | mask_dev, 120 | sizeof(unsigned long long) * boxes_num * col_blocks, 121 | cudaMemcpyDeviceToHost)); 122 | 123 | std::vector remv(col_blocks); 124 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 125 | 126 | // we need to create a memory for keep_out on cpu 127 | // otherwise, the following code cannot run 128 | 129 | int* keep_out_cpu = new int[boxes_num]; 130 | 131 | int num_to_keep = 0; 132 | for (int i = 0; i < boxes_num; i++) { 133 | int nblock = i / threadsPerBlock; 134 | int inblock = i % threadsPerBlock; 135 | 136 | if (!(remv[nblock] & (1ULL << inblock))) { 137 | // orignal: keep_out[num_to_keep++] = i; 138 | keep_out_cpu[num_to_keep++] = i; 139 | unsigned long long *p = &mask_host[0] + i * col_blocks; 140 | for (int j = nblock; j < col_blocks; j++) { 141 | remv[j] |= p[j]; 142 | } 143 | } 144 | } 145 | 146 | // copy keep_out_cpu to keep_out on gpu 147 | CUDA_WARN(cudaMemcpy(keep_out, keep_out_cpu, boxes_num * sizeof(int),cudaMemcpyHostToDevice)); 148 | 149 | // *num_out = num_to_keep; 150 | 151 | // original: *num_out = num_to_keep; 152 | // copy num_to_keep to num_out on gpu 153 | 154 | CUDA_WARN(cudaMemcpy(num_out, &num_to_keep, 1 * sizeof(int),cudaMemcpyHostToDevice)); 155 | 156 | // release cuda memory 157 | CUDA_CHECK(cudaFree(boxes_dev)); 158 | CUDA_CHECK(cudaFree(mask_dev)); 159 | // release cpu memory 160 | delete []keep_out_cpu; 161 | } 162 | -------------------------------------------------------------------------------- /lib/nms/src/nms_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | #ifdef __cplusplus 2 | extern "C" { 3 | #endif 4 | 5 | void nms_cuda_compute(int* keep_out, int *num_out, float* boxes_host, int boxes_num, 6 | int boxes_dim, float nms_overlap_thresh); 7 | 8 | #ifdef __cplusplus 9 | } 10 | #endif 11 | -------------------------------------------------------------------------------- /lib/pycocotools/UPSTREAM_REV: -------------------------------------------------------------------------------- 1 | https://github.com/pdollar/coco/commit/3ac47c77ebd5a1ed4254a98b7fbf2ef4765a3574 2 | -------------------------------------------------------------------------------- /lib/pycocotools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /lib/pycocotools/license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014, Piotr Dollar and Tsung-Yi Lin 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 14 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 15 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 16 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 17 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 18 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 19 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 20 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 21 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 22 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23 | 24 | The views and conclusions contained in the software and documentation are those 25 | of the authors and should not be interpreted as representing official policies, 26 | either expressed or implied, of the FreeBSD Project. 27 | -------------------------------------------------------------------------------- /lib/pycocotools/mask.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tsungyi' 2 | 3 | from . import _mask 4 | 5 | # Interface for manipulating masks stored in RLE format. 6 | # 7 | # RLE is a simple yet efficient format for storing binary masks. RLE 8 | # first divides a vector (or vectorized image) into a series of piecewise 9 | # constant regions and then for each piece simply stores the length of 10 | # that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would 11 | # be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] 12 | # (note that the odd counts are always the numbers of zeros). Instead of 13 | # storing the counts directly, additional compression is achieved with a 14 | # variable bitrate representation based on a common scheme called LEB128. 15 | # 16 | # Compression is greatest given large piecewise constant regions. 17 | # Specifically, the size of the RLE is proportional to the number of 18 | # *boundaries* in M (or for an image the number of boundaries in the y 19 | # direction). Assuming fairly simple shapes, the RLE representation is 20 | # O(sqrt(n)) where n is number of pixels in the object. Hence space usage 21 | # is substantially lower, especially for large simple objects (large n). 22 | # 23 | # Many common operations on masks can be computed directly using the RLE 24 | # (without need for decoding). This includes computations such as area, 25 | # union, intersection, etc. All of these operations are linear in the 26 | # size of the RLE, in other words they are O(sqrt(n)) where n is the area 27 | # of the object. Computing these operations on the original mask is O(n). 28 | # Thus, using the RLE can result in substantial computational savings. 29 | # 30 | # The following API functions are defined: 31 | # encode - Encode binary masks using RLE. 32 | # decode - Decode binary masks encoded via RLE. 33 | # merge - Compute union or intersection of encoded masks. 34 | # iou - Compute intersection over union between masks. 35 | # area - Compute area of encoded masks. 36 | # toBbox - Get bounding boxes surrounding encoded masks. 37 | # frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. 38 | # 39 | # Usage: 40 | # Rs = encode( masks ) 41 | # masks = decode( Rs ) 42 | # R = merge( Rs, intersect=false ) 43 | # o = iou( dt, gt, iscrowd ) 44 | # a = area( Rs ) 45 | # bbs = toBbox( Rs ) 46 | # Rs = frPyObjects( [pyObjects], h, w ) 47 | # 48 | # In the API the following formats are used: 49 | # Rs - [dict] Run-length encoding of binary masks 50 | # R - dict Run-length encoding of binary mask 51 | # masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) 52 | # iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore 53 | # bbs - [nx4] Bounding box(es) stored as [x y w h] 54 | # poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) 55 | # dt,gt - May be either bounding boxes or encoded masks 56 | # Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). 57 | # 58 | # Finally, a note about the intersection over union (iou) computation. 59 | # The standard iou of a ground truth (gt) and detected (dt) object is 60 | # iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) 61 | # For "crowd" regions, we use a modified criteria. If a gt object is 62 | # marked as "iscrowd", we allow a dt to match any subregion of the gt. 63 | # Choosing gt' in the crowd gt that best matches the dt can be done using 64 | # gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing 65 | # iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) 66 | # For crowd gt regions we use this modified criteria above for the iou. 67 | # 68 | # To compile run "python setup.py build_ext --inplace" 69 | # Please do not contact us for help with compiling. 70 | # 71 | # Microsoft COCO Toolbox. version 2.0 72 | # Data, paper, and tutorials available at: http://mscoco.org/ 73 | # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 74 | # Licensed under the Simplified BSD License [see coco/license.txt] 75 | 76 | encode = _mask.encode 77 | decode = _mask.decode 78 | iou = _mask.iou 79 | merge = _mask.merge 80 | area = _mask.area 81 | toBbox = _mask.toBbox 82 | frPyObjects = _mask.frPyObjects -------------------------------------------------------------------------------- /lib/pycocotools/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | #include 9 | 10 | typedef unsigned int uint; 11 | typedef unsigned long siz; 12 | typedef unsigned char byte; 13 | typedef double* BB; 14 | typedef struct { siz h, w, m; uint *cnts; } RLE; 15 | 16 | // Initialize/destroy RLE. 17 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 18 | void rleFree( RLE *R ); 19 | 20 | // Initialize/destroy RLE array. 21 | void rlesInit( RLE **R, siz n ); 22 | void rlesFree( RLE **R, siz n ); 23 | 24 | // Encode binary masks using RLE. 25 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 26 | 27 | // Decode binary masks encoded via RLE. 28 | void rleDecode( const RLE *R, byte *mask, siz n ); 29 | 30 | // Compute union or intersection of encoded masks. 31 | void rleMerge( const RLE *R, RLE *M, siz n, bool intersect ); 32 | 33 | // Compute area of encoded masks. 34 | void rleArea( const RLE *R, siz n, uint *a ); 35 | 36 | // Compute intersection over union between masks. 37 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 38 | 39 | // Compute intersection over union between bounding boxes. 40 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 41 | 42 | // Get bounding boxes surrounding encoded masks. 43 | void rleToBbox( const RLE *R, BB bb, siz n ); 44 | 45 | // Convert bounding boxes to encoded masks. 46 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 47 | 48 | // Convert polygon to encoded mask. 49 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 50 | 51 | // Get compressed string representation of encoded mask. 52 | char* rleToString( const RLE *R ); 53 | 54 | // Convert from compressed string representation of encoded mask. 55 | void rleFrString( RLE *R, char *s, siz h, siz w ); 56 | -------------------------------------------------------------------------------- /lib/rpn_msr/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | -------------------------------------------------------------------------------- /lib/rpn_msr/generate.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import cv2 10 | import matplotlib.pyplot as plt 11 | 12 | from ..utils.blob import im_list_to_blob 13 | from ..utils.timer import Timer 14 | 15 | # TODO: make fast_rcnn irrelevant 16 | # >>>> obsolete, because it depends on sth outside of this project 17 | from ..fast_rcnn.config import cfg 18 | # <<<< obsolete 19 | 20 | 21 | def _vis_proposals(im, dets, thresh=0.5): 22 | """Draw detected bounding boxes.""" 23 | inds = np.where(dets[:, -1] >= thresh)[0] 24 | if len(inds) == 0: 25 | return 26 | 27 | class_name = 'obj' 28 | im = im[:, :, (2, 1, 0)] 29 | fig, ax = plt.subplots(figsize=(12, 12)) 30 | ax.imshow(im, aspect='equal') 31 | for i in inds: 32 | bbox = dets[i, :4] 33 | score = dets[i, -1] 34 | 35 | ax.add_patch( 36 | plt.Rectangle((bbox[0], bbox[1]), 37 | bbox[2] - bbox[0], 38 | bbox[3] - bbox[1], fill=False, 39 | edgecolor='red', linewidth=3.5) 40 | ) 41 | ax.text(bbox[0], bbox[1] - 2, 42 | '{:s} {:.3f}'.format(class_name, score), 43 | bbox=dict(facecolor='blue', alpha=0.5), 44 | fontsize=14, color='white') 45 | 46 | ax.set_title(('{} detections with ' 47 | 'p({} | box) >= {:.1f}').format(class_name, class_name, 48 | thresh), 49 | fontsize=14) 50 | plt.axis('off') 51 | plt.tight_layout() 52 | plt.draw() 53 | 54 | def _get_image_blob(im): 55 | """Converts an image into a network input. 56 | 57 | Arguments: 58 | im (ndarray): a color image in BGR order 59 | 60 | Returns: 61 | blob (ndarray): a data blob holding an image pyramid 62 | im_scale_factors (list): list of image scales (relative to im) used 63 | in the image pyramid 64 | """ 65 | im_orig = im.astype(np.float32, copy=True) 66 | im_orig -= cfg.PIXEL_MEANS 67 | 68 | processed_ims = [] 69 | 70 | assert len(cfg.TEST.SCALES_BASE) == 1 71 | im_scale = cfg.TRAIN.SCALES_BASE[0] 72 | 73 | im = cv2.resize(im_orig, None, None, fx=im_scale, fy=im_scale, 74 | interpolation=cv2.INTER_LINEAR) 75 | im_info = np.hstack((im.shape[:2], im_scale))[np.newaxis, :] 76 | processed_ims.append(im) 77 | 78 | # Create a blob to hold the input images 79 | blob = im_list_to_blob(processed_ims) 80 | 81 | return blob, im_info 82 | 83 | def im_proposals(net, im): 84 | """Generate RPN proposals on a single image.""" 85 | blobs = {} 86 | blobs['data'], blobs['im_info'] = _get_image_blob(im) 87 | net.blobs['data'].reshape(*(blobs['data'].shape)) 88 | net.blobs['im_info'].reshape(*(blobs['im_info'].shape)) 89 | blobs_out = net.forward( 90 | data=blobs['data'].astype(np.float32, copy=False), 91 | im_info=blobs['im_info'].astype(np.float32, copy=False)) 92 | 93 | scale = blobs['im_info'][0, 2] 94 | boxes = blobs_out['rois'][:, 1:].copy() / scale 95 | scores = blobs_out['scores'].copy() 96 | return boxes, scores 97 | 98 | def imdb_proposals(net, imdb): 99 | """Generate RPN proposals on all images in an imdb.""" 100 | 101 | _t = Timer() 102 | imdb_boxes = [[] for _ in xrange(imdb.num_images)] 103 | for i in xrange(imdb.num_images): 104 | im = cv2.imread(imdb.image_path_at(i)) 105 | _t.tic() 106 | imdb_boxes[i], scores = im_proposals(net, im) 107 | _t.toc() 108 | print 'im_proposals: {:d}/{:d} {:.3f}s' \ 109 | .format(i + 1, imdb.num_images, _t.average_time) 110 | if 0: 111 | dets = np.hstack((imdb_boxes[i], scores)) 112 | # from IPython import embed; embed() 113 | _vis_proposals(im, dets[:3, :], thresh=0.9) 114 | plt.show() 115 | 116 | return imdb_boxes 117 | 118 | def imdb_proposals_det(net, imdb): 119 | """Generate RPN proposals on all images in an imdb.""" 120 | 121 | _t = Timer() 122 | imdb_boxes = [[] for _ in xrange(imdb.num_images)] 123 | for i in xrange(imdb.num_images): 124 | im = cv2.imread(imdb.image_path_at(i)) 125 | _t.tic() 126 | boxes, scores = im_proposals(net, im) 127 | _t.toc() 128 | print 'im_proposals: {:d}/{:d} {:.3f}s' \ 129 | .format(i + 1, imdb.num_images, _t.average_time) 130 | dets = np.hstack((boxes, scores)) 131 | imdb_boxes[i] = dets 132 | 133 | if 0: 134 | # from IPython import embed; embed() 135 | _vis_proposals(im, dets[:3, :], thresh=0.9) 136 | plt.show() 137 | 138 | return imdb_boxes 139 | -------------------------------------------------------------------------------- /lib/rpn_msr/generate_anchors.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | # Verify that we compute the same anchors as Shaoqing's matlab implementation: 11 | # 12 | # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat 13 | # >> anchors 14 | # 15 | # anchors = 16 | # 17 | # -83 -39 100 56 18 | # -175 -87 192 104 19 | # -359 -183 376 200 20 | # -55 -55 72 72 21 | # -119 -119 136 136 22 | # -247 -247 264 264 23 | # -35 -79 52 96 24 | # -79 -167 96 184 25 | # -167 -343 184 360 26 | 27 | #array([[ -83., -39., 100., 56.], 28 | # [-175., -87., 192., 104.], 29 | # [-359., -183., 376., 200.], 30 | # [ -55., -55., 72., 72.], 31 | # [-119., -119., 136., 136.], 32 | # [-247., -247., 264., 264.], 33 | # [ -35., -79., 52., 96.], 34 | # [ -79., -167., 96., 184.], 35 | # [-167., -343., 184., 360.]]) 36 | 37 | def generate_anchors_bak(ratios=[0.5, 1, 2], 38 | scales=2**np.arange(3, 6), base_size=16): 39 | """ 40 | Generate anchor (reference) windows by enumerating aspect ratios X 41 | scales wrt a reference (0, 0, 15, 15) window. 42 | """ 43 | ratois = np.array(ratios) 44 | scales = np.array(scales) 45 | 46 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 47 | ratio_anchors = _ratio_enum(base_anchor, ratios) 48 | anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) 49 | for i in xrange(ratio_anchors.shape[0])]) 50 | return anchors 51 | 52 | 53 | 54 | def generate_anchors(ratios, scales, base_size=16): 55 | # print 'ratios', ratios 56 | # print 'scales', scales 57 | base_anchor = np.array([1, 1, base_size, base_size]) - 1 58 | w, h, x_ctr, y_ctr = _whctrs(base_anchor) 59 | size = w * h 60 | size_ratios = size / ratios 61 | ws = np.round(np.sqrt(size_ratios)) 62 | hs = np.round(ws * ratios) 63 | ws = ws * np.array(scales) 64 | hs = hs * np.array(scales) 65 | # print 'ws', ws 66 | # print 'hs', hs 67 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 68 | # print 'anchors', anchors 69 | return anchors 70 | 71 | 72 | 73 | def _whctrs(anchor): 74 | """ 75 | Return width, height, x center, and y center for an anchor (window). 76 | """ 77 | 78 | w = anchor[2] - anchor[0] + 1 79 | h = anchor[3] - anchor[1] + 1 80 | x_ctr = anchor[0] + 0.5 * (w - 1) 81 | y_ctr = anchor[1] + 0.5 * (h - 1) 82 | return w, h, x_ctr, y_ctr 83 | 84 | def _mkanchors(ws, hs, x_ctr, y_ctr): 85 | """ 86 | Given a vector of widths (ws) and heights (hs) around a center 87 | (x_ctr, y_ctr), output a set of anchors (windows). 88 | """ 89 | # print 'ws', ws 90 | ws = ws[:, np.newaxis] 91 | hs = hs[:, np.newaxis] 92 | anchors = np.hstack((x_ctr - 0.5 * (ws - 1), 93 | y_ctr - 0.5 * (hs - 1), 94 | x_ctr + 0.5 * (ws - 1), 95 | y_ctr + 0.5 * (hs - 1))) 96 | return anchors 97 | 98 | def _ratio_enum(anchor, ratios): 99 | """ 100 | Enumerate a set of anchors for each aspect ratio wrt an anchor. 101 | """ 102 | 103 | w, h, x_ctr, y_ctr = _whctrs(anchor) 104 | size = w * h 105 | size_ratios = size / ratios 106 | ws = np.round(np.sqrt(size_ratios)) 107 | hs = np.round(ws * ratios) 108 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 109 | return anchors 110 | 111 | def _scale_enum(anchor, scales): 112 | """ 113 | Enumerate a set of anchors for each scale wrt an anchor. 114 | """ 115 | 116 | w, h, x_ctr, y_ctr = _whctrs(anchor) 117 | ws = w * scales 118 | hs = h * scales 119 | anchors = _mkanchors(ws, hs, x_ctr, y_ctr) 120 | return anchors 121 | 122 | if __name__ == '__main__': 123 | import time 124 | t = time.time() 125 | a = generate_anchors() 126 | print time.time() - t 127 | print a 128 | from IPython import embed; embed() 129 | -------------------------------------------------------------------------------- /lib/rpn_msr/proposal_layer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick and Sean Bell 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | from lib.fast_rcnn.nms_wrapper import nms 10 | 11 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 12 | from generate_anchors import generate_anchors 13 | 14 | import pdb 15 | 16 | 17 | DEBUG = False 18 | """ 19 | Outputs object detection proposals by applying estimated bounding-box 20 | transformations to a set of regular boxes (called "anchors"). 21 | """ 22 | 23 | 24 | def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_infos, 25 | _feat_stride, opts, anchor_scales, anchor_ratios, 26 | mappings): 27 | # Algorithm: 28 | # 29 | # for each (H, W) location i 30 | # generate A anchor boxes centered on cell i 31 | # apply predicted bbox deltas at cell i to each of the A anchors 32 | # clip predicted boxes to image 33 | # remove predicted boxes with either height or width < threshold 34 | # sort all (proposal, score) pairs by score from highest to lowest 35 | # take top pre_nms_topN proposals before NMS 36 | # apply NMS with threshold 0.7 to remaining proposals 37 | # take after_nms_topN proposals after NMS 38 | # return the top proposals (-> RoIs top, scores top) 39 | # layer_params = yaml.load(self.param_str_) 40 | batch_size = rpn_cls_prob_reshape.shape[0] 41 | _anchors = generate_anchors(scales=anchor_scales, ratios=anchor_ratios) 42 | _num_anchors = _anchors.shape[0] 43 | pre_nms_topN = opts['num_box_pre_NMS'] 44 | post_nms_topN = opts['num_box_post_NMS'] 45 | nms_thres = opts['nms_thres'] 46 | min_size = opts['min_size'] 47 | 48 | blob = [] 49 | 50 | for i in range(batch_size): 51 | im_info = im_infos[i] 52 | # the first set of _num_anchors channels are bg probs 53 | # the second set are the fg probs, which we want 54 | height = mappings[int(im_info[0])] 55 | width = mappings[int(im_info[1])] 56 | scores = rpn_cls_prob_reshape[i, _num_anchors:, :height, :width] 57 | bbox_deltas = rpn_bbox_pred[i, :, :height, :width] 58 | 59 | if DEBUG: 60 | print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) 61 | print 'scale: {}'.format(im_info[2]) 62 | if DEBUG: 63 | print 'score map size: {}'.format(scores.shape) 64 | 65 | # Enumerate all shifts 66 | shift_x = np.arange(0, width) * _feat_stride 67 | shift_y = np.arange(0, height) * _feat_stride 68 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 69 | shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), 70 | shift_x.ravel(), shift_y.ravel())).transpose() 71 | 72 | # Enumerate all shifted anchors: 73 | # 74 | # add A anchors (1, A, 4) to 75 | # cell K shifts (K, 1, 4) to get 76 | # shift anchors (K, A, 4) 77 | # reshape to (K*A, 4) shifted anchors 78 | A = _num_anchors 79 | K = shifts.shape[0] 80 | anchors = _anchors.reshape((1, A, 4)) + \ 81 | shifts.reshape((1, K, 4)).transpose((1, 0, 2)) 82 | anchors = anchors.reshape((K * A, 4)) 83 | 84 | # Transpose and reshape predicted bbox transformations to get them 85 | # into the same order as the anchors: 86 | # 87 | # bbox deltas will be (1, 4 * A, H, W) format 88 | # transpose to (1, H, W, 4 * A) 89 | # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) 90 | # in slowest to fastest order 91 | bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape((-1, 4)) 92 | 93 | # Same story for the scores: 94 | # 95 | # scores are (1, A, H, W) format 96 | # transpose to (1, H, W, A) 97 | # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) 98 | scores = scores.transpose((1, 2, 0)).reshape((-1, 1)) 99 | 100 | # Convert anchors into proposals via bbox transformations 101 | proposals = bbox_transform_inv(anchors, bbox_deltas) 102 | 103 | # 2. clip predicted boxes to image 104 | if opts['dropout_box_runoff_image']: 105 | _allowed_border = 16 106 | inds_inside = np.where( 107 | (proposals[:, 0] >= -_allowed_border) & 108 | (proposals[:, 1] >= -_allowed_border) & 109 | (proposals[:, 2] < im_info[1] + _allowed_border) & # width 110 | (proposals[:, 3] < im_info[0] + _allowed_border) # height 111 | )[0] 112 | proposals = proposals[inds_inside, :] 113 | proposals = clip_boxes(proposals, im_info[:2]) 114 | 115 | # 3. remove predicted boxes with either height or width < threshold 116 | # (NOTE: convert min_size to input image scale stored in im_info[2]) 117 | keep = _filter_boxes(proposals, min_size * im_info[2]) 118 | proposals = proposals[keep, :] 119 | scores = scores[keep] 120 | 121 | # 4. sort all (proposal, score) pairs by score from highest to lowest 122 | # 5. take top pre_nms_topN (e.g. 6000) 123 | order = scores.ravel().argsort()[::-1] 124 | if pre_nms_topN > 0: 125 | order = order[:pre_nms_topN] 126 | proposals = proposals[order, :] 127 | scores = scores[order] 128 | 129 | # 6. apply nms (e.g. threshold = 0.7) 130 | # 7. take after_nms_topN (e.g. 300) 131 | # 8. return the top proposals (-> RoIs top) 132 | # print 'proposals', proposals 133 | # print 'scores', scores 134 | keep = nms(np.hstack((proposals, scores)).astype(np.float32), nms_thres) 135 | if post_nms_topN > 0: 136 | keep = keep[:post_nms_topN] 137 | proposals = proposals[keep, :] 138 | scores = scores[keep] 139 | # Output rois blob 140 | # Our RPN implementation only supports a single input image, so all 141 | # batch inds are 0 142 | batch_inds = np.ones((proposals.shape[0], 1), dtype=np.float32) * i 143 | blob.append(np.hstack((batch_inds, proposals.astype(np.float32, copy=False), scores.astype(np.float32, copy=False)))) 144 | 145 | return np.concatenate(blob, axis=0) 146 | 147 | 148 | def _filter_boxes(boxes, min_size): 149 | """Remove all boxes with any side smaller than min_size.""" 150 | ws = boxes[:, 2] - boxes[:, 0] + 1 151 | hs = boxes[:, 3] - boxes[:, 1] + 1 152 | keep = np.where((ws >= min_size) & (hs >= min_size))[0] 153 | return keep 154 | -------------------------------------------------------------------------------- /lib/setup_cuda.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #!/usr/bin/env python 3 | 4 | import glob 5 | import os 6 | 7 | import torch 8 | from setuptools import find_packages 9 | from setuptools import setup 10 | from torch.utils.cpp_extension import CUDA_HOME 11 | from torch.utils.cpp_extension import CppExtension 12 | from torch.utils.cpp_extension import CUDAExtension 13 | 14 | requirements = ["torch", "torchvision"] 15 | 16 | 17 | def get_extensions(): 18 | this_dir = os.path.dirname(os.path.abspath(__file__)) 19 | extensions_dir = os.path.join(this_dir, "layer_utils", "csrc") 20 | 21 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 22 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 23 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 24 | 25 | sources = main_file + source_cpu 26 | extension = CppExtension 27 | 28 | extra_compile_args = {"cxx": []} 29 | define_macros = [] 30 | 31 | if torch.cuda.is_available() and CUDA_HOME is not None: 32 | extension = CUDAExtension 33 | sources += source_cuda 34 | define_macros += [("WITH_CUDA", None)] 35 | extra_compile_args["nvcc"] = [ 36 | "-DCUDA_HAS_FP16=1", 37 | "-D__CUDA_NO_HALF_OPERATORS__", 38 | "-D__CUDA_NO_HALF_CONVERSIONS__", 39 | "-D__CUDA_NO_HALF2_OPERATORS__", 40 | ] 41 | 42 | sources = [os.path.join(extensions_dir, s) for s in sources] 43 | 44 | include_dirs = [extensions_dir] 45 | 46 | ext_modules = [ 47 | extension( 48 | "layer_utils._C", 49 | sources, 50 | include_dirs=include_dirs, 51 | define_macros=define_macros, 52 | extra_compile_args=extra_compile_args, 53 | ) 54 | ] 55 | 56 | return ext_modules 57 | 58 | 59 | setup( 60 | name="faster_rcnn", 61 | version="0.1", 62 | description="object detection in pytorch", 63 | packages=find_packages(exclude=("configs", "tests",)), 64 | # install_requires=requirements, 65 | ext_modules=get_extensions(), 66 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 67 | ) -------------------------------------------------------------------------------- /lib/setup_cython.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | # def locate_cuda(): 27 | # """Locate the CUDA environment on the system 28 | # 29 | # Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | # and values giving the absolute path to each directory. 31 | # 32 | # Starts by looking for the CUDAHOME env variable. If not found, everything 33 | # is based on finding 'nvcc' in the PATH. 34 | # """ 35 | # 36 | # # first check if the CUDAHOME env variable is in use 37 | # if 'CUDAHOME' in os.environ: 38 | # home = os.environ['CUDAHOME'] 39 | # nvcc = pjoin(home, 'bin', 'nvcc') 40 | # else: 41 | # # otherwise, search the PATH for NVCC 42 | # default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | # nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 44 | # if nvcc is None: 45 | # raise EnvironmentError('The nvcc binary could not be ' 46 | # 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 47 | # home = os.path.dirname(os.path.dirname(nvcc)) 48 | # 49 | # cudaconfig = {'home': home, 'nvcc': nvcc, 50 | # 'include': pjoin(home, 'include'), 51 | # 'lib64': pjoin(home, 'lib64')} 52 | # for k, v in cudaconfig.iteritems(): 53 | # if not os.path.exists(v): 54 | # raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 55 | # 56 | # return cudaconfig 57 | 58 | 59 | # CUDA = locate_cuda() 60 | 61 | # Obtain the numpy include directory. This logic works across numpy versions. 62 | try: 63 | numpy_include = np.get_include() 64 | except AttributeError: 65 | numpy_include = np.get_numpy_include() 66 | 67 | 68 | def customize_compiler_for_nvcc(self): 69 | """inject deep into distutils to customize how the dispatch 70 | to gcc/nvcc works. 71 | If you subclass UnixCCompiler, it's not trivial to get your subclass 72 | injected in, and still have the right customizations (i.e. 73 | distutils.sysconfig.customize_compiler) run on it. So instead of going 74 | the OO route, I have this. Note, it's kindof like a wierd functional 75 | subclassing going on.""" 76 | 77 | # tell the compiler it can processes .cu 78 | self.src_extensions.append('.cu') 79 | 80 | # save references to the default compiler_so and _comple methods 81 | default_compiler_so = self.compiler_so 82 | super = self._compile 83 | 84 | # now redefine the _compile method. This gets executed for each 85 | # object but distutils doesn't have the ability to change compilers 86 | # based on source extension: we add it. 87 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 88 | print extra_postargs 89 | if os.path.splitext(src)[1] == '.cu': 90 | # use the cuda for .cu files 91 | self.set_executable('compiler_so', CUDA['nvcc']) 92 | # use only a subset of the extra_postargs, which are 1-1 translated 93 | # from the extra_compile_args in the Extension class 94 | postargs = extra_postargs['nvcc'] 95 | else: 96 | postargs = extra_postargs['gcc'] 97 | 98 | super(obj, src, ext, cc_args, postargs, pp_opts) 99 | # reset the default compiler_so, which we might have changed for cuda 100 | self.compiler_so = default_compiler_so 101 | 102 | # inject our redefined _compile method into the class 103 | self._compile = _compile 104 | 105 | 106 | # run the customize_compiler 107 | class custom_build_ext(build_ext): 108 | def build_extensions(self): 109 | customize_compiler_for_nvcc(self.compiler) 110 | build_ext.build_extensions(self) 111 | 112 | 113 | ext_modules = [ 114 | Extension( 115 | "utils.cython_bbox", 116 | ["utils/bbox.pyx"], 117 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 118 | include_dirs=[numpy_include] 119 | ), 120 | Extension( 121 | "utils.cython_nms", 122 | ["utils/nms.pyx"], 123 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 124 | include_dirs=[numpy_include] 125 | ), 126 | Extension( 127 | "nms.nms_retain_all", 128 | ["nms/nms_retain_all.pyx"], 129 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 130 | include_dirs=[numpy_include] 131 | ), 132 | Extension( 133 | 'pycocotools._mask', 134 | sources=['pycocotools/maskApi.c', 'pycocotools/_mask.pyx'], 135 | include_dirs=[numpy_include, 'pycocotools'], 136 | extra_compile_args={ 137 | 'gcc': ['-Wno-cpp', '-Wno-unused-function', '-std=c99']}, 138 | ), 139 | ] 140 | 141 | setup( 142 | name='faster_rcnn', 143 | ext_modules=ext_modules, 144 | # inject our custom trigger 145 | cmdclass={'build_ext': custom_build_ext}, 146 | ) -------------------------------------------------------------------------------- /lib/utils/.gitignore: -------------------------------------------------------------------------------- 1 | *.c 2 | *.cpp 3 | *.so 4 | -------------------------------------------------------------------------------- /lib/utils/HDN_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | import pdb 6 | from .cython_bbox import bbox_overlaps, bbox_intersections 7 | 8 | 9 | def get_model_name(arguments): 10 | 11 | 12 | if arguments.nesterov: 13 | arguments.model_name += '_nesterov' 14 | 15 | if arguments.MPS_iter < 0: 16 | print 'Using random MPS iterations to training' 17 | arguments.model_name += '_rand_iters' 18 | else: 19 | arguments.model_name += '_{}_iters'.format(arguments.MPS_iter) 20 | 21 | 22 | if arguments.use_kernel_function: 23 | arguments.model_name += '_with_kernel' 24 | if arguments.load_RPN or arguments.resume_training: 25 | arguments.model_name += '_alt' 26 | else: 27 | arguments.model_name += '_end2end' 28 | if arguments.dropout: 29 | arguments.model_name += '_dropout' 30 | arguments.model_name += '_{}'.format(arguments.dataset_option) 31 | if arguments.disable_language_model: 32 | arguments.model_name += '_no_caption' 33 | else: 34 | if arguments.rnn_type == 'LSTM_im': 35 | arguments.model_name += '_H_LSTM' 36 | elif arguments.rnn_type == 'LSTM_normal': 37 | arguments.model_name += '_I_LSTM' 38 | elif arguments.rnn_type == 'LSTM_baseline': 39 | arguments.model_name += '_B_LSTM' 40 | else: 41 | raise Exception('Error in RNN type') 42 | if arguments.caption_use_bias: 43 | arguments.model_name += '_with_bias' 44 | else: 45 | arguments.model_name += '_no_bias' 46 | if arguments.caption_use_dropout > 0: 47 | arguments.model_name += '_with_dropout_{}'.format(arguments.caption_use_dropout).replace('.', '_') 48 | else: 49 | arguments.model_name += '_no_dropout' 50 | arguments.model_name += '_nembed_{}'.format(arguments.nembedding) 51 | arguments.model_name += '_nhidden_{}'.format(arguments.nhidden_caption) 52 | 53 | if arguments.region_bbox_reg: 54 | arguments.model_name += '_with_region_regression' 55 | 56 | if arguments.resume_training: 57 | arguments.model_name += '_resume' 58 | 59 | if arguments.finetune_language_model: 60 | arguments.model_name += '_finetune' 61 | if arguments.optimizer == 0: 62 | arguments.model_name += '_SGD' 63 | arguments.solver = 'SGD' 64 | elif arguments.optimizer == 1: 65 | arguments.model_name += '_Adam' 66 | arguments.solver = 'Adam' 67 | elif arguments.optimizer == 2: 68 | arguments.model_name += '_Adagrad' 69 | arguments.solver = 'Adagrad' 70 | else: 71 | raise Exception('Unrecognized optimization algorithm specified!') 72 | 73 | return arguments 74 | 75 | 76 | def group_features(net_): 77 | vgg_features_fix = list(net_.rpn.features.parameters())[:8] 78 | vgg_features_var = list(net_.rpn.features.parameters())[8:] 79 | vgg_feature_len = len(list(net_.rpn.features.parameters())) 80 | rpn_feature_len = len(list(net_.rpn.parameters())) - vgg_feature_len 81 | rpn_features = list(net_.rpn.parameters())[vgg_feature_len:] 82 | language_features = list(net_.caption_prediction.parameters()) 83 | language_feature_len = len(language_features) 84 | hdn_features = list(net_.parameters())[(rpn_feature_len + vgg_feature_len):(-1 * language_feature_len)] 85 | print 'vgg feature length:', vgg_feature_len 86 | print 'rpn feature length:', rpn_feature_len 87 | print 'HDN feature length:', len(hdn_features) 88 | print 'language_feature_len:', language_feature_len 89 | return vgg_features_fix, vgg_features_var, rpn_features, hdn_features, language_features 90 | 91 | -------------------------------------------------------------------------------- /lib/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | from . import cython_nms 8 | from . import cython_bbox 9 | import blob 10 | import nms 11 | import timer -------------------------------------------------------------------------------- /lib/utils/bbox.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps 56 | 57 | 58 | def bbox_intersections( 59 | np.ndarray[DTYPE_t, ndim=2] boxes, 60 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 61 | """ 62 | For each query box compute the intersection ratio covered by boxes 63 | ---------- 64 | Parameters 65 | ---------- 66 | boxes: (N, 4) ndarray of float 67 | query_boxes: (K, 4) ndarray of float 68 | Returns 69 | ------- 70 | overlaps: (N, K) ndarray of intersec between boxes and query_boxes 71 | """ 72 | cdef unsigned int N = boxes.shape[0] 73 | cdef unsigned int K = query_boxes.shape[0] 74 | cdef np.ndarray[DTYPE_t, ndim=2] intersec = np.zeros((N, K), dtype=DTYPE) 75 | cdef DTYPE_t iw, ih, box_area 76 | cdef DTYPE_t ua 77 | cdef unsigned int k, n 78 | for k in range(K): 79 | box_area = ( 80 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 81 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 82 | ) 83 | for n in range(N): 84 | iw = ( 85 | min(boxes[n, 2], query_boxes[k, 2]) - 86 | max(boxes[n, 0], query_boxes[k, 0]) + 1 87 | ) 88 | if iw > 0: 89 | ih = ( 90 | min(boxes[n, 3], query_boxes[k, 3]) - 91 | max(boxes[n, 1], query_boxes[k, 1]) + 1 92 | ) 93 | if ih > 0: 94 | intersec[n, k] = iw * ih / box_area 95 | return intersec -------------------------------------------------------------------------------- /lib/utils/blob.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Blob helper functions.""" 9 | 10 | import numpy as np 11 | import cv2 12 | 13 | def im_list_to_blob(ims): 14 | """Convert a list of images into a network input. 15 | 16 | Assumes images are already prepared (means subtracted, BGR order, ...). 17 | """ 18 | max_shape = np.array([im.shape for im in ims]).max(axis=0) 19 | num_images = len(ims) 20 | blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), 21 | dtype=np.float32) 22 | for i in xrange(num_images): 23 | im = ims[i] 24 | blob[i, 0:im.shape[0], 0:im.shape[1], :] = im 25 | 26 | return blob 27 | 28 | def prep_im_for_blob(im, pixel_means, target_size, max_size): 29 | """Mean subtract and scale an image for use in a blob.""" 30 | im = im.astype(np.float32, copy=False) 31 | im -= pixel_means 32 | im_shape = im.shape 33 | im_size_min = np.min(im_shape[0:2]) 34 | im_size_max = np.max(im_shape[0:2]) 35 | im_scale = float(target_size) / float(im_size_min) 36 | # Prevent the biggest axis from being more than MAX_SIZE 37 | if np.round(im_scale * im_size_max) > max_size: 38 | im_scale = float(max_size) / float(im_size_max) 39 | im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, 40 | interpolation=cv2.INTER_LINEAR) 41 | 42 | return im, im_scale 43 | -------------------------------------------------------------------------------- /lib/utils/boxes_grid.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Subcategory CNN 3 | # Copyright (c) 2015 CVGL Stanford 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Yu Xiang 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import math 10 | # TODO: make fast_rcnn irrelevant 11 | # >>>> obsolete, because it depends on sth outside of this project 12 | from ..fast_rcnn.config import cfg 13 | # <<<< obsolete 14 | 15 | def get_boxes_grid(image_height, image_width): 16 | """ 17 | Return the boxes on image grid. 18 | calling this function when cfg.IS_MULTISCALE is True, otherwise, calling rdl_roidb.prepare_roidb(imdb) instead. 19 | """ 20 | 21 | # fixed a bug, change cfg.TRAIN.SCALES to cfg.TRAIN.SCALES_BASE 22 | # coz, here needs a ratio around 1.0, not the accutual size. 23 | # height and width of the feature map 24 | if cfg.NET_NAME == 'CaffeNet': 25 | height = np.floor((image_height * max(cfg.TRAIN.SCALES_BASE) - 1) / 4.0 + 1) 26 | height = np.floor((height - 1) / 2.0 + 1 + 0.5) 27 | height = np.floor((height - 1) / 2.0 + 1 + 0.5) 28 | 29 | width = np.floor((image_width * max(cfg.TRAIN.SCALES_BASE) - 1) / 4.0 + 1) 30 | width = np.floor((width - 1) / 2.0 + 1 + 0.5) 31 | width = np.floor((width - 1) / 2.0 + 1 + 0.5) 32 | elif cfg.NET_NAME == 'VGGnet': 33 | height = np.floor(image_height * max(cfg.TRAIN.SCALES_BASE) / 2.0 + 0.5) 34 | height = np.floor(height / 2.0 + 0.5) 35 | height = np.floor(height / 2.0 + 0.5) 36 | height = np.floor(height / 2.0 + 0.5) 37 | 38 | width = np.floor(image_width * max(cfg.TRAIN.SCALES_BASE) / 2.0 + 0.5) 39 | width = np.floor(width / 2.0 + 0.5) 40 | width = np.floor(width / 2.0 + 0.5) 41 | width = np.floor(width / 2.0 + 0.5) 42 | else: 43 | assert (1), 'The network architecture is not supported in utils.get_boxes_grid!' 44 | 45 | # compute the grid box centers 46 | h = np.arange(height) 47 | w = np.arange(width) 48 | y, x = np.meshgrid(h, w, indexing='ij') 49 | centers = np.dstack((x, y)) 50 | centers = np.reshape(centers, (-1, 2)) 51 | num = centers.shape[0] 52 | 53 | # compute width and height of grid box 54 | area = cfg.TRAIN.KERNEL_SIZE * cfg.TRAIN.KERNEL_SIZE 55 | aspect = cfg.TRAIN.ASPECTS # height / width 56 | num_aspect = len(aspect) 57 | widths = np.zeros((1, num_aspect), dtype=np.float32) 58 | heights = np.zeros((1, num_aspect), dtype=np.float32) 59 | for i in xrange(num_aspect): 60 | widths[0,i] = math.sqrt(area / aspect[i]) 61 | heights[0,i] = widths[0,i] * aspect[i] 62 | 63 | # construct grid boxes 64 | centers = np.repeat(centers, num_aspect, axis=0) 65 | widths = np.tile(widths, num).transpose() 66 | heights = np.tile(heights, num).transpose() 67 | 68 | x1 = np.reshape(centers[:,0], (-1, 1)) - widths * 0.5 69 | x2 = np.reshape(centers[:,0], (-1, 1)) + widths * 0.5 70 | y1 = np.reshape(centers[:,1], (-1, 1)) - heights * 0.5 71 | y2 = np.reshape(centers[:,1], (-1, 1)) + heights * 0.5 72 | 73 | boxes_grid = np.hstack((x1, y1, x2, y2)) / cfg.TRAIN.SPATIAL_SCALE 74 | 75 | return boxes_grid, centers[:,0], centers[:,1] 76 | -------------------------------------------------------------------------------- /lib/utils/general_utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import collections 3 | import torch 4 | import numpy as np 5 | 6 | 7 | def update_values(dict_from, dict_to): 8 | for key, value in dict_from.items(): 9 | if isinstance(value, dict): 10 | update_values(dict_from[key], dict_to[key]) 11 | elif value is not None: 12 | dict_to[key] = dict_from[key] 13 | 14 | return dict_to 15 | 16 | 17 | def params_count(model): 18 | count = 0 19 | for p in model.parameters(): 20 | c = 1 21 | for i in range(p.dim()): 22 | c *= p.size(i) 23 | count += c 24 | return count -------------------------------------------------------------------------------- /lib/utils/logger.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import time 3 | import json 4 | import numpy as np 5 | import os 6 | from collections import defaultdict 7 | 8 | class Experiment(object): 9 | 10 | def __init__(self, name, options=dict()): 11 | """ Create an experiment 12 | """ 13 | super(Experiment, self).__init__() 14 | 15 | self.name = name 16 | self.options = options 17 | self.date_and_time = time.strftime('%d-%m-%Y--%H-%M-%S') 18 | 19 | self.info = defaultdict(dict) 20 | self.logged = defaultdict(dict) 21 | self.meters = defaultdict(dict) 22 | 23 | def add_meters(self, tag, meters_dict): 24 | assert tag not in (self.meters.keys()) 25 | for name, meter in meters_dict.items(): 26 | self.add_meter(tag, name, meter) 27 | 28 | def add_meter(self, tag, name, meter): 29 | assert name not in list(self.meters[tag].keys()), \ 30 | "meter with tag {} and name {} already exists".format(tag, name) 31 | self.meters[tag][name] = meter 32 | 33 | def update_options(self, options_dict): 34 | self.options.update(options_dict) 35 | 36 | def log_meter(self, tag, name, n=1): 37 | meter = self.get_meter(tag, name) 38 | if name not in self.logged[tag]: 39 | self.logged[tag][name] = {} 40 | self.logged[tag][name][n] = meter.value() 41 | 42 | def log_meters(self, tag, n=1): 43 | for name, meter in self.get_meters(tag).items(): 44 | self.log_meter(tag, name, n=n) 45 | 46 | def reset_meters(self, tag): 47 | meters = self.get_meters(tag) 48 | for name, meter in meters.items(): 49 | meter.reset() 50 | return meters 51 | 52 | def get_meters(self, tag): 53 | assert tag in list(self.meters.keys()) 54 | return self.meters[tag] 55 | 56 | def get_meter(self, tag, name): 57 | assert tag in list(self.meters.keys()) 58 | assert name in list(self.meters[tag].keys()) 59 | return self.meters[tag][name] 60 | 61 | def to_json(self, filename): 62 | os.system('mkdir -p ' + os.path.dirname(filename)) 63 | var_dict = copy.copy(vars(self)) 64 | var_dict.pop('meters') 65 | for key in ('viz', 'viz_dict'): 66 | if key in list(var_dict.keys()): 67 | var_dict.pop(key) 68 | with open(filename, 'w') as f: 69 | json.dump(var_dict, f) 70 | 71 | def from_json(filename): 72 | with open(filename, 'r') as f: 73 | var_dict = json.load(f) 74 | xp = Experiment('') 75 | xp.date_and_time = var_dict['date_and_time'] 76 | xp.logged = var_dict['logged'] 77 | # TODO: Remove 78 | if 'info' in var_dict: 79 | xp.info = var_dict['info'] 80 | xp.options = var_dict['options'] 81 | xp.name = var_dict['name'] 82 | return xp 83 | 84 | 85 | class AvgMeter(object): 86 | """Computes and stores the average and current value""" 87 | def __init__(self): 88 | self.reset() 89 | 90 | def reset(self): 91 | self.val = 0 92 | self.avg = 0 93 | self.sum = 0 94 | self.count = 0 95 | 96 | def update(self, val, n=1): 97 | self.val = val 98 | self.sum += val * n 99 | self.count += n 100 | self.avg = self.sum / self.count 101 | 102 | def value(self): 103 | return self.avg 104 | 105 | 106 | class SumMeter(object): 107 | """Computes and stores the sum and current value""" 108 | def __init__(self): 109 | self.reset() 110 | 111 | def reset(self): 112 | self.val = 0 113 | self.sum = 0 114 | self.count = 0 115 | 116 | def update(self, val, n=1): 117 | self.val = val 118 | self.sum += val * n 119 | self.count += n 120 | 121 | def value(self): 122 | return self.sum 123 | 124 | 125 | class ValueMeter(object): 126 | """Computes and stores the average and current value""" 127 | def __init__(self): 128 | self.reset() 129 | 130 | def reset(self): 131 | self.val = 0 132 | 133 | def update(self, val): 134 | self.val = val 135 | 136 | def value(self): 137 | return self.val -------------------------------------------------------------------------------- /lib/utils/metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import numpy as np 5 | import pdb 6 | from lib.visualize_graph.vis_utils import expand_relationships_mat, expand_relationships_list 7 | from .cython_bbox import bbox_overlaps, bbox_intersections 8 | 9 | def recall(rois, gt_objects, top_N, thres): 10 | overlaps = bbox_overlaps( 11 | np.ascontiguousarray(rois[:top_N, 1:5], dtype=np.float), 12 | np.ascontiguousarray(gt_objects[:, :4], dtype=np.float)) 13 | 14 | overlap_gt = np.amax(overlaps, axis=0) 15 | correct_cnt = np.sum(overlap_gt >= thres) 16 | total_cnt = overlap_gt.size 17 | return correct_cnt, total_cnt 18 | 19 | def check_recall(rois, gt_objects, top_N, thres=0.5): 20 | 21 | rois = rois.cpu().data.numpy() 22 | if isinstance(gt_objects, list): 23 | correct_cnt, total_cnt = 0, 0 24 | for i, gt in enumerate(gt_objects): 25 | im_rois = rois[np.where(rois[:, 0] == i)[0]] 26 | r = recall(im_rois, gt, top_N, thres) 27 | correct_cnt += r[0] 28 | total_cnt += r[1] 29 | return correct_cnt, total_cnt 30 | else: 31 | return recall(rois, gt_objects, top_N, thres) 32 | 33 | def get_phrase_boxes(sub_boxes, obj_boxes): 34 | phrase_boxes = [np.minimum(sub_boxes[:, 0], obj_boxes[:, 0]), 35 | np.minimum(sub_boxes[:, 1], obj_boxes[:, 1]), 36 | np.maximum(sub_boxes[:, 2], obj_boxes[:, 2]), 37 | np.maximum(sub_boxes[:, 3], obj_boxes[:, 3])] 38 | phrase_boxes = np.stack(phrase_boxes, axis=1) 39 | return phrase_boxes 40 | 41 | def check_phrase_recall(gt_objects, gt_relationships, 42 | subject_inds, object_inds, predicate_inds, 43 | subject_boxes, object_boxes, top_Ns, thres=0.5): 44 | # rearrange the ground truth 45 | gt_rel_sub_idx, gt_rel_obj_idx = np.where(gt_relationships > 0) # ground truth number 46 | gt_sub = gt_objects[gt_rel_sub_idx, :5] 47 | gt_obj = gt_objects[gt_rel_obj_idx, :5] 48 | gt_rel = gt_relationships[gt_rel_sub_idx, gt_rel_obj_idx] 49 | 50 | rel_cnt = len(gt_rel) 51 | rel_correct_cnt = np.zeros(len(top_Ns)) 52 | max_topN = max(top_Ns) 53 | 54 | # compute the overlap 55 | try: 56 | phrase_overlaps = bbox_overlaps( 57 | np.ascontiguousarray( 58 | get_phrase_boxes(subject_boxes[:max_topN], object_boxes[:max_topN]), dtype=np.float), 59 | np.ascontiguousarray( 60 | get_phrase_boxes(gt_sub[:, :4], gt_obj[:, :4]), dtype=np.float)) 61 | except: 62 | print('[Warning] No relationship remaining.') 63 | return rel_cnt, rel_correct_cnt 64 | 65 | 66 | for idx, top_N in enumerate(top_Ns): 67 | for gt_id in xrange(rel_cnt): 68 | fg_candidate = np.where(phrase_overlaps[:top_N, gt_id] >= thres)[0] 69 | 70 | for candidate_id in fg_candidate: 71 | if predicate_inds[candidate_id] == gt_rel[gt_id] and \ 72 | subject_inds[candidate_id] == gt_sub[gt_id, 4] and \ 73 | object_inds[candidate_id] == gt_obj[gt_id, 4]: 74 | rel_correct_cnt[idx] += 1 75 | break 76 | return rel_cnt, rel_correct_cnt 77 | 78 | 79 | def check_relationship_recall(gt_objects, gt_relationships, 80 | subject_inds, object_inds, predicate_inds, 81 | subject_boxes, object_boxes, top_Ns, thres=0.5): 82 | # rearrange the ground truth 83 | gt_rel_sub_idx, gt_rel_obj_idx = np.where(gt_relationships > 0) # ground truth number 84 | gt_sub = gt_objects[gt_rel_sub_idx, :5] 85 | gt_obj = gt_objects[gt_rel_obj_idx, :5] 86 | gt_rel = gt_relationships[gt_rel_sub_idx, gt_rel_obj_idx] 87 | 88 | rel_cnt = len(gt_rel) 89 | pred_correct_cnt = np.zeros(len(top_Ns)) 90 | rel_correct_cnt = np.zeros(len(top_Ns)) 91 | max_topN = max(top_Ns) 92 | 93 | # compute the overlap 94 | try: 95 | sub_overlaps = bbox_overlaps( 96 | np.ascontiguousarray(subject_boxes[:max_topN], dtype=np.float), 97 | np.ascontiguousarray(gt_sub[:, :4], dtype=np.float)) 98 | obj_overlaps = bbox_overlaps( 99 | np.ascontiguousarray(object_boxes[:max_topN], dtype=np.float), 100 | np.ascontiguousarray(gt_obj[:, :4], dtype=np.float)) 101 | except: 102 | print('[Warning] No relationship remaining.') 103 | return rel_cnt, rel_correct_cnt, pred_correct_cnt 104 | 105 | 106 | for idx, top_N in enumerate(top_Ns): 107 | for gt_id in xrange(rel_cnt): 108 | fg_candidate = np.where(np.logical_and( 109 | sub_overlaps[:top_N, gt_id] >= thres, 110 | obj_overlaps[:top_N, gt_id] >= thres))[0] 111 | 112 | pred_flag = 1 113 | for candidate_id in fg_candidate: 114 | if predicate_inds[candidate_id] == gt_rel[gt_id]: 115 | pred_correct_cnt[idx] += pred_flag 116 | pred_flag = 0 # only add once 117 | if subject_inds[candidate_id] == gt_sub[gt_id, 4] and \ 118 | object_inds[candidate_id] == gt_obj[gt_id, 4]: 119 | 120 | rel_correct_cnt[idx] += 1 121 | break 122 | return rel_cnt, rel_correct_cnt, pred_correct_cnt 123 | 124 | 125 | def check_hit_detections(gt_objects, gt_relationships, 126 | pred_objects, pred_relationships, thres=0.5): 127 | 128 | 129 | # rearrange the ground truth 130 | gt_sub, gt_obj, gt_rel,_, _ = expand_relationships_mat(gt_objects, gt_relationships) 131 | pred_sub, pred_obj, pred_rel = expand_relationships_list(pred_objects, pred_relationships) 132 | hit_pred = np.zeros_like(pred_rel) 133 | assigned_gt = np.ones_like(gt_rel) 134 | # compute the overlap 135 | try: 136 | sub_overlaps = bbox_overlaps( 137 | np.ascontiguousarray(pred_sub[:, :4], dtype=np.float), 138 | np.ascontiguousarray(gt_sub[:, :4], dtype=np.float)) 139 | obj_overlaps = bbox_overlaps( 140 | np.ascontiguousarray(pred_obj[:, :4], dtype=np.float), 141 | np.ascontiguousarray(gt_obj[:, :4], dtype=np.float)) 142 | except: 143 | print('[Warning] No relationship remaining.') 144 | return hit_pred 145 | 146 | 147 | 148 | for pred_id in xrange(pred_rel.shape[0]): 149 | 150 | fg_candidate = np.where(np.logical_and( 151 | sub_overlaps[pred_id] >= thres, 152 | obj_overlaps[pred_id] >= thres))[0] 153 | for candidate_id in fg_candidate: 154 | if pred_rel[pred_id] == gt_rel[candidate_id] and \ 155 | pred_sub[pred_id, 4] == gt_sub[candidate_id, 4] and \ 156 | pred_obj[pred_id, 4] == gt_obj[candidate_id, 4] and assigned_gt[candidate_id]: 157 | 158 | hit_pred[pred_id] = 1 159 | assigned_gt[candidate_id] = 0 160 | break 161 | 162 | return hit_pred 163 | 164 | -------------------------------------------------------------------------------- /lib/utils/nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | import pdb 10 | 11 | def nms(dets, thresh): 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | 40 | def unary_nms(dets, classes, thresh): 41 | x1 = dets[:, 0] 42 | y1 = dets[:, 1] 43 | x2 = dets[:, 2] 44 | y2 = dets[:, 3] 45 | scores = dets[:, 4] 46 | 47 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 48 | order = scores.argsort()[::-1] 49 | 50 | keep = [] 51 | while order.size > 0: 52 | i = order[0] 53 | keep.append(i) 54 | xx1 = np.maximum(x1[i], x1[order[1:]]) 55 | yy1 = np.maximum(y1[i], y1[order[1:]]) 56 | xx2 = np.minimum(x2[i], x2[order[1:]]) 57 | yy2 = np.minimum(y2[i], y2[order[1:]]) 58 | 59 | w = np.maximum(0.0, xx2 - xx1 + 1) 60 | h = np.maximum(0.0, yy2 - yy1 + 1) 61 | inter = w * h 62 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 63 | 64 | inds = np.where((ovr <= thresh) | (classes[i] != classes[order[1:]]))[0] 65 | order = order[inds + 1] 66 | 67 | return keep 68 | 69 | def triplet_nms(sub_ids, obj_ids, pred_ids, sub_boxes, obj_boxes, thresh): 70 | #print('before: {}'.format(len(sub_ids))), 71 | sub_x1 = sub_boxes[:, 0] 72 | sub_y1 = sub_boxes[:, 1] 73 | sub_x2 = sub_boxes[:, 2] 74 | sub_y2 = sub_boxes[:, 3] 75 | obj_x1 = obj_boxes[:, 0] 76 | obj_y1 = obj_boxes[:, 1] 77 | obj_x2 = obj_boxes[:, 2] 78 | obj_y2 = obj_boxes[:, 3] 79 | 80 | 81 | sub_areas = (sub_x2 - sub_x1 + 1) * (sub_y2 - sub_y1 + 1) 82 | obj_areas = (obj_x2 - obj_x1 + 1) * (obj_y2 - obj_y1 + 1) 83 | order = np.array(range(len(sub_ids))) 84 | 85 | keep = [] 86 | while order.size > 0: 87 | i = order[0] 88 | keep.append(i) 89 | sub_xx1 = np.maximum(sub_x1[i], sub_x1[order[1:]]) 90 | sub_yy1 = np.maximum(sub_y1[i], sub_y1[order[1:]]) 91 | sub_xx2 = np.minimum(sub_x2[i], sub_x2[order[1:]]) 92 | sub_yy2 = np.minimum(sub_y2[i], sub_y2[order[1:]]) 93 | sub_id = sub_ids[i] 94 | obj_xx1 = np.maximum(obj_x1[i], obj_x1[order[1:]]) 95 | obj_yy1 = np.maximum(obj_y1[i], obj_y1[order[1:]]) 96 | obj_xx2 = np.minimum(obj_x2[i], obj_x2[order[1:]]) 97 | obj_yy2 = np.minimum(obj_y2[i], obj_y2[order[1:]]) 98 | obj_id = obj_ids[i] 99 | pred_id = pred_ids[i] 100 | 101 | w = np.maximum(0.0, sub_xx2 - sub_xx1 + 1) 102 | h = np.maximum(0.0, sub_yy2 - sub_yy1 + 1) 103 | inter = w * h 104 | sub_ovr = inter / (sub_areas[i] + sub_areas[order[1:]] - inter) 105 | 106 | w = np.maximum(0.0, obj_xx2 - obj_xx1 + 1) 107 | h = np.maximum(0.0, obj_yy2 - obj_yy1 + 1) 108 | inter = w * h 109 | obj_ovr = inter / (obj_areas[i] + obj_areas[order[1:]] - inter) 110 | inds = np.where( (sub_ovr <= thresh) | 111 | (obj_ovr <= thresh) | 112 | (sub_ids[order[1:]] != sub_id) | 113 | (obj_ids[order[1:]] != obj_id) | 114 | (pred_ids[order[1:]] != pred_id) )[0] 115 | order = order[inds + 1] 116 | #print(' After: {}'.format(len(keep))) 117 | return sub_ids[keep], obj_ids[keep], pred_ids[keep], sub_boxes[keep], obj_boxes[keep], keep 118 | -------------------------------------------------------------------------------- /lib/utils/nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | def nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 18 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 19 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 20 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 21 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 22 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 23 | 24 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 25 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 26 | 27 | cdef int ndets = dets.shape[0] 28 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 29 | np.zeros((ndets), dtype=np.int) 30 | 31 | # nominal indices 32 | cdef int _i, _j 33 | # sorted indices 34 | cdef int i, j 35 | # temp variables for box i's (the box currently under consideration) 36 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 37 | # variables for computing overlap with box j (lower scoring box) 38 | cdef np.float32_t xx1, yy1, xx2, yy2 39 | cdef np.float32_t w, h 40 | cdef np.float32_t inter, ovr 41 | 42 | keep = [] 43 | for _i in range(ndets): 44 | i = order[_i] 45 | if suppressed[i] == 1: 46 | continue 47 | keep.append(i) 48 | ix1 = x1[i] 49 | iy1 = y1[i] 50 | ix2 = x2[i] 51 | iy2 = y2[i] 52 | iarea = areas[i] 53 | for _j in range(_i + 1, ndets): 54 | j = order[_j] 55 | if suppressed[j] == 1: 56 | continue 57 | xx1 = max(ix1, x1[j]) 58 | yy1 = max(iy1, y1[j]) 59 | xx2 = min(ix2, x2[j]) 60 | yy2 = min(iy2, y2[j]) 61 | w = max(0.0, xx2 - xx1 + 1) 62 | h = max(0.0, yy2 - yy1 + 1) 63 | inter = w * h 64 | ovr = inter / (iarea + areas[j] - inter) 65 | if ovr >= thresh: 66 | suppressed[j] = 1 67 | 68 | return keep 69 | 70 | def nms_new(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 71 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 72 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 73 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 74 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 75 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 76 | 77 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 78 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 79 | 80 | cdef int ndets = dets.shape[0] 81 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 82 | np.zeros((ndets), dtype=np.int) 83 | 84 | # nominal indices 85 | cdef int _i, _j 86 | # sorted indices 87 | cdef int i, j 88 | # temp variables for box i's (the box currently under consideration) 89 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 90 | # variables for computing overlap with box j (lower scoring box) 91 | cdef np.float32_t xx1, yy1, xx2, yy2 92 | cdef np.float32_t w, h 93 | cdef np.float32_t inter, ovr 94 | 95 | keep = [] 96 | for _i in range(ndets): 97 | i = order[_i] 98 | if suppressed[i] == 1: 99 | continue 100 | keep.append(i) 101 | ix1 = x1[i] 102 | iy1 = y1[i] 103 | ix2 = x2[i] 104 | iy2 = y2[i] 105 | iarea = areas[i] 106 | for _j in range(_i + 1, ndets): 107 | j = order[_j] 108 | if suppressed[j] == 1: 109 | continue 110 | xx1 = max(ix1, x1[j]) 111 | yy1 = max(iy1, y1[j]) 112 | xx2 = min(ix2, x2[j]) 113 | yy2 = min(iy2, y2[j]) 114 | w = max(0.0, xx2 - xx1 + 1) 115 | h = max(0.0, yy2 - yy1 + 1) 116 | inter = w * h 117 | ovr = inter / (iarea + areas[j] - inter) 118 | ovr1 = inter / iarea 119 | ovr2 = inter / areas[j] 120 | if ovr >= thresh or ovr1 > 0.95 or ovr2 > 0.95: 121 | suppressed[j] = 1 122 | 123 | return keep 124 | -------------------------------------------------------------------------------- /lib/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | class Timer(object): 11 | """A simple timer.""" 12 | def __init__(self): 13 | self.total_time = 0. 14 | self.calls = 0 15 | self.start_time = 0. 16 | self.diff = 0. 17 | self.average_time = 0. 18 | 19 | def tic(self): 20 | # using time.time instead of time.clock because time time.clock 21 | # does not normalize for multithreading 22 | self.start_time = time.time() 23 | 24 | def toc(self, average=True): 25 | self.diff = time.time() - self.start_time 26 | self.total_time += self.diff 27 | self.calls += 1 28 | self.average_time = self.total_time / self.calls 29 | if average: 30 | return self.average_time 31 | else: 32 | return self.diff 33 | -------------------------------------------------------------------------------- /lib/visualize_graph/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikang-li/FactorizableNet/1e3e1813630d8d647ab0f01bce7782198272e2e8/lib/visualize_graph/__init__.py -------------------------------------------------------------------------------- /lib/visualize_graph/vis_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pdb 3 | from ..utils.cython_bbox import bbox_overlaps 4 | 5 | def _compute_gt_target(pred_boxes, gt_boxes): 6 | """ 7 | compute which gt gets mapped to each predicted box 8 | [Modified from Danfei's implementation. 9 | Directly use top-1-score boxes. 10 | In Danfei's implementation, per-class-boxes 11 | are used.] 12 | """ 13 | 14 | num_boxes = pred_boxes.shape[0] 15 | # map predicted boxes to ground-truth 16 | gt_targets = np.zeros(num_boxes).astype(np.int32) 17 | gt_target_iou = np.zeros(num_boxes) 18 | gt_target_iou.fill(-1) 19 | 20 | for j in xrange(num_boxes): 21 | # prepare inputs 22 | bb = pred_boxes[j].astype(float) 23 | # # compute max IoU over classes 24 | # # for c in xrange(1, num_classes): 25 | # for c in xrange(pred_class_scores.shape[1]): 26 | # bb = bbox[4*c:4*(c+1)] 27 | if gt_boxes.size > 0: 28 | # compute overlaps 29 | # intersection 30 | ixmin = np.maximum(gt_boxes[:, 0], bb[0]) 31 | iymin = np.maximum(gt_boxes[:, 1], bb[1]) 32 | ixmax = np.minimum(gt_boxes[:, 2], bb[2]) 33 | iymax = np.minimum(gt_boxes[:, 3], bb[3]) 34 | iw = np.maximum(ixmax - ixmin + 1., 0.) 35 | ih = np.maximum(iymax - iymin + 1., 0.) 36 | inters = iw * ih 37 | 38 | # union 39 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 40 | (gt_boxes[:, 2] - gt_boxes[:, 0] + 1.) * 41 | (gt_boxes[:, 3] - gt_boxes[:, 1] + 1.) - inters) 42 | 43 | overlaps = inters / uni 44 | max_iou_class = np.max(overlaps) 45 | max_k_class = np.argmax(overlaps) 46 | 47 | # select max over classes 48 | if max_iou_class > gt_target_iou[j]: 49 | gt_target_iou[j] = max_iou_class 50 | gt_targets[j] = max_k_class 51 | 52 | return gt_targets, gt_target_iou 53 | 54 | 55 | def ground_predictions(boxes, gt_boxes, ovthresh=0.5): 56 | """ 57 | ground graph predictions onto ground truth annotations 58 | boxes: predicted boxes 59 | """ 60 | 61 | # get predictions 62 | num_boxes = boxes.shape[0] 63 | 64 | 65 | # compute which gt index each roi gets mapped to 66 | gt_targets, gt_target_iou = _compute_gt_target(boxes, gt_boxes) 67 | 68 | # filter out predictions with low IoUs 69 | filter_inds = np.where(gt_target_iou > ovthresh)[0] 70 | 71 | # make sure each gt box is referenced only once 72 | # if referenced more than once, use the one that 73 | # has the maximum IoU 74 | gt_to_pred = {} # {gt_ind: pred_ind} 75 | for j in xrange(num_boxes): 76 | gti = gt_targets[j] # referenced gt ind 77 | if gti in gt_to_pred: 78 | pred_ind = gt_to_pred[gti] 79 | if gt_target_iou[j] > gt_target_iou[pred_ind]: 80 | gt_to_pred[gti] = j 81 | elif j in filter_inds: # also must survive filtering 82 | gt_to_pred[gti] = j 83 | 84 | return gt_to_pred 85 | 86 | def expand_relationships_mat(objects, relationships): 87 | rel_sub_idx, rel_obj_idx = np.where(relationships > 0) # ground truth number 88 | sub = objects[rel_sub_idx, :5] 89 | obj = objects[rel_obj_idx, :5] 90 | rel = relationships[rel_sub_idx, rel_obj_idx] 91 | return sub, obj, rel, rel_sub_idx, rel_obj_idx 92 | def expand_relationships_list(objects, relationships): 93 | relationships = np.array(relationships, dtype=np.int) 94 | sub = objects[relationships[:, 0]][:, :5] 95 | obj = objects[relationships[:, 1]][:, :5] 96 | rel = relationships[:, 2] 97 | return sub, obj, rel 98 | 99 | 100 | def check_recalled_graph(gt_objects, gt_relationships, 101 | pred_objects, pred_relationships, thres=0.5): 102 | # rearrange the ground truth 103 | gt_sub, gt_obj, gt_rel,gt_sub_assign, gt_obj_assign = expand_relationships_mat(gt_objects, gt_relationships) 104 | pred_sub, pred_obj, pred_rel,_, _ = expand_relationships_mat(pred_objects, pred_relationships) 105 | rec_rel = np.zeros_like(gt_relationships) 106 | # compute the overlap 107 | try: 108 | sub_overlaps = bbox_overlaps( 109 | np.ascontiguousarray(pred_sub[:, :4], dtype=np.float), 110 | np.ascontiguousarray(gt_sub[:, :4], dtype=np.float)) 111 | obj_overlaps = bbox_overlaps( 112 | np.ascontiguousarray(pred_obj[:, :4], dtype=np.float), 113 | np.ascontiguousarray(gt_obj[:, :4], dtype=np.float)) 114 | except: 115 | print('[Warning] No relationship remaining.') 116 | return gt_objects, gt_relationships 117 | 118 | 119 | for gt_id in xrange(gt_sub.shape[0]): 120 | fg_candidate = np.where(np.logical_and( 121 | sub_overlaps[:, gt_id] >= thres, 122 | obj_overlaps[:, gt_id] >= thres))[0] 123 | 124 | for candidate_id in fg_candidate: 125 | if pred_rel[candidate_id] == gt_rel[gt_id] and \ 126 | pred_sub[candidate_id, 4] == gt_sub[gt_id, 4] and \ 127 | pred_obj[candidate_id, 4] == gt_obj[gt_id, 4]: 128 | 129 | rec_rel[gt_sub_assign[gt_id], gt_obj_assign[gt_id]] = gt_rel[gt_id] 130 | break 131 | 132 | rec_sub, rec_obj = np.where(rec_rel > 0) 133 | rec_objects = np.union1d(rec_sub, rec_obj) 134 | 135 | return gt_objects[rec_objects], rec_rel[rec_objects][:,rec_objects] 136 | -------------------------------------------------------------------------------- /lib/visualize_graph/visualize.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Scene Graph Generation by Iterative Message Passing 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Danfei Xu 5 | # -------------------------------------------------------- 6 | 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | from graphviz import Digraph 10 | import cv2 11 | 12 | import pdb 13 | 14 | """ 15 | Utility for visualizing a scene graph 16 | """ 17 | 18 | 19 | 20 | 21 | def draw_scene_graph(labels, inds, rels, ind_to_class, ind_to_predicate, filename): 22 | """ 23 | draw a graphviz graph of the scene graph topology 24 | """ 25 | viz_labels = labels[inds] 26 | viz_rels = None 27 | if rels is not None: 28 | viz_rels = [] 29 | for rel in rels: 30 | if rel[0] in inds and rel[1] in inds : 31 | sub_idx = np.where(inds == rel[0])[0][0] 32 | obj_idx = np.where(inds == rel[1])[0][0] 33 | viz_rels.append([sub_idx, obj_idx, rel[2]]) 34 | return draw_graph(viz_labels, viz_rels, ind_to_class, ind_to_predicate, filename) 35 | 36 | 37 | def draw_graph(labels, rels, ind_to_class, ind_to_predicate, filename): 38 | u = Digraph('sg', filename=filename) 39 | u.body.append('size="6,6"') 40 | u.body.append('rankdir="LR"') 41 | u.node_attr.update(style='filled') 42 | 43 | out_dict = {'ind_to_class': ind_to_class, 'ind_to_predicate': ind_to_predicate} 44 | out_dict['labels'] = labels.tolist() 45 | out_dict['relations'] = rels 46 | 47 | rels = np.array(rels) 48 | rel_inds = rels[:,:2].ravel().tolist() 49 | name_list = [] 50 | for i, l in enumerate(labels): 51 | if i in rel_inds: 52 | name = ind_to_class[l] 53 | name_suffix = 1 54 | obj_name = name 55 | while obj_name in name_list: 56 | obj_name = name + '_' + str(name_suffix) 57 | name_suffix += 1 58 | name_list.append(obj_name) 59 | u.node(str(i), label=obj_name, color='lightblue2') 60 | 61 | for rel in rels: 62 | edge_key = '%s_%s' % (rel[0], rel[1]) 63 | u.node(edge_key, label=ind_to_predicate[rel[2]], color='red') 64 | 65 | u.edge(str(rel[0]), edge_key) 66 | u.edge(edge_key, str(rel[1])) 67 | 68 | u.render(cleanup=True) # save the graph to file and remove the source 69 | return out_dict 70 | 71 | 72 | def viz_scene_graph(im, rois, labels, ind_to_class, ind_to_predicate, inds=None, rels=None, filename=None): 73 | """ 74 | visualize a scene graph on an image 75 | """ 76 | if inds is None: 77 | inds = np.arange(rois.shape[0]) 78 | viz_rois = rois[inds] 79 | viz_labels = labels[inds] 80 | viz_rels = None 81 | if rels is not None: 82 | viz_rels = [] 83 | for rel in rels: 84 | if rel[0] in inds and rel[1] in inds : 85 | sub_idx = np.where(inds == rel[0])[0][0] 86 | obj_idx = np.where(inds == rel[1])[0][0] 87 | viz_rels.append([sub_idx, obj_idx, rel[2]]) 88 | viz_rels = np.array(viz_rels) 89 | return _viz_scene_graph(im, viz_rois, viz_labels, ind_to_class, ind_to_predicate, viz_rels, filename) 90 | 91 | 92 | def _viz_scene_graph(im, rois, labels, ind_to_class, ind_to_predicate, rels, filename): 93 | fig, ax = plt.subplots(figsize=(12, 12)) 94 | ax.imshow(cv2.cvtColor(im, cv2.COLOR_BGR2RGB), aspect='equal') 95 | if rels.size > 0: 96 | rel_inds = rels[:,:2].ravel().tolist() 97 | else: 98 | rel_inds = [] 99 | # draw bounding boxes 100 | for i, bbox in enumerate(rois): 101 | if int(labels[i]) == 0 and i not in rel_inds: 102 | continue 103 | ax.add_patch( 104 | plt.Rectangle((bbox[0], bbox[1]), 105 | bbox[2] - bbox[0], 106 | bbox[3] - bbox[1], fill=False, 107 | edgecolor='red', linewidth=3.5) 108 | ) 109 | label_str = ind_to_class[int(labels[i])] 110 | ax.text(bbox[0], bbox[1] - 2, 111 | label_str, 112 | bbox=dict(facecolor='blue', alpha=0.5), 113 | fontsize=14, color='white') 114 | 115 | # draw relations 116 | for i, rel in enumerate(rels): 117 | if rel[2] == 0: # ignore bachground 118 | continue 119 | sub_box = rois[rel[0], :] 120 | obj_box = rois[rel[1], :] 121 | obj_ctr = [obj_box[0], obj_box[1] - 2] 122 | sub_ctr = [sub_box[0], sub_box[1] - 2] 123 | line_ctr = [(sub_ctr[0] + obj_ctr[0]) / 2, (sub_ctr[1] + obj_ctr[1]) / 2] 124 | predicate = ind_to_predicate[int(rel[2])] 125 | ax.arrow(sub_ctr[0], sub_ctr[1], obj_ctr[0]-sub_ctr[0], obj_ctr[1]-sub_ctr[1], color='green') 126 | 127 | ax.text(line_ctr[0], line_ctr[1], predicate, 128 | bbox=dict(facecolor='green', alpha=0.5), 129 | fontsize=14, color='white') 130 | 131 | ax.set_title('Scene Graph Visualization', fontsize=14) 132 | ax.axis('off') 133 | fig.tight_layout() 134 | if filename is not None: 135 | fig.savefig(filename + '.png') 136 | plt.close(fig) 137 | -------------------------------------------------------------------------------- /models/HDN_v2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikang-li/FactorizableNet/1e3e1813630d8d647ab0f01bce7782198272e2e8/models/HDN_v2/__init__.py -------------------------------------------------------------------------------- /models/HDN_v2/criteria.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .utils import build_loss_bbox, build_loss_cls 4 | 5 | import lib.network as network 6 | 7 | 8 | def loss_FN_v1(pred_obj, pred_rel, roi_data_object, roi_data_predicate, 9 | obj_loss_weight, rel_loss_weight): 10 | roi_data_object = [network.np_to_variable(roi_data_object[0], is_cuda=True, dtype=torch.LongTensor), 11 | network.np_to_variable(roi_data_object[1], is_cuda=True), 12 | network.np_to_variable(roi_data_object[2], is_cuda=True), 13 | network.np_to_variable(roi_data_object[3], is_cuda=True), ] 14 | roi_data_predicate = [network.np_to_variable(roi_data_predicate[0], is_cuda=True, dtype=torch.LongTensor)] 15 | # object cls loss 16 | loss_cls_obj, acc_obj = build_loss_cls(pred_obj[0], roi_data_object[0], obj_loss_weight) 17 | loss_reg_obj= build_loss_bbox(pred_obj[1], roi_data_object, acc_obj[2]) 18 | loss_cls_rel, acc_rel= build_loss_cls(pred_rel[0], roi_data_predicate[0], rel_loss_weight) 19 | 20 | return loss_cls_obj, loss_reg_obj, loss_cls_rel -------------------------------------------------------------------------------- /models/RPN/RPN.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import os.path as osp 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from lib.utils.timer import Timer 9 | from lib.utils.blob import im_list_to_blob 10 | from lib.rpn_msr.proposal_layer import proposal_layer as proposal_layer_py 11 | from lib.rpn_msr.anchor_target_layer import anchor_target_layer as anchor_target_layer_py 12 | from lib.fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes 13 | 14 | from lib import network 15 | from lib.network import Conv2d, FC 16 | import torchvision.models as models 17 | import math 18 | import json 19 | import yaml 20 | import pdb 21 | 22 | from .utils import nms_detections, build_loss, reshape_layer, generate_output_mapping 23 | 24 | DEBUG = False 25 | 26 | 27 | 28 | class RPN(nn.Module): 29 | _feat_stride = 16 30 | 31 | anchor_scales_normal = [2, 4, 8, 16, 32, 64] 32 | anchor_ratios_normal = [0.25, 0.5, 1, 2, 4] 33 | anchor_scales_normal_region = [4, 8, 16, 32, 64] 34 | anchor_ratios_normal_region = [0.25, 0.5, 1, 2, 4] 35 | 36 | def __init__(self, opts): 37 | super(RPN, self).__init__() 38 | 39 | # loading RPN configs 40 | self.opts = opts 41 | if self.opts['kmeans_anchors']: 42 | # Loading k-means anchors 43 | kmeans_anchors_file = osp.join(self.opts['anchor_dir'], 'kmeans_anchors.json') 44 | print 'using k-means anchors: {}'.format(kmeans_anchors_file) 45 | anchors = json.load(open(kmeans_anchors_file)) 46 | if 'scale' not in self.opts: 47 | print('No RPN scale is given, default [600] is set') 48 | self.opts['object']['anchor_scales'] = list(np.array(anchors['anchor_scales_kmeans']) / 600.0 * self.opts.get('scale', 600.)) 49 | self.opts['object']['anchor_ratios'] = anchors['anchor_ratios_kmeans'] 50 | else: 51 | print 'using normal anchors' 52 | anchor_scales, anchor_ratios = \ 53 | np.meshgrid(self.anchor_scales_normal, self.anchor_ratios_normal, indexing='ij') 54 | self.opts['object']['anchor_scales'] = anchor_scales.reshape(-1) 55 | self.opts['object']['anchor_ratios'] = anchor_ratios.reshape(-1) 56 | 57 | self.anchor_num = len(self.opts['object']['anchor_scales']) 58 | 59 | self.features = models.vgg16(pretrained=True).features 60 | self.features.__delattr__('30') # to delete the max pooling 61 | # by default, fix the first four layers 62 | network.set_trainable_param(list(self.features.parameters())[:8], requires_grad=False) 63 | 64 | # self.features = models.vgg16().features 65 | self.conv1 = Conv2d(512, 512, 3, same_padding=True) 66 | self.score_conv = Conv2d(512, self.anchor_num * 2, 1, relu=False, same_padding=False) 67 | self.bbox_conv = Conv2d(512, self.anchor_num * 4, 1, relu=False, same_padding=False) 68 | 69 | # initialize the parameters 70 | self.initialize_parameters() 71 | self.opts['mappings'] = generate_output_mapping(osp.join(self.opts['anchor_dir'], 'vgg16_mappings.json'), 72 | self.features) 73 | 74 | def initialize_parameters(self, normal_method='normal'): 75 | 76 | if normal_method == 'normal': 77 | normal_fun = network.weights_normal_init 78 | elif normal_method == 'MSRA': 79 | normal_fun = network.weights_MSRA_init 80 | else: 81 | raise(Exception('Cannot recognize the normal method:'.format(normal_method))) 82 | 83 | normal_fun(self.conv1, 0.025) 84 | normal_fun(self.score_conv, 0.025) 85 | normal_fun(self.bbox_conv, 0.01) 86 | 87 | 88 | # @property 89 | # def loss(self): 90 | # return self.loss_cls + self.loss_box * 0.2 91 | 92 | def forward(self, im_data, im_info, gt_objects=None, dontcare_areas=None, rpn_data=None): 93 | 94 | features = self.features(im_data) 95 | # print 'features.std()', features.data.std() 96 | rpn_conv1 = self.conv1(features) 97 | # print 'rpn_conv1.std()', rpn_conv1.data.std() 98 | # object proposal score 99 | rpn_cls_score = self.score_conv(rpn_conv1) 100 | # print 'rpn_cls_score.std()', rpn_cls_score.data.std() 101 | rpn_cls_score_reshape = reshape_layer(rpn_cls_score, 2) 102 | rpn_cls_prob = F.softmax(rpn_cls_score_reshape, dim=1) 103 | rpn_cls_prob_reshape = reshape_layer(rpn_cls_prob, self.anchor_num*2) 104 | # rpn boxes 105 | rpn_bbox_pred = self.bbox_conv(rpn_conv1) 106 | # print 'rpn_bbox_pred.std()', rpn_bbox_pred.data.std() * 4 107 | 108 | 109 | # proposal layer 110 | cfg_key = 'train' if self.training else 'test' 111 | rois = self.proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, 112 | self._feat_stride, self.opts['object'][cfg_key], 113 | self.opts['object']['anchor_scales'], 114 | self.opts['object']['anchor_ratios'], 115 | mappings=self.opts['mappings']) 116 | 117 | # generating training labels and build the rpn loss 118 | losses = {} 119 | if self.training and rpn_data is not None: 120 | loss_cls, loss_box, accs = build_loss(rpn_cls_score_reshape, rpn_bbox_pred, rpn_data) 121 | tp, tf, fg_cnt, bg_cnt = accs 122 | losses = { 123 | 'loss_cls': loss_cls, 124 | 'loss_box': loss_box, 125 | 'loss': loss_cls + loss_box * 0.2, 126 | 'tp': tp, 127 | 'tf': tf, 128 | 'fg_cnt': fg_cnt, 129 | 'bg_cnt': bg_cnt, 130 | 131 | } 132 | return features, rois, losses 133 | 134 | 135 | @staticmethod 136 | def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, 137 | _feat_stride, opts, anchor_scales, anchor_ratios, mappings): 138 | rpn_cls_prob_reshape = rpn_cls_prob_reshape.data.cpu().numpy() 139 | rpn_bbox_pred = rpn_bbox_pred.data.cpu().numpy() 140 | x = proposal_layer_py(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, 141 | _feat_stride, opts, anchor_scales, anchor_ratios, mappings) 142 | x = network.np_to_variable(x, is_cuda=True) 143 | return x.view(-1, 6) 144 | -------------------------------------------------------------------------------- /models/RPN/__init__.py: -------------------------------------------------------------------------------- 1 | from .RPN import RPN 2 | from .RPN_region import RPN as RPN_region -------------------------------------------------------------------------------- /models/RPN/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import os.path as osp 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import numpy as np 9 | import json 10 | import shutil 11 | 12 | from lib.fast_rcnn.nms_wrapper import nms 13 | from lib import network 14 | 15 | import pdb 16 | 17 | def save_checkpoint(filename, model, epoch, is_best): 18 | model_name = '{}_epoch_{}.h5'.format(filename, epoch) 19 | model_name_best = '{}_best.h5'.format(filename) 20 | info_name = '{}_epoch_{}_info.json'.format(filename, epoch) 21 | info_name_best = '{}_best_info.json'.format(filename) 22 | network.save_net(model_name, model) 23 | with open(info_name, 'w') as f: 24 | json.dump(model.opts, f) 25 | print('save model: {}'.format(model_name)) 26 | if is_best: 27 | shutil.copyfile(model_name, model_name_best) 28 | shutil.copyfile(info_name, info_name_best) 29 | 30 | def load_checkpoint(filename, model): 31 | model_name = '{}.h5'.format(filename) 32 | info_name = '{}_info.json'.format(filename) 33 | network.load_net(model_name, model) 34 | if False: # disable info loading #osp.isfile(info_name): 35 | with open(info_name, 'r') as f: 36 | model.opts = json.load(f) 37 | else: 38 | print('Info file missed, using the default options') 39 | 40 | 41 | 42 | def reshape_layer(x, d): 43 | input_shape = x.size() 44 | # x = x.permute(0, 3, 1, 2) 45 | # b c w h 46 | x = x.view( 47 | input_shape[0], 48 | int(d), 49 | int(float(input_shape[1] * input_shape[2]) / float(d)), 50 | input_shape[3] 51 | ) 52 | 53 | return x 54 | # x = x.permute(0, 2, 3, 1) 55 | 56 | def nms_detections(pred_boxes, scores, nms_thresh, inds=None): 57 | dets = np.hstack((pred_boxes, 58 | scores[:, np.newaxis])).astype(np.float32) 59 | keep = nms(dets, nms_thresh) 60 | if inds is None: 61 | return pred_boxes[keep], scores[keep] 62 | return pred_boxes[keep], scores[keep], inds[keep] 63 | 64 | def build_loss(rpn_cls_score_reshape, rpn_bbox_pred, rpn_data): 65 | # classification loss 66 | rpn_cls_score = rpn_cls_score_reshape.permute(0, 2, 3, 1).contiguous().view(-1, 2) 67 | rpn_label = rpn_data[0].view(-1) 68 | # print rpn_label.size(), rpn_cls_score.size() 69 | rpn_keep = Variable(rpn_label.data.ne(-1).nonzero().squeeze()) 70 | rpn_cls_score = torch.index_select(rpn_cls_score, 0, rpn_keep) 71 | rpn_label = torch.index_select(rpn_label, 0, rpn_keep) 72 | 73 | fg_cnt = torch.sum(rpn_label.data.ne(0)) 74 | bg_cnt = rpn_label.data.numel() - fg_cnt 75 | 76 | _, predict = torch.max(rpn_cls_score.data, 1) 77 | error = torch.sum(torch.abs(predict - rpn_label.data)) 78 | # try: 79 | if fg_cnt == 0: 80 | tp = 0. 81 | tf = tf = torch.sum(predict.eq(rpn_label.data)) 82 | else: 83 | tp = torch.sum(predict[:fg_cnt].eq(rpn_label.data[:fg_cnt])) 84 | tf = torch.sum(predict[fg_cnt:].eq(rpn_label.data[fg_cnt:])) 85 | fg_cnt = fg_cnt 86 | bg_cnt = bg_cnt 87 | # print 'accuracy: %2.2f%%' % ((self.tp + self.tf) / float(fg_cnt + bg_cnt) * 100) 88 | rpn_cross_entropy = F.cross_entropy(rpn_cls_score, rpn_label) 89 | # print rpn_cross_entropy 90 | 91 | # box loss 92 | rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights = rpn_data[1:] 93 | rpn_bbox_targets = torch.mul(rpn_bbox_targets, rpn_bbox_inside_weights) 94 | rpn_bbox_pred = torch.mul(rpn_bbox_pred, rpn_bbox_inside_weights) 95 | rpn_loss_box = F.smooth_l1_loss(rpn_bbox_pred, rpn_bbox_targets, size_average=False) / (fg_cnt + 1e-4) 96 | 97 | return rpn_cross_entropy, rpn_loss_box, (tp, tf, fg_cnt, bg_cnt) 98 | 99 | 100 | def generate_output_mapping(mapping_file, conv_layers, min_size=16, max_size=1001): 101 | if osp.isfile(mapping_file): 102 | with open(mapping_file, 'r') as f: 103 | mappings = json.load(f) 104 | 105 | mappings = {int(k):int(v) for k,v in mappings.items()} 106 | return mappings 107 | else: 108 | conv_layers.cuda() 109 | print('Generating input/output size mappings') 110 | mappings = {} 111 | for i in range(min_size, max_size): 112 | t_in = Variable(torch.zeros(1, 3, i, i).cuda()) 113 | t_out = conv_layers(t_in) 114 | mappings[i] = t_out.size(2) 115 | 116 | with open(mapping_file, 'w') as f: 117 | json.dump(mappings, f) 118 | print('Done') 119 | return mappings 120 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from HDN_v2.factorizable_network_v4 import Factorizable_network as FN_v4 2 | from HDN_v2.factorizable_network_v4s import Factorizable_network as FN_v4s 3 | -------------------------------------------------------------------------------- /models/modules/NMS.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torch.nn import Parameter 7 | import pdb 8 | import relation_module 9 | #from options.config_FN import cfg 10 | 11 | class Dumplicate_Removal(nn.Module): 12 | def __init__(self, opts): 13 | super(Dumplicate_Removal, self).__init__() 14 | self.opts = opts 15 | self.relation_transform = relation_module.Relation_Module( 16 | self.opts['dim_mm'], 17 | self.opts['dim_mm'], 18 | self.opts['dim_mm'] // 2, 19 | geometry_trans=self.opts.get('geometry', 'Geometry_Transform_v2') 20 | ) 21 | self.transform_visual = nn.Linear(self.opts['dim_ho'], self.opts['dim_mm']) 22 | self.rank_embeddings = nn.Embedding(256, self.opts['dim_mm']) # cfg.TRAIN.BATCH_SIZE, self.opts['dim_mm']) 23 | self.transform_rescore = nn.Linear(self.opts['dim_mm'], 1) 24 | 25 | 26 | def forward(self, feature_obj, highest_prob, rois_obj): 27 | ''' 28 | Training stage: object probability is that of the assigned ground truth label 29 | Testing stage: object probability is the one with highest probability 30 | ''' 31 | assert highest_prob.size(0) <= self.rank_embeddings.num_embeddings 32 | if isinstance(highest_prob, Variable): 33 | highest_prob = highest_prob.data 34 | _, rank = torch.sort(highest_prob, descending=True, dim=0) 35 | rank = Variable(rank) 36 | feature_rank = self.rank_embeddings(rank) 37 | feature_obj = self.transform_visual(feature_obj) 38 | feature_visual = feature_rank + feature_obj 39 | feature_visual = self.relation_transform(feature_visual, rois_obj) 40 | reranked_score = self.transform_rescore(F.relu(feature_visual, inplace=True)) 41 | reranked_score = torch.sigmoid(reranked_score) 42 | 43 | return reranked_score 44 | 45 | 46 | 47 | 48 | 49 | if __name__ == '__main__': 50 | opts = { 51 | 'dim_mm': 6, 52 | 'dim_ho': 4, 53 | } 54 | nms_module = Dumplicate_Removal(opts) 55 | visual_features = Variable(torch.normal(torch.zeros(10, 4))) 56 | rois = Variable(torch.cat((torch.zeros(10, 1), (torch.rand(10, 4) + torch.FloatTensor([[0, 1, 2, 3], ])) * 100 ), dim=1)) 57 | duplicate_labels = Variable(torch.ones(5, 1)).type(torch.LongTensor) 58 | cls_prob_object = Variable(torch.rand(10, 20)) 59 | 60 | mask = torch.zeros_like(cls_prob_object[:duplicate_labels.size(0)]).type(torch.ByteTensor) 61 | for i in range(duplicate_labels.size(0)): 62 | mask[i, duplicate_labels.data[i][0]] = 1 63 | selected_prob = torch.masked_select(cls_prob_object[:duplicate_labels.size(0)], mask) 64 | reranked_score = nms_module(visual_features[:duplicate_labels.size(0)], selected_prob, rois[:duplicate_labels.size(0)]) 65 | selected_prob = selected_prob.unsqueeze(1) * reranked_score 66 | 67 | loss = F.binary_cross_entropy(selected_prob, duplicate_labels.float()) 68 | loss.backward() 69 | print(nms_module.transform_rescore.weight.grad) 70 | -------------------------------------------------------------------------------- /models/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .factor_updating_structure_v3 import factor_updating_structure as factor_updating_structure_v3 2 | from .factor_updating_structure_v3r import factor_updating_structure as factor_updating_structure_v3r 3 | from .NMS import Dumplicate_Removal 4 | -------------------------------------------------------------------------------- /models/modules/dataParallel.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.nn import DataParallel as DataParallel_raw 5 | import numpy as np 6 | 7 | 8 | class DataParallel(DataParallel_raw): 9 | """ 10 | we do the scatter outside of the DataPrallel. 11 | input: Scattered Inputs without kwargs. 12 | """ 13 | 14 | def __init__(self, module): 15 | # Disable all the other parameters 16 | super(DataParallel, self).__init__(module) 17 | 18 | 19 | def forward(self, *inputs, **kwargs): 20 | assert len(inputs) == 0, "Only support arguments like [variable_name = xxx]" 21 | new_inputs = [{} for _ in self.device_ids] 22 | for key in kwargs: 23 | if key == 'im_data': 24 | for i, device in enumerate(self.device_ids): 25 | new_inputs[i][key] = kwargs[key][i].to(device) 26 | elif key.startswith("rpn_anchor_targets"): 27 | for i, device in enumerate(self.device_ids): 28 | new_inputs[i][key] = [item.to(device) for item in kwargs[key][i]] 29 | 30 | else: 31 | assert isinstance(kwargs[key], list) 32 | for i in range(len(self.device_ids)): 33 | new_inputs[i][key] = [kwargs[key][i], ] 34 | nones = [[] for _ in self.device_ids] 35 | replicas = self.replicate(self.module, self.device_ids) 36 | outputs = self.parallel_apply(replicas, nones, new_inputs) 37 | return self.gather(outputs, self.output_device) -------------------------------------------------------------------------------- /models/modules/factor_updating_structure.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torch.nn import Parameter 7 | from lib.utils.timer import Timer 8 | import pdb 9 | 10 | 11 | VISUALIZE_RESULTS = False 12 | 13 | class Kernel_Attention_Module(nn.Module): 14 | def __init__(self, dim_source, dim_target, dim_mm): 15 | super(Kernel_Attention_Module, self).__init__() 16 | self.ws = nn.Linear(dim_source, dim_mm, bias=False) 17 | self.wt = nn.Linear(dim_target, dim_mm, bias=False) 18 | 19 | def forward(self, source_feat, target_feat, return_gate_value=False): 20 | # print '[unary_term, pair_term]', [unary_term, pair_term] 21 | gate = torch.sigmoid(torch.mean((self.ws(source_feat) * self.wt(target_feat)), 1, keepdim=True)) 22 | # print 'gate', gate 23 | output = source_feat * gate.expand(gate.size(0), source_feat.size(1)) 24 | if return_gate_value: 25 | return output, gate 26 | else: 27 | return output 28 | 29 | class Attention_Module(nn.Module): 30 | def __init__(self, dim_source, dim_target, filter_size = 128): 31 | super(Attention_Module, self).__init__() 32 | self.filter_size = filter_size 33 | if filter_size > 0: 34 | self.w = nn.Linear(dim_source+dim_target, filter_size, bias=True) 35 | 36 | def forward(self, source_feat, target_feat, return_gate_value=False): 37 | 38 | if self.filter_size > 0: 39 | gate = torch.cat([source_feat, target_feat], 1) 40 | gate = F.relu(gate) 41 | gate = torch.mean(torch.sigmoid(self.w(gate)), 1, keepdim=True) 42 | # print 'gate', gate 43 | output = source_feat * gate.expand_as(source_feat) 44 | if return_gate_value: 45 | return output, gate 46 | else: 47 | return output 48 | else: 49 | return source_feat 50 | 51 | 52 | class factor_updating_structure(nn.Module): 53 | def __init__(self, opts): 54 | super(factor_updating_structure, self).__init__() 55 | 56 | # Attention modules 57 | if opts['use_attention']: 58 | if opts['use_kernel']: 59 | self.gate_object2region = Kernel_Attention_Module(opts['dim_ho'], opts['dim_hr'], opts['dim_mm']) 60 | self.gate_region2object = Kernel_Attention_Module(opts['dim_hr'], opts['dim_ho'], opts['dim_mm']) 61 | else: 62 | self.gate_object2region = Attention_Module(opts['dim_ho'], opts['dim_hr'], opts['gate_width']) 63 | self.gate_region2object = Attention_Module(opts['dim_hr'], opts['dim_ho'], opts['gate_width']) 64 | else: 65 | self.gate_object2region = None 66 | self.gate_region2object = None 67 | # To transform the attentioned features 68 | self.transform_object2region = nn.Sequential( 69 | nn.ReLU(), 70 | nn.Linear(opts['dim_ho'], opts['dim_hr'], bias=opts['use_bias'])) 71 | self.transform_region2object = nn.Sequential( 72 | nn.ReLU(), 73 | nn.Linear(opts['dim_hr'], opts['dim_ho'], bias=opts['use_bias'])) 74 | 75 | self.use_average = opts['mps_use_average'] 76 | 77 | 78 | 79 | def forward(self, feature_obj, feature_region, mat_object, mat_region): 80 | 81 | feature_region2object = self.prepare_message(feature_obj, feature_region, mat_object, self.gate_region2object) 82 | # Transform the features 83 | out_feature_object = feature_obj + self.transform_region2object(feature_region2object) 84 | # gather the attentioned features 85 | feature_object2region = self.prepare_message(feature_region, feature_obj, mat_region, self.gate_object2region) 86 | # Transform the features 87 | out_feature_region = feature_region + self.transform_object2region(feature_object2region) 88 | 89 | return out_feature_object, out_feature_region 90 | 91 | def prepare_message(self, target_features, source_features, select_mat, attend_module=None): 92 | feature_data = [] 93 | transfer_list = np.where(select_mat > 0) 94 | 95 | for f_id in range(target_features.size(0)): 96 | if len(np.where(select_mat[f_id, :] > 0)[0]) > 0: 97 | source_indices = transfer_list[1][transfer_list[0] == f_id] 98 | source_indices = Variable(torch.from_numpy(source_indices).type(torch.LongTensor)).cuda() 99 | features = torch.index_select(source_features, 0, source_indices) 100 | if attend_module is not None: 101 | target_f = target_features[f_id].view(1, -1).expand(features.size(0), -1) 102 | features = attend_module(features, target_f) 103 | if self.use_average: 104 | features = features.mean(0) 105 | else: 106 | features = features.sum(0) 107 | feature_data.append(features) 108 | else: 109 | temp = Variable(torch.zeros(target_features.size()[1:]), requires_grad=False).type(torch.FloatTensor).cuda() 110 | feature_data.append(temp) 111 | return torch.stack(feature_data, 0) 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /models/modules/factor_updating_structure_v3r.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torch.nn import Parameter 7 | from lib.utils.timer import Timer 8 | import pdb 9 | from lib.network import GroupDropout 10 | 11 | from .factor_updating_structure_v3 import factor_updating_structure as FS_v3 12 | from .relation_module import Relation_Module 13 | 14 | 15 | VISUALIZE_RESULTS = False 16 | TIME_IT = False 17 | 18 | 19 | class factor_updating_structure(FS_v3): 20 | def __init__(self, opts): 21 | super(factor_updating_structure, self).__init__(opts) 22 | 23 | kernel_size = opts.get('kernel_size', 1) 24 | assert kernel_size % 2, 'Odd kernel size required.' 25 | padding = (kernel_size - 1) // 2 26 | # To transform the attentioned features 27 | self.transform_object2object = Relation_Module(opts['dim_ho'], opts['dim_ho'], opts['dim_ho'] // 2, 28 | geometry_trans=self.opts.get('geometry', 'Geometry_Transform_v2')) 29 | 30 | 31 | 32 | def forward(self, feature_obj, feature_region, mat_object, mat_region, object_rois, region_rois): 33 | 34 | self.timer_r2o.tic() 35 | feature_region2object = self.region_to_object(feature_obj, feature_region, mat_object) 36 | # Transform the features 37 | out_feature_object = feature_obj + self.transform_region2object(feature_region2object) \ 38 | + self.transform_object2object(feature_obj, object_rois) 39 | self.timer_r2o.toc() 40 | 41 | 42 | self.timer_o2r.tic() 43 | # gather the attentioned features 44 | feature_object2region = self.object_to_region(feature_region, feature_obj, mat_region) 45 | # Transform the features 46 | out_feature_region = feature_region + self.transform_object2region(feature_object2region) 47 | self.timer_o2r.toc() 48 | 49 | if TIME_IT: 50 | print('[MPS Timing:]') 51 | print('\t[R2O]: {0:.3f} s'.format(self.timer_r2o.average_time)) 52 | print('\t[O2R]: {0:.3f} s'.format(self.timer_o2r.average_time)) 53 | 54 | return out_feature_object, out_feature_region 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /models/modules/geometry_transform.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | import pdb 7 | 8 | 9 | def geometry_transform(rois_keys, rois_queries=None): 10 | if rois_queries is None: 11 | rois_queries = rois_keys 12 | if isinstance(rois_keys, Variable): # transform to Tensor 13 | rois_keys = rois_keys.data 14 | rois_queries = rois_queries.data 15 | if rois_keys.size(1) == 5: # Remove the ID 16 | rois_keys = rois_keys[:, 1:] 17 | rois_queries = rois_queries[:, 1:] 18 | 19 | assert rois_keys.size(1) == 4 20 | # keys 21 | w_keys = (rois_keys[:, 2] - rois_keys[:, 0] + 1e-10).unsqueeze(1) 22 | h_keys = (rois_keys[:, 3] - rois_keys[:, 1] + 1e-10).unsqueeze(1) 23 | x_keys = ((rois_keys[:, 2] + rois_keys[:, 0]) / 2).unsqueeze(1) 24 | y_keys = ((rois_keys[:, 3] + rois_keys[:, 1]) / 2).unsqueeze(1) 25 | # queries 26 | w_queries = (rois_queries[:, 2] - rois_queries[:, 0] + 1e-10).unsqueeze(0) 27 | h_queries = (rois_queries[:, 3] - rois_queries[:, 1] + 1e-10).unsqueeze(0) 28 | x_queries = ((rois_queries[:, 2] + rois_queries[:, 0]) / 2).unsqueeze(0) 29 | y_queries = ((rois_queries[:, 3] + rois_queries[:, 1]) / 2).unsqueeze(0) 30 | 31 | # slightly different from [Relation Networks for Object Detection] 32 | geometry_feature = torch.stack( 33 | [ (x_keys - x_queries).abs() / w_keys, 34 | (y_keys - y_queries).abs() / h_keys, 35 | w_keys / w_queries, 36 | h_keys / h_queries,], dim=2) 37 | 38 | geometry_log = geometry_feature.log() 39 | geometry_log[geometry_feature == 0] = 0 40 | 41 | return geometry_log 42 | 43 | def positional_encoding(position_mat, dim_output, wave_length=1000): 44 | '''Sinusoidal Positional_Encoding. 45 | Returns: 46 | Sinusoidal Positional embedding of different objects 47 | ''' 48 | # position_mat: [num_keys, num_queries, 4] 49 | assert dim_output % 8 == 0, "[dim_output] is expected to be an integral multiple of 8" 50 | position_enc = torch.Tensor([np.power(wave_length, 8.*i/dim_output) for i in range(dim_output / 8)]).view(1, 1, 1, -1).type_as(position_mat) 51 | # position_enc: [num_keys, num_queries, 4, dim_output / 8] 52 | position_enc = position_mat.unsqueeze(-1) * 100 / position_enc 53 | # Second part, apply the cosine to even columns and sin to odds. 54 | # position_enc: [num_keys, num_queries, 4, dim_output / 4] 55 | position_enc = torch.cat([torch.sin(position_enc), torch.cos(position_enc)], dim=3) 56 | position_enc = position_enc.view(position_enc.size(0), position_enc.size(1), -1) 57 | 58 | return position_enc 59 | 60 | class Geometry_Transform_v1(nn.Module): 61 | def __init__(self, dim_mm): 62 | super(Geometry_Transform_v1, self).__init__() 63 | self.transform_geometry = nn.Sequential( 64 | nn.Linear(4, dim_mm), 65 | nn.ReLU(), 66 | nn.Linear(dim_mm, 1), 67 | nn.ReLU(),) 68 | 69 | def forward(self, rois_keys, rois_queries=None): 70 | position_mat = Variable(geometry_transform(rois_keys, rois_queries), requires_grad=True) 71 | geometry_weight = self.transform_geometry(position_mat).squeeze(2) 72 | return geometry_weight 73 | 74 | 75 | class Geometry_Transform_v2(nn.Module): 76 | ''' 77 | expand the geometry features 78 | ''' 79 | def __init__(self, dim_mm): 80 | super(Geometry_Transform_v2, self).__init__() 81 | self.transform_geometry = nn.Sequential( 82 | nn.Linear(dim_mm, 1), 83 | nn.ReLU(),) 84 | self.dim_mm = dim_mm 85 | 86 | def forward(self, rois_keys, rois_queries=None): 87 | position_mat = geometry_transform(rois_keys, rois_queries) 88 | geometry_weight = positional_encoding(position_mat, self.dim_mm) 89 | geometry_weight = Variable(geometry_weight, requires_grad=True) 90 | geometry_weight = self.transform_geometry(geometry_weight).squeeze(2) 91 | return geometry_weight -------------------------------------------------------------------------------- /models/modules/phrase_inference_structure.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from torch.nn import Parameter 7 | from lib.utils.timer import Timer 8 | import pdb 9 | from lib.network import GroupDropout 10 | from copy import deepcopy 11 | 12 | class Abstract_Phrase_Inference_Structure(nn.Module): 13 | def __init__(self, opts): 14 | super(Abstract_Phrase_Inference_Structure, self).__init__() 15 | self.opts = opts 16 | 17 | def forward(self, feature_obj, feature_region, mat_predicate): 18 | 19 | raise NotImplementedError 20 | 21 | 22 | class Basic_Phrase_Inference_Structure(Abstract_Phrase_Inference_Structure): 23 | def __init__(self, opts): 24 | super(Basic_Phrase_Inference_Structure, self).__init__(opts) 25 | self.opts = opts 26 | #self.w_object = Parameter() 27 | 28 | # To transform the attentioned features 29 | self.transform_subject = nn.Sequential( 30 | nn.ReLU(), 31 | #nn.BatchNorm1d(opts['dim_ho'], eps=0.001, momentum=0, affine=True), 32 | nn.Linear(opts['dim_ho'], opts['dim_mm'], bias=opts['use_bias'])) 33 | self.transform_object = nn.Sequential( 34 | nn.ReLU(), 35 | #nn.BatchNorm1d(opts['dim_ho'], eps=0.001, momentum=0, affine=True), 36 | nn.Linear(opts['dim_ho'], opts['dim_mm'], bias=opts['use_bias'])) 37 | self.transform_region = None 38 | 39 | def _fusion(self, transformed_feat_sub, transformed_feat_obj, transformed_feat_region): 40 | raise NotImplementedError 41 | 42 | def _prepare(self, feature_obj, feature_region, indices_sub, indices_obj, indices_region): 43 | raise NotImplementedError 44 | 45 | def forward(self, feature_obj, feature_region, mat_predicate): 46 | indices_sub = Variable(torch.from_numpy(mat_predicate[:, 0]).type(torch.LongTensor)).cuda().detach() 47 | indices_obj = Variable(torch.from_numpy(mat_predicate[:, 1]).type(torch.LongTensor)).cuda().detach() 48 | indices_region = Variable(torch.from_numpy(mat_predicate[:, 2]).type(torch.LongTensor)).cuda().detach() 49 | transformed_feat_sub, transformed_feat_obj, transformed_feat_region = self._prepare( 50 | feature_obj, feature_region, indices_sub, indices_obj, indices_region) 51 | # y = x_[p] + W_[s,p] * x_[s] + W_[o,p] * x_[o] 52 | out_feature_phrase = self._fusion(transformed_feat_sub, transformed_feat_obj, transformed_feat_region) 53 | return out_feature_phrase 54 | 55 | 56 | class PI_v5(Basic_Phrase_Inference_Structure): 57 | ''' 58 | sub/obj feature vector --> feature map --> merge with region 59 | --> Full connection for inference 60 | ''' 61 | def __init__(self, opts): 62 | super(PI_v5, self).__init__(opts) 63 | self.transform_region = nn.Sequential( 64 | nn.ReLU(), 65 | #nn.BatchNorm2d(opts['dim_hr'], eps=0.001, momentum=0, affine=True), 66 | nn.Conv2d(opts['dim_hr'], opts['dim_mm'], kernel_size=1, bias=opts['use_bias']), 67 | GroupDropout(p=opts['dropout'], inplace=True),) 68 | if opts.get('bottleneck', False): 69 | print('Bottleneck enabled.') 70 | self.predicate_feat_pre = nn.Sequential( 71 | nn.ReLU(), 72 | nn.Conv2d(opts['dim_mm'], opts['dim_mm'] // 2, kernel_size=1, bias=opts['use_bias']), 73 | GroupDropout(p=opts['dropout'], inplace=True), 74 | nn.ReLU(),) 75 | #nn.BatchNorm2d(opts['dim_mm'], eps=0.001, momentum=0, affine=True),) 76 | self.predicate_feat_fc = nn.Sequential( 77 | nn.Linear((opts['dim_mm'] // 2)* opts['pool_size'] * opts['pool_size'] , 78 | opts['dim_hp'], bias=opts['use_bias']), 79 | GroupDropout(p=opts['dropout'], inplace=True),) 80 | else: 81 | print('Bottleneck disabled.') 82 | self.predicate_feat_pre = nn.Sequential( 83 | nn.ReLU(),) 84 | self.predicate_feat_fc = nn.Sequential( 85 | nn.Linear(opts['dim_mm'] * opts['pool_size'] * opts['pool_size'] , 86 | opts['dim_hp'], bias=opts['use_bias']), 87 | GroupDropout(p=opts['dropout'], inplace=True),) 88 | 89 | 90 | 91 | 92 | 93 | def _prepare(self, feature_obj, feature_region, indices_sub, indices_obj, indices_region): 94 | transformed_feat_sub = self.transform_subject(feature_obj) 95 | transformed_feat_sub = torch.index_select(transformed_feat_sub, 0, indices_sub) 96 | transformed_feat_obj = self.transform_object(feature_obj) 97 | transformed_feat_obj = torch.index_select(transformed_feat_obj, 0, indices_obj) 98 | transformed_feat_region = self.transform_region(feature_region) 99 | transformed_feat_region = torch.index_select(transformed_feat_region, 0, indices_region) 100 | return transformed_feat_sub, transformed_feat_obj, transformed_feat_region 101 | 102 | # @staticmethod 103 | # def _attention_merge(reference, query): 104 | # B, C, H, W = reference.size() 105 | # similarity = torch.sum(query * reference, dim=1, keepdim=True) 106 | # prob = F.sigmoid(similarity) # use sigmoid to retain scale of feature 107 | # weighted_feature = query * prob 108 | # return weighted_feature 109 | 110 | 111 | def _fusion(self, transformed_feat_sub, transformed_feat_obj, transformed_feat_region): 112 | batch_size = transformed_feat_sub.size(0) 113 | transformed_feat_sub = transformed_feat_sub.view(batch_size, -1, 1, 1) 114 | transformed_feat_obj = transformed_feat_obj.view(batch_size, -1, 1, 1) 115 | op = self.opts.get('op', 'Sum') 116 | if op == 'Sum': 117 | output_feature = transformed_feat_region + transformed_feat_sub + transformed_feat_obj 118 | elif op == 'Prod': 119 | output_feature = transformed_feat_region * transformed_feat_sub * transformed_feat_obj 120 | elif op == 'Sum_Prod': 121 | output_feature = transformed_feat_region * (transformed_feat_sub + transformed_feat_obj) 122 | output_feature = self.predicate_feat_pre(output_feature).view(batch_size, -1) 123 | output_feature = self.predicate_feat_fc(output_feature) 124 | return output_feature 125 | -------------------------------------------------------------------------------- /models/modules/relation_module.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | import pdb 7 | 8 | import geometry_transform 9 | 10 | 11 | class Relation_Module(nn.Module): 12 | def __init__(self, dim_v, dim_o, dim_mm, geometry_trans='Geometry_Transform_v2'): 13 | super(Relation_Module, self).__init__() 14 | self.dim_key = dim_mm 15 | self.transform_key = nn.Linear(dim_v, dim_mm) 16 | self.transform_query = nn.Linear(dim_v, dim_mm) 17 | self.transform_visual = nn.Linear(dim_v, dim_o) 18 | self.transform_geometry = getattr(geometry_transform, geometry_trans)(dim_mm) 19 | 20 | 21 | def forward(self, feature_visual, rois): 22 | ''' 23 | Relation Module adopts pre-non-linear-activated features 24 | ''' 25 | feature_visual = nn.functional.relu(feature_visual) 26 | feature_key = self.transform_key(feature_visual) 27 | feature_query = self.transform_query(feature_visual) 28 | feature_visual = self.transform_visual(feature_visual) 29 | 30 | visual_weight = (feature_query.unsqueeze(0) * feature_key.unsqueeze(1)).sum(dim=2, keepdim=False) / np.sqrt(self.dim_key) 31 | geometry_weight = self.transform_geometry(rois) 32 | 33 | attention = visual_weight.exp() * geometry_weight 34 | for i in range(attention.size(0)): 35 | attention[i, i] = 0 36 | attention = attention / (attention.sum(dim=1, keepdim=True) + 1e-10) 37 | feature_out = torch.sum(attention.unsqueeze(2) * feature_visual.unsqueeze(0), dim=1, keepdim=False) 38 | 39 | return feature_out 40 | 41 | if __name__ == '__main__': 42 | relation_module = Relation_Module_v2(4, 5, 3, 4) 43 | visual_features = Variable(torch.normal(torch.zeros(10, 4))) 44 | rois = Variable(torch.cat((torch.zeros(10, 1), (torch.rand(10, 4) + torch.FloatTensor([[0, 1, 2, 3], ])) * 100 ), dim=1)) 45 | feature_out = relation_module(visual_features, rois) 46 | 47 | print(feature_out) 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /models/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikang-li/FactorizableNet/1e3e1813630d8d647ab0f01bce7782198272e2e8/models/utils/__init__.py -------------------------------------------------------------------------------- /models/utils/vgg16.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | 7 | from utils.blob import im_list_to_blob 8 | from network import Conv2d 9 | import network 10 | 11 | 12 | class VGG16(nn.Module): 13 | def __init__(self, bn=False): 14 | super(VGG16, self).__init__() 15 | 16 | self.conv1 = nn.Sequential(Conv2d(3, 64, 3, same_padding=True, bn=bn), 17 | Conv2d(64, 64, 3, same_padding=True, bn=bn), 18 | nn.MaxPool2d(2)) 19 | self.conv2 = nn.Sequential(Conv2d(64, 128, 3, same_padding=True, bn=bn), 20 | Conv2d(128, 128, 3, same_padding=True, bn=bn), 21 | nn.MaxPool2d(2)) 22 | network.set_trainable(self.conv1, requires_grad=False) 23 | network.set_trainable(self.conv2, requires_grad=False) 24 | 25 | self.conv3 = nn.Sequential(Conv2d(128, 256, 3, same_padding=True, bn=bn), 26 | Conv2d(256, 256, 3, same_padding=True, bn=bn), 27 | Conv2d(256, 256, 3, same_padding=True, bn=bn), 28 | nn.MaxPool2d(2)) 29 | self.conv4 = nn.Sequential(Conv2d(256, 512, 3, same_padding=True, bn=bn), 30 | Conv2d(512, 512, 3, same_padding=True, bn=bn), 31 | Conv2d(512, 512, 3, same_padding=True, bn=bn), 32 | nn.MaxPool2d(2)) 33 | self.conv5 = nn.Sequential(Conv2d(512, 512, 3, same_padding=True, bn=bn), 34 | Conv2d(512, 512, 3, same_padding=True, bn=bn), 35 | Conv2d(512, 512, 3, same_padding=True, bn=bn)) 36 | 37 | def forward(self, im_data): 38 | # im_data, im_scales = get_blobs(image) 39 | # im_info = np.array( 40 | # [[im_data.shape[1], im_data.shape[2], im_scales[0]]], 41 | # dtype=np.float32) 42 | # data = Variable(torch.from_numpy(im_data)).cuda() 43 | # x = data.permute(0, 3, 1, 2) 44 | 45 | x = self.conv1(im_data) 46 | x = self.conv2(x) 47 | x = self.conv3(x) 48 | x = self.conv4(x) 49 | x = self.conv5(x) 50 | return x 51 | 52 | def load_from_npz(self, params): 53 | # params = np.load(npz_file) 54 | own_dict = self.state_dict() 55 | for name, val in own_dict.items(): 56 | i, j = int(name[4]), int(name[6]) + 1 57 | ptype = 'weights' if name[-1] == 't' else 'biases' 58 | key = 'conv{}_{}/{}:0'.format(i, j, ptype) 59 | param = torch.from_numpy(params[key]) 60 | if ptype == 'weights': 61 | param = param.permute(3, 2, 0, 1) 62 | val.copy_(param) 63 | 64 | def load_from_npy_file(self, fname): 65 | own_dict = self.state_dict() 66 | params = np.load(fname).item() 67 | for name, val in own_dict.items(): 68 | # # print name 69 | # # print val.size() 70 | # # print param.size() 71 | # if name.find('bn.') >= 0: 72 | # continue 73 | 74 | i, j = int(name[4]), int(name[6]) + 1 75 | ptype = 'weights' if name[-1] == 't' else 'biases' 76 | key = 'conv{}_{}'.format(i, j) 77 | param = torch.from_numpy(params[key][ptype]) 78 | 79 | if ptype == 'weights': 80 | param = param.permute(3, 2, 0, 1) 81 | 82 | val.copy_(param) 83 | 84 | 85 | if __name__ == '__main__': 86 | vgg = VGG16() 87 | vgg.load_from_npy_file('/media/longc/Data/models/VGG_imagenet.npy') -------------------------------------------------------------------------------- /options/RPN/RPN_FN.yaml: -------------------------------------------------------------------------------- 1 | # Training Settings 2 | 3 | kmeans_anchors: True 4 | anchor_dir: data/visual_genome # set 5 | 6 | object: 7 | train: 8 | num_box_pre_NMS: 12000 9 | num_box_post_NMS: 2000 10 | nms_thres: 0.7 11 | min_size: 16 12 | dropout_box_runoff_image: False 13 | allowed_border: 128 14 | clobber_positives: False 15 | negative_overlap: 0.35 16 | positive_overlap: 0.5 17 | dontcare_area_intersection_hi: 0.5 18 | fg_fraction: 0.5 19 | batch_size: 512 20 | BBOX_INSIDE_WEIGHTS: [1.0, 1.0, 1.0, 1.0] 21 | POSITIVE_WEIGHT: -1.0 22 | test: 23 | num_box_pre_NMS: 12000 24 | num_box_post_NMS: 300 25 | nms_thres: 0.6 26 | min_size: 16 27 | dropout_box_runoff_image: False 28 | allowed_border: 128 29 | -------------------------------------------------------------------------------- /options/RPN/RPN_FN_VRD.yaml: -------------------------------------------------------------------------------- 1 | # Training Settings 2 | 3 | kmeans_anchors: True 4 | anchor_dir: data/VRD # set 5 | 6 | object: 7 | train: 8 | num_box_pre_NMS: 12000 9 | num_box_post_NMS: 2000 10 | nms_thres: 0.7 11 | min_size: 16 12 | dropout_box_runoff_image: False 13 | allowed_border: 128 14 | clobber_positives: False 15 | negative_overlap: 0.3 16 | positive_overlap: 0.7 17 | dontcare_area_intersection_hi: 0.5 18 | fg_fraction: 0.5 19 | batch_size: 512 20 | BBOX_INSIDE_WEIGHTS: [1.0, 1.0, 1.0, 1.0] 21 | POSITIVE_WEIGHT: -1.0 22 | test: 23 | num_box_pre_NMS: 12000 24 | num_box_post_NMS: 300 25 | nms_thres: 0.6 26 | min_size: 16 27 | dropout_box_runoff_image: False 28 | allowed_border: 128 29 | 30 | 31 | -------------------------------------------------------------------------------- /options/RPN/RPN_FN_svg.yaml: -------------------------------------------------------------------------------- 1 | # Training Settings 2 | 3 | kmeans_anchors: True 4 | anchor_dir: data/svg # set 5 | 6 | object: 7 | train: 8 | num_box_pre_NMS: 12000 9 | num_box_post_NMS: 2000 10 | nms_thres: 0.7 11 | min_size: 16 12 | dropout_box_runoff_image: False 13 | allowed_border: 128 14 | clobber_positives: False 15 | negative_overlap: 0.3 16 | positive_overlap: 0.7 17 | dontcare_area_intersection_hi: 0.5 18 | fg_fraction: 0.5 19 | batch_size: 512 20 | BBOX_INSIDE_WEIGHTS: [1.0, 1.0, 1.0, 1.0] 21 | POSITIVE_WEIGHT: -1.0 22 | test: 23 | num_box_pre_NMS: 12000 24 | num_box_post_NMS: 300 25 | nms_thres: 0.6 26 | min_size: 16 27 | dropout_box_runoff_image: False 28 | allowed_border: 128 29 | 30 | -------------------------------------------------------------------------------- /options/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yikang-li/FactorizableNet/1e3e1813630d8d647ab0f01bce7782198272e2e8/options/__init__.py -------------------------------------------------------------------------------- /options/config_FN.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | """Fast R-CNN config system. 9 | 10 | This file specifies default config options for Fast R-CNN. You should not 11 | change values in this file. Instead, you should write a config file (in yaml) 12 | and use cfg_from_file(yaml_file) to load it and override the default options. 13 | 14 | Most tools in $ROOT/tools take a --cfg option to specify an override file. 15 | - See tools/{train,test}_net.py for example code that uses cfg_from_file() 16 | - See experiments/cfgs/*.yml for example YAML config override files 17 | """ 18 | 19 | import os 20 | import os.path as osp 21 | import numpy as np 22 | from time import strftime, localtime 23 | from easydict import EasyDict as edict 24 | 25 | __C = edict() 26 | # Consumers can get config by: 27 | # from fast_rcnn_config import cfg 28 | cfg = __C 29 | 30 | ## Training settings 31 | __C.TRAIN = edict() 32 | __C.TRAIN.BATCH_SIZE = 256 33 | __C.TRAIN.BATCH_SIZE_RELATIONSHIP = 512 34 | __C.TRAIN.BATCH_SIZE_CAPTION = 128 35 | 36 | __C.TRAIN.FG_FRACTION = 0.5 # [pending] higher fraction may be different from the inference case (since we introduce message passing) 37 | __C.TRAIN.FG_FRACTION_RELATIONSHIP = 0.5 38 | __C.TRAIN.FG_FRACTION_CAPTION = 0.5 39 | 40 | # Overlap threshold for a ROI to be considered foreground (if >= FG_THRESH) 41 | __C.TRAIN.FG_THRESH = 0.5 # change to 0.5 from [Feb 2], previously 0.6 42 | __C.TRAIN.FG_THRESH_REGION = 0.5 43 | 44 | # used for assigning weights for each coords (x1, y1, w, h) 45 | __C.TRAIN.BBOX_INSIDE_WEIGHTS = (1.0, 1.0, 1.0, 1.0) 46 | 47 | # Overlap threshold for a ROI to be considered background (class = 0 if 48 | # overlap in [LO, HI)) 49 | __C.TRAIN.BG_THRESH_HI = 0.4 50 | __C.TRAIN.BG_THRESH_LO = 0.0 # in Faster R-CNN by Shaoqing Ren, it is set to 0.1 51 | __C.TRAIN.BG_THRESH_HI_REGION = 0.4 52 | __C.TRAIN.BG_THRESH_LO_REGION = 0.0 53 | 54 | 55 | 56 | # Config for ROI-merging 57 | __C.TRAIN.REGION_NMS_THRES =0.5 58 | __C.TRAIN.CAPTION_COVERAGE_THRES =0.8 59 | 60 | 61 | ## Testing settings 62 | __C.TEST = edict() 63 | __C.TEST.BBOX_NUM = 200 64 | __C.TEST.REGION_NUM = 128 65 | 66 | # Config for ROI-merging 67 | __C.TEST.CAPTION_COVERAGE_THRES =0.8 68 | __C.TEST.REGION_NMS_THRES = 0.5 69 | 70 | 71 | 72 | def get_output_dir(imdb, weights_filename): 73 | """Return the directory where experimental artifacts are placed. 74 | If the directory does not exist, it is created. 75 | 76 | A canonical path is built using the name from an imdb and a network 77 | (if not None). 78 | """ 79 | outdir = osp.abspath(osp.join(__C.ROOT_DIR, 'output', __C.EXP_DIR, imdb.name)) 80 | if weights_filename is not None: 81 | outdir = osp.join(outdir, weights_filename) 82 | if not os.path.exists(outdir): 83 | os.makedirs(outdir) 84 | return outdir 85 | 86 | 87 | def get_log_dir(imdb): 88 | """Return the directory where experimental artifacts are placed. 89 | If the directory does not exist, it is created. 90 | A canonical path is built using the name from an imdb and a network 91 | (if not None). 92 | """ 93 | log_dir = osp.abspath( \ 94 | osp.join(__C.ROOT_DIR, 'logs', __C.LOG_DIR, imdb.name, strftime("%Y-%m-%d-%H-%M-%S", localtime()))) 95 | if not os.path.exists(log_dir): 96 | os.makedirs(log_dir) 97 | return log_dir 98 | 99 | 100 | def _merge_a_into_b(a, b): 101 | """Merge config dictionary a into config dictionary b, clobbering the 102 | options in b whenever they are also specified in a. 103 | """ 104 | if type(a) is not edict: 105 | return 106 | 107 | for k, v in a.iteritems(): 108 | # a must specify keys that are in b 109 | if not b.has_key(k): 110 | raise KeyError('{} is not a valid config key'.format(k)) 111 | 112 | # the types must match, too 113 | old_type = type(b[k]) 114 | if old_type is not type(v): 115 | if isinstance(b[k], np.ndarray): 116 | v = np.array(v, dtype=b[k].dtype) 117 | else: 118 | raise ValueError(('Type mismatch ({} vs. {}) ' 119 | 'for config key: {}').format(type(b[k]), 120 | type(v), k)) 121 | 122 | # recursively merge dicts 123 | if type(v) is edict: 124 | try: 125 | _merge_a_into_b(a[k], b[k]) 126 | except: 127 | print('Error under config key: {}'.format(k)) 128 | raise 129 | else: 130 | b[k] = v 131 | 132 | 133 | def cfg_from_file(filename): 134 | """Load a config file and merge it into the default options.""" 135 | import yaml 136 | with open(filename, 'r') as f: 137 | yaml_cfg = edict(yaml.load(f)) 138 | 139 | _merge_a_into_b(yaml_cfg, __C) 140 | 141 | 142 | def cfg_from_list(cfg_list): 143 | """Set config keys via list (e.g., from command line).""" 144 | from ast import literal_eval 145 | assert len(cfg_list) % 2 == 0 146 | for k, v in zip(cfg_list[0::2], cfg_list[1::2]): 147 | key_list = k.split('.') 148 | d = __C 149 | for subkey in key_list[:-1]: 150 | assert d.has_key(subkey) 151 | d = d[subkey] 152 | subkey = key_list[-1] 153 | assert d.has_key(subkey) 154 | try: 155 | value = literal_eval(v) 156 | except: 157 | # handle the case when v is a string literal 158 | value = v 159 | assert type(value) == type(d[subkey]), \ 160 | 'type {} does not match original type {}'.format( 161 | type(value), type(d[subkey])) 162 | d[subkey] = value 163 | -------------------------------------------------------------------------------- /options/data.yaml: -------------------------------------------------------------------------------- 1 | dir: data/visual_genome 2 | dataset_version: top_150_50_new 3 | train: 4 | SCALES: [500, 550, 600, 600, 650, 700] # more probability to 600 5 | MAX_SIZE: 1000 6 | test: 7 | SCALES: [600,] 8 | MAX_SIZE: 1000 9 | -------------------------------------------------------------------------------- /options/data_VRD.yaml: -------------------------------------------------------------------------------- 1 | dir: data/VRD 2 | train: 3 | SCALES: [450, 500, 550, 600, 600, 650, 700, 750]# more probability to 600 4 | MAX_SIZE: 1000 5 | test: 6 | SCALES: [600,] 7 | MAX_SIZE: 1000 8 | -------------------------------------------------------------------------------- /options/data_sVG.yaml: -------------------------------------------------------------------------------- 1 | dir: data/svg 2 | train: 3 | SCALES: [450, 500, 550, 600, 600, 650, 700, 750] # more probability to 600 4 | MAX_SIZE: 1000 5 | test: 6 | SCALES: [600,] 7 | MAX_SIZE: 1000 8 | -------------------------------------------------------------------------------- /options/models/VG-DR-Net.yaml: -------------------------------------------------------------------------------- 1 | logs: 2 | dir_logs: output/ 3 | model_name: FN_VG-DR-Net 4 | data: 5 | dataset: sVG 6 | opts: options/data_sVG.yaml 7 | use_region: false 8 | batch_size: 1 9 | model: 10 | arch: FN_v4s 11 | rpn_opts: options/RPN/RPN_FN_svg.yaml 12 | # feature vector size 13 | dim_hr: 512 14 | dim_ho: 512 15 | dim_hp: 512 16 | pool_size: 5 17 | op: Sum 18 | # for both kernel-based attention and Mutan 19 | dim_mm: 256 20 | activation: relu 21 | # Iters for message passing, 0 means disable that 22 | MPS_iter: 1 23 | geometry: Geometry_Transform_v1 24 | # settings for attention gate 25 | use_bias: True 26 | dropout: 0. 27 | # Settings for inference part 28 | fusion: PI_v5 # PI is short for [Predicate Inference] 29 | # loss_weight 30 | cls_obj: 1. 31 | cls_pred: 2. 32 | reg_obj: 0.5 33 | 34 | optim: 35 | lr: 0.01 36 | lr_decay: 0.1 37 | lr_decay_epoch: 3 38 | epochs: 15 39 | optimizer: 0 # [0: SGD | 1: Adam | 2: Adagrad] 40 | nesterov: True 41 | weight_decay: 0.00001 42 | momentum: 0.9 43 | clip_gradient: True 44 | -------------------------------------------------------------------------------- /options/models/VG-MSDN.yaml: -------------------------------------------------------------------------------- 1 | logs: 2 | dir_logs: output/ 3 | model_name: FN_VG-MSDN 4 | data: 5 | dataset: visual_genome 6 | opts: options/data.yaml 7 | dataset_option: small # (small | normal | fat) 8 | batch_size: 1 9 | use_region: False 10 | model: 11 | arch: FN_v4 12 | rpn_opts: options/RPN/RPN_FN.yaml 13 | # feature vector size 14 | dim_hr: 512 15 | dim_ho: 512 16 | dim_hp: 512 17 | use_shortcut: True 18 | pool_size: 7 19 | op: Sum 20 | # for both kernel-based attention and Mutan 21 | dim_mm: 256 22 | activation: relu 23 | # Iters for message passing, 0 means disable that 24 | MPS_iter: 1 25 | # settings for attention gate 26 | use_bias: True 27 | dropout: 0. 28 | # Settings for inference part 29 | fusion: PI_v5 # PI is short for [Predicate Inference] 30 | bottleneck: True 31 | # loss_weight 32 | cls_obj: 1. 33 | cls_pred: 2. 34 | reg_obj: 0.5 35 | 36 | optim: 37 | lr: 0.01 38 | lr_decay: 0.1 39 | lr_decay_epoch: 2 40 | epochs: 10 41 | optimizer: 0 # [0: SGD | 1: Adam | 2: Adagrad] 42 | nesterov: True 43 | weight_decay: 0.00001 44 | momentum: 0.9 45 | clip_gradient: True 46 | 47 | -------------------------------------------------------------------------------- /options/models/VRD.yaml: -------------------------------------------------------------------------------- 1 | logs: 2 | dir_logs: output/ 3 | model_name: FN_VRD 4 | data: 5 | dataset: VRD 6 | opts: options/data_VRD.yaml 7 | use_region: false 8 | batch_size: 1 9 | model: 10 | arch: FN_v4s 11 | rpn_opts: options/RPN/RPN_FN_VRD.yaml 12 | # feature vector size 13 | dim_hr: 512 14 | dim_ho: 512 15 | dim_hp: 512 16 | pool_size: 5 17 | op: Sum 18 | # for both kernel-based attention and Mutan 19 | dim_mm: 256 20 | activation: relu 21 | # Iters for message passing, 0 means disable that 22 | MPS_iter: 1 23 | # settings for attention gate 24 | use_bias: True 25 | dropout: 0. 26 | # Settings for inference part 27 | fusion: PI_v5 # PI is short for [Predicate Inference] 28 | bottleneck: True 29 | # loss_weight 30 | cls_obj: 1. 31 | cls_pred: 2. 32 | reg_obj: 0.5 33 | 34 | optim: 35 | lr: 0.01 36 | lr_decay: 0.1 37 | lr_decay_epoch: 3 38 | epochs: 15 39 | optimizer: 0 # [0: SGD | 1: Adam | 2: Adagrad] 40 | nesterov: True 41 | weight_decay: 0.00001 42 | momentum: 0.9 43 | clip_gradient: True 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cython 2 | cffi 3 | opencv-python 4 | scipy 5 | easydict 6 | matplotlib 7 | pyyaml 8 | -------------------------------------------------------------------------------- /scripts/collect_samples.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import os 3 | import os.path as osp 4 | 5 | import argparse 6 | import pdb 7 | 8 | 9 | parser = argparse.ArgumentParser('Options') 10 | 11 | parser.add_argument('--path_files', default='output/graph_top_100/high_recall_cases.txt', type=str, 12 | help='path to a data file') 13 | parser.add_argument('--output_dir', default='output/graph_top_100/high_recall_cases', type=str) 14 | parser.add_argument('--base_dir', default='output/graph_top_100') 15 | 16 | 17 | args = parser.parse_args() 18 | 19 | def main(): 20 | global args 21 | 22 | if osp.isdir(args.output_dir): 23 | shutil.rmtree(args.output_dir) 24 | 25 | os.makedirs(args.output_dir) 26 | 27 | 28 | with open(args.path_files, 'r') as f: 29 | data = f.readlines() 30 | data = [v.strip('\n') for v in data] 31 | for f in data: 32 | try: 33 | shutil.copyfile(osp.join(args.base_dir, f+'.png'), 34 | osp.join(args.output_dir, f+'.png')) 35 | shutil.copyfile(osp.join(args.base_dir, f+'.pdf'), 36 | osp.join(args.output_dir, f+'.pdf')) 37 | except: 38 | continue 39 | print('Done.') 40 | 41 | 42 | 43 | if __name__ == '__main__': 44 | main() -------------------------------------------------------------------------------- /scripts/preprocessing_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import nltk 4 | import json 5 | import enchant 6 | from nltk.corpus import wordnet as wn 7 | 8 | current_dir = os.getcwd() 9 | os.chdir('../') 10 | 11 | 12 | ## Loading data 13 | image_data = json.load(open('image_data.json')) 14 | print('image data length: ' + str(len(image_data))) 15 | relationships_data = json.load(open('relationships.json')) 16 | print('relationship data length: ' + str(len(relationships_data))) 17 | 18 | ## The subject and object should be none 19 | en_dict = enchant.Dict("en_US") 20 | nouns = {x.name().split('.', 1)[0] for x in wn.all_synsets('n')} 21 | 22 | 23 | 24 | relationship_count = 0 25 | predicate_dataset = {} 26 | 27 | spelling_error_counter = 0 28 | length_matching_counter = 0 29 | # word_mismatch_counter = 0 30 | 31 | relationships = {} 32 | 33 | for d_id,rs in enumerate(relationships_data): 34 | im_relationships = {} 35 | for r_id,r in enumerate(rs['relationships']): 36 | try: 37 | normalized_predicate = '_'.join([nltk.stem.WordNetLemmatizer().lemmatize(x, 'v') for x in 38 | r['predicate'].strip('.').strip(',').encode('ascii', 'replace').split()]) 39 | normalized_subject = '_'.join([nltk.stem.WordNetLemmatizer().lemmatize(x, 'n') for x in 40 | r['subject']['name'].strip('.').strip(',').encode('ascii', 'replace').split()]) 41 | normalized_object = '_'.join([nltk.stem.WordNetLemmatizer().lemmatize(x, 'n') for x in 42 | r['object']['name'].strip('.').strip(',').encode('ascii', 'replace').split()]) 43 | 44 | if (not en_dict.check(normalized_predicate.replace('_', '-'))) or \ 45 | (not en_dict.check(normalized_subject.replace('_', '-'))) or \ 46 | (not en_dict.check(normalized_object.replace('_', '-'))): 47 | spelling_error_counter += 1 48 | # print('Wrong spelling({}):{}-{}-{}\n'.format(spelling_error_counter, normalized_subject, normalized_predicate, normalized_object)); 49 | continue 50 | 51 | normalized_predicate = normalized_predicate.lower().replace('-', '_') 52 | normalized_subject = normalized_subject.lower().replace('-', '_') 53 | normalized_object = normalized_object.lower().replace('-', '_') 54 | 55 | if len(normalized_predicate) <= 1 or len(normalized_subject) <=1 or len(normalized_object) <=1: 56 | length_matching_counter += 1 57 | # print('length not matched:{}-{}-{}\n'.format(r['subject']['name'], r['predicate'], r['object']['name'])) 58 | continue 59 | 60 | # if normalized_object not in nouns or normalized_subject not in nouns: 61 | # # print('Subject or Object no in Nouns:{}-{}-{}\n'.format(r['subject']['name'], r['predicate'], r['object']['name'])) 62 | # word_mismatch_counter += 1 63 | # continue 64 | relationship_item = {} 65 | relationship_item['object'] = normalized_object 66 | relationship_item['subject'] = normalized_subject 67 | relationship_item['sub_box'] = \ 68 | (r['subject']['x'], r['subject']['y'], r['subject']['x'] + r['subject']['w'], \ 69 | r['subject']['y'] + r['subject']['h']) 70 | relationship_item['obj_box'] = \ 71 | (r['object']['x'], r['object']['y'], r['object']['x'] + r['object']['w'], \ 72 | r['object']['y'] + r['object']['h']) 73 | relationship_item['predicate'] = normalized_predicate 74 | if 'relationships' not in im_relationships.keys(): 75 | im_relationships['relationships'] = [relationship_item] 76 | else: 77 | im_relationships['relationships'].append(relationship_item) 78 | relationship_count += 1 79 | except Exception as inst: 80 | print inst 81 | print d_id 82 | print r_id 83 | # raw_input("Press Enter to continue...") 84 | print('({}, {}): [{}]-[{}]-[{}]\n'.format(d_id, r_id, r['subject']['name'], r['predicate'], r['object']['name'])) 85 | print('Error: [{}]-[{}]-[{}]\n'.format(r['subject']['name'].encode('ascii', 'replace'), r['predicate'].encode('ascii', 'replace'), r['object']['name'].encode('ascii', 'replace'))) 86 | # raw_input('Press Enter to continue...') 87 | pass 88 | if d_id%5000 == 0: 89 | print(str(d_id) + ' images processed, ' + str(relationship_count) + ' relationships') 90 | 91 | if 'relationships' in im_relationships.keys(): 92 | im_relationships['path'] = str(image_data[d_id]['image_id']) + '.jpg' 93 | im_relationships['width'] = image_data[d_id]['width'] 94 | im_relationships['height'] = image_data[d_id]['height'] 95 | relationships[d_id] = im_relationships 96 | 97 | del relationships_data 98 | print('Currently, we have ' + str(relationship_count) + ' relationship tuples and {} images'.format(len(relationships.keys()))) 99 | print('Spelling error: {}'.format(spelling_error_counter)) 100 | print('Length matching error: {}'.format(length_matching_counter)) 101 | # print('word mismatch error: {}'.format(word_mismatch_counter)) 102 | 103 | if __name__ == "__main__": 104 | def output_annoatation(output_path, relationships): 105 | with open(output_path, 'w') as f: 106 | output_counter = 0 107 | for item_key in relationships: 108 | im_item = relationships[item_key] 109 | f.write('# {}\n'.format(item_key)) # to output the item key 110 | f.write(im_item['path'] + '\n') # to output the image path 111 | f.write('{}\n{}\n'.format(im_item['height'], im_item['width'])) # output the height and width 112 | f.write('{}\n'.format(len(im_item['relationships']))) 113 | for r in im_item['relationships']: # output the relationship item [subject]-[predicate]-[object]-[sub_box]-[obj_box] 114 | f.write(r['subject'].replace(' ', '_')) 115 | f.write(' ' + r['predicate'].replace(' ', '_')) 116 | f.write(' ' + r['object'].replace(' ', '_')) 117 | for item in r['sub_box']: 118 | f.write(' ' + str(item)) 119 | for item in r['obj_box']: 120 | f.write(' ' + str(item)) 121 | f.write('\n') 122 | output_counter += 1 123 | if output_counter % 1000 == 0: 124 | print('{}/{} images processed'.format(output_counter, len(relationships.keys()))) 125 | 126 | print('Result output to: {}'.format(output_path)) 127 | 128 | 129 | os.chdir(current_dir) 130 | output_annoatation('output/filtered_relationship.txt', relationships) 131 | 132 | 133 | 134 | os.chdir(current_dir) 135 | -------------------------------------------------------------------------------- /scripts/setup_eval.sh: -------------------------------------------------------------------------------- 1 | cd eval 2 | wget http://www.cs.cmu.edu/~alavie/METEOR/download/meteor-1.5.tar.gz 3 | tar -xzvf meteor-1.5.tar.gz 4 | cp meteor-1.5/meteor-1.5.jar . 5 | mkdir data 6 | cp meteor-1.5/data/paraphrase-en.gz data/ 7 | rm -r meteor-1.5 8 | rm meteor-1.5.tar.gz 9 | cd .. 10 | -------------------------------------------------------------------------------- /scripts/voc_converter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | This file is a tool to parse json file and generate voc format xml file. 4 | ''' 5 | import json 6 | import xml.etree.ElementTree as ET 7 | import cv2 8 | import os 9 | import os.path as osp 10 | 11 | 12 | def main(): 13 | base_data_dir = '/DATA/ykli/workspace/scene_generation/data/VRD' 14 | out_xml_path = osp.join(base_data_dir, "object_xml") 15 | image_dir = osp.join(base_data_dir, 'images', 'sg_test_images') 16 | 17 | if not osp.isdir(out_xml_path): 18 | os.makedirs(out_xml_path) 19 | 20 | annotations = json.load(open(osp.join(base_data_dir, "test.json"))) 21 | classes_object = json.load(open(osp.join(base_data_dir, "objects.json"))) 22 | 23 | #jpg files folder 24 | counter = 0 25 | 26 | for i in range(len(annotations)): 27 | jpg_name = annotations[i]['path'] 28 | xml_file_name = os.path.splitext(jpg_name)[0] + ".xml" 29 | 30 | img_path = osp.join(image_dir, jpg_name) 31 | image = cv2.imread(img_path) 32 | im_height = image.shape[0] 33 | im_width = image.shape[1] 34 | im_ch = image.shape[2] 35 | counter += 1 36 | 37 | #create a xml 38 | out = ET.Element('annotation') 39 | #folder 40 | folder = ET.SubElement(out,"folder") 41 | folder.text = "VOC2007" 42 | #filename 43 | filename = ET.SubElement(out,"filename") 44 | filename.text = jpg_name 45 | #filesource 46 | file_source = ET.SubElement(out,"source") 47 | database = ET.SubElement(file_source,"database") 48 | database.text = "VRD Database" 49 | annotation = ET.SubElement(file_source,"annotation") 50 | annotation.text = "VRD" 51 | image = ET.SubElement(file_source,"image") 52 | image.text = "flickr" 53 | flickid = ET.SubElement(file_source,"flickrid") 54 | flickid.text = "Yikang" 55 | 56 | #file owner 57 | owner = ET.SubElement(out,"owner") 58 | flickid = ET.SubElement(owner,"flickrid") 59 | flickid.text = "Yikang" 60 | name = ET.SubElement(owner,"name") 61 | name.text = "Yikang" 62 | 63 | #file size 64 | file_size = ET.SubElement(out,"size") 65 | file_width = ET.SubElement(file_size,"width") 66 | file_width.text = str(im_height) 67 | file_height = ET.SubElement(file_size,"height") 68 | file_height.text = str(im_width) 69 | file_depth = ET.SubElement(file_size,"depth") 70 | file_depth.text = str(im_ch) 71 | 72 | #file segmented 73 | file_segmented = ET.SubElement(out,"segmented") 74 | file_segmented.text = "0" 75 | 76 | for obj in annotations[i]['objects']: 77 | idx = obj['class'] 78 | bbox_x1 = obj['bbox'][0] 79 | bbox_y1 = obj['bbox'][1] 80 | bbox_x2 = obj['bbox'][2] 81 | bbox_y2 = obj['bbox'][3] 82 | #create a car obj 83 | obj = ET.SubElement(out,'object') 84 | obj_name = ET.SubElement(obj,"name") 85 | obj_name.text = classes_object[idx-1] 86 | 87 | obj_pose = ET.SubElement(obj,"pose") 88 | obj_pose.text = "Unspecified" 89 | 90 | obj_truncated = ET.SubElement(obj,"truncated") 91 | obj_truncated.text = "1" 92 | 93 | obj_difficult = ET.SubElement(obj,"difficult") 94 | obj_difficult.text = "0" 95 | 96 | #create boundingbox 97 | bndbox = ET.SubElement(obj,"bndbox") 98 | xmin = ET.SubElement(bndbox,'xmin') 99 | xmin.text = str(bbox_x1) 100 | 101 | ymin = ET.SubElement(bndbox,'ymin') 102 | ymin.text = str(bbox_y1) 103 | 104 | xmax = ET.SubElement(bndbox,'xmax') 105 | xmax.text = str(bbox_x2) 106 | 107 | ymax = ET.SubElement(bndbox,'ymax') 108 | ymax.text = str(bbox_y2) 109 | 110 | out_tree = ET.ElementTree(out) 111 | out_tree.write(out_xml_path + xml_file_name) 112 | 113 | if (i+1) % 100 == 0: 114 | print('{} / {} images processed'.format(i+1, len(annotations))) 115 | 116 | print "Process done" 117 | 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /scripts/voc_converter_vg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | ''' 3 | This file is a tool to parse json file and generate voc format xml file. 4 | ''' 5 | import json 6 | import xml.etree.ElementTree as ET 7 | import cv2 8 | import os 9 | import os.path as osp 10 | import pdb 11 | 12 | 13 | def main(): 14 | base_data_dir = '/DATA/ykli/workspace/scene_generation/data/visual_genome/vg_cleansing/output/top_150_50_new' 15 | out_xml_path = osp.join(base_data_dir, "object_xml") 16 | 17 | if not osp.isdir(out_xml_path): 18 | os.makedirs(out_xml_path) 19 | 20 | annotations = json.load(open(osp.join(base_data_dir, "test.json"))) 21 | 22 | #jpg files folder 23 | counter = 0 24 | 25 | for i in range(len(annotations)): 26 | jpg_name = annotations[i]['path'] 27 | xml_file_name = os.path.splitext(jpg_name)[0] + ".xml" 28 | im_height = annotations[i]['height'] 29 | im_width = annotations[i]['width'] 30 | im_ch = 3 31 | counter += 1 32 | 33 | #create a xml 34 | out = ET.Element('annotation') 35 | #folder 36 | folder = ET.SubElement(out,"folder") 37 | folder.text = "VOC2007" 38 | #filename 39 | filename = ET.SubElement(out,"filename") 40 | filename.text = jpg_name 41 | #filesource 42 | file_source = ET.SubElement(out,"source") 43 | database = ET.SubElement(file_source,"database") 44 | database.text = "VRD Database" 45 | annotation = ET.SubElement(file_source,"annotation") 46 | annotation.text = "VRD" 47 | image = ET.SubElement(file_source,"image") 48 | image.text = "flickr" 49 | flickid = ET.SubElement(file_source,"flickrid") 50 | flickid.text = "Yikang" 51 | 52 | #file owner 53 | owner = ET.SubElement(out,"owner") 54 | flickid = ET.SubElement(owner,"flickrid") 55 | flickid.text = "Yikang" 56 | name = ET.SubElement(owner,"name") 57 | name.text = "Yikang" 58 | 59 | #file size 60 | file_size = ET.SubElement(out,"size") 61 | file_width = ET.SubElement(file_size,"width") 62 | file_width.text = str(im_height) 63 | file_height = ET.SubElement(file_size,"height") 64 | file_height.text = str(im_width) 65 | file_depth = ET.SubElement(file_size,"depth") 66 | file_depth.text = str(im_ch) 67 | 68 | #file segmented 69 | file_segmented = ET.SubElement(out,"segmented") 70 | file_segmented.text = "0" 71 | 72 | for obj in annotations[i]['objects']: 73 | bbox_x1 = obj['box'][0] 74 | bbox_y1 = obj['box'][1] 75 | bbox_x2 = obj['box'][2] 76 | bbox_y2 = obj['box'][3] 77 | obj_class = obj['class'] 78 | #create a car obj 79 | obj = ET.SubElement(out,'object') 80 | obj_name = ET.SubElement(obj,"name") 81 | obj_name.text = obj_class 82 | 83 | obj_pose = ET.SubElement(obj,"pose") 84 | obj_pose.text = "Unspecified" 85 | 86 | obj_truncated = ET.SubElement(obj,"truncated") 87 | obj_truncated.text = "1" 88 | 89 | obj_difficult = ET.SubElement(obj,"difficult") 90 | obj_difficult.text = "0" 91 | 92 | #create boundingbox 93 | bndbox = ET.SubElement(obj,"bndbox") 94 | xmin = ET.SubElement(bndbox,'xmin') 95 | xmin.text = str(bbox_x1) 96 | 97 | ymin = ET.SubElement(bndbox,'ymin') 98 | ymin.text = str(bbox_y1) 99 | 100 | xmax = ET.SubElement(bndbox,'xmax') 101 | xmax.text = str(bbox_x2) 102 | 103 | ymax = ET.SubElement(bndbox,'ymax') 104 | ymax.text = str(bbox_y2) 105 | 106 | out_tree = ET.ElementTree(out) 107 | out_tree.write(osp.join(out_xml_path, xml_file_name)) 108 | 109 | if (i+1) % 100 == 0: 110 | print('{} / {} images processed'.format(i+1, len(annotations))) 111 | 112 | print "Process done" 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | CUDA_VISIBLE_DEVICES=0 python train_FN.py --dataset_option=normal --path_opt options/models/VG-MSDN.yaml --rpn output/RPN.h5 4 | -------------------------------------------------------------------------------- /visualize_graph.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path as osp 3 | import torch 4 | import numpy as np 5 | import random 6 | import numpy.random as npr 7 | import json 8 | import cPickle as pickle 9 | import yaml 10 | import cv2 11 | 12 | from pprint import pprint 13 | 14 | # from faster_rcnn.datasets.factory import get_imdb 15 | import lib.datasets as datasets 16 | from lib.visualize_graph.vis_utils import ground_predictions 17 | from lib.visualize_graph.visualize import viz_scene_graph, draw_scene_graph 18 | 19 | 20 | import argparse 21 | import pdb 22 | 23 | from PIL import Image 24 | 25 | from eval.evaluator import DenseCaptioningEvaluator 26 | 27 | 28 | parser = argparse.ArgumentParser('Options for Meteor evaluation') 29 | 30 | parser.add_argument('--path_data_opts', default='options/data_VRD.yaml', type=str, 31 | help='path to a data file') 32 | parser.add_argument('--path_result', default='output/testing_result.pkl', type=str, 33 | help='path to the evaluation result file') 34 | parser.add_argument('--output_dir', default='output/graph_results/VRD', type=str, 35 | help='path to the evaluation result file') 36 | parser.add_argument('--dataset_option', default='small', type=str, 37 | help='path to the evaluation result file') 38 | parser.add_argument('--dataset', default='VRD', type=str, 39 | help='path to the evaluation result file') 40 | 41 | args = parser.parse_args() 42 | 43 | if args.dataset is not 'visual_genome': 44 | args.dataset_option = None 45 | 46 | # def prepare_rel_matrix(relationships, object_num): 47 | # rel_mat = np.zeros() 48 | # for rel in len(relationships): 49 | # rel_mat[rel[0], rel[1]] = rel_cls[i] 50 | # return rel_mat 51 | 52 | 53 | def visualize(): 54 | 55 | global args 56 | print('=========== Visualizing Scene Graph =========') 57 | 58 | 59 | print('Loading dataset...'), 60 | with open(args.path_data_opts, 'r') as handle: 61 | options = yaml.load(handle) 62 | test_set = getattr(datasets, args.dataset)(options, 'test', 63 | dataset_option=args.dataset_option, 64 | use_region=True) 65 | test_loader = torch.utils.data.DataLoader(test_set, batch_size=1, 66 | shuffle=False, num_workers=4, 67 | pin_memory=True, 68 | collate_fn=getattr(datasets, args.dataset).collate) 69 | print('Done Loading') 70 | 71 | with open(args.path_result, 'rb') as f: 72 | print('Loading result....'), 73 | result = pickle.load(f) 74 | print('Done') 75 | print('Total: {} images'.format(len(result))) 76 | 77 | for i, sample in enumerate(test_loader): # (im_data, im_info, gt_objects, gt_relationships) 78 | objects = result[i]['objects'] 79 | relationships = result[i]['relationships'] 80 | gt_boxes = sample['objects'][0][:, :4] / sample['image_info'][0][2] 81 | gt_relations = sample['relations'][0] 82 | gt_relations = zip(*np.where(gt_relations > 0)) 83 | gt_to_pred = ground_predictions(objects['bbox'], gt_boxes, 0.5) 84 | assert sample['path'][0] == result[i]['path'], 'Image mismatch.' 85 | im = cv2.imread(osp.join(test_set._data_path, sample['path'][0])) 86 | image_name = sample['path'][0].split('/')[-1].split('.')[0] 87 | image_name = osp.join(args.output_dir, image_name) 88 | draw_graph_pred(im, objects['bbox'], objects['class'], relationships, 89 | gt_to_pred, gt_relations, test_set._object_classes, 90 | test_set._predicate_classes, filename=image_name) 91 | 92 | print 'Done generating scene graphs.' 93 | 94 | 95 | def draw_graph_pred(im, boxes, obj_ids, pred_relationships, gt_to_pred, 96 | gt_relations, ind_to_class, ind_to_predicate, filename): 97 | """ 98 | Draw a predicted scene graph. To keep the graph interpretable, only draw 99 | the node and edge predictions that have correspounding ground truth 100 | labels. 101 | args: 102 | im: image 103 | boxes: prediceted boxes 104 | obj_ids: object id list 105 | rel_pred_mat: relation classification matrix 106 | gt_to_pred: a mapping from ground truth box indices to predicted box indices 107 | idx: for saving 108 | gt_relations: gt_relationships 109 | """ 110 | rel_pred = [] 111 | all_rels = [] 112 | 113 | for pred_rel in pred_relationships: 114 | for rel in gt_relations: 115 | if rel[0] not in gt_to_pred or rel[1] not in gt_to_pred: 116 | continue 117 | 118 | # discard duplicate grounding 119 | if pred_rel[0] == gt_to_pred[rel[0]] and pred_rel[1] == gt_to_pred[rel[1]]: 120 | rel_pred.append(pred_rel) 121 | all_rels.append([pred_rel[0], pred_rel[1]]) 122 | break 123 | # rel_pred = pred_relationships[:5] # uncomment to visualize top-5 relationships 124 | rel_pred = np.array(rel_pred) 125 | if rel_pred.size < 4: 126 | print('Image Skipped.') 127 | return 128 | # indices of predicted boxes 129 | pred_inds = rel_pred[:, :2].ravel() 130 | 131 | # draw graph predictions 132 | graph_dict = draw_scene_graph(obj_ids, pred_inds, rel_pred, ind_to_class, ind_to_predicate, filename=filename) 133 | viz_scene_graph(im, boxes, obj_ids, ind_to_class, ind_to_predicate, pred_inds, rel_pred, filename=filename) 134 | """ 135 | out_boxes = [] 136 | for box, cls in zip(boxes[pred_inds], cls_pred[pred_inds]): 137 | out_boxes.append(box[cls*4:(cls+1)*4].tolist()) 138 | 139 | graph_dict['boxes'] = out_boxes 140 | 141 | if do_save == 'y': 142 | scipy.misc.imsave('cherry/im_%i.png' % idx, im) 143 | fn = open('cherry/graph_%i.json' % idx, 'w+') 144 | json.dump(graph_dict, fn) 145 | print(idx) 146 | """ 147 | 148 | 149 | if __name__ == '__main__': 150 | visualize() 151 | 152 | 153 | --------------------------------------------------------------------------------