├── lib ├── core │ ├── __init__.py │ ├── test_rel.py │ └── test_engine_rel.py ├── datasets_rel │ ├── __init__.py │ ├── pytorch_misc.py │ ├── dataset_catalog_rel.py │ ├── ap_eval_rel.py │ ├── task_evaluation_vg_and_vrd.py │ ├── roidb_rel.py │ └── task_evaluation_sg.py ├── modeling_rel │ ├── __init__.py │ ├── sparse_targets_rel.py │ ├── generate_rel_proposal_labels.py │ ├── VGG16.py │ ├── rel_pyramid_module.py │ ├── get_dataset_counts_rel.py │ ├── relpn_heads.py │ └── fast_rcnn_heads.py ├── roi_data_rel │ ├── __init__.py │ ├── minibatch_rel.py │ ├── loader_rel.py │ └── fast_rcnn_rel.py ├── utils_rel │ ├── __init__.py │ ├── cython_bbox_rel.pyx │ ├── net_rel.py │ ├── logging_rel.py │ ├── boxes_rel.py │ ├── subprocess_rel.py │ └── training_stats_rel.py ├── make.sh └── setup.py ├── Examples.PNG ├── Loss_illustration.PNG ├── .gitmodules ├── docker └── Dockerfile ├── tools ├── _init_paths.py ├── rename_vrd_with_numbers.py ├── convert_vrd_anno_to_coco_format.py └── test_net_rel.py ├── configs ├── vg │ ├── e2e_faster_rcnn_VGG16_8_epochs_vg_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_no_spt.yaml │ └── e2e_faster_rcnn_X-101-64x4d-FPN_8_epochs_vg_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml ├── vrd │ ├── e2e_faster_rcnn_VGG16_16_epochs_vrd_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_IN_pretrained.yaml │ └── e2e_faster_rcnn_VGG16_16_epochs_vrd_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_COCO_pretrained.yaml ├── oi_rel_mini │ └── e2e_faster_rcnn_X-101-64x4d-FPN_12_epochs_oi_rel_mini_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml └── oi_rel │ └── e2e_faster_rcnn_X-101-64x4d-FPN_12_epochs_oi_rel_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml ├── LICENSE └── README.md /lib/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/datasets_rel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/modeling_rel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/roi_data_rel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/utils_rel/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Examples.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ContrastiveLosses4VRD/master/Examples.PNG -------------------------------------------------------------------------------- /Loss_illustration.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/ContrastiveLosses4VRD/master/Loss_illustration.PNG -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "Detectron_pytorch"] 2 | path = Detectron_pytorch 3 | url = https://github.com/roytseng-tw/Detectron.pytorch 4 | -------------------------------------------------------------------------------- /lib/make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | CUDA_PATH=/usr/local/cuda/ 5 | 6 | python3 setup.py build_ext --inplace 7 | rm -rf build 8 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:0.4-cuda9-cudnn7-devel 2 | RUN apt-get update --fix-missing 3 | RUN apt-get install -y software-properties-common 4 | RUN apt-get install -y libsm6 libxext6 libxrender1 libfontconfig1 5 | RUN pip install --upgrade pip 6 | RUN pip install Cython matplotlib numpy scipy pyyaml packaging tensorboardX scikit-image pillow tqdm gensim 7 | RUN pip install pycocotools 8 | RUN conda install opencv -------------------------------------------------------------------------------- /tools/_init_paths.py: -------------------------------------------------------------------------------- 1 | # Based on Detectron.pytorch/tools/_init_paths.py by Roy Tseng 2 | # modified for this project by Ji Zhang 3 | 4 | """Add {PROJECT_ROOT}/lib. to PYTHONPATH 5 | 6 | Usage: 7 | import this module before import any modules under lib/ 8 | e.g 9 | import _init_paths 10 | from core.config import cfg 11 | """ 12 | 13 | import os.path as osp 14 | import sys 15 | 16 | 17 | def add_path(path): 18 | if path not in sys.path: 19 | sys.path.insert(0, path) 20 | 21 | this_dir = osp.abspath(osp.dirname(osp.dirname(__file__))) 22 | 23 | # add Detectron.PyTorch/lib 24 | detectron_path = osp.join(this_dir, 'Detectron_pytorch', 'lib') 25 | add_path(detectron_path) 26 | 27 | # Add lib to PYTHONPATH 28 | lib_path = osp.join(this_dir, 'lib') 29 | add_path(lib_path) 30 | 31 | -------------------------------------------------------------------------------- /lib/setup.py: -------------------------------------------------------------------------------- 1 | # Based on: 2 | # Detectron.pytorch/lib/setup.py 3 | # and modified for this project 4 | # Original source license text: 5 | # -------------------------------------------------------- 6 | # Fast R-CNN 7 | # Copyright (c) 2015 Microsoft 8 | # Licensed under The MIT License [see LICENSE for details] 9 | # Written by Ross Girshick 10 | # -------------------------------------------------------- 11 | 12 | from __future__ import print_function 13 | 14 | from Cython.Build import cythonize 15 | from Cython.Distutils import build_ext 16 | from setuptools import Extension 17 | from setuptools import setup 18 | 19 | import numpy as np 20 | 21 | 22 | # Obtain the numpy include directory. This logic works across numpy versions. 23 | try: 24 | numpy_include = np.get_include() 25 | except AttributeError: 26 | numpy_include = np.get_numpy_include() 27 | 28 | 29 | ext_modules = [ 30 | Extension( 31 | name='utils_rel.cython_bbox_rel', 32 | sources=['utils_rel/cython_bbox_rel.pyx'], 33 | extra_compile_args=['-Wno-cpp'], 34 | include_dirs=[numpy_include] 35 | ) 36 | ] 37 | 38 | setup( 39 | name='mask_rcnn_rel', 40 | ext_modules=cythonize(ext_modules) 41 | ) 42 | 43 | -------------------------------------------------------------------------------- /lib/datasets_rel/pytorch_misc.py: -------------------------------------------------------------------------------- 1 | 2 | # This file is from https://github.com/rowanz/neural-motifs/blob/master/lib/pytorch_misc.py 3 | # Unused imports and functions are deleted 4 | 5 | """ 6 | Miscellaneous functions that might be useful for pytorch 7 | """ 8 | 9 | import numpy as np 10 | 11 | 12 | def intersect_2d(x1, x2): 13 | """ 14 | Given two arrays [m1, n], [m2,n], returns a [m1, m2] array where each entry is True if those 15 | rows match. 16 | :param x1: [m1, n] numpy array 17 | :param x2: [m2, n] numpy array 18 | :return: [m1, m2] bool array of the intersections 19 | """ 20 | if x1.shape[1] != x2.shape[1]: 21 | raise ValueError("Input arrays must have same #columns") 22 | 23 | # This performs a matrix multiplication-esque thing between the two arrays 24 | # Instead of summing, we want the equality, so we reduce in that way 25 | res = (x1[..., None] == x2.T[None, ...]).all(1) 26 | return res 27 | 28 | def argsort_desc(scores): 29 | """ 30 | Returns the indices that sort scores descending in a smart way 31 | :param scores: Numpy array of arbitrary size 32 | :return: an array of size [numel(scores), dim(scores)] where each row is the index you'd 33 | need to get the score. 34 | """ 35 | return np.column_stack(np.unravel_index(np.argsort(-scores.ravel()), scores.shape)) -------------------------------------------------------------------------------- /configs/vg/e2e_faster_rcnn_VGG16_8_epochs_vg_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_no_spt.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: generalized_rcnn 3 | SUBTYPE: 3 4 | USE_OVLP_FILTER: True 5 | USE_FREQ_BIAS: True 6 | NO_FC7_RELU: True 7 | USE_SPATIAL_FEAT: False 8 | ADD_SO_SCORES: True 9 | ADD_SCORES_ALL: True 10 | USE_BG: True 11 | CONV_BODY: VGG16.VGG16_conv_body 12 | USE_NODE_CONTRASTIVE_LOSS: True 13 | NODE_CONTRASTIVE_MARGIN: 0.2 14 | USE_NODE_CONTRASTIVE_SO_AWARE_LOSS: True 15 | NODE_CONTRASTIVE_SO_AWARE_MARGIN: 0.2 16 | NODE_CONTRASTIVE_SO_AWARE_WEIGHT: 0.5 17 | USE_NODE_CONTRASTIVE_P_AWARE_LOSS: True 18 | NODE_CONTRASTIVE_P_AWARE_MARGIN: 0.2 19 | NODE_CONTRASTIVE_P_AWARE_WEIGHT: 0.1 20 | NODE_SAMPLE_SIZE: 128 21 | FASTER_RCNN: True 22 | NUM_GPUS: 8 23 | SOLVER: 24 | WEIGHT_DECAY: 0.0001 25 | LR_POLICY: steps_with_decay 26 | BASE_LR: 0.01 27 | GAMMA: 0.1 28 | MAX_ITER: 62723 # 62723 images 29 | STEPS: [0, 41815, 55754] 30 | VGG16: 31 | VG_PRETRAINED_WEIGHTS: 'detection_models/vg/VGG16/model_step479999.pth' 32 | VG_PRD_PRETRAINED_WEIGHTS: 'detection_models/vg/VGG16/model_step479999.pth' 33 | FAST_RCNN: 34 | ROI_BOX_HEAD: VGG16.VGG16_roi_conv5_head 35 | ROI_XFORM_METHOD: RoIAlign 36 | RPN: 37 | SIZES: (32, 64, 128, 256, 512) 38 | TRAIN: 39 | SCALES: (800,) 40 | MAX_SIZE: 1333 41 | IMS_PER_BATCH: 1 42 | BATCH_SIZE_PER_IM: 512 43 | TEST: 44 | FORCE_JSON_DATASET_EVAL: True 45 | SCALE: 800 46 | MAX_SIZE: 1333 47 | NMS: 0.5 48 | RPN_PRE_NMS_TOP_N: 6000 49 | RPN_POST_NMS_TOP_N: 1000 50 | -------------------------------------------------------------------------------- /configs/vrd/e2e_faster_rcnn_VGG16_16_epochs_vrd_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_IN_pretrained.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: generalized_rcnn 3 | SUBTYPE: 3 4 | USE_OVLP_FILTER: True 5 | USE_FREQ_BIAS: True 6 | NO_FC7_RELU: True 7 | USE_SPATIAL_FEAT: True 8 | ADD_SO_SCORES: True 9 | ADD_SCORES_ALL: True 10 | USE_BG: True 11 | CONV_BODY: VGG16.VGG16_conv_body 12 | USE_NODE_CONTRASTIVE_LOSS: True 13 | NODE_CONTRASTIVE_MARGIN: 0.2 14 | USE_NODE_CONTRASTIVE_SO_AWARE_LOSS: True 15 | NODE_CONTRASTIVE_SO_AWARE_MARGIN: 0.2 16 | NODE_CONTRASTIVE_SO_AWARE_WEIGHT: 0.5 17 | USE_NODE_CONTRASTIVE_P_AWARE_LOSS: True 18 | NODE_CONTRASTIVE_P_AWARE_MARGIN: 0.2 19 | NODE_CONTRASTIVE_P_AWARE_WEIGHT: 0.1 20 | NODE_SAMPLE_SIZE: 128 21 | FASTER_RCNN: True 22 | NUM_GPUS: 8 23 | SOLVER: 24 | WEIGHT_DECAY: 0.0001 25 | LR_POLICY: steps_with_decay 26 | BASE_LR: 0.01 27 | GAMMA: 0.1 28 | MAX_ITER: 7560 # 7560 roidbs 29 | STEPS: [0, 5040, 6720] 30 | VGG16: 31 | VRD_PRETRAINED_WEIGHTS: 'detection_models/vrd/VGG16/IN_pretrained/model_step8999.pth' 32 | VRD_PRD_PRETRAINED_WEIGHTS: 'detection_models/vrd/VGG16/IN_pretrained/model_step8999.pth' 33 | FAST_RCNN: 34 | ROI_BOX_HEAD: VGG16.VGG16_roi_conv5_head 35 | ROI_XFORM_METHOD: RoIAlign 36 | RPN: 37 | SIZES: (32, 64, 128, 256, 512) 38 | TRAIN: 39 | SCALES: (800,) 40 | MAX_SIZE: 1333 41 | IMS_PER_BATCH: 1 42 | BATCH_SIZE_PER_IM: 512 43 | TEST: 44 | FORCE_JSON_DATASET_EVAL: True 45 | SCALE: 800 46 | MAX_SIZE: 1333 47 | NMS: 0.5 48 | RPN_PRE_NMS_TOP_N: 6000 49 | RPN_POST_NMS_TOP_N: 1000 50 | -------------------------------------------------------------------------------- /configs/vrd/e2e_faster_rcnn_VGG16_16_epochs_vrd_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_COCO_pretrained.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: generalized_rcnn 3 | SUBTYPE: 3 4 | USE_OVLP_FILTER: True 5 | USE_FREQ_BIAS: True 6 | NO_FC7_RELU: True 7 | USE_SPATIAL_FEAT: True 8 | ADD_SO_SCORES: True 9 | ADD_SCORES_ALL: True 10 | USE_BG: True 11 | CONV_BODY: VGG16.VGG16_conv_body 12 | USE_NODE_CONTRASTIVE_LOSS: True 13 | NODE_CONTRASTIVE_MARGIN: 0.2 14 | USE_NODE_CONTRASTIVE_SO_AWARE_LOSS: True 15 | NODE_CONTRASTIVE_SO_AWARE_MARGIN: 0.2 16 | NODE_CONTRASTIVE_SO_AWARE_WEIGHT: 0.5 17 | USE_NODE_CONTRASTIVE_P_AWARE_LOSS: True 18 | NODE_CONTRASTIVE_P_AWARE_MARGIN: 0.2 19 | NODE_CONTRASTIVE_P_AWARE_WEIGHT: 0.1 20 | NODE_SAMPLE_SIZE: 128 21 | FASTER_RCNN: True 22 | NUM_GPUS: 8 23 | SOLVER: 24 | WEIGHT_DECAY: 0.0001 25 | LR_POLICY: steps_with_decay 26 | BASE_LR: 0.01 27 | GAMMA: 0.1 28 | MAX_ITER: 7560 # 7560 roidbs 29 | STEPS: [0, 5040, 6720] 30 | VGG16: 31 | VRD_PRETRAINED_WEIGHTS: 'detection_models/vrd/VGG16/COCO_pretrained/model_step4499.pth' 32 | VRD_PRD_PRETRAINED_WEIGHTS: 'detection_models/vrd/VGG16/COCO_pretrained/model_step4499.pth' 33 | FAST_RCNN: 34 | ROI_BOX_HEAD: VGG16.VGG16_roi_conv5_head 35 | ROI_XFORM_METHOD: RoIAlign 36 | RPN: 37 | SIZES: (32, 64, 128, 256, 512) 38 | TRAIN: 39 | SCALES: (800,) 40 | MAX_SIZE: 1333 41 | IMS_PER_BATCH: 1 42 | BATCH_SIZE_PER_IM: 512 43 | TEST: 44 | FORCE_JSON_DATASET_EVAL: True 45 | SCALE: 800 46 | MAX_SIZE: 1333 47 | NMS: 0.5 48 | RPN_PRE_NMS_TOP_N: 6000 49 | RPN_POST_NMS_TOP_N: 1000 50 | -------------------------------------------------------------------------------- /configs/vg/e2e_faster_rcnn_X-101-64x4d-FPN_8_epochs_vg_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: generalized_rcnn 3 | SUBTYPE: 3 4 | USE_OVLP_FILTER: True 5 | USE_FREQ_BIAS: True 6 | NO_FC7_RELU: True 7 | USE_SPATIAL_FEAT: True 8 | ADD_SO_SCORES: True 9 | ADD_SCORES_ALL: True 10 | USE_BG: True 11 | CONV_BODY: FPN.fpn_ResNet101_conv5_body 12 | USE_NODE_CONTRASTIVE_LOSS: True 13 | NODE_CONTRASTIVE_MARGIN: 0.2 14 | USE_NODE_CONTRASTIVE_SO_AWARE_LOSS: True 15 | NODE_CONTRASTIVE_SO_AWARE_MARGIN: 0.2 16 | NODE_CONTRASTIVE_SO_AWARE_WEIGHT: 0.5 17 | USE_NODE_CONTRASTIVE_P_AWARE_LOSS: True 18 | NODE_CONTRASTIVE_P_AWARE_MARGIN: 0.2 19 | NODE_CONTRASTIVE_P_AWARE_WEIGHT: 0.1 20 | NODE_SAMPLE_SIZE: 128 21 | FASTER_RCNN: True 22 | NUM_GPUS: 8 23 | SOLVER: 24 | WEIGHT_DECAY: 0.0001 25 | LR_POLICY: steps_with_decay 26 | # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) 27 | BASE_LR: 0.01 28 | GAMMA: 0.1 29 | MAX_ITER: 62723 # 62723 images 30 | STEPS: [0, 41815, 55754] 31 | FPN: 32 | FPN_ON: True 33 | MULTILEVEL_ROIS: True 34 | MULTILEVEL_RPN: True 35 | RESNETS: 36 | VG_PRETRAINED_WEIGHTS: 'detection_models/vg/X-101-64x4d-FPN/model_step119999.pth' 37 | VG_PRD_PRETRAINED_WEIGHTS: 'detection_models/vg/X-101-64x4d-FPN/model_step119999.pth' 38 | STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models 39 | TRANS_FUNC: bottleneck_transformation 40 | NUM_GROUPS: 64 41 | WIDTH_PER_GROUP: 4 42 | FAST_RCNN: 43 | ROI_BOX_HEAD: fast_rcnn_heads.roi_2mlp_head 44 | ROI_XFORM_METHOD: RoIAlign 45 | ROI_XFORM_RESOLUTION: 7 46 | ROI_XFORM_SAMPLING_RATIO: 2 47 | TRAIN: 48 | SCALES: (800,) 49 | MAX_SIZE: 1333 50 | IMS_PER_BATCH: 1 51 | BATCH_SIZE_PER_IM: 512 52 | RPN_PRE_NMS_TOP_N: 2000 # Per FPN level 53 | TEST: 54 | FORCE_JSON_DATASET_EVAL: True 55 | SCALE: 800 56 | MAX_SIZE: 1333 57 | NMS: 0.5 58 | RPN_PRE_NMS_TOP_N: 1000 # Per FPN level 59 | RPN_POST_NMS_TOP_N: 1000 60 | -------------------------------------------------------------------------------- /configs/oi_rel_mini/e2e_faster_rcnn_X-101-64x4d-FPN_12_epochs_oi_rel_mini_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: generalized_rcnn 3 | SUBTYPE: 3 4 | USE_OVLP_FILTER: True 5 | USE_FREQ_BIAS: True 6 | NO_FC7_RELU: True 7 | USE_SPATIAL_FEAT: True 8 | ADD_SO_SCORES: True 9 | ADD_SCORES_ALL: True 10 | USE_BG: True 11 | CONV_BODY: FPN.fpn_ResNet101_conv5_body 12 | USE_NODE_CONTRASTIVE_LOSS: True 13 | NODE_CONTRASTIVE_MARGIN: 0.2 14 | USE_NODE_CONTRASTIVE_SO_AWARE_LOSS: True 15 | NODE_CONTRASTIVE_SO_AWARE_MARGIN: 0.2 16 | NODE_CONTRASTIVE_SO_AWARE_WEIGHT: 0.5 17 | USE_NODE_CONTRASTIVE_P_AWARE_LOSS: True 18 | NODE_CONTRASTIVE_P_AWARE_MARGIN: 0.2 19 | NODE_CONTRASTIVE_P_AWARE_WEIGHT: 0.1 20 | NODE_SAMPLE_SIZE: 128 21 | FASTER_RCNN: True 22 | NUM_GPUS: 8 23 | SOLVER: 24 | WEIGHT_DECAY: 0.0001 25 | LR_POLICY: steps_with_decay 26 | # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) 27 | BASE_LR: 0.01 28 | GAMMA: 0.1 29 | MAX_ITER: 6750 # 4500 images 30 | STEPS: [0, 4500, 6000] 31 | FPN: 32 | FPN_ON: True 33 | MULTILEVEL_ROIS: True 34 | MULTILEVEL_RPN: True 35 | RESNETS: 36 | OI_REL_PRETRAINED_WEIGHTS: 'detection_models/oi_rel/X-101-64x4d-FPN/model_step599999.pth' 37 | OI_REL_PRD_PRETRAINED_WEIGHTS: 'detection_models/oi_rel/X-101-64x4d-FPN/model_step599999.pth' 38 | STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models 39 | TRANS_FUNC: bottleneck_transformation 40 | NUM_GROUPS: 64 41 | WIDTH_PER_GROUP: 4 42 | FAST_RCNN: 43 | ROI_BOX_HEAD: fast_rcnn_heads.roi_2mlp_head 44 | ROI_XFORM_METHOD: RoIAlign 45 | ROI_XFORM_RESOLUTION: 7 46 | ROI_XFORM_SAMPLING_RATIO: 2 47 | TRAIN: 48 | SCALES: (800,) 49 | MAX_SIZE: 1333 50 | IMS_PER_BATCH: 1 51 | BATCH_SIZE_PER_IM: 512 52 | RPN_PRE_NMS_TOP_N: 2000 # Per FPN level 53 | TEST: 54 | FORCE_JSON_DATASET_EVAL: True 55 | SCALE: 800 56 | MAX_SIZE: 1333 57 | NMS: 0.5 58 | # DETECTIONS_PER_IM: 100 59 | # SCORE_THRESH: 0.05 # sometimes the number of sbj_rois is 0 if SCORE_THRESH is 0.05(default) 60 | RPN_PRE_NMS_TOP_N: 1000 # Per FPN level 61 | RPN_POST_NMS_TOP_N: 1000 62 | PRD_Ks: (1, 10) 63 | -------------------------------------------------------------------------------- /lib/modeling_rel/sparse_targets_rel.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some functions are adapted from Rowan Zellers: 3 | https://github.com/rowanz/neural-motifs 4 | """ 5 | import os 6 | import torch.nn as nn 7 | import torch 8 | from torch.autograd import Variable 9 | import numpy as np 10 | import logging 11 | from six.moves import cPickle as pickle 12 | 13 | from core.config import cfg 14 | from modeling_rel.get_dataset_counts_rel import get_rel_counts 15 | 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | # This module is adapted from Rowan Zellers: 21 | # https://github.com/rowanz/neural-motifs/blob/master/lib/sparse_targets.py 22 | # Modified for this project 23 | class FrequencyBias(nn.Module): 24 | """ 25 | The goal of this is to provide a simplified way of computing 26 | P(predicate | obj1, obj2, img). 27 | """ 28 | 29 | def __init__(self, ds_name, eps=1e-3): 30 | super(FrequencyBias, self).__init__() 31 | 32 | if ds_name.find('vg') >= 0: 33 | ds_name = 'vg' 34 | elif ds_name.find('oi') >= 0: 35 | ds_name = 'oi' 36 | elif ds_name.find('vrd') >= 0: 37 | ds_name = 'vrd' 38 | else: 39 | raise NotImplementedError 40 | 41 | if cfg.MODEL.USE_OVLP_FILTER: 42 | must_overlap = True 43 | else: 44 | must_overlap = False 45 | fg_matrix, bg_matrix = get_rel_counts(ds_name, must_overlap=must_overlap) 46 | bg_matrix += 1 47 | fg_matrix[:, :, 0] = bg_matrix 48 | 49 | pred_dist = np.log(fg_matrix / (fg_matrix.sum(2)[:, :, None] + 1e-08) + eps) 50 | 51 | self.num_objs = pred_dist.shape[0] 52 | pred_dist = torch.FloatTensor(pred_dist).view(-1, pred_dist.shape[2]) 53 | 54 | self.rel_baseline = nn.Embedding(pred_dist.size(0), pred_dist.size(1)) 55 | self.rel_baseline.weight.data = pred_dist 56 | 57 | logger.info('Frequency bias tables loaded.') 58 | 59 | def rel_index_with_labels(self, labels): 60 | """ 61 | :param labels: [batch_size, 2] 62 | :return: 63 | """ 64 | return self.rel_baseline(labels[:, 0] * self.num_objs + labels[:, 1]) 65 | -------------------------------------------------------------------------------- /configs/oi_rel/e2e_faster_rcnn_X-101-64x4d-FPN_12_epochs_oi_rel_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: generalized_rcnn 3 | SUBTYPE: 3 4 | USE_OVLP_FILTER: True 5 | USE_FREQ_BIAS: True 6 | NO_FC7_RELU: True 7 | USE_SPATIAL_FEAT: True 8 | ADD_SO_SCORES: True 9 | ADD_SCORES_ALL: True 10 | USE_BG: True 11 | CONV_BODY: FPN.fpn_ResNet101_conv5_body 12 | USE_NODE_CONTRASTIVE_LOSS: True 13 | NODE_CONTRASTIVE_MARGIN: 0.2 14 | USE_NODE_CONTRASTIVE_SO_AWARE_LOSS: True 15 | NODE_CONTRASTIVE_SO_AWARE_MARGIN: 0.2 16 | NODE_CONTRASTIVE_SO_AWARE_WEIGHT: 0.5 17 | USE_NODE_CONTRASTIVE_P_AWARE_LOSS: True 18 | NODE_CONTRASTIVE_P_AWARE_MARGIN: 0.2 19 | NODE_CONTRASTIVE_P_AWARE_WEIGHT: 0.1 20 | NODE_SAMPLE_SIZE: 128 21 | FASTER_RCNN: True 22 | NUM_GPUS: 8 23 | SOLVER: 24 | WEIGHT_DECAY: 0.0001 25 | LR_POLICY: steps_with_decay 26 | # 1x schedule (note TRAIN.IMS_PER_BATCH: 1) 27 | BASE_LR: 0.01 28 | GAMMA: 0.1 29 | MAX_ITER: 80930 # 53953 images * 12 / 8 = 80929.5 30 | STEPS: [0, 53954, 71937] 31 | FPN: 32 | FPN_ON: True 33 | MULTILEVEL_ROIS: True 34 | MULTILEVEL_RPN: True 35 | RESNETS: 36 | OI_REL_PRETRAINED_WEIGHTS: 'detection_models/oi_rel/X-101-64x4d-FPN/model_step599999.pth' 37 | OI_REL_PRD_PRETRAINED_WEIGHTS: 'detection_models/oi_rel/X-101-64x4d-FPN/model_step599999.pth' 38 | STRIDE_1X1: False # default True for MSRA; False for C2 or Torch models 39 | TRANS_FUNC: bottleneck_transformation 40 | NUM_GROUPS: 64 41 | WIDTH_PER_GROUP: 4 42 | FAST_RCNN: 43 | ROI_BOX_HEAD: fast_rcnn_heads.roi_2mlp_head 44 | ROI_XFORM_METHOD: RoIAlign 45 | ROI_XFORM_RESOLUTION: 7 46 | ROI_XFORM_SAMPLING_RATIO: 2 47 | TRAIN: 48 | SCALES: (800,) 49 | MAX_SIZE: 1333 50 | IMS_PER_BATCH: 1 51 | BATCH_SIZE_PER_IM: 512 52 | RPN_PRE_NMS_TOP_N: 2000 # Per FPN level 53 | TEST: 54 | FORCE_JSON_DATASET_EVAL: True 55 | SCALE: 800 56 | MAX_SIZE: 1333 57 | NMS: 0.5 58 | # DETECTIONS_PER_IM: 100 59 | # SCORE_THRESH: 0.05 # sometimes the number of sbj_rois is 0 if SCORE_THRESH is 0.05(default) 60 | RPN_PRE_NMS_TOP_N: 1000 # Per FPN level 61 | RPN_POST_NMS_TOP_N: 1000 62 | PRD_Ks: (1, 10) 63 | -------------------------------------------------------------------------------- /tools/rename_vrd_with_numbers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[23]: 5 | 6 | 7 | import json 8 | import numpy as np 9 | import os 10 | from PIL import Image 11 | from tqdm import tqdm 12 | import copy 13 | from shutil import copyfile 14 | 15 | 16 | # take the images from the sg_dataset folder and rename them 17 | # Also converts the gif and png images into jpg 18 | 19 | def process_vrd_split(in_split, out_split): 20 | vrd_dir = 'data/vrd/sg_dataset/sg_' + in_split + '_images/' 21 | new_dir = 'data/vrd/'+ out_split + '_images/' 22 | os.mkdir(new_dir) 23 | 24 | cnt = 1 25 | name_map = {} 26 | for f in tqdm(sorted(os.listdir(vrd_dir))): 27 | # for f in os.listdir(vrd_dir): 28 | ext = f.split('.')[1] 29 | if ext.find('png') >= 0 or ext.find('gif') >= 0: 30 | img = Image.open(vrd_dir + f).convert('RGB') 31 | else: 32 | copyfile(vrd_dir + f, new_dir + '{:012d}'.format(cnt) + '.jpg') 33 | 34 | 35 | if ext.find('gif') >= 0: 36 | img.save(new_dir + '{:012d}'.format(cnt) + '.jpg') 37 | elif ext.find('png') >= 0: 38 | img.save(new_dir + '{:012d}'.format(cnt) + '.jpg') 39 | name_map[f] = cnt 40 | cnt += 1 41 | 42 | print(len(name_map)) 43 | 44 | 45 | # store the filename mappings here 46 | name_map_fname = 'data/vrd/%s_fname_mapping.json' %(out_split) 47 | with open(name_map_fname, 'w') as f: 48 | json.dump(name_map, f, sort_keys=True, indent=4) 49 | f.close() 50 | 51 | # load the original annotations 52 | with open('data/vrd/annotations_' + in_split + '.json', 'r') as f: 53 | vrd_anns = json.load(f) 54 | f.close() 55 | new_anns = {} 56 | for k, v in tqdm(vrd_anns.items()): 57 | # apparently this gif file has been renamed in the original annotations 58 | if k == '4392556686_44d71ff5a0_o.jpg': 59 | k = '4392556686_44d71ff5a0_o.gif' 60 | new_k = '{:012d}'.format(name_map[k]) + '.jpg' 61 | 62 | new_anns[new_k] = v 63 | 64 | 65 | # create the new annotations 66 | with open('data/vrd/new_annotations_' + out_split + '.json', 'w') as outfile: 67 | json.dump(new_anns, outfile) 68 | 69 | 70 | if __name__ == '__main__': 71 | 72 | # using the test split as our val. We won't have a true test split for VRD 73 | process_vrd_split('test', 'val') 74 | 75 | process_vrd_split('train', 'train') 76 | -------------------------------------------------------------------------------- /tools/convert_vrd_anno_to_coco_format.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | 5 | 6 | 7 | import json 8 | import numpy as np 9 | from PIL import Image 10 | from tqdm import tqdm 11 | 12 | 13 | 14 | # [ymin, ymax, xmin, xmax] to [x, y, w, h] 15 | def box_transform(box): 16 | x = box[2] 17 | y = box[0] 18 | w = box[3] - box[2] + 1 19 | h = box[1] - box[0] + 1 20 | return [x, y, w, h] 21 | 22 | 23 | 24 | def convert_anno(split): 25 | 26 | with open('data/vrd/new_annotations_' + split + '.json', 'r') as f: 27 | vrd_anns = json.load(f) 28 | 29 | 30 | print(len(vrd_anns)) 31 | 32 | img_dir = 'data/vrd/' + split + '_images/' 33 | new_imgs = [] 34 | new_anns = [] 35 | ann_id = 1 36 | for f, anns in tqdm(vrd_anns.items()): 37 | im_w, im_h = Image.open(img_dir + f).size 38 | image_id = int(f.split('.')[0]) 39 | new_imgs.append(dict(file_name=f, height=im_h, width=im_w, id=image_id)) 40 | # used for duplicate checking 41 | bbox_set = set() 42 | for ann in anns: 43 | # "area" in COCO is the area of segmentation mask, while here it's the area of bbox 44 | # also need to fake a 'iscrowd' which is always 0 45 | s_box = ann['subject']['bbox'] 46 | bbox = box_transform(s_box) 47 | if not tuple(bbox) in bbox_set: 48 | bbox_set.add(tuple(bbox)) 49 | area = bbox[2] * bbox[3] 50 | cat = ann['subject']['category'] 51 | new_anns.append(dict(area=area, bbox=bbox, category_id=cat, id=ann_id, image_id=image_id, iscrowd=0)) 52 | ann_id += 1 53 | 54 | o_box = ann['object']['bbox'] 55 | bbox = box_transform(o_box) 56 | if not tuple(bbox) in bbox_set: 57 | bbox_set.add(tuple(bbox)) 58 | area = bbox[2] * bbox[3] 59 | cat = ann['object']['category'] 60 | new_anns.append(dict(area=area, bbox=bbox, category_id=cat, id=ann_id, image_id=image_id, iscrowd=0)) 61 | ann_id += 1 62 | 63 | with open('data/vrd/objects.json', 'r') as f: 64 | vrd_objs = json.load(f) 65 | 66 | 67 | new_objs = [] 68 | for i, obj in enumerate(vrd_objs): 69 | new_objs.append(dict(id=i, name=obj, supercategory=obj)) 70 | 71 | 72 | new_data = dict(images=new_imgs, annotations=new_anns, categories=new_objs) 73 | 74 | with open('data/vrd/detections_' + split + '.json', 'w') as outfile: 75 | json.dump(new_data, outfile) 76 | 77 | 78 | 79 | if __name__ == '__main__': 80 | convert_anno('train') 81 | convert_anno('val') 82 | 83 | -------------------------------------------------------------------------------- /lib/modeling_rel/generate_rel_proposal_labels.py: -------------------------------------------------------------------------------- 1 | # Adapted from Detectron.pytorch/lib/modeling/generate_proposal_labels.py 2 | # for this project by Ji Zhang, 2019 3 | 4 | from torch import nn 5 | 6 | from core.config import cfg 7 | from datasets_rel import json_dataset_rel 8 | from roi_data_rel.fast_rcnn_rel import add_rel_blobs 9 | 10 | 11 | class GenerateRelProposalLabelsOp(nn.Module): 12 | def __init__(self): 13 | super().__init__() 14 | 15 | def forward(self, sbj_rois, obj_rois, det_rois, roidb, im_info): 16 | 17 | im_scales = im_info.data.numpy()[:, 2] 18 | # For historical consistency with the original Faster R-CNN 19 | # implementation we are *not* filtering crowd proposals. 20 | # This choice should be investigated in the future (it likely does 21 | # not matter). 22 | # Note: crowd_thresh=0 will ignore _filter_crowd_proposals 23 | json_dataset_rel.add_rel_proposals(roidb, sbj_rois, obj_rois, det_rois, im_scales) 24 | output_blob_names = ['sbj_rois', 'obj_rois', 'rel_rois', 'fg_prd_labels_int32', 'all_prd_labels_int32', 'fg_size'] 25 | if cfg.MODEL.USE_SPATIAL_FEAT: 26 | output_blob_names += ['spt_feat'] 27 | if cfg.MODEL.USE_FREQ_BIAS: 28 | output_blob_names += ['all_sbj_labels_int32'] 29 | output_blob_names += ['all_obj_labels_int32'] 30 | if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS: 31 | output_blob_names += ['binary_labels_sbj_pos_int32', 32 | 'sbj_rois_sbj_pos', 'obj_rois_sbj_pos', 'rel_rois_sbj_pos', 33 | 'spt_feat_sbj_pos', 34 | 'sbj_labels_sbj_pos_int32', 'obj_labels_sbj_pos_int32', 'prd_labels_sbj_pos_int32', 35 | 'sbj_labels_sbj_pos_fg_int32', 'obj_labels_sbj_pos_fg_int32', 36 | 'inds_unique_sbj_pos', 37 | 'inds_reverse_sbj_pos', 38 | 'binary_labels_obj_pos_int32', 39 | 'sbj_rois_obj_pos', 'obj_rois_obj_pos', 'rel_rois_obj_pos', 40 | 'spt_feat_obj_pos', 41 | 'sbj_labels_obj_pos_int32', 'obj_labels_obj_pos_int32', 'prd_labels_obj_pos_int32', 42 | 'sbj_labels_obj_pos_fg_int32', 'obj_labels_obj_pos_fg_int32', 43 | 'inds_unique_obj_pos', 44 | 'inds_reverse_obj_pos'] 45 | blobs = {k: [] for k in output_blob_names} 46 | 47 | add_rel_blobs(blobs, im_scales, roidb) 48 | 49 | return blobs 50 | -------------------------------------------------------------------------------- /lib/utils_rel/cython_bbox_rel.pyx: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang for this project in 2019 2 | # 3 | # Original license text below: 4 | ############################################################################# 5 | # Copyright (c) 2017-present, Facebook, Inc. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | ############################################################################## 19 | # 20 | # Based on: 21 | # -------------------------------------------------------- 22 | # Fast R-CNN 23 | # Copyright (c) 2015 Microsoft 24 | # Licensed under The MIT License [see LICENSE for details] 25 | # Written by Sergey Karayev 26 | # -------------------------------------------------------- 27 | 28 | cimport cython 29 | import numpy as np 30 | cimport numpy as np 31 | 32 | DTYPE = np.float32 33 | ctypedef np.float32_t DTYPE_t 34 | 35 | 36 | @cython.boundscheck(False) 37 | def bbox_pair_overlaps( 38 | np.ndarray[DTYPE_t, ndim=2] boxes1, 39 | np.ndarray[DTYPE_t, ndim=2] boxes2): 40 | """ 41 | Parameters 42 | ---------- 43 | boxes1: (N, 4) ndarray of float 44 | boxes2: (N, 4) ndarray of float 45 | Returns 46 | ------- 47 | overlaps: (N,) ndarray of overlaps between each pair of boxes1 and boxes2 48 | """ 49 | assert boxes1.shape[0] == boxes2.shape[0] 50 | cdef unsigned int N = boxes1.shape[0] 51 | cdef np.ndarray[DTYPE_t, ndim=1] overlaps = np.zeros(N, dtype=DTYPE) 52 | cdef DTYPE_t iw, ih, box_area 53 | cdef DTYPE_t ua 54 | cdef unsigned int n 55 | with nogil: 56 | for n in range(N): 57 | box_area = ( 58 | (boxes2[n, 2] - boxes2[n, 0] + 1) * 59 | (boxes2[n, 3] - boxes2[n, 1] + 1) 60 | ) 61 | iw = ( 62 | min(boxes1[n, 2], boxes2[n, 2]) - 63 | max(boxes1[n, 0], boxes2[n, 0]) + 1 64 | ) 65 | if iw > 0: 66 | ih = ( 67 | min(boxes1[n, 3], boxes2[n, 3]) - 68 | max(boxes1[n, 1], boxes2[n, 1]) + 1 69 | ) 70 | if ih > 0: 71 | ua = float( 72 | (boxes1[n, 2] - boxes1[n, 0] + 1) * 73 | (boxes1[n, 3] - boxes1[n, 1] + 1) + 74 | box_area - iw * ih 75 | ) 76 | overlaps[n] = iw * ih / ua 77 | return overlaps 78 | -------------------------------------------------------------------------------- /lib/modeling_rel/VGG16.py: -------------------------------------------------------------------------------- 1 | # Written by Ji Zhang in 2019 2 | 3 | import os 4 | import numpy as np 5 | import logging 6 | from collections import OrderedDict 7 | 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.functional as F 11 | 12 | from core.config import cfg 13 | import nn as mynn 14 | import torchvision.models as models 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | # ---------------------------------------------------------------------------- # 20 | # VGG16 architecture 21 | # ---------------------------------------------------------------------------- # 22 | 23 | vgg = models.vgg16() 24 | if cfg.VGG16.IMAGENET_PRETRAINED_WEIGHTS != '': 25 | logger.info("Loading imagenet pretrained weights from %s", cfg.VGG16.IMAGENET_PRETRAINED_WEIGHTS) 26 | state_dict = torch.load(cfg.VGG16.IMAGENET_PRETRAINED_WEIGHTS) 27 | vgg.load_state_dict({k:v for k, v in state_dict.items() if k in vgg.state_dict()}) 28 | 29 | class VGG16_conv_body(nn.Module): 30 | def __init__(self): 31 | super().__init__() 32 | self.num_layers = 16 33 | self.spatial_scale = 1. / 16. # final feature scale wrt. original image scale 34 | self.dim_out = 512 35 | 36 | self._init_modules() 37 | 38 | def _init_modules(self): 39 | 40 | # not using the last maxpool layer 41 | self.convs = nn.Sequential(*list(vgg.features._modules.values())[:-1]) 42 | 43 | for layer in range(10): 44 | for p in self.convs[layer].parameters(): p.requires_grad = False 45 | 46 | def forward(self, x): 47 | 48 | return self.convs(x) 49 | 50 | 51 | class VGG16_roi_conv5_head(nn.Module): 52 | def __init__(self, dim_in, roi_xform_func, spatial_scale): 53 | super().__init__() 54 | self.roi_xform = roi_xform_func 55 | self.spatial_scale = spatial_scale 56 | 57 | self.dim_out = 4096 58 | self.dim_roi_out = dim_in # 512 59 | 60 | self._init_modules() 61 | 62 | def _init_modules(self): 63 | 64 | self.heads = nn.Sequential(*list(vgg.classifier._modules.values())[:-1]) 65 | 66 | def forward(self, x, rpn_ret, rois_name='rois', use_relu=True): 67 | x = self.roi_xform( 68 | x, rpn_ret, 69 | blob_rois=rois_name, 70 | method=cfg.FAST_RCNN.ROI_XFORM_METHOD, 71 | resolution=7, 72 | spatial_scale=self.spatial_scale, 73 | sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO 74 | ) 75 | 76 | feat = x.view(x.size(0), -1) 77 | 78 | if use_relu: 79 | for layer in list(self.heads.children()): 80 | feat = layer(feat) 81 | else: 82 | # not use the last Drop-out and ReLU in fc7 (keep it the same with Rawan's paper) 83 | for layer in list(self.heads.children())[:-2]: 84 | feat = layer(feat) 85 | 86 | return feat 87 | -------------------------------------------------------------------------------- /lib/utils_rel/net_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang in 2019 2 | # 3 | # Based on Detectron.pytorch/lib/utils/net.py written by Roy Tseng 4 | 5 | import logging 6 | import os 7 | import numpy as np 8 | 9 | import torch 10 | import torch.nn.functional as F 11 | from torch.autograd import Variable 12 | 13 | from core.config import cfg 14 | from utils.net import _get_lr_change_ratio 15 | from utils.net import _CorrectMomentum 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | def update_learning_rate_att(optimizer, cur_lr, new_lr): 21 | """Update learning rate""" 22 | if cur_lr != new_lr: 23 | ratio = _get_lr_change_ratio(cur_lr, new_lr) 24 | if ratio > cfg.SOLVER.LOG_LR_CHANGE_THRESHOLD: 25 | logger.info('Changing learning rate %.6f -> %.6f', cur_lr, new_lr) 26 | # Update learning rate, note that different parameter may have different learning rate 27 | param_keys = [] 28 | for ind, param_group in enumerate(optimizer.param_groups): 29 | if (ind == 1 or ind == 3) and cfg.SOLVER.BIAS_DOUBLE_LR: # bias params 30 | param_group['lr'] = new_lr * 2 31 | else: 32 | param_group['lr'] = new_lr 33 | if ind <= 1: # backbone params 34 | param_group['lr'] = cfg.SOLVER.BACKBONE_LR_SCALAR * param_group['lr'] # 0.1 * param_group['lr'] 35 | param_keys += param_group['params'] 36 | if cfg.SOLVER.TYPE in ['SGD'] and cfg.SOLVER.SCALE_MOMENTUM and cur_lr > 1e-7 and \ 37 | ratio > cfg.SOLVER.SCALE_MOMENTUM_THRESHOLD: 38 | _CorrectMomentum(optimizer, param_keys, new_lr / cur_lr) 39 | 40 | 41 | def update_learning_rate_rel(optimizer, cur_lr, new_lr): 42 | """Update learning rate""" 43 | if cur_lr != new_lr: 44 | ratio = _get_lr_change_ratio(cur_lr, new_lr) 45 | if ratio > cfg.SOLVER.LOG_LR_CHANGE_THRESHOLD: 46 | logger.info('Changing learning rate %.6f -> %.6f', cur_lr, new_lr) 47 | # Update learning rate, note that different parameter may have different learning rate 48 | param_keys = [] 49 | for ind, param_group in enumerate(optimizer.param_groups): 50 | if (ind == 1 or ind == 3) and cfg.SOLVER.BIAS_DOUBLE_LR: # bias params 51 | param_group['lr'] = new_lr * 2 52 | else: 53 | param_group['lr'] = new_lr 54 | if ind <= 1: # backbone params 55 | param_group['lr'] = cfg.SOLVER.BACKBONE_LR_SCALAR * param_group['lr'] # 0.1 * param_group['lr'] 56 | param_keys += param_group['params'] 57 | if cfg.SOLVER.TYPE in ['SGD'] and cfg.SOLVER.SCALE_MOMENTUM and cur_lr > 1e-7 and \ 58 | ratio > cfg.SOLVER.SCALE_MOMENTUM_THRESHOLD: 59 | _CorrectMomentum(optimizer, param_keys, new_lr / cur_lr) 60 | 61 | 62 | def load_ckpt_rel(model, ckpt): 63 | """Load checkpoint""" 64 | 65 | model.load_state_dict(ckpt, strict=False) 66 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Unless otherwise stated, all files are released under the following license: 2 | 3 | * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions 7 | * are met: 8 | * * Redistributions of source code must retain the above copyright 9 | * notice, this list of conditions and the following disclaimer. 10 | * * Redistributions in binary form must reproduce the above copyright 11 | * notice, this list of conditions and the following disclaimer in the 12 | * documentation and/or other materials provided with the distribution. 13 | * * Neither the name of NVIDIA CORPORATION nor the names of its 14 | * contributors may be used to endorse or promote products derived 15 | * from this software without specific prior written permission. 16 | * 17 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 18 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 20 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 25 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | 29 | 30 | The license for files within Detectron_pytorch git submodule can be found in 31 | Detectron.pytorch/LICENSE. This license also applies to our source files derived from 32 | those in Detectron.pytorch and written by Roy Tseng. Additional information will 33 | be included in the header of all derived source files. 34 | 35 | Additional code is derived from the Neural-Motifs repository by Rowan Zellers: 36 | https://github.com/rowanz/neural-motifs 37 | The license for this repository is reproduced below: 38 | 39 | MIT License 40 | 41 | Copyright (c) 2018 Rowan Zellers 42 | 43 | Permission is hereby granted, free of charge, to any person obtaining a copy 44 | of this software and associated documentation files (the "Software"), to deal 45 | in the Software without restriction, including without limitation the rights 46 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 47 | copies of the Software, and to permit persons to whom the Software is 48 | furnished to do so, subject to the following conditions: 49 | 50 | The above copyright notice and this permission notice shall be included in all 51 | copies or substantial portions of the Software. 52 | 53 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 54 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 55 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 56 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 57 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 58 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 59 | SOFTWARE. 60 | -------------------------------------------------------------------------------- /lib/modeling_rel/rel_pyramid_module.py: -------------------------------------------------------------------------------- 1 | # Written by Ji Zhang in 2019 2 | 3 | import collections 4 | import numpy as np 5 | import logging 6 | 7 | import torch 8 | import torch.nn as nn 9 | import torch.nn.functional as F 10 | from torch.nn import init 11 | 12 | from core.config import cfg 13 | import utils.net as net_utils 14 | import modeling.ResNet as ResNet 15 | from modeling.generate_anchors import generate_anchors 16 | from modeling.generate_proposals import GenerateProposalsOp 17 | from modeling.collect_and_distribute_fpn_rpn_proposals import CollectAndDistributeFpnRpnProposalsOp 18 | import nn as mynn 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class rel_pyramid_module(nn.Module): 24 | def __init__(self, num_backbone_stages): 25 | super().__init__() 26 | 27 | fpn_dim = cfg.FPN.DIM 28 | self.num_backbone_stages = num_backbone_stages 29 | 30 | self.prd_conv_lateral = nn.ModuleList() 31 | for i in range(self.num_backbone_stages): 32 | if cfg.FPN.USE_GN: 33 | self.prd_conv_lateral.append(nn.Sequential( 34 | nn.Conv2d(fpn_dim, fpn_dim, 1, 1, 0, bias=False), 35 | nn.GroupNorm(net_utils.get_group_gn(fpn_dim), fpn_dim, 36 | eps=cfg.GROUP_NORM.EPSILON))) 37 | else: 38 | self.prd_conv_lateral.append(nn.Conv2d(fpn_dim, fpn_dim, 1, 1, 0)) 39 | 40 | self.posthoc_modules = nn.ModuleList() 41 | for i in range(self.num_backbone_stages): 42 | if cfg.FPN.USE_GN: 43 | self.posthoc_modules.append(nn.Sequential( 44 | nn.Conv2d(fpn_dim, fpn_dim, 3, 1, 1, bias=False), 45 | nn.GroupNorm(net_utils.get_group_gn(fpn_dim), fpn_dim, 46 | eps=cfg.GROUP_NORM.EPSILON))) 47 | else: 48 | self.posthoc_modules.append(nn.Conv2d(fpn_dim, fpn_dim, 3, 1, 1)) 49 | 50 | self._init_weights() 51 | 52 | def _init_weights(self): 53 | for m in self.modules(): 54 | if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): 55 | mynn.init.XavierFill(m.weight) 56 | if m.bias is not None: 57 | nn.init.constant_(m.bias, 0) 58 | elif isinstance(m, nn.BatchNorm2d): 59 | nn.init.constant_(m.weight, 1) 60 | nn.init.constant_(m.bias, 0) 61 | 62 | def forward(self, blob_conv): 63 | # blob_conv is in the order (P5, P4, P3, P2) 64 | rel_lateral_inner_blob = None 65 | rel_lateral_output_blobs = [] 66 | for i in range(self.num_backbone_stages): 67 | if rel_lateral_inner_blob is not None: 68 | bu = F.max_pool2d(rel_lateral_inner_blob, 2, stride=2) 69 | rel_lateral_inner_blob = \ 70 | self.prd_conv_lateral[i](blob_conv[-1 - i]) + bu 71 | else: 72 | rel_lateral_inner_blob = \ 73 | self.prd_conv_lateral[i](blob_conv[-1 - i]) 74 | rel_lateral_output_blobs.append(self.posthoc_modules[i](rel_lateral_inner_blob)) 75 | 76 | # the output is in the order of (P2, P3, P4, P5), we need to recover it back to (P5, P4, P3, P2) 77 | rel_lateral_output_blobs.reverse() 78 | return rel_lateral_output_blobs 79 | -------------------------------------------------------------------------------- /lib/roi_data_rel/minibatch_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang in 2019 2 | # 3 | # Based on Detectron.pytorch/lib/roi_data/minibatch.py written by Roy Tseng 4 | 5 | import numpy as np 6 | import cv2 7 | 8 | from core.config import cfg 9 | import utils.blob as blob_utils 10 | import roi_data.rpn 11 | 12 | 13 | def get_minibatch_blob_names(is_training=True): 14 | """Return blob names in the order in which they are read by the data loader. 15 | """ 16 | # data blob: holds a batch of N images, each with 3 channels 17 | blob_names = ['data'] 18 | if cfg.RPN.RPN_ON: 19 | # RPN-only or end-to-end Faster R-CNN 20 | blob_names += roi_data.rpn.get_rpn_blob_names(is_training=is_training) 21 | elif cfg.RETINANET.RETINANET_ON: 22 | raise NotImplementedError 23 | else: 24 | # Fast R-CNN like models trained on precomputed proposals 25 | blob_names += roi_data.fast_rcnn.get_fast_rcnn_blob_names( 26 | is_training=is_training 27 | ) 28 | return blob_names 29 | 30 | 31 | def get_minibatch(roidb): 32 | """Given a roidb, construct a minibatch sampled from it.""" 33 | # We collect blobs from each image onto a list and then concat them into a 34 | # single tensor, hence we initialize each blob to an empty list 35 | blobs = {k: [] for k in get_minibatch_blob_names()} 36 | 37 | # Get the input image blob 38 | im_blob, im_scales = _get_image_blob(roidb) 39 | blobs['data'] = im_blob 40 | if cfg.RPN.RPN_ON: 41 | # RPN-only or end-to-end Faster/Mask R-CNN 42 | valid = roi_data.rpn.add_rpn_blobs(blobs, im_scales, roidb) 43 | elif cfg.RETINANET.RETINANET_ON: 44 | raise NotImplementedError 45 | else: 46 | # Fast R-CNN like models trained on precomputed proposals 47 | valid = roi_data.fast_rcnn.add_fast_rcnn_blobs(blobs, im_scales, roidb) 48 | # add relpn blobs 49 | add_relpn_blobs(blobs, im_scales, roidb) 50 | return blobs, valid 51 | 52 | 53 | def add_relpn_blobs(blobs, im_scales, roidb): 54 | 55 | assert 'roidb' in blobs 56 | valid_keys = ['dataset_name', 57 | 'sbj_gt_boxes', 'sbj_gt_classes', 'obj_gt_boxes', 'obj_gt_classes', 'prd_gt_classes', 58 | 'sbj_gt_overlaps', 'obj_gt_overlaps', 'prd_gt_overlaps', 'pair_to_gt_ind_map', 59 | 'width', 'height'] 60 | for i, e in enumerate(roidb): 61 | for k in valid_keys: 62 | if k in e: 63 | blobs['roidb'][i][k] = e[k] 64 | 65 | # Always return valid=True, since RPN minibatches are valid by design 66 | return True 67 | 68 | 69 | def _get_image_blob(roidb): 70 | """Builds an input blob from the images in the roidb at the specified 71 | scales. 72 | """ 73 | num_images = len(roidb) 74 | # Sample random scales to use for each image in this batch 75 | scale_inds = np.random.randint( 76 | 0, high=len(cfg.TRAIN.SCALES), size=num_images) 77 | processed_ims = [] 78 | im_scales = [] 79 | for i in range(num_images): 80 | im = cv2.imread(roidb[i]['image']) 81 | assert im is not None, \ 82 | 'Failed to read image \'{}\''.format(roidb[i]['image']) 83 | # If NOT using opencv to read in images, uncomment following lines 84 | # if len(im.shape) == 2: 85 | # im = im[:, :, np.newaxis] 86 | # im = np.concatenate((im, im, im), axis=2) 87 | # # flip the channel, since the original one using cv2 88 | # # rgb -> bgr 89 | # im = im[:, :, ::-1] 90 | if roidb[i]['flipped']: 91 | im = im[:, ::-1, :] 92 | target_size = cfg.TRAIN.SCALES[scale_inds[i]] 93 | im, im_scale = blob_utils.prep_im_for_blob( 94 | im, cfg.PIXEL_MEANS, [target_size], cfg.TRAIN.MAX_SIZE) 95 | im_scales.append(im_scale[0]) 96 | processed_ims.append(im[0]) 97 | 98 | # Create a blob to hold the input images [n, c, h, w] 99 | blob = blob_utils.im_list_to_blob(processed_ims) 100 | 101 | return blob, im_scales 102 | -------------------------------------------------------------------------------- /lib/utils_rel/logging_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang for this project in 2019 2 | # Based on Detectron.pytorch/lib/utils/logging.py 3 | # Original license text below: 4 | # 5 | ############################################################################ 6 | # Copyright (c) 2017-present, Facebook, Inc. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | ############################################################################## 20 | 21 | """Utilities for logging.""" 22 | 23 | from __future__ import absolute_import 24 | from __future__ import division 25 | from __future__ import print_function 26 | from __future__ import unicode_literals 27 | 28 | from collections import deque 29 | from email.mime.text import MIMEText 30 | import json 31 | import logging 32 | import numpy as np 33 | import smtplib 34 | import sys 35 | 36 | from core.config import cfg 37 | 38 | # Print lower precision floating point values than default FLOAT_REPR 39 | # Note! Has no use for json encode with C speedups 40 | json.encoder.FLOAT_REPR = lambda o: format(o, '.6f') 41 | 42 | 43 | def log_json_stats(stats, sort_keys=True): 44 | print('json_stats: {:s}'.format(json.dumps(stats, sort_keys=sort_keys))) 45 | 46 | 47 | def log_stats(stats, misc_args): 48 | """Log training statistics to terminal""" 49 | if hasattr(misc_args, 'epoch'): 50 | lines = "[%s][%s][Epoch %d][Iter %d / %d]\n" % ( 51 | misc_args.run_name, misc_args.cfg_filename, 52 | misc_args.epoch, misc_args.step, misc_args.iters_per_epoch) 53 | else: 54 | lines = "[%s][%s][Step %d / %d]\n" % ( 55 | misc_args.run_name, misc_args.cfg_filename, stats['iter'], cfg.SOLVER.MAX_ITER) 56 | 57 | lines += "\t\tloss: %.6f, lr: %.6f backbone_lr: %.6f time: %.6f, eta: %s\n" % ( 58 | stats['loss'], stats['lr'], stats['backbone_lr'], stats['time'], stats['eta'] 59 | ) 60 | if stats['metrics']: 61 | lines += "\t\t" + ", ".join("%s: %.6f" % (k, v) for k, v in stats['metrics'].items()) + "\n" 62 | if stats['head_losses']: 63 | lines += "\t\t" + ", ".join("%s: %.6f" % (k, v) for k, v in stats['head_losses'].items()) + "\n" 64 | print(lines[:-1]) # remove last new line 65 | 66 | 67 | class SmoothedValue(object): 68 | """Track a series of values and provide access to smoothed values over a 69 | window or the global series average. 70 | """ 71 | 72 | def __init__(self, window_size): 73 | self.deque = deque(maxlen=window_size) 74 | self.series = [] 75 | self.total = 0.0 76 | self.count = 0 77 | 78 | def AddValue(self, value): 79 | self.deque.append(value) 80 | self.series.append(value) 81 | self.count += 1 82 | self.total += value 83 | 84 | def GetMedianValue(self): 85 | return np.median(self.deque) 86 | 87 | def GetAverageValue(self): 88 | return np.mean(self.deque) 89 | 90 | def GetGlobalAverageValue(self): 91 | return self.total / self.count 92 | 93 | 94 | def send_email(subject, body, to): 95 | s = smtplib.SMTP('localhost') 96 | mime = MIMEText(body) 97 | mime['Subject'] = subject 98 | mime['To'] = to 99 | s.sendmail('detectron', to, mime.as_string()) 100 | 101 | 102 | def setup_logging(name): 103 | FORMAT = '%(levelname)s %(filename)s:%(lineno)4d: %(message)s' 104 | # Manually clear root loggers to prevent any module that may have called 105 | # logging.basicConfig() from blocking our logging setup 106 | logging.root.handlers = [] 107 | logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout) 108 | logger = logging.getLogger(name) 109 | return logger 110 | -------------------------------------------------------------------------------- /lib/modeling_rel/get_dataset_counts_rel.py: -------------------------------------------------------------------------------- 1 | # Some functions are adapted from Rowan Zellers: 2 | # https://github.com/rowanz/neural-motifs 3 | # Get counts of all of the examples in the dataset. Used for creating the baseline 4 | # dictionary model 5 | 6 | from __future__ import absolute_import 7 | from __future__ import division 8 | from __future__ import print_function 9 | from __future__ import unicode_literals 10 | 11 | import numpy as np 12 | import json 13 | 14 | import utils.boxes as box_utils 15 | import utils_rel.boxes_rel as box_utils_rel 16 | from core.config import cfg 17 | 18 | from datasets_rel.dataset_catalog_rel import ANN_FN2 19 | from datasets_rel.dataset_catalog_rel import DATASETS 20 | 21 | 22 | # This function is adapted from Rowan Zellers: 23 | # https://github.com/rowanz/neural-motifs/blob/master/lib/get_dataset_counts.py 24 | # Modified for this project 25 | def get_rel_counts(ds_name, must_overlap=True): 26 | """ 27 | Get counts of all of the relations. Used for modeling directly P(rel | o1, o2) 28 | :param train_data: 29 | :param must_overlap: 30 | :return: 31 | """ 32 | 33 | if ds_name.find('vg') >= 0: 34 | with open(DATASETS['vg_train'][ANN_FN2]) as f: 35 | train_data = json.load(f) 36 | elif ds_name.find('oi') >= 0: 37 | with open(DATASETS['oi_rel_train'][ANN_FN2]) as f: 38 | train_data = json.load(f) 39 | elif ds_name.find('vrd') >= 0: 40 | with open(DATASETS['vrd_train'][ANN_FN2]) as f: 41 | train_data = json.load(f) 42 | else: 43 | raise NotImplementedError 44 | 45 | fg_matrix = np.zeros(( 46 | cfg.MODEL.NUM_CLASSES - 1, # not include background 47 | cfg.MODEL.NUM_CLASSES - 1, # not include background 48 | cfg.MODEL.NUM_PRD_CLASSES + 1, # include background 49 | ), dtype=np.int64) 50 | 51 | bg_matrix = np.zeros(( 52 | cfg.MODEL.NUM_CLASSES - 1, # not include background 53 | cfg.MODEL.NUM_CLASSES - 1, # not include background 54 | ), dtype=np.int64) 55 | 56 | for _, im_rels in train_data.items(): 57 | # get all object boxes 58 | gt_box_to_label = {} 59 | for i, rel in enumerate(im_rels): 60 | sbj_box = box_utils_rel.y1y2x1x2_to_x1y1x2y2(rel['subject']['bbox']) 61 | obj_box = box_utils_rel.y1y2x1x2_to_x1y1x2y2(rel['object']['bbox']) 62 | sbj_lbl = rel['subject']['category'] # not include background 63 | obj_lbl = rel['object']['category'] # not include background 64 | prd_lbl = rel['predicate'] # not include background 65 | if tuple(sbj_box) not in gt_box_to_label: 66 | gt_box_to_label[tuple(sbj_box)] = sbj_lbl 67 | if tuple(obj_box) not in gt_box_to_label: 68 | gt_box_to_label[tuple(obj_box)] = obj_lbl 69 | 70 | fg_matrix[sbj_lbl, obj_lbl, prd_lbl + 1] += 1 71 | 72 | if cfg.MODEL.USE_OVLP_FILTER: 73 | if len(gt_box_to_label): 74 | gt_boxes = np.array(list(gt_box_to_label.keys()), dtype=np.int32) 75 | gt_classes = np.array(list(gt_box_to_label.values()), dtype=np.int32) 76 | o1o2_total = gt_classes[np.array( 77 | box_filter(gt_boxes, must_overlap=must_overlap), dtype=int)] 78 | for (o1, o2) in o1o2_total: 79 | bg_matrix[o1, o2] += 1 80 | else: 81 | # consider all pairs of boxes, overlapped or non-overlapped 82 | for b1, l1 in gt_box_to_label.items(): 83 | for b2, l2 in gt_box_to_label.items(): 84 | if b1 == b2: 85 | continue 86 | bg_matrix[l1, l2] += 1 87 | 88 | return fg_matrix, bg_matrix 89 | 90 | 91 | # This function is adapted from Rowan Zellers: 92 | # https://github.com/rowanz/neural-motifs/blob/master/lib/get_dataset_counts.py 93 | # Modified for this project 94 | def box_filter(boxes, must_overlap=False): 95 | """ Only include boxes that overlap as possible relations. 96 | If no overlapping boxes, use all of them.""" 97 | n_cands = boxes.shape[0] 98 | 99 | overlaps = box_utils.bbox_overlaps(boxes.astype(np.float32), boxes.astype(np.float32)) > 0 100 | np.fill_diagonal(overlaps, 0) 101 | 102 | all_possib = np.ones_like(overlaps, dtype=np.bool) 103 | np.fill_diagonal(all_possib, 0) 104 | 105 | if must_overlap: 106 | possible_boxes = np.column_stack(np.where(overlaps)) 107 | 108 | if possible_boxes.size == 0: 109 | possible_boxes = np.column_stack(np.where(all_possib)) 110 | else: 111 | possible_boxes = np.column_stack(np.where(all_possib)) 112 | return possible_boxes 113 | -------------------------------------------------------------------------------- /lib/datasets_rel/dataset_catalog_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted from Detectron.pytorch/lib/datasets/dataset_catalog.py 2 | # for this project by Ji Zhang,2019 3 | #----------------------------------------------------------------------------- 4 | # Copyright (c) 2017-present, Facebook, Inc. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ############################################################################## 18 | 19 | """Collection of available datasets.""" 20 | 21 | from __future__ import absolute_import 22 | from __future__ import division 23 | from __future__ import print_function 24 | from __future__ import unicode_literals 25 | 26 | import os 27 | 28 | from core.config import cfg 29 | 30 | # Path to data dir 31 | _DATA_DIR = cfg.DATA_DIR 32 | 33 | # Required dataset entry keys 34 | IM_DIR = 'image_directory' 35 | ANN_FN = 'annotation_file' 36 | ANN_FN2 = 'annotation_file2' 37 | ANN_FN3 = 'predicate_file' 38 | 39 | # Optional dataset entry keys 40 | IM_PREFIX = 'image_prefix' 41 | DEVKIT_DIR = 'devkit_directory' 42 | RAW_DIR = 'raw_dir' 43 | 44 | # Available datasets 45 | DATASETS = { 46 | # OpenImages_v4 rel dataset for relationship task 47 | 'oi_rel_train': { 48 | IM_DIR: 49 | _DATA_DIR + '/openimages_v4/train', 50 | ANN_FN: 51 | _DATA_DIR + '/openimages_v4/rel/detections_train.json', 52 | ANN_FN2: 53 | _DATA_DIR + '/openimages_v4/rel/rel_only_annotations_train.json', 54 | ANN_FN3: 55 | _DATA_DIR + '/openimages_v4/rel/rel_9_predicates.json', 56 | }, 57 | 'oi_rel_train_mini': { 58 | IM_DIR: 59 | _DATA_DIR + '/openimages_v4/train', 60 | ANN_FN: 61 | _DATA_DIR + '/openimages_v4/rel/detections_train.json', 62 | ANN_FN2: 63 | _DATA_DIR + '/openimages_v4/rel/rel_only_annotations_train_mini.json', 64 | ANN_FN3: 65 | _DATA_DIR + '/openimages_v4/rel/rel_9_predicates.json', 66 | }, 67 | 'oi_rel_val': { 68 | IM_DIR: 69 | _DATA_DIR + '/openimages_v4/train', 70 | ANN_FN: 71 | _DATA_DIR + '/openimages_v4/rel/detections_val.json', 72 | ANN_FN2: 73 | _DATA_DIR + '/openimages_v4/rel/rel_only_annotations_val.json', 74 | ANN_FN3: 75 | _DATA_DIR + '/openimages_v4/rel/rel_9_predicates.json', 76 | }, 77 | 'oi_rel_val_mini': { 78 | IM_DIR: 79 | _DATA_DIR + '/openimages_v4/train', 80 | ANN_FN: 81 | _DATA_DIR + '/openimages_v4/rel/detections_val.json', 82 | ANN_FN2: 83 | _DATA_DIR + '/openimages_v4/rel/rel_only_annotations_val_mini.json', 84 | ANN_FN3: 85 | _DATA_DIR + '/openimages_v4/rel/rel_9_predicates.json', 86 | }, 87 | # for Kaggle test 88 | 'oi_kaggle_rel_test': { 89 | IM_DIR: 90 | _DATA_DIR + '/openimages_v4/rel/kaggle_test_images/challenge2018_test', 91 | ANN_FN: # pseudo annotation 92 | _DATA_DIR + '/openimages_v4/rel/kaggle_test_images/detections_test.json', 93 | ANN_FN2: 94 | _DATA_DIR + '/openimages_v4/rel/kaggle_test_images/all_rel_only_annotations_test.json', 95 | ANN_FN3: 96 | _DATA_DIR + '/openimages_v4/rel/rel_9_predicates.json', 97 | }, 98 | # VG dataset 99 | 'vg_train': { 100 | IM_DIR: 101 | _DATA_DIR + '/vg/VG_100K', 102 | ANN_FN: 103 | _DATA_DIR + '/vg/detections_train.json', 104 | ANN_FN2: 105 | _DATA_DIR + '/vg/rel_annotations_train.json', 106 | ANN_FN3: 107 | _DATA_DIR + '/vg/predicates.json', 108 | }, 109 | 'vg_val': { 110 | IM_DIR: 111 | _DATA_DIR + '/vg/VG_100K', 112 | ANN_FN: 113 | _DATA_DIR + '/vg/detections_val.json', 114 | ANN_FN2: 115 | _DATA_DIR + '/vg/rel_annotations_val.json', 116 | ANN_FN3: 117 | _DATA_DIR + '/vg/predicates.json', 118 | }, 119 | # VRD dataset 120 | 'vrd_train': { 121 | IM_DIR: 122 | _DATA_DIR + '/vrd/train_images', 123 | ANN_FN: 124 | _DATA_DIR + '/vrd/detections_train.json', 125 | ANN_FN2: 126 | _DATA_DIR + '/vrd/new_annotations_train.json', 127 | ANN_FN3: 128 | _DATA_DIR + '/vrd/predicates.json', 129 | }, 130 | 'vrd_val': { 131 | IM_DIR: 132 | _DATA_DIR + '/vrd/val_images', 133 | ANN_FN: 134 | _DATA_DIR + '/vrd/detections_val.json', 135 | ANN_FN2: 136 | _DATA_DIR + '/vrd/new_annotations_val.json', 137 | ANN_FN3: 138 | _DATA_DIR + '/vrd/predicates.json', 139 | }, 140 | } 141 | -------------------------------------------------------------------------------- /lib/utils_rel/boxes_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang in 2019 for this project 2 | # Based on Detectron.pytorch/lib/utils/boxes.py 3 | # 4 | # Original license text below: 5 | # 6 | ############################################################################# 7 | # Copyright (c) 2017-present, Facebook, Inc. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | ############################################################################## 21 | # 22 | # Based on: 23 | # -------------------------------------------------------- 24 | # Fast/er R-CNN 25 | # Licensed under The MIT License [see LICENSE for details] 26 | # Written by Ross Girshick 27 | # -------------------------------------------------------- 28 | 29 | """Box manipulation functions. The internal Detectron box format is 30 | [x1, y1, x2, y2] where (x1, y1) specify the top-left box corner and (x2, y2) 31 | specify the bottom-right box corner. Boxes from external sources, e.g., 32 | datasets, may be in other formats (such as [x, y, w, h]) and require conversion. 33 | 34 | This module uses a convention that may seem strange at first: the width of a box 35 | is computed as x2 - x1 + 1 (likewise for height). The "+ 1" dates back to old 36 | object detection days when the coordinates were integer pixel indices, rather 37 | than floating point coordinates in a subpixel coordinate frame. A box with x2 = 38 | x1 and y2 = y1 was taken to include a single pixel, having a width of 1, and 39 | hence requiring the "+ 1". Now, most datasets will likely provide boxes with 40 | floating point coordinates and the width should be more reasonably computed as 41 | x2 - x1. 42 | 43 | In practice, as long as a model is trained and tested with a consistent 44 | convention either decision seems to be ok (at least in our experience on COCO). 45 | Since we have a long history of training models with the "+ 1" convention, we 46 | are reluctant to change it even if our modern tastes prefer not to use it. 47 | """ 48 | 49 | from __future__ import absolute_import 50 | from __future__ import division 51 | from __future__ import print_function 52 | from __future__ import unicode_literals 53 | 54 | import warnings 55 | import numpy as np 56 | 57 | from core.config import cfg 58 | import utils_rel.cython_bbox_rel as cython_bbox_rel 59 | from utils.boxes import bbox_transform_inv 60 | 61 | 62 | bbox_pair_overlaps = cython_bbox_rel.bbox_pair_overlaps 63 | 64 | 65 | def get_spt_features(boxes1, boxes2, width, height): 66 | boxes_u = boxes_union(boxes1, boxes2) 67 | spt_feat_1 = get_box_feature(boxes1, width, height) 68 | spt_feat_2 = get_box_feature(boxes2, width, height) 69 | spt_feat_12 = get_pair_feature(boxes1, boxes2) 70 | spt_feat_1u = get_pair_feature(boxes1, boxes_u) 71 | spt_feat_u2 = get_pair_feature(boxes_u, boxes2) 72 | return np.hstack((spt_feat_12, spt_feat_1u, spt_feat_u2, spt_feat_1, spt_feat_2)) 73 | 74 | 75 | def get_pair_feature(boxes1, boxes2): 76 | delta_1 = bbox_transform_inv(boxes1, boxes2) 77 | delta_2 = bbox_transform_inv(boxes2, boxes1) 78 | spt_feat = np.hstack((delta_1, delta_2[:, :2])) 79 | return spt_feat 80 | 81 | 82 | def get_box_feature(boxes, width, height): 83 | f1 = boxes[:, 0] / width 84 | f2 = boxes[:, 1] / height 85 | f3 = boxes[:, 2] / width 86 | f4 = boxes[:, 3] / height 87 | f5 = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1) / (width * height) 88 | return np.vstack((f1, f2, f3, f4, f5)).transpose() 89 | 90 | 91 | def boxes_union(boxes1, boxes2): 92 | assert boxes1.shape == boxes2.shape 93 | xmin = np.minimum(boxes1[:, 0], boxes2[:, 0]) 94 | ymin = np.minimum(boxes1[:, 1], boxes2[:, 1]) 95 | xmax = np.maximum(boxes1[:, 2], boxes2[:, 2]) 96 | ymax = np.maximum(boxes1[:, 3], boxes2[:, 3]) 97 | return np.vstack((xmin, ymin, xmax, ymax)).transpose() 98 | 99 | 100 | def rois_union(rois1, rois2): 101 | assert (rois1[:, 0] == rois2[:, 0]).all() 102 | xmin = np.minimum(rois1[:, 1], rois2[:, 1]) 103 | ymin = np.minimum(rois1[:, 2], rois2[:, 2]) 104 | xmax = np.maximum(rois1[:, 3], rois2[:, 3]) 105 | ymax = np.maximum(rois1[:, 4], rois2[:, 4]) 106 | return np.vstack((rois1[:, 0], xmin, ymin, xmax, ymax)).transpose() 107 | 108 | 109 | def boxes_intersect(boxes1, boxes2): 110 | assert boxes1.shape == boxes2.shape 111 | xmin = np.maximum(boxes1[:, 0], boxes2[:, 0]) 112 | ymin = np.maximum(boxes1[:, 1], boxes2[:, 1]) 113 | xmax = np.minimum(boxes1[:, 2], boxes2[:, 2]) 114 | ymax = np.minimum(boxes1[:, 3], boxes2[:, 3]) 115 | return np.vstack((xmin, ymin, xmax, ymax)).transpose() 116 | 117 | 118 | def rois_intersect(rois1, rois2): 119 | assert (rois1[:, 0] == rois2[:, 0]).all() 120 | xmin = np.maximum(rois1[:, 1], rois2[:, 1]) 121 | ymin = np.maximum(rois1[:, 2], rois2[:, 2]) 122 | xmax = np.minimum(rois1[:, 3], rois2[:, 3]) 123 | ymax = np.minimum(rois1[:, 4], rois2[:, 4]) 124 | return np.vstack((rois1[:, 0], xmin, ymin, xmax, ymax)).transpose() 125 | 126 | 127 | def y1y2x1x2_to_x1y1x2y2(y1y2x1x2): 128 | x1 = y1y2x1x2[2] 129 | y1 = y1y2x1x2[0] 130 | x2 = y1y2x1x2[3] 131 | y2 = y1y2x1x2[1] 132 | return [x1, y1, x2, y2] 133 | -------------------------------------------------------------------------------- /lib/utils_rel/subprocess_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang in 2019 2 | # Based on Detectron.pytorch/lib/utils/subprocess.py 3 | # Original license text below: 4 | # 5 | ############################################################################# 6 | # 7 | # Copyright (c) 2017-present, Facebook, Inc. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | ############################################################################## 21 | 22 | """Primitives for running multiple single-GPU jobs in parallel over subranges of 23 | data. These are used for running multi-GPU inference. Subprocesses are used to 24 | avoid the GIL since inference may involve non-trivial amounts of Python code. 25 | """ 26 | 27 | from io import IOBase 28 | import logging 29 | import os 30 | import subprocess 31 | from six.moves import shlex_quote 32 | from six.moves import cPickle as pickle 33 | import yaml 34 | import numpy as np 35 | import torch 36 | 37 | from core.config import cfg 38 | 39 | logger = logging.getLogger(__name__) 40 | 41 | 42 | def process_in_parallel( 43 | tag, total_range_size, binary, output_dir, 44 | load_ckpt, load_detectron, opts=''): 45 | """Run the specified binary NUM_GPUS times in parallel, each time as a 46 | subprocess that uses one GPU. The binary must accept the command line 47 | arguments `--range {start} {end}` that specify a data processing range. 48 | """ 49 | # Snapshot the current cfg state in order to pass to the inference 50 | # subprocesses 51 | cfg_file = os.path.join(output_dir, '{}_range_config.yaml'.format(tag)) 52 | with open(cfg_file, 'w') as f: 53 | yaml.dump(cfg, stream=f) 54 | subprocess_env = os.environ.copy() 55 | processes = [] 56 | NUM_GPUS = torch.cuda.device_count() 57 | subinds = np.array_split(range(total_range_size), NUM_GPUS) 58 | # Determine GPUs to use 59 | cuda_visible_devices = os.environ.get('CUDA_VISIBLE_DEVICES') 60 | if cuda_visible_devices: 61 | gpu_inds = list(map(int, cuda_visible_devices.split(','))) 62 | assert -1 not in gpu_inds, \ 63 | 'Hiding GPU indices using the \'-1\' index is not supported' 64 | else: 65 | gpu_inds = range(cfg.NUM_GPUS) 66 | gpu_inds = list(gpu_inds) 67 | # Run the binary in cfg.NUM_GPUS subprocesses 68 | for i, gpu_ind in enumerate(gpu_inds): 69 | start = subinds[i][0] 70 | end = subinds[i][-1] + 1 71 | subprocess_env['CUDA_VISIBLE_DEVICES'] = str(gpu_ind) 72 | cmd = ('python3 {binary} --range {start} {end} --cfg {cfg_file} --set {opts} ' 73 | '--output_dir {output_dir}') 74 | if load_ckpt is not None: 75 | cmd += ' --load_ckpt {load_ckpt}' 76 | elif load_detectron is not None: 77 | cmd += ' --load_detectron {load_detectron}' 78 | cmd = cmd.format( 79 | binary=shlex_quote(binary), 80 | start=int(start), 81 | end=int(end), 82 | cfg_file=shlex_quote(cfg_file), 83 | output_dir=output_dir, 84 | load_ckpt=load_ckpt, 85 | load_detectron=load_detectron, 86 | opts=' '.join([shlex_quote(opt) for opt in opts]) 87 | ) 88 | logger.info('{} range command {}: {}'.format(tag, i, cmd)) 89 | if i == 0: 90 | subprocess_stdout = subprocess.PIPE 91 | else: 92 | filename = os.path.join( 93 | output_dir, '%s_range_%s_%s.stdout' % (tag, start, end) 94 | ) 95 | subprocess_stdout = open(filename, 'w') 96 | p = subprocess.Popen( 97 | cmd, 98 | shell=True, 99 | env=subprocess_env, 100 | stdout=subprocess_stdout, 101 | stderr=subprocess.STDOUT, 102 | bufsize=1 103 | ) 104 | processes.append((i, p, start, end, subprocess_stdout)) 105 | # Log output from inference processes and collate their results 106 | outputs = [] 107 | for i, p, start, end, subprocess_stdout in processes: 108 | log_subprocess_output(i, p, output_dir, tag, start, end) 109 | if isinstance(subprocess_stdout, IOBase): 110 | subprocess_stdout.close() 111 | range_file = os.path.join( 112 | output_dir, '%s_range_%s_%s.pkl' % (tag, start, end) 113 | ) 114 | range_data = pickle.load(open(range_file, 'rb')) 115 | outputs.append(range_data) 116 | return outputs 117 | 118 | 119 | def log_subprocess_output(i, p, output_dir, tag, start, end): 120 | """Capture the output of each subprocess and log it in the parent process. 121 | The first subprocess's output is logged in realtime. The output from the 122 | other subprocesses is buffered and then printed all at once (in order) when 123 | subprocesses finish. 124 | """ 125 | outfile = os.path.join( 126 | output_dir, '%s_range_%s_%s.stdout' % (tag, start, end) 127 | ) 128 | logger.info('# ' + '-' * 76 + ' #') 129 | logger.info( 130 | 'stdout of subprocess %s with range [%s, %s]' % (i, start + 1, end) 131 | ) 132 | logger.info('# ' + '-' * 76 + ' #') 133 | if i == 0: 134 | # Stream the piped stdout from the first subprocess in realtime 135 | with open(outfile, 'w') as f: 136 | for line in iter(p.stdout.readline, b''): 137 | print(line.rstrip().decode('ascii')) 138 | f.write(str(line, encoding='ascii')) 139 | p.stdout.close() 140 | ret = p.wait() 141 | else: 142 | # For subprocesses >= 1, wait and dump their log file 143 | ret = p.wait() 144 | with open(outfile, 'r') as f: 145 | print(''.join(f.readlines())) 146 | assert ret == 0, 'Range subprocess failed (exit code: {})'.format(ret) 147 | -------------------------------------------------------------------------------- /lib/modeling_rel/relpn_heads.py: -------------------------------------------------------------------------------- 1 | # Written by Ji Zhang in 2019 2 | 3 | import numpy as np 4 | from numpy import linalg as la 5 | import json 6 | import logging 7 | 8 | from torch import nn 9 | from torch.nn import init 10 | import torch.nn.functional as F 11 | 12 | from core.config import cfg 13 | from modeling_rel.generate_rel_proposal_labels import GenerateRelProposalLabelsOp 14 | import modeling.FPN as FPN 15 | import utils_rel.boxes_rel as box_utils_rel 16 | import utils.fpn as fpn_utils 17 | 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | def generic_relpn_outputs(): 23 | return single_scale_relpn_outputs() 24 | 25 | 26 | class single_scale_relpn_outputs(nn.Module): 27 | """Add RelPN outputs to a single scale model (i.e., no FPN).""" 28 | def __init__(self): 29 | super().__init__() 30 | 31 | self.RelPN_GenerateProposalLabels = GenerateRelProposalLabelsOp() 32 | ds_name = cfg.TRAIN.DATASETS[0] if len(cfg.TRAIN.DATASETS) else cfg.TEST.DATASETS[0] 33 | 34 | def get_roi_inds(self, det_labels, lbls): 35 | lbl_set = np.array(lbls) 36 | inds = np.where(np.isin(det_labels, lbl_set))[0] 37 | return inds 38 | 39 | def remove_self_pairs(self, det_size, sbj_inds, obj_inds): 40 | mask = np.ones(sbj_inds.shape[0], dtype=bool) 41 | for i in range(det_size): 42 | mask[i + det_size * i] = False 43 | keeps = np.where(mask)[0] 44 | sbj_inds = sbj_inds[keeps] 45 | obj_inds = obj_inds[keeps] 46 | return sbj_inds, obj_inds 47 | 48 | def forward(self, det_rois, det_labels, det_scores, im_info, dataset_name, roidb=None): 49 | """ 50 | det_rois: feature maps from the backbone network. (Variable) 51 | im_info: (CPU Variable) 52 | roidb: (list of ndarray) 53 | """ 54 | 55 | # Get pairwise proposals first 56 | if roidb is not None: 57 | # we always feed one image per batch during training 58 | assert len(roidb) == 1 59 | 60 | sbj_inds = np.repeat(np.arange(det_rois.shape[0]), det_rois.shape[0]) 61 | obj_inds = np.tile(np.arange(det_rois.shape[0]), det_rois.shape[0]) 62 | # remove self paired rois 63 | if det_rois.shape[0] > 1: # no pairs to remove when there is at most one detection 64 | sbj_inds, obj_inds = self.remove_self_pairs(det_rois.shape[0], sbj_inds, obj_inds) 65 | sbj_rois = det_rois[sbj_inds] 66 | obj_rois = det_rois[obj_inds] 67 | 68 | im_scale = im_info.data.numpy()[:, 2][0] 69 | sbj_boxes = sbj_rois[:, 1:] / im_scale 70 | obj_boxes = obj_rois[:, 1:] / im_scale 71 | # filters out those roi pairs whose boxes are not overlapping in the original scales 72 | if cfg.MODEL.USE_OVLP_FILTER: 73 | ovlp_so = box_utils_rel.bbox_pair_overlaps( 74 | sbj_boxes.astype(dtype=np.float32, copy=False), 75 | obj_boxes.astype(dtype=np.float32, copy=False)) 76 | ovlp_inds = np.where(ovlp_so > 0)[0] 77 | sbj_inds = sbj_inds[ovlp_inds] 78 | obj_inds = obj_inds[ovlp_inds] 79 | sbj_rois = sbj_rois[ovlp_inds] 80 | obj_rois = obj_rois[ovlp_inds] 81 | sbj_boxes = sbj_boxes[ovlp_inds] 82 | obj_boxes = obj_boxes[ovlp_inds] 83 | 84 | return_dict = {} 85 | if self.training: 86 | # Add binary relationships 87 | blobs_out = self.RelPN_GenerateProposalLabels(sbj_rois, obj_rois, det_rois, roidb, im_info) 88 | return_dict.update(blobs_out) 89 | else: 90 | sbj_labels = det_labels[sbj_inds] 91 | obj_labels = det_labels[obj_inds] 92 | sbj_scores = det_scores[sbj_inds] 93 | obj_scores = det_scores[obj_inds] 94 | rel_rois = box_utils_rel.rois_union(sbj_rois, obj_rois) 95 | return_dict['det_rois'] = det_rois 96 | return_dict['sbj_inds'] = sbj_inds 97 | return_dict['obj_inds'] = obj_inds 98 | return_dict['sbj_rois'] = sbj_rois 99 | return_dict['obj_rois'] = obj_rois 100 | return_dict['rel_rois'] = rel_rois 101 | return_dict['sbj_labels'] = sbj_labels 102 | return_dict['obj_labels'] = obj_labels 103 | return_dict['sbj_scores'] = sbj_scores 104 | return_dict['obj_scores'] = obj_scores 105 | return_dict['fg_size'] = np.array([sbj_rois.shape[0]], dtype=np.int32) 106 | 107 | im_scale = im_info.data.numpy()[:, 2][0] 108 | im_w = im_info.data.numpy()[:, 1][0] 109 | im_h = im_info.data.numpy()[:, 0][0] 110 | if cfg.MODEL.USE_SPATIAL_FEAT: 111 | spt_feat = box_utils_rel.get_spt_features(sbj_boxes, obj_boxes, im_w, im_h) 112 | return_dict['spt_feat'] = spt_feat 113 | if cfg.MODEL.USE_FREQ_BIAS or cfg.MODEL.RUN_BASELINE: 114 | return_dict['all_sbj_labels_int32'] = sbj_labels.astype(np.int32, copy=False) - 1 # det_labels start from 1 115 | return_dict['all_obj_labels_int32'] = obj_labels.astype(np.int32, copy=False) - 1 # det_labels start from 1 116 | if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS: 117 | lvl_min = cfg.FPN.ROI_MIN_LEVEL 118 | lvl_max = cfg.FPN.ROI_MAX_LEVEL 119 | # when use min_rel_area, the same sbj/obj area could be mapped to different feature levels 120 | # when they are associated with different relationships 121 | # Thus we cannot get det_rois features then gather sbj/obj features 122 | # The only way is gather sbj/obj per relationship, thus need to return sbj_rois/obj_rois 123 | rois_blob_names = ['det_rois', 'rel_rois'] 124 | for rois_blob_name in rois_blob_names: 125 | # Add per FPN level roi blobs named like: _fpn 126 | target_lvls = fpn_utils.map_rois_to_fpn_levels( 127 | return_dict[rois_blob_name][:, 1:5], lvl_min, lvl_max) 128 | fpn_utils.add_multilevel_roi_blobs( 129 | return_dict, rois_blob_name, return_dict[rois_blob_name], target_lvls, 130 | lvl_min, lvl_max) 131 | 132 | return return_dict 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /tools/test_net_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang, 2019 2 | # 3 | # Based on Detectron.pytorch/tools/test_net.py Written by Roy Tseng 4 | 5 | """Perform inference on one or more datasets.""" 6 | 7 | import argparse 8 | import cv2 9 | import os 10 | import pprint 11 | import sys 12 | import time 13 | from six.moves import cPickle as pickle 14 | 15 | import torch 16 | 17 | import _init_paths # pylint: disable=unused-import 18 | from core.config import cfg, merge_cfg_from_file, merge_cfg_from_list, assert_and_infer_cfg 19 | from core.test_engine_rel import run_inference 20 | import utils.logging 21 | 22 | from datasets_rel import task_evaluation_sg as task_evaluation_sg 23 | from datasets_rel import task_evaluation_vg_and_vrd as task_evaluation_vg_and_vrd 24 | 25 | # OpenCL may be enabled by default in OpenCV3; disable it because it's not 26 | # thread safe and causes unwanted GPU memory allocations. 27 | cv2.ocl.setUseOpenCL(False) 28 | 29 | 30 | def parse_args(): 31 | """Parse in command line arguments""" 32 | parser = argparse.ArgumentParser(description='Test a Fast R-CNN network') 33 | parser.add_argument( 34 | '--dataset', 35 | help='training dataset') 36 | parser.add_argument( 37 | '--cfg', dest='cfg_file', required=True, 38 | help='optional config file') 39 | 40 | parser.add_argument( 41 | '--load_ckpt', help='path of checkpoint to load') 42 | parser.add_argument( 43 | '--load_detectron', help='path to the detectron weight pickle file') 44 | 45 | parser.add_argument( 46 | '--output_dir', 47 | help='output directory to save the testing results. If not provided, ' 48 | 'defaults to [args.load_ckpt|args.load_detectron]/../test.') 49 | 50 | parser.add_argument( 51 | '--set', dest='set_cfgs', 52 | help='set config keys, will overwrite config in the cfg_file.' 53 | ' See lib/core/config.py for all options', 54 | default=[], nargs='*') 55 | 56 | parser.add_argument( 57 | '--range', 58 | help='start (inclusive) and end (exclusive) indices', 59 | type=int, nargs=2) 60 | parser.add_argument( 61 | '--multi-gpu-testing', help='using multiple gpus for inference', 62 | action='store_true') 63 | parser.add_argument( 64 | '--do_val', dest='do_val', help='do evaluation', action='store_true') 65 | parser.add_argument( 66 | '--do_vis', dest='do_vis', help='visualize the last layer of conv_body', action='store_true') 67 | parser.add_argument( 68 | '--do_special', dest='do_special', help='visualize the last layer of conv_body', action='store_true') 69 | parser.add_argument( 70 | '--use_gt_boxes', dest='use_gt_boxes', help='use gt boxes for sgcls/prdcls', action='store_true') 71 | parser.add_argument( 72 | '--use_gt_labels', dest='use_gt_labels', help='use gt boxes for sgcls/prdcls', action='store_true') 73 | 74 | return parser.parse_args() 75 | 76 | 77 | if __name__ == '__main__': 78 | 79 | if not torch.cuda.is_available(): 80 | sys.exit("Need a CUDA device to run the code.") 81 | 82 | logger = utils.logging.setup_logging(__name__) 83 | args = parse_args() 84 | logger.info('Called with args:') 85 | logger.info(args) 86 | 87 | assert (torch.cuda.device_count() == 1) ^ bool(args.multi_gpu_testing) 88 | 89 | if args.cfg_file is not None: 90 | merge_cfg_from_file(args.cfg_file) 91 | if args.set_cfgs is not None: 92 | merge_cfg_from_file(args.cfg_file) 93 | 94 | if args.dataset == "oi_rel": 95 | cfg.TEST.DATASETS = ('oi_rel_val',) 96 | cfg.MODEL.NUM_CLASSES = 58 97 | cfg.MODEL.NUM_PRD_CLASSES = 9 # rel, exclude background 98 | elif args.dataset == "oi_rel_mini": 99 | cfg.TEST.DATASETS = ('oi_rel_val_mini',) 100 | cfg.MODEL.NUM_CLASSES = 58 101 | cfg.MODEL.NUM_PRD_CLASSES = 9 # rel, exclude background 102 | elif args.dataset == "oi_all_rel_train": 103 | cfg.TEST.DATASETS = ('oi_all_rel_train',) 104 | cfg.MODEL.NUM_CLASSES = 58 105 | cfg.MODEL.NUM_PRD_CLASSES = 9 # rel, exclude background 106 | elif args.dataset == "oi_all_rel": 107 | cfg.TEST.DATASETS = ('oi_all_rel_val',) 108 | cfg.MODEL.NUM_CLASSES = 58 109 | cfg.MODEL.NUM_PRD_CLASSES = 9 # rel, exclude background 110 | elif args.dataset == "oi_kaggle": 111 | cfg.TEST.DATASETS = ('oi_kaggle_rel_test',) 112 | cfg.MODEL.NUM_CLASSES = 58 113 | cfg.MODEL.NUM_PRD_CLASSES = 9 # rel, exclude background 114 | elif args.dataset == "vg_mini": 115 | cfg.TEST.DATASETS = ('vg_val_mini',) 116 | cfg.MODEL.NUM_CLASSES = 151 117 | cfg.MODEL.NUM_PRD_CLASSES = 50 # exclude background 118 | elif args.dataset == "vg": 119 | cfg.TEST.DATASETS = ('vg_val',) 120 | cfg.MODEL.NUM_CLASSES = 151 121 | cfg.MODEL.NUM_PRD_CLASSES = 50 # exclude background 122 | elif args.dataset == "vrd_train": 123 | cfg.TEST.DATASETS = ('vrd_train',) 124 | cfg.MODEL.NUM_CLASSES = 101 125 | cfg.MODEL.NUM_PRD_CLASSES = 70 # exclude background 126 | elif args.dataset == "vrd": 127 | cfg.TEST.DATASETS = ('vrd_val',) 128 | cfg.MODEL.NUM_CLASSES = 101 129 | cfg.MODEL.NUM_PRD_CLASSES = 70 # exclude background 130 | else: # For subprocess call 131 | assert cfg.TEST.DATASETS, 'cfg.TEST.DATASETS shouldn\'t be empty' 132 | 133 | assert_and_infer_cfg() 134 | 135 | if not cfg.MODEL.RUN_BASELINE: 136 | assert bool(args.load_ckpt) ^ bool(args.load_detectron), \ 137 | 'Exactly one of --load_ckpt and --load_detectron should be specified.' 138 | if args.output_dir is None: 139 | ckpt_path = args.load_ckpt if args.load_ckpt else args.load_detectron 140 | args.output_dir = os.path.join( 141 | os.path.dirname(os.path.dirname(ckpt_path)), 'test') 142 | logger.info('Automatically set output directory to %s', args.output_dir) 143 | if not os.path.exists(args.output_dir): 144 | os.makedirs(args.output_dir) 145 | 146 | logger.info('Testing with config:') 147 | logger.info(pprint.pformat(cfg)) 148 | 149 | # For test_engine.multi_gpu_test_net_on_dataset 150 | args.test_net_file, _ = os.path.splitext(__file__) 151 | # manually set args.cuda 152 | args.cuda = True 153 | 154 | if args.use_gt_boxes: 155 | if args.use_gt_labels: 156 | det_file = os.path.join(args.output_dir, 'rel_detections_gt_boxes_prdcls.pkl') 157 | else: 158 | det_file = os.path.join(args.output_dir, 'rel_detections_gt_boxes_sgcls.pkl') 159 | else: 160 | det_file = os.path.join(args.output_dir, 'rel_detections.pkl') 161 | if os.path.exists(det_file): 162 | logger.info('Loading results from {}'.format(det_file)) 163 | with open(det_file, 'rb') as f: 164 | all_results = pickle.load(f) 165 | logger.info('Starting evaluation now...') 166 | if args.dataset.find('vg') >= 0 or args.dataset.find('vrd') >= 0: 167 | task_evaluation_vg_and_vrd.eval_rel_results(all_results, args.output_dir, args.do_val) 168 | else: 169 | task_evaluation_sg.eval_rel_results(all_results, args.output_dir, args.do_val, args.do_vis, args.do_special) 170 | else: 171 | run_inference( 172 | args, 173 | ind_range=args.range, 174 | multi_gpu_testing=args.multi_gpu_testing, 175 | check_expected_results=True) 176 | -------------------------------------------------------------------------------- /lib/datasets_rel/ap_eval_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted from Detectron.pytorch/lib/datasets/voc_eval.py for 2 | # this project by Ji Zhang, 2019 3 | #----------------------------------------------------------------------------- 4 | # Copyright (c) 2017-present, Facebook, Inc. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ############################################################################## 18 | # 19 | # Based on: 20 | # -------------------------------------------------------- 21 | # Fast/er R-CNN 22 | # Licensed under The MIT License [see LICENSE for details] 23 | # Written by Bharath Hariharan 24 | # -------------------------------------------------------- 25 | 26 | """relationship AP evaluation code.""" 27 | 28 | from six.moves import cPickle as pickle 29 | import logging 30 | import numpy as np 31 | import os 32 | from tqdm import tqdm 33 | 34 | from utils.boxes import bbox_overlaps 35 | from utils_rel.boxes_rel import boxes_union 36 | 37 | logger = logging.getLogger(__name__) 38 | 39 | 40 | def prepare_mAP_dets(topk_dets, cls_num): 41 | cls_image_ids = [[] for _ in range(cls_num)] 42 | cls_dets = [{'confidence': np.empty(0), 43 | 'BB_s': np.empty((0, 4)), 44 | 'BB_o': np.empty((0, 4)), 45 | 'BB_r': np.empty((0, 4)), 46 | 'LBL_s': np.empty(0), 47 | 'LBL_o': np.empty(0)} for _ in range(cls_num)] 48 | cls_gts = [{} for _ in range(cls_num)] 49 | npos = [0 for _ in range(cls_num)] 50 | for dets in tqdm(topk_dets): 51 | image_id = dets['image'].split('/')[-1].split('.')[0] 52 | sbj_boxes = dets['det_boxes_s_top'] 53 | obj_boxes = dets['det_boxes_o_top'] 54 | rel_boxes = boxes_union(sbj_boxes, obj_boxes) 55 | sbj_labels = dets['det_labels_s_top'] 56 | obj_labels = dets['det_labels_o_top'] 57 | prd_labels = dets['det_labels_p_top'] 58 | det_scores = dets['det_scores_top'] 59 | gt_boxes_sbj = dets['gt_boxes_sbj'] 60 | gt_boxes_obj = dets['gt_boxes_obj'] 61 | gt_boxes_rel = boxes_union(gt_boxes_sbj, gt_boxes_obj) 62 | gt_labels_sbj = dets['gt_labels_sbj'] 63 | gt_labels_prd = dets['gt_labels_prd'] 64 | gt_labels_obj = dets['gt_labels_obj'] 65 | for c in range(cls_num): 66 | cls_inds = np.where(prd_labels == c)[0] 67 | # logger.info(cls_inds) 68 | if len(cls_inds): 69 | cls_sbj_boxes = sbj_boxes[cls_inds] 70 | cls_obj_boxes = obj_boxes[cls_inds] 71 | cls_rel_boxes = rel_boxes[cls_inds] 72 | cls_sbj_labels = sbj_labels[cls_inds] 73 | cls_obj_labels = obj_labels[cls_inds] 74 | cls_det_scores = det_scores[cls_inds] 75 | cls_dets[c]['confidence'] = np.concatenate((cls_dets[c]['confidence'], cls_det_scores)) 76 | cls_dets[c]['BB_s'] = np.concatenate((cls_dets[c]['BB_s'], cls_sbj_boxes), 0) 77 | cls_dets[c]['BB_o'] = np.concatenate((cls_dets[c]['BB_o'], cls_obj_boxes), 0) 78 | cls_dets[c]['BB_r'] = np.concatenate((cls_dets[c]['BB_r'], cls_rel_boxes), 0) 79 | cls_dets[c]['LBL_s'] = np.concatenate((cls_dets[c]['LBL_s'], cls_sbj_labels)) 80 | cls_dets[c]['LBL_o'] = np.concatenate((cls_dets[c]['LBL_o'], cls_obj_labels)) 81 | cls_image_ids[c] += [image_id] * len(cls_inds) 82 | cls_gt_inds = np.where(gt_labels_prd == c)[0] 83 | cls_gt_boxes_sbj = gt_boxes_sbj[cls_gt_inds] 84 | cls_gt_boxes_obj = gt_boxes_obj[cls_gt_inds] 85 | cls_gt_boxes_rel = gt_boxes_rel[cls_gt_inds] 86 | cls_gt_labels_sbj = gt_labels_sbj[cls_gt_inds] 87 | cls_gt_labels_obj = gt_labels_obj[cls_gt_inds] 88 | cls_gt_num = len(cls_gt_inds) 89 | det = [False] * cls_gt_num 90 | npos[c] = npos[c] + cls_gt_num 91 | cls_gts[c][image_id] = {'gt_boxes_sbj': cls_gt_boxes_sbj, 92 | 'gt_boxes_obj': cls_gt_boxes_obj, 93 | 'gt_boxes_rel': cls_gt_boxes_rel, 94 | 'gt_labels_sbj': cls_gt_labels_sbj, 95 | 'gt_labels_obj': cls_gt_labels_obj, 96 | 'gt_num': cls_gt_num, 97 | 'det': det} 98 | return cls_image_ids, cls_dets, cls_gts, npos 99 | 100 | 101 | def get_ap(rec, prec): 102 | """Compute AP given precision and recall. 103 | """ 104 | # correct AP calculation 105 | # first append sentinel values at the end 106 | mrec = np.concatenate(([0.], rec, [1.])) 107 | mpre = np.concatenate(([0.], prec, [0.])) 108 | 109 | # compute the precision envelope 110 | for i in range(mpre.size - 1, 0, -1): 111 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 112 | 113 | # to calculate area under PR curve, look for points 114 | # where X axis (recall) changes value 115 | i = np.where(mrec[1:] != mrec[:-1])[0] 116 | 117 | # and sum (\Delta recall) * prec 118 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 119 | return ap 120 | 121 | 122 | def ap_eval(image_ids, 123 | dets, 124 | gts, 125 | npos, 126 | rel_or_phr=True, 127 | ovthresh=0.5): 128 | """ 129 | Top level function that does the relationship AP evaluation. 130 | 131 | detpath: Path to detections 132 | detpath.format(classname) should produce the detection results file. 133 | classname: Category name (duh) 134 | [ovthresh]: Overlap threshold (default = 0.5) 135 | """ 136 | 137 | confidence = dets['confidence'] 138 | BB_s = dets['BB_s'] 139 | BB_o = dets['BB_o'] 140 | BB_r = dets['BB_r'] 141 | LBL_s = dets['LBL_s'] 142 | LBL_o = dets['LBL_o'] 143 | 144 | # sort by confidence 145 | sorted_ind = np.argsort(-confidence) 146 | BB_s = BB_s[sorted_ind, :] 147 | BB_o = BB_o[sorted_ind, :] 148 | BB_r = BB_r[sorted_ind, :] 149 | LBL_s = LBL_s[sorted_ind] 150 | LBL_o = LBL_o[sorted_ind] 151 | image_ids = [image_ids[x] for x in sorted_ind] 152 | 153 | # go down dets and mark TPs and FPs 154 | nd = len(image_ids) 155 | tp = np.zeros(nd) 156 | fp = np.zeros(nd) 157 | gts_visited = {k: [False] * v['gt_num'] for k, v in gts.items()} 158 | for d in range(nd): 159 | R = gts[image_ids[d]] 160 | visited = gts_visited[image_ids[d]] 161 | bb_s = BB_s[d, :].astype(float) 162 | bb_o = BB_o[d, :].astype(float) 163 | bb_r = BB_r[d, :].astype(float) 164 | lbl_s = LBL_s[d] 165 | lbl_o = LBL_o[d] 166 | ovmax = -np.inf 167 | BBGT_s = R['gt_boxes_sbj'].astype(float) 168 | BBGT_o = R['gt_boxes_obj'].astype(float) 169 | BBGT_r = R['gt_boxes_rel'].astype(float) 170 | LBLGT_s = R['gt_labels_sbj'] 171 | LBLGT_o = R['gt_labels_obj'] 172 | if BBGT_s.size > 0: 173 | valid_mask = np.logical_and(LBLGT_s == lbl_s, LBLGT_o == lbl_o) 174 | if valid_mask.any(): 175 | if rel_or_phr: # means it is evaluating relationships 176 | overlaps_s = bbox_overlaps( 177 | bb_s[None, :].astype(dtype=np.float32, copy=False), 178 | BBGT_s.astype(dtype=np.float32, copy=False))[0] 179 | overlaps_o = bbox_overlaps( 180 | bb_o[None, :].astype(dtype=np.float32, copy=False), 181 | BBGT_o.astype(dtype=np.float32, copy=False))[0] 182 | overlaps = np.minimum(overlaps_s, overlaps_o) 183 | else: 184 | overlaps = bbox_overlaps( 185 | bb_r[None, :].astype(dtype=np.float32, copy=False), 186 | BBGT_r.astype(dtype=np.float32, copy=False))[0] 187 | overlaps *= valid_mask 188 | ovmax = np.max(overlaps) 189 | jmax = np.argmax(overlaps) 190 | else: 191 | ovmax = 0. 192 | jmax = -1 193 | 194 | if ovmax > ovthresh: 195 | if not visited[jmax]: 196 | tp[d] = 1. 197 | visited[jmax] = 1 198 | else: 199 | fp[d] = 1. 200 | else: 201 | fp[d] = 1. 202 | 203 | # compute precision recall 204 | fp = np.cumsum(fp) 205 | tp = np.cumsum(tp) 206 | rec = tp / (float(npos) + 1e-12) 207 | # ground truth 208 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 209 | ap = get_ap(rec, prec) 210 | 211 | return rec, prec, ap 212 | -------------------------------------------------------------------------------- /lib/datasets_rel/task_evaluation_vg_and_vrd.py: -------------------------------------------------------------------------------- 1 | """ 2 | Written by Ji Zhang, 2019 3 | Some functions are adapted from Rowan Zellers 4 | Original source: 5 | https://github.com/rowanz/neural-motifs/blob/master/lib/evaluation/sg_eval.py 6 | """ 7 | 8 | import os 9 | import numpy as np 10 | import logging 11 | from six.moves import cPickle as pickle 12 | import json 13 | import csv 14 | from tqdm import tqdm 15 | 16 | from core.config import cfg 17 | from functools import reduce 18 | from utils.boxes import bbox_overlaps 19 | from utils_rel.boxes_rel import boxes_union 20 | 21 | from .pytorch_misc import intersect_2d, argsort_desc 22 | 23 | np.set_printoptions(precision=3) 24 | 25 | logger = logging.getLogger(__name__) 26 | 27 | 28 | topk = 100 29 | 30 | 31 | def eval_rel_results(all_results, output_dir, do_val): 32 | 33 | if cfg.TEST.DATASETS[0].find('vg') >= 0: 34 | prd_k_set = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20) 35 | elif cfg.TEST.DATASETS[0].find('vrd') >= 0: 36 | prd_k_set = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 70) 37 | else: 38 | prd_k_set = (1, 2, 3, 4, 5, 6, 7, 8, 9) 39 | 40 | if cfg.TEST.DATASETS[0].find('vg') >= 0: 41 | eval_sets = (False,) 42 | else: 43 | eval_sets = (False, True) 44 | 45 | for phrdet in eval_sets: 46 | eval_metric = 'phrdet' if phrdet else 'reldet' 47 | print('================== {} =================='.format(eval_metric)) 48 | 49 | for prd_k in prd_k_set: 50 | print('prd_k = {}:'.format(prd_k)) 51 | 52 | recalls = {20: 0, 50: 0, 100: 0} 53 | if do_val: 54 | all_gt_cnt = 0 55 | 56 | topk_dets = [] 57 | for im_i, res in enumerate(tqdm(all_results)): 58 | 59 | # in oi_all_rel some images have no dets 60 | if res['prd_scores'] is None: 61 | det_boxes_s_top = np.zeros((0, 4), dtype=np.float32) 62 | det_boxes_o_top = np.zeros((0, 4), dtype=np.float32) 63 | det_labels_s_top = np.zeros(0, dtype=np.int32) 64 | det_labels_p_top = np.zeros(0, dtype=np.int32) 65 | det_labels_o_top = np.zeros(0, dtype=np.int32) 66 | det_scores_top = np.zeros(0, dtype=np.float32) 67 | else: 68 | det_boxes_sbj = res['sbj_boxes'] # (#num_rel, 4) 69 | det_boxes_obj = res['obj_boxes'] # (#num_rel, 4) 70 | det_labels_sbj = res['sbj_labels'] # (#num_rel,) 71 | det_labels_obj = res['obj_labels'] # (#num_rel,) 72 | det_scores_sbj = res['sbj_scores'] # (#num_rel,) 73 | det_scores_obj = res['obj_scores'] # (#num_rel,) 74 | if 'prd_scores_ttl' in res: 75 | det_scores_prd = res['prd_scores_ttl'][:, 1:] 76 | else: 77 | det_scores_prd = res['prd_scores'][:, 1:] 78 | 79 | det_labels_prd = np.argsort(-det_scores_prd, axis=1) 80 | det_scores_prd = -np.sort(-det_scores_prd, axis=1) 81 | 82 | det_scores_so = det_scores_sbj * det_scores_obj 83 | det_scores_spo = det_scores_so[:, None] * det_scores_prd[:, :prd_k] 84 | det_scores_inds = argsort_desc(det_scores_spo)[:topk] 85 | det_scores_top = det_scores_spo[det_scores_inds[:, 0], det_scores_inds[:, 1]] 86 | det_boxes_so_top = np.hstack( 87 | (det_boxes_sbj[det_scores_inds[:, 0]], det_boxes_obj[det_scores_inds[:, 0]])) 88 | det_labels_p_top = det_labels_prd[det_scores_inds[:, 0], det_scores_inds[:, 1]] 89 | det_labels_spo_top = np.vstack( 90 | (det_labels_sbj[det_scores_inds[:, 0]], det_labels_p_top, det_labels_obj[det_scores_inds[:, 0]])).transpose() 91 | 92 | det_boxes_s_top = det_boxes_so_top[:, :4] 93 | det_boxes_o_top = det_boxes_so_top[:, 4:] 94 | det_labels_s_top = det_labels_spo_top[:, 0] 95 | det_labels_p_top = det_labels_spo_top[:, 1] 96 | det_labels_o_top = det_labels_spo_top[:, 2] 97 | 98 | topk_dets.append(dict(image=res['image'], 99 | det_boxes_s_top=det_boxes_s_top, 100 | det_boxes_o_top=det_boxes_o_top, 101 | det_labels_s_top=det_labels_s_top, 102 | det_labels_p_top=det_labels_p_top, 103 | det_labels_o_top=det_labels_o_top, 104 | det_scores_top=det_scores_top)) 105 | 106 | if do_val: 107 | gt_boxes_sbj = res['gt_sbj_boxes'] # (#num_gt, 4) 108 | gt_boxes_obj = res['gt_obj_boxes'] # (#num_gt, 4) 109 | gt_labels_sbj = res['gt_sbj_labels'] # (#num_gt,) 110 | gt_labels_obj = res['gt_obj_labels'] # (#num_gt,) 111 | gt_labels_prd = res['gt_prd_labels'] # (#num_gt,) 112 | gt_boxes_so = np.hstack((gt_boxes_sbj, gt_boxes_obj)) 113 | gt_labels_spo = np.vstack((gt_labels_sbj, gt_labels_prd, gt_labels_obj)).transpose() 114 | # Compute recall. It's most efficient to match once and then do recall after 115 | # det_boxes_so_top is (#num_rel, 8) 116 | # det_labels_spo_top is (#num_rel, 3) 117 | if phrdet: 118 | det_boxes_r_top = boxes_union(det_boxes_s_top, det_boxes_o_top) 119 | gt_boxes_r = boxes_union(gt_boxes_sbj, gt_boxes_obj) 120 | pred_to_gt = _compute_pred_matches( 121 | gt_labels_spo, det_labels_spo_top, 122 | gt_boxes_r, det_boxes_r_top, 123 | phrdet=phrdet) 124 | else: 125 | pred_to_gt = _compute_pred_matches( 126 | gt_labels_spo, det_labels_spo_top, 127 | gt_boxes_so, det_boxes_so_top, 128 | phrdet=phrdet) 129 | all_gt_cnt += gt_labels_spo.shape[0] 130 | for k in recalls: 131 | if len(pred_to_gt): 132 | match = reduce(np.union1d, pred_to_gt[:k]) 133 | else: 134 | match = [] 135 | recalls[k] += len(match) 136 | 137 | topk_dets[-1].update(dict(gt_boxes_sbj=gt_boxes_sbj, 138 | gt_boxes_obj=gt_boxes_obj, 139 | gt_labels_sbj=gt_labels_sbj, 140 | gt_labels_obj=gt_labels_obj, 141 | gt_labels_prd=gt_labels_prd)) 142 | 143 | if do_val: 144 | for k in recalls: 145 | recalls[k] = float(recalls[k]) / (float(all_gt_cnt) + 1e-12) 146 | print_stats(recalls) 147 | 148 | 149 | def print_stats(recalls): 150 | # print('====================== ' + 'sgdet' + ' ============================') 151 | for k, v in recalls.items(): 152 | print('R@%i: %.2f' % (k, 100 * v)) 153 | 154 | 155 | # This function is adapted from Rowan Zellers' code: 156 | # https://github.com/rowanz/neural-motifs/blob/master/lib/evaluation/sg_eval.py 157 | # Modified for this project to work with PyTorch v0.4 158 | def _compute_pred_matches(gt_triplets, pred_triplets, 159 | gt_boxes, pred_boxes, iou_thresh=0.5, phrdet=False): 160 | """ 161 | Given a set of predicted triplets, return the list of matching GT's for each of the 162 | given predictions 163 | :param gt_triplets: 164 | :param pred_triplets: 165 | :param gt_boxes: 166 | :param pred_boxes: 167 | :param iou_thresh: 168 | :return: 169 | """ 170 | # This performs a matrix multiplication-esque thing between the two arrays 171 | # Instead of summing, we want the equality, so we reduce in that way 172 | # The rows correspond to GT triplets, columns to pred triplets 173 | keeps = intersect_2d(gt_triplets, pred_triplets) 174 | gt_has_match = keeps.any(1) 175 | pred_to_gt = [[] for x in range(pred_boxes.shape[0])] 176 | for gt_ind, gt_box, keep_inds in zip(np.where(gt_has_match)[0], 177 | gt_boxes[gt_has_match], 178 | keeps[gt_has_match], 179 | ): 180 | boxes = pred_boxes[keep_inds] 181 | if phrdet: 182 | gt_box = gt_box.astype(dtype=np.float32, copy=False) 183 | boxes = boxes.astype(dtype=np.float32, copy=False) 184 | rel_iou = bbox_overlaps(gt_box[None, :], boxes)[0] 185 | 186 | inds = rel_iou >= iou_thresh 187 | else: 188 | gt_box = gt_box.astype(dtype=np.float32, copy=False) 189 | boxes = boxes.astype(dtype=np.float32, copy=False) 190 | sub_iou = bbox_overlaps(gt_box[None,:4], boxes[:, :4])[0] 191 | obj_iou = bbox_overlaps(gt_box[None,4:], boxes[:, 4:])[0] 192 | 193 | inds = (sub_iou >= iou_thresh) & (obj_iou >= iou_thresh) 194 | 195 | for i in np.where(keep_inds)[0][inds]: 196 | pred_to_gt[i].append(int(gt_ind)) 197 | return pred_to_gt 198 | -------------------------------------------------------------------------------- /lib/core/test_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang in 2019 2 | # From Detectron.pytorch/lib/core/test.py 3 | # Original license text below 4 | # -------------------------------------------------------- 5 | # Written by Roy Tseng 6 | # 7 | # Based on: 8 | # -------------------------------------------------------- 9 | # Copyright (c) 2017-present, Facebook, Inc. 10 | # 11 | # Licensed under the Apache License, Version 2.0 (the "License"); 12 | # you may not use this file except in compliance with the License. 13 | # You may obtain a copy of the License at 14 | # 15 | # http://www.apache.org/licenses/LICENSE-2.0 16 | # 17 | # Unless required by applicable law or agreed to in writing, software 18 | # distributed under the License is distributed on an "AS IS" BASIS, 19 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 20 | # See the License for the specific language governing permissions and 21 | # limitations under the License. 22 | ############################################################################## 23 | # 24 | # Based on: 25 | # -------------------------------------------------------- 26 | # Fast R-CNN 27 | # Copyright (c) 2015 Microsoft 28 | # Licensed under The MIT License [see LICENSE for details] 29 | # Written by Ross Girshick 30 | # -------------------------------------------------------- 31 | 32 | from __future__ import absolute_import 33 | from __future__ import division 34 | from __future__ import print_function 35 | from __future__ import unicode_literals 36 | 37 | from collections import defaultdict 38 | from six.moves import cPickle as pickle 39 | import cv2 40 | import logging 41 | import numpy as np 42 | 43 | from torch.autograd import Variable 44 | import torch 45 | 46 | from core.config import cfg 47 | from utils.timer import Timer 48 | import utils.blob as blob_utils 49 | import utils.fpn as fpn_utils 50 | import utils.image as image_utils 51 | 52 | logger = logging.getLogger(__name__) 53 | 54 | 55 | def im_detect_rels(model, im, dataset_name, box_proposals, do_vis=False, timers=None, roidb=None, use_gt_labels=False): 56 | 57 | if timers is None: 58 | timers = defaultdict(Timer) 59 | 60 | timers['im_detect_rels'].tic() 61 | rel_results = im_get_det_rels(model, im, dataset_name, cfg.TEST.SCALE, cfg.TEST.MAX_SIZE, box_proposals, do_vis, roidb, use_gt_labels) 62 | timers['im_detect_rels'].toc() 63 | 64 | return rel_results 65 | 66 | 67 | def im_get_det_rels(model, im, dataset_name, target_scale, target_max_size, boxes=None, do_vis=False, roidb=None, use_gt_labels=False): 68 | """Prepare the bbox for testing""" 69 | 70 | inputs, im_scale = _get_blobs(im, boxes, target_scale, target_max_size) 71 | 72 | if cfg.DEDUP_BOXES > 0 and not cfg.MODEL.FASTER_RCNN: 73 | v = np.array([1, 1e3, 1e6, 1e9, 1e12]) 74 | hashes = np.round(inputs['rois'] * cfg.DEDUP_BOXES).dot(v) 75 | _, index, inv_index = np.unique( 76 | hashes, return_index=True, return_inverse=True 77 | ) 78 | inputs['rois'] = inputs['rois'][index, :] 79 | boxes = boxes[index, :] 80 | 81 | # Add multi-level rois for FPN 82 | if cfg.FPN.MULTILEVEL_ROIS and not cfg.MODEL.FASTER_RCNN: 83 | _add_multilevel_rois_for_test(inputs, 'rois') 84 | 85 | if cfg.PYTORCH_VERSION_LESS_THAN_040: 86 | inputs['data'] = [Variable(torch.from_numpy(inputs['data']), volatile=True)] 87 | inputs['im_info'] = [Variable(torch.from_numpy(inputs['im_info']), volatile=True)] 88 | else: 89 | inputs['data'] = [torch.from_numpy(inputs['data'])] 90 | inputs['im_info'] = [torch.from_numpy(inputs['im_info'])] 91 | if dataset_name is not None: 92 | inputs['dataset_name'] = [blob_utils.serialize(dataset_name)] 93 | 94 | inputs['do_vis'] = [do_vis] 95 | if roidb is not None: 96 | inputs['roidb'] = [roidb] 97 | if use_gt_labels: 98 | inputs['use_gt_labels'] = [use_gt_labels] 99 | 100 | return_dict = model(**inputs) 101 | 102 | return_dict2 = {} 103 | if return_dict['sbj_rois'] is not None: 104 | sbj_boxes = return_dict['sbj_rois'].data.cpu().numpy()[:, 1:5] / im_scale 105 | sbj_labels = return_dict['sbj_labels'].data.cpu().numpy() - 1 106 | sbj_scores = return_dict['sbj_scores'].data.cpu().numpy() 107 | obj_boxes = return_dict['obj_rois'].data.cpu().numpy()[:, 1:5] / im_scale 108 | obj_labels = return_dict['obj_labels'].data.cpu().numpy() - 1 109 | obj_scores = return_dict['obj_scores'].data.cpu().numpy() 110 | prd_scores = return_dict['prd_scores'].data.cpu().numpy() 111 | if cfg.MODEL.USE_FREQ_BIAS: 112 | prd_scores_bias = return_dict['prd_scores_bias'].data.cpu().numpy() 113 | if cfg.MODEL.USE_SPATIAL_FEAT: 114 | prd_scores_spt = return_dict['prd_scores_spt'].data.cpu().numpy() 115 | if cfg.MODEL.ADD_SCORES_ALL: 116 | prd_scores_ttl = return_dict['prd_ttl_scores'].data.cpu().numpy() 117 | 118 | return_dict2 = dict(sbj_boxes=sbj_boxes, 119 | sbj_labels=sbj_labels.astype(np.int32, copy=False), 120 | sbj_scores=sbj_scores, 121 | obj_boxes=obj_boxes, 122 | obj_labels=obj_labels.astype(np.int32, copy=False), 123 | obj_scores=obj_scores, 124 | prd_scores=prd_scores) 125 | if cfg.MODEL.ADD_SCORES_ALL: 126 | return_dict2['prd_scores_ttl'] = prd_scores_ttl 127 | 128 | if cfg.MODEL.USE_FREQ_BIAS: 129 | return_dict2['prd_scores_bias'] = prd_scores_bias 130 | if cfg.MODEL.USE_SPATIAL_FEAT: 131 | return_dict2['prd_scores_spt'] = prd_scores_spt 132 | if do_vis: 133 | if isinstance(return_dict['blob_conv'], list): 134 | blob_conv = [b.data.cpu().numpy().squeeze() for b in return_dict['blob_conv']] 135 | blob_conv_prd = [b.data.cpu().numpy().squeeze() for b in return_dict['blob_conv_prd']] 136 | blob_conv = [b.mean(axis=0) for b in blob_conv] 137 | blob_conv_prd = [b.mean(axis=0) for b in blob_conv_prd] 138 | return_dict2['blob_conv'] = blob_conv 139 | return_dict2['blob_conv_prd'] = blob_conv_prd 140 | else: 141 | blob_conv = return_dict['blob_conv'].data.cpu().numpy().squeeze() 142 | blob_conv_prd = return_dict['blob_conv_prd'].data.cpu().numpy().squeeze() 143 | blob_conv = blob_conv.mean(axis=0) 144 | blob_conv_prd = blob_conv_prd.mean(axis=0) 145 | return_dict2['blob_conv'] = blob_conv 146 | return_dict2['blob_conv_prd'] = blob_conv_prd 147 | else: 148 | return_dict2 = dict(sbj_boxes=None, 149 | sbj_labels=None, 150 | sbj_scores=None, 151 | obj_boxes=None, 152 | obj_labels=None, 153 | obj_scores=None, 154 | prd_scores=None) 155 | 156 | return return_dict2 157 | 158 | 159 | def _get_rois_blob(im_rois, im_scale): 160 | """Converts RoIs into network inputs. 161 | 162 | Arguments: 163 | im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates 164 | im_scale_factors (list): scale factors as returned by _get_image_blob 165 | 166 | Returns: 167 | blob (ndarray): R x 5 matrix of RoIs in the image pyramid with columns 168 | [level, x1, y1, x2, y2] 169 | """ 170 | rois, levels = _project_im_rois(im_rois, im_scale) 171 | rois_blob = np.hstack((levels, rois)) 172 | return rois_blob.astype(np.float32, copy=False) 173 | 174 | 175 | def _project_im_rois(im_rois, scales): 176 | """Project image RoIs into the image pyramid built by _get_image_blob. 177 | 178 | Arguments: 179 | im_rois (ndarray): R x 4 matrix of RoIs in original image coordinates 180 | scales (list): scale factors as returned by _get_image_blob 181 | 182 | Returns: 183 | rois (ndarray): R x 4 matrix of projected RoI coordinates 184 | levels (ndarray): image pyramid levels used by each projected RoI 185 | """ 186 | rois = im_rois.astype(np.float, copy=False) * scales 187 | levels = np.zeros((im_rois.shape[0], 1), dtype=np.int) 188 | return rois, levels 189 | 190 | 191 | def _add_multilevel_rois_for_test(blobs, name): 192 | """Distributes a set of RoIs across FPN pyramid levels by creating new level 193 | specific RoI blobs. 194 | 195 | Arguments: 196 | blobs (dict): dictionary of blobs 197 | name (str): a key in 'blobs' identifying the source RoI blob 198 | 199 | Returns: 200 | [by ref] blobs (dict): new keys named by `name + 'fpn' + level` 201 | are added to dict each with a value that's an R_level x 5 ndarray of 202 | RoIs (see _get_rois_blob for format) 203 | """ 204 | lvl_min = cfg.FPN.ROI_MIN_LEVEL 205 | lvl_max = cfg.FPN.ROI_MAX_LEVEL 206 | lvls = fpn_utils.map_rois_to_fpn_levels(blobs[name][:, 1:5], lvl_min, lvl_max) 207 | fpn_utils.add_multilevel_roi_blobs( 208 | blobs, name, blobs[name], lvls, lvl_min, lvl_max 209 | ) 210 | 211 | 212 | def _get_blobs(im, rois, target_scale, target_max_size): 213 | """Convert an image and RoIs within that image into network inputs.""" 214 | blobs = {} 215 | blobs['data'], im_scale, blobs['im_info'] = \ 216 | blob_utils.get_image_blob(im, target_scale, target_max_size) 217 | if rois is not None: 218 | blobs['rois'] = _get_rois_blob(rois, im_scale) 219 | return blobs, im_scale 220 | -------------------------------------------------------------------------------- /lib/modeling_rel/fast_rcnn_heads.py: -------------------------------------------------------------------------------- 1 | # Adapted from Detectron.pytorch/lib/modeling/fast_rcnn_heads.py 2 | # for this project by Ji Zhang, 2019 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import torch.nn.init as init 8 | from torch.autograd import Variable 9 | 10 | from core.config import cfg 11 | import nn as mynn 12 | import utils.net as net_utils 13 | 14 | 15 | class fast_rcnn_outputs(nn.Module): 16 | def __init__(self, dim_in): 17 | super().__init__() 18 | self.cls_score = nn.Linear(dim_in, cfg.MODEL.NUM_CLASSES) 19 | if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG: # bg and fg 20 | self.bbox_pred = nn.Linear(dim_in, 4 * 2) 21 | else: 22 | self.bbox_pred = nn.Linear(dim_in, 4 * cfg.MODEL.NUM_CLASSES) 23 | 24 | self._init_weights() 25 | 26 | def _init_weights(self): 27 | init.normal_(self.cls_score.weight, std=0.01) 28 | init.constant_(self.cls_score.bias, 0) 29 | init.normal_(self.bbox_pred.weight, std=0.001) 30 | init.constant_(self.bbox_pred.bias, 0) 31 | 32 | def detectron_weight_mapping(self): 33 | detectron_weight_mapping = { 34 | 'cls_score.weight': 'cls_score_w', 35 | 'cls_score.bias': 'cls_score_b', 36 | 'bbox_pred.weight': 'bbox_pred_w', 37 | 'bbox_pred.bias': 'bbox_pred_b' 38 | } 39 | orphan_in_detectron = [] 40 | return detectron_weight_mapping, orphan_in_detectron 41 | 42 | def forward(self, x): 43 | if x.dim() == 4: 44 | x = x.squeeze(3).squeeze(2) 45 | cls_score = self.cls_score(x) 46 | if not self.training: 47 | cls_score = F.softmax(cls_score, dim=1) 48 | bbox_pred = self.bbox_pred(x) 49 | 50 | return cls_score, bbox_pred 51 | 52 | 53 | def fast_rcnn_losses(cls_score, bbox_pred, label_int32, bbox_targets, 54 | bbox_inside_weights, bbox_outside_weights): 55 | device_id = cls_score.get_device() 56 | rois_label = Variable(torch.from_numpy(label_int32.astype('int64'))).cuda(device_id) 57 | loss_cls = F.cross_entropy(cls_score, rois_label) 58 | 59 | bbox_targets = Variable(torch.from_numpy(bbox_targets)).cuda(device_id) 60 | bbox_inside_weights = Variable(torch.from_numpy(bbox_inside_weights)).cuda(device_id) 61 | bbox_outside_weights = Variable(torch.from_numpy(bbox_outside_weights)).cuda(device_id) 62 | loss_bbox = net_utils.smooth_l1_loss( 63 | bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights) 64 | 65 | # class accuracy 66 | cls_preds = cls_score.max(dim=1)[1].type_as(rois_label) 67 | accuracy_cls = cls_preds.eq(rois_label).float().mean(dim=0) 68 | 69 | return loss_cls, loss_bbox, accuracy_cls 70 | 71 | 72 | # ---------------------------------------------------------------------------- # 73 | # Box heads 74 | # ---------------------------------------------------------------------------- # 75 | 76 | class roi_2mlp_head(nn.Module): 77 | """Add a ReLU MLP with two hidden layers.""" 78 | def __init__(self, dim_in, roi_xform_func, spatial_scale): 79 | super().__init__() 80 | self.dim_in = dim_in 81 | self.roi_xform = roi_xform_func 82 | self.spatial_scale = spatial_scale 83 | self.dim_out = hidden_dim = cfg.FAST_RCNN.MLP_HEAD_DIM 84 | 85 | roi_size = cfg.FAST_RCNN.ROI_XFORM_RESOLUTION 86 | self.fc1 = nn.Linear(dim_in * roi_size**2, hidden_dim) 87 | self.fc2 = nn.Linear(hidden_dim, hidden_dim) 88 | 89 | self._init_weights() 90 | 91 | def _init_weights(self): 92 | mynn.init.XavierFill(self.fc1.weight) 93 | init.constant_(self.fc1.bias, 0) 94 | mynn.init.XavierFill(self.fc2.weight) 95 | init.constant_(self.fc2.bias, 0) 96 | 97 | def detectron_weight_mapping(self): 98 | detectron_weight_mapping = { 99 | 'fc1.weight': 'fc6_w', 100 | 'fc1.bias': 'fc6_b', 101 | 'fc2.weight': 'fc7_w', 102 | 'fc2.bias': 'fc7_b' 103 | } 104 | return detectron_weight_mapping, [] 105 | 106 | def forward(self, x, rpn_ret, rois_name='rois', use_relu=True): 107 | x = self.roi_xform( 108 | x, rpn_ret, 109 | blob_rois=rois_name, 110 | method=cfg.FAST_RCNN.ROI_XFORM_METHOD, 111 | resolution=cfg.FAST_RCNN.ROI_XFORM_RESOLUTION, 112 | spatial_scale=self.spatial_scale, 113 | sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO 114 | ) 115 | batch_size = x.size(0) 116 | x = F.relu(self.fc1(x.view(batch_size, -1)), inplace=True) 117 | if use_relu: 118 | x = F.relu(self.fc2(x), inplace=True) 119 | else: 120 | x = self.fc2(x) 121 | 122 | return x 123 | 124 | 125 | class roi_Xconv1fc_head(nn.Module): 126 | """Add a X conv + 1fc head, as a reference if not using GroupNorm""" 127 | def __init__(self, dim_in, roi_xform_func, spatial_scale): 128 | super().__init__() 129 | self.dim_in = dim_in 130 | self.roi_xform = roi_xform_func 131 | self.spatial_scale = spatial_scale 132 | 133 | hidden_dim = cfg.FAST_RCNN.CONV_HEAD_DIM 134 | module_list = [] 135 | for i in range(cfg.FAST_RCNN.NUM_STACKED_CONVS): 136 | module_list.extend([ 137 | nn.Conv2d(dim_in, hidden_dim, 3, 1, 1), 138 | nn.ReLU(inplace=True) 139 | ]) 140 | dim_in = hidden_dim 141 | self.convs = nn.Sequential(*module_list) 142 | 143 | self.dim_out = fc_dim = cfg.FAST_RCNN.MLP_HEAD_DIM 144 | roi_size = cfg.FAST_RCNN.ROI_XFORM_RESOLUTION 145 | self.fc = nn.Linear(dim_in * roi_size * roi_size, fc_dim) 146 | 147 | self._init_weights() 148 | 149 | def _init_weights(self): 150 | def _init(m): 151 | if isinstance(m, nn.Conv2d): 152 | mynn.init.MSRAFill(m.weight) 153 | init.constant_(m.bias, 0) 154 | elif isinstance(m, nn.Linear): 155 | mynn.init.XavierFill(m.weight) 156 | init.constant_(m.bias, 0) 157 | self.apply(_init) 158 | 159 | def detectron_weight_mapping(self): 160 | mapping = {} 161 | for i in range(cfg.FAST_RCNN.NUM_STACKED_CONVS): 162 | mapping.update({ 163 | 'convs.%d.weight' % (i*2): 'head_conv%d_w' % (i+1), 164 | 'convs.%d.bias' % (i*2): 'head_conv%d_b' % (i+1) 165 | }) 166 | mapping.update({ 167 | 'fc.weight': 'fc6_w', 168 | 'fc.bias': 'fc6_b' 169 | }) 170 | return mapping, [] 171 | 172 | def forward(self, x, rpn_ret): 173 | x = self.roi_xform( 174 | x, rpn_ret, 175 | blob_rois='rois', 176 | method=cfg.FAST_RCNN.ROI_XFORM_METHOD, 177 | resolution=cfg.FAST_RCNN.ROI_XFORM_RESOLUTION, 178 | spatial_scale=self.spatial_scale, 179 | sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO 180 | ) 181 | batch_size = x.size(0) 182 | x = self.convs(x) 183 | x = F.relu(self.fc(x.view(batch_size, -1)), inplace=True) 184 | return x 185 | 186 | 187 | class roi_Xconv1fc_gn_head(nn.Module): 188 | """Add a X conv + 1fc head, with GroupNorm""" 189 | def __init__(self, dim_in, roi_xform_func, spatial_scale): 190 | super().__init__() 191 | self.dim_in = dim_in 192 | self.roi_xform = roi_xform_func 193 | self.spatial_scale = spatial_scale 194 | 195 | hidden_dim = cfg.FAST_RCNN.CONV_HEAD_DIM 196 | module_list = [] 197 | for i in range(cfg.FAST_RCNN.NUM_STACKED_CONVS): 198 | module_list.extend([ 199 | nn.Conv2d(dim_in, hidden_dim, 3, 1, 1, bias=False), 200 | nn.GroupNorm(net_utils.get_group_gn(hidden_dim), hidden_dim, 201 | eps=cfg.GROUP_NORM.EPSILON), 202 | nn.ReLU(inplace=True) 203 | ]) 204 | dim_in = hidden_dim 205 | self.convs = nn.Sequential(*module_list) 206 | 207 | self.dim_out = fc_dim = cfg.FAST_RCNN.MLP_HEAD_DIM 208 | roi_size = cfg.FAST_RCNN.ROI_XFORM_RESOLUTION 209 | self.fc = nn.Linear(dim_in * roi_size * roi_size, fc_dim) 210 | 211 | self._init_weights() 212 | 213 | def _init_weights(self): 214 | def _init(m): 215 | if isinstance(m, nn.Conv2d): 216 | mynn.init.MSRAFill(m.weight) 217 | elif isinstance(m, nn.Linear): 218 | mynn.init.XavierFill(m.weight) 219 | init.constant_(m.bias, 0) 220 | self.apply(_init) 221 | 222 | def detectron_weight_mapping(self): 223 | mapping = {} 224 | for i in range(cfg.FAST_RCNN.NUM_STACKED_CONVS): 225 | mapping.update({ 226 | 'convs.%d.weight' % (i*3): 'head_conv%d_w' % (i+1), 227 | 'convs.%d.weight' % (i*3+1): 'head_conv%d_gn_s' % (i+1), 228 | 'convs.%d.bias' % (i*3+1): 'head_conv%d_gn_b' % (i+1) 229 | }) 230 | mapping.update({ 231 | 'fc.weight': 'fc6_w', 232 | 'fc.bias': 'fc6_b' 233 | }) 234 | return mapping, [] 235 | 236 | def forward(self, x, rpn_ret): 237 | x = self.roi_xform( 238 | x, rpn_ret, 239 | blob_rois='rois', 240 | method=cfg.FAST_RCNN.ROI_XFORM_METHOD, 241 | resolution=cfg.FAST_RCNN.ROI_XFORM_RESOLUTION, 242 | spatial_scale=self.spatial_scale, 243 | sampling_ratio=cfg.FAST_RCNN.ROI_XFORM_SAMPLING_RATIO 244 | ) 245 | batch_size = x.size(0) 246 | x = self.convs(x) 247 | x = F.relu(self.fc(x.view(batch_size, -1)), inplace=True) 248 | return x 249 | -------------------------------------------------------------------------------- /lib/utils_rel/training_stats_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang in 2019 for thsi project 2 | # Based on Detectron.pytorch/lib/utils/training_stats.py 3 | # Original license text below: 4 | # 5 | ############################################################################## 6 | # Copyright (c) 2017-present, Facebook, Inc. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | ############################################################################## 20 | 21 | 22 | """Utilities for training.""" 23 | 24 | from __future__ import absolute_import 25 | from __future__ import division 26 | from __future__ import print_function 27 | from __future__ import unicode_literals 28 | 29 | from collections import defaultdict, OrderedDict 30 | import datetime 31 | import numpy as np 32 | 33 | from core.config import cfg 34 | from utils_rel.logging_rel import log_stats 35 | from utils_rel.logging_rel import SmoothedValue 36 | from utils.timer import Timer 37 | import utils.net as nu 38 | 39 | 40 | class TrainingStats(object): 41 | """Track vital training statistics.""" 42 | 43 | def __init__(self, misc_args, log_period=20, tensorboard_logger=None): 44 | # Output logging period in SGD iterations 45 | self.misc_args = misc_args 46 | self.LOG_PERIOD = log_period 47 | self.tblogger = tensorboard_logger 48 | self.tb_ignored_keys = ['iter', 'eta'] 49 | self.iter_timer = Timer() 50 | # Window size for smoothing tracked values (with median filtering) 51 | self.WIN_SZ = 20 52 | def create_smoothed_value(): 53 | return SmoothedValue(self.WIN_SZ) 54 | self.smoothed_losses = defaultdict(create_smoothed_value) 55 | self.smoothed_metrics = defaultdict(create_smoothed_value) 56 | self.smoothed_total_loss = SmoothedValue(self.WIN_SZ) 57 | # For the support of args.iter_size 58 | self.inner_total_loss = [] 59 | self.inner_losses = defaultdict(list) 60 | if cfg.FPN.FPN_ON: 61 | self.inner_loss_rpn_cls = [] 62 | self.inner_loss_rpn_bbox = [] 63 | self.inner_metrics = defaultdict(list) 64 | 65 | def IterTic(self): 66 | self.iter_timer.tic() 67 | 68 | def IterToc(self): 69 | return self.iter_timer.toc(average=False) 70 | 71 | def ResetIterTimer(self): 72 | self.iter_timer.reset() 73 | 74 | def UpdateIterStats(self, model_out, inner_iter=None): 75 | """Update tracked iteration statistics.""" 76 | if inner_iter is not None and self.misc_args.iter_size > 1: 77 | # For the case of using args.iter_size > 1 78 | return self._UpdateIterStats_inner(model_out, inner_iter) 79 | 80 | # Following code is saved for compatability of train_net.py and iter_size==1 81 | total_loss = 0 82 | if cfg.FPN.FPN_ON: 83 | loss_rpn_cls_data = 0 84 | loss_rpn_bbox_data = 0 85 | 86 | for k, loss in model_out['losses'].items(): 87 | assert loss.shape[0] == cfg.NUM_GPUS 88 | loss = loss.mean(dim=0, keepdim=True) 89 | total_loss += loss 90 | loss_data = loss.data[0] 91 | model_out['losses'][k] = loss 92 | if cfg.FPN.FPN_ON: 93 | if k.startswith('loss_rpn_cls_'): 94 | loss_rpn_cls_data += loss_data 95 | elif k.startswith('loss_rpn_bbox_'): 96 | loss_rpn_bbox_data += loss_data 97 | self.smoothed_losses[k].AddValue(loss_data) 98 | 99 | model_out['total_loss'] = total_loss # Add the total loss for back propagation 100 | self.smoothed_total_loss.AddValue(total_loss.data[0]) 101 | if cfg.FPN.FPN_ON: 102 | self.smoothed_losses['loss_rpn_cls'].AddValue(loss_rpn_cls_data) 103 | self.smoothed_losses['loss_rpn_bbox'].AddValue(loss_rpn_bbox_data) 104 | 105 | for k, metric in model_out['metrics'].items(): 106 | metric = metric.mean(dim=0, keepdim=True) 107 | self.smoothed_metrics[k].AddValue(metric.data[0]) 108 | 109 | def _UpdateIterStats_inner(self, model_out, inner_iter): 110 | """Update tracked iteration statistics for the case of iter_size > 1""" 111 | assert inner_iter < self.misc_args.iter_size 112 | 113 | total_loss = 0 114 | if cfg.FPN.FPN_ON: 115 | loss_rpn_cls_data = 0 116 | loss_rpn_bbox_data = 0 117 | 118 | if inner_iter == 0: 119 | self.inner_total_loss = [] 120 | for k in model_out['losses']: 121 | self.inner_losses[k] = [] 122 | if cfg.FPN.FPN_ON: 123 | self.inner_loss_rpn_cls = [] 124 | self.inner_loss_rpn_bbox = [] 125 | for k in model_out['metrics']: 126 | self.inner_metrics[k] = [] 127 | 128 | for k, loss in model_out['losses'].items(): 129 | assert loss.shape[0] == cfg.NUM_GPUS 130 | loss = loss.mean(dim=0, keepdim=True) 131 | total_loss += loss 132 | loss_data = loss.data[0] 133 | 134 | model_out['losses'][k] = loss 135 | if cfg.FPN.FPN_ON: 136 | if k.startswith('loss_rpn_cls_'): 137 | loss_rpn_cls_data += loss_data 138 | elif k.startswith('loss_rpn_bbox_'): 139 | loss_rpn_bbox_data += loss_data 140 | 141 | self.inner_losses[k].append(loss_data) 142 | if inner_iter == (self.misc_args.iter_size - 1): 143 | loss_data = self._mean_and_reset_inner_list('inner_losses', k) 144 | self.smoothed_losses[k].AddValue(loss_data) 145 | 146 | model_out['total_loss'] = total_loss # Add the total loss for back propagation 147 | total_loss_data = total_loss.data[0] 148 | self.inner_total_loss.append(total_loss_data) 149 | if cfg.FPN.FPN_ON: 150 | self.inner_loss_rpn_cls.append(loss_rpn_cls_data) 151 | self.inner_loss_rpn_bbox.append(loss_rpn_bbox_data) 152 | if inner_iter == (self.misc_args.iter_size - 1): 153 | total_loss_data = self._mean_and_reset_inner_list('inner_total_loss') 154 | self.smoothed_total_loss.AddValue(total_loss_data) 155 | if cfg.FPN.FPN_ON: 156 | loss_rpn_cls_data = self._mean_and_reset_inner_list('inner_loss_rpn_cls') 157 | loss_rpn_bbox_data = self._mean_and_reset_inner_list('inner_loss_rpn_bbox') 158 | self.smoothed_losses['loss_rpn_cls'].AddValue(loss_rpn_cls_data) 159 | self.smoothed_losses['loss_rpn_bbox'].AddValue(loss_rpn_bbox_data) 160 | 161 | for k, metric in model_out['metrics'].items(): 162 | metric = metric.mean(dim=0, keepdim=True) 163 | metric_data = metric.data[0] 164 | self.inner_metrics[k].append(metric_data) 165 | if inner_iter == (self.misc_args.iter_size - 1): 166 | metric_data = self._mean_and_reset_inner_list('inner_metrics', k) 167 | self.smoothed_metrics[k].AddValue(metric_data) 168 | 169 | def _mean_and_reset_inner_list(self, attr_name, key=None): 170 | """Take the mean and reset list empty""" 171 | if key: 172 | mean_val = sum(getattr(self, attr_name)[key]) / self.misc_args.iter_size 173 | getattr(self, attr_name)[key] = [] 174 | else: 175 | mean_val = sum(getattr(self, attr_name)) / self.misc_args.iter_size 176 | setattr(self, attr_name, []) 177 | return mean_val 178 | 179 | def LogIterStats(self, cur_iter, lr, backbone_lr): 180 | """Log the tracked statistics.""" 181 | if (cur_iter % self.LOG_PERIOD == 0 or 182 | cur_iter == cfg.SOLVER.MAX_ITER - 1): 183 | stats = self.GetStats(cur_iter, lr, backbone_lr) 184 | log_stats(stats, self.misc_args) 185 | if self.tblogger: 186 | self.tb_log_stats(stats, cur_iter) 187 | 188 | def tb_log_stats(self, stats, cur_iter): 189 | """Log the tracked statistics to tensorboard""" 190 | for k in stats: 191 | if k not in self.tb_ignored_keys: 192 | v = stats[k] 193 | if isinstance(v, dict): 194 | self.tb_log_stats(v, cur_iter) 195 | else: 196 | self.tblogger.add_scalar(k, v, cur_iter) 197 | 198 | def GetStats(self, cur_iter, lr, backbone_lr): 199 | eta_seconds = self.iter_timer.average_time * ( 200 | cfg.SOLVER.MAX_ITER - cur_iter 201 | ) 202 | eta = str(datetime.timedelta(seconds=int(eta_seconds))) 203 | stats = OrderedDict( 204 | iter=cur_iter + 1, # 1-indexed 205 | time=self.iter_timer.average_time, 206 | eta=eta, 207 | loss=self.smoothed_total_loss.GetMedianValue(), 208 | lr=lr, 209 | backbone_lr=backbone_lr 210 | ) 211 | stats['metrics'] = OrderedDict() 212 | for k in sorted(self.smoothed_metrics): 213 | stats['metrics'][k] = self.smoothed_metrics[k].GetMedianValue() 214 | 215 | head_losses = [] 216 | for k, v in self.smoothed_losses.items(): 217 | head_losses.append((k, v.GetMedianValue())) 218 | stats['head_losses'] = OrderedDict(head_losses) 219 | 220 | return stats 221 | -------------------------------------------------------------------------------- /lib/datasets_rel/roidb_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted from Detectron.pytorch/lib/datasets/roidb.py 2 | # for this project by Ji Zhang, 2019 3 | # 4 | # -------------------------------------------------------- 5 | # Copyright (c) 2017-present, Facebook, Inc. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | ############################################################################## 19 | 20 | """Functions for common roidb manipulations.""" 21 | 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | from __future__ import unicode_literals 26 | 27 | import six 28 | import logging 29 | import numpy as np 30 | 31 | import utils.boxes as box_utils 32 | import utils.blob as blob_utils 33 | from core.config import cfg 34 | from .json_dataset_rel import JsonDatasetRel 35 | 36 | logger = logging.getLogger(__name__) 37 | 38 | 39 | def combined_roidb_for_training(dataset_names, proposal_files): 40 | """Load and concatenate roidbs for one or more datasets, along with optional 41 | object proposals. The roidb entries are then prepared for use in training, 42 | which involves caching certain types of metadata for each roidb entry. 43 | """ 44 | def get_roidb(dataset_name, proposal_file): 45 | ds = JsonDatasetRel(dataset_name) 46 | roidb = ds.get_roidb( 47 | gt=True, 48 | proposal_file=proposal_file, 49 | crowd_filter_thresh=cfg.TRAIN.CROWD_FILTER_THRESH 50 | ) 51 | if cfg.TRAIN.USE_FLIPPED: 52 | logger.info('Appending horizontally-flipped training examples...') 53 | extend_with_flipped_entries(roidb, ds) 54 | logger.info('Loaded dataset: {:s}'.format(ds.name)) 55 | return roidb 56 | 57 | if isinstance(dataset_names, six.string_types): 58 | dataset_names = (dataset_names, ) 59 | if isinstance(proposal_files, six.string_types): 60 | proposal_files = (proposal_files, ) 61 | if len(proposal_files) == 0: 62 | proposal_files = (None, ) * len(dataset_names) 63 | assert len(dataset_names) == len(proposal_files) 64 | roidbs = [get_roidb(*args) for args in zip(dataset_names, proposal_files)] 65 | roidb = roidbs[0] 66 | for r in roidbs[1:]: 67 | roidb.extend(r) 68 | roidb = filter_for_training(roidb) 69 | 70 | if cfg.TRAIN.ASPECT_GROUPING or cfg.TRAIN.ASPECT_CROPPING: 71 | logger.info('Computing image aspect ratios and ordering the ratios...') 72 | ratio_list, ratio_index = rank_for_training(roidb) 73 | logger.info('done') 74 | else: 75 | ratio_list, ratio_index = None, None 76 | 77 | logger.info('Computing bounding-box regression targets...') 78 | add_bbox_regression_targets(roidb) 79 | logger.info('done') 80 | 81 | _compute_and_log_stats(roidb) 82 | 83 | return roidb, ratio_list, ratio_index 84 | 85 | 86 | def extend_with_flipped_entries(roidb, dataset): 87 | """Flip each entry in the given roidb and return a new roidb that is the 88 | concatenation of the original roidb and the flipped entries. 89 | 90 | "Flipping" an entry means that that image and associated metadata (e.g., 91 | ground truth boxes and object proposals) are horizontally flipped. 92 | """ 93 | flipped_roidb = [] 94 | for entry in roidb: 95 | width = entry['width'] 96 | boxes = entry['boxes'].copy() 97 | oldx1 = boxes[:, 0].copy() 98 | oldx2 = boxes[:, 2].copy() 99 | boxes[:, 0] = width - oldx2 - 1 100 | boxes[:, 2] = width - oldx1 - 1 101 | assert (boxes[:, 2] >= boxes[:, 0]).all() 102 | # sbj 103 | sbj_gt_boxes = entry['sbj_gt_boxes'].copy() 104 | oldx1 = sbj_gt_boxes[:, 0].copy() 105 | oldx2 = sbj_gt_boxes[:, 2].copy() 106 | sbj_gt_boxes[:, 0] = width - oldx2 - 1 107 | sbj_gt_boxes[:, 2] = width - oldx1 - 1 108 | assert (sbj_gt_boxes[:, 2] >= sbj_gt_boxes[:, 0]).all() 109 | # obj 110 | obj_gt_boxes = entry['obj_gt_boxes'].copy() 111 | oldx1 = obj_gt_boxes[:, 0].copy() 112 | oldx2 = obj_gt_boxes[:, 2].copy() 113 | obj_gt_boxes[:, 0] = width - oldx2 - 1 114 | obj_gt_boxes[:, 2] = width - oldx1 - 1 115 | assert (obj_gt_boxes[:, 2] >= obj_gt_boxes[:, 0]).all() 116 | # now flip 117 | flipped_entry = {} 118 | dont_copy = ('boxes', 'sbj_gt_boxes', 'obj_gt_boxes', 'segms', 'gt_keypoints', 'flipped') 119 | for k, v in entry.items(): 120 | if k not in dont_copy: 121 | flipped_entry[k] = v 122 | flipped_entry['boxes'] = boxes 123 | flipped_entry['sbj_gt_boxes'] = sbj_gt_boxes 124 | flipped_entry['obj_gt_boxes'] = obj_gt_boxes 125 | flipped_entry['flipped'] = True 126 | flipped_roidb.append(flipped_entry) 127 | roidb.extend(flipped_roidb) 128 | 129 | 130 | def filter_for_training(roidb): 131 | """Remove roidb entries that have no usable RoIs based on config settings. 132 | """ 133 | def is_valid(entry): 134 | # Valid images have: 135 | # (1) At least one foreground RoI OR 136 | # (2) At least one background RoI 137 | overlaps = entry['max_overlaps'] 138 | # find boxes with sufficient overlap 139 | fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] 140 | # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) 141 | bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & 142 | (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] 143 | # image is only valid if such boxes exist 144 | valid = len(fg_inds) > 0 or len(bg_inds) > 0 145 | if cfg.MODEL.KEYPOINTS_ON: 146 | # If we're training for keypoints, exclude images with no keypoints 147 | valid = valid and entry['has_visible_keypoints'] 148 | return valid 149 | 150 | num = len(roidb) 151 | filtered_roidb = [entry for entry in roidb if is_valid(entry)] 152 | num_after = len(filtered_roidb) 153 | logger.info('Filtered {} roidb entries: {} -> {}'. 154 | format(num - num_after, num, num_after)) 155 | return filtered_roidb 156 | 157 | 158 | def rank_for_training(roidb): 159 | """Rank the roidb entries according to image aspect ration and mark for cropping 160 | for efficient batching if image is too long. 161 | 162 | Returns: 163 | ratio_list: ndarray, list of aspect ratios from small to large 164 | ratio_index: ndarray, list of roidb entry indices correspond to the ratios 165 | """ 166 | RATIO_HI = cfg.TRAIN.ASPECT_HI # largest ratio to preserve. 167 | RATIO_LO = cfg.TRAIN.ASPECT_LO # smallest ratio to preserve. 168 | 169 | need_crop_cnt = 0 170 | 171 | ratio_list = [] 172 | for entry in roidb: 173 | width = entry['width'] 174 | height = entry['height'] 175 | ratio = width / float(height) 176 | 177 | if cfg.TRAIN.ASPECT_CROPPING: 178 | if ratio > RATIO_HI: 179 | entry['need_crop'] = True 180 | ratio = RATIO_HI 181 | need_crop_cnt += 1 182 | elif ratio < RATIO_LO: 183 | entry['need_crop'] = True 184 | ratio = RATIO_LO 185 | need_crop_cnt += 1 186 | else: 187 | entry['need_crop'] = False 188 | else: 189 | entry['need_crop'] = False 190 | 191 | ratio_list.append(ratio) 192 | 193 | if cfg.TRAIN.ASPECT_CROPPING: 194 | logging.info('Number of entries that need to be cropped: %d. Ratio bound: [%.2f, %.2f]', 195 | need_crop_cnt, RATIO_LO, RATIO_HI) 196 | ratio_list = np.array(ratio_list) 197 | ratio_index = np.argsort(ratio_list) 198 | return ratio_list[ratio_index], ratio_index 199 | 200 | def add_bbox_regression_targets(roidb): 201 | """Add information needed to train bounding-box regressors.""" 202 | for entry in roidb: 203 | entry['bbox_targets'] = _compute_targets(entry) 204 | 205 | 206 | def _compute_targets(entry): 207 | """Compute bounding-box regression targets for an image.""" 208 | # Indices of ground-truth ROIs 209 | rois = entry['boxes'] 210 | overlaps = entry['max_overlaps'] 211 | labels = entry['max_classes'] 212 | gt_inds = np.where((entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0] 213 | # Targets has format (class, tx, ty, tw, th) 214 | targets = np.zeros((rois.shape[0], 5), dtype=np.float32) 215 | if len(gt_inds) == 0: 216 | # Bail if the image has no ground-truth ROIs 217 | return targets 218 | 219 | # Indices of examples for which we try to make predictions 220 | ex_inds = np.where(overlaps >= cfg.TRAIN.BBOX_THRESH)[0] 221 | 222 | # Get IoU overlap between each ex ROI and gt ROI 223 | ex_gt_overlaps = box_utils.bbox_overlaps( 224 | rois[ex_inds, :].astype(dtype=np.float32, copy=False), 225 | rois[gt_inds, :].astype(dtype=np.float32, copy=False)) 226 | 227 | # Find which gt ROI each ex ROI has max overlap with: 228 | # this will be the ex ROI's gt target 229 | gt_assignment = ex_gt_overlaps.argmax(axis=1) 230 | gt_rois = rois[gt_inds[gt_assignment], :] 231 | ex_rois = rois[ex_inds, :] 232 | # Use class "1" for all boxes if using class_agnostic_bbox_reg 233 | targets[ex_inds, 0] = ( 234 | 1 if cfg.MODEL.CLS_AGNOSTIC_BBOX_REG else labels[ex_inds]) 235 | targets[ex_inds, 1:] = box_utils.bbox_transform_inv( 236 | ex_rois, gt_rois, cfg.MODEL.BBOX_REG_WEIGHTS) 237 | return targets 238 | 239 | 240 | def _compute_and_log_stats(roidb): 241 | classes = roidb[0]['dataset'].classes 242 | char_len = np.max([len(c) for c in classes]) 243 | hist_bins = np.arange(len(classes) + 1) 244 | 245 | # Histogram of ground-truth objects 246 | gt_hist = np.zeros((len(classes)), dtype=np.int) 247 | for entry in roidb: 248 | gt_inds = np.where( 249 | (entry['gt_classes'] > 0) & (entry['is_crowd'] == 0))[0] 250 | gt_classes = entry['gt_classes'][gt_inds] 251 | gt_hist += np.histogram(gt_classes, bins=hist_bins)[0] 252 | logger.debug('Ground-truth class histogram:') 253 | for i, v in enumerate(gt_hist): 254 | logger.debug( 255 | '{:d}{:s}: {:d}'.format( 256 | i, classes[i].rjust(char_len), v)) 257 | logger.debug('-' * char_len) 258 | logger.debug( 259 | '{:s}: {:d}'.format( 260 | 'total'.rjust(char_len), np.sum(gt_hist))) 261 | -------------------------------------------------------------------------------- /lib/core/test_engine_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang, 2019 2 | # from Detectron.pytorch/lib/core/test_engine.py 3 | # Original license text below 4 | # -------------------------------------------------------- 5 | # Copyright (c) 2017-present, Facebook, Inc. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | ############################################################################## 19 | 20 | """Test a Detectron network on an imdb (image database).""" 21 | 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | from __future__ import unicode_literals 26 | 27 | from collections import defaultdict 28 | import cv2 29 | import datetime 30 | import logging 31 | import numpy as np 32 | from numpy import linalg as la 33 | import os 34 | import yaml 35 | import json 36 | from six.moves import cPickle as pickle 37 | 38 | import torch 39 | import nn as mynn 40 | from torch.autograd import Variable 41 | 42 | from core.config import cfg 43 | from core.test_rel import im_detect_rels 44 | from datasets_rel import task_evaluation_sg as task_evaluation_sg 45 | from datasets_rel import task_evaluation_vg_and_vrd as task_evaluation_vg_and_vrd 46 | from datasets_rel.json_dataset_rel import JsonDatasetRel 47 | from modeling_rel import model_builder_rel 48 | from utils.detectron_weight_helper import load_detectron_weight 49 | import utils.env as envu 50 | import utils_rel.net_rel as net_utils_rel 51 | import utils_rel.subprocess_rel as subprocess_utils 52 | import utils.vis as vis_utils 53 | from utils.io import save_object 54 | from utils.timer import Timer 55 | 56 | logger = logging.getLogger(__name__) 57 | 58 | 59 | def get_eval_functions(): 60 | # Determine which parent or child function should handle inference 61 | # Generic case that handles all network types other than RPN-only nets 62 | # and RetinaNet 63 | child_func = test_net 64 | parent_func = test_net_on_dataset 65 | 66 | return parent_func, child_func 67 | 68 | 69 | def get_inference_dataset(index, is_parent=True): 70 | assert is_parent or len(cfg.TEST.DATASETS) == 1, \ 71 | 'The child inference process can only work on a single dataset' 72 | 73 | dataset_name = cfg.TEST.DATASETS[index] 74 | proposal_file = None 75 | 76 | return dataset_name, proposal_file 77 | 78 | 79 | def run_inference( 80 | args, ind_range=None, 81 | multi_gpu_testing=False, gpu_id=0, 82 | check_expected_results=False): 83 | parent_func, child_func = get_eval_functions() 84 | is_parent = ind_range is None 85 | 86 | def result_getter(): 87 | if is_parent: 88 | # Parent case: 89 | # In this case we're either running inference on the entire dataset in a 90 | # single process or (if multi_gpu_testing is True) using this process to 91 | # launch subprocesses that each run inference on a range of the dataset 92 | all_results = [] 93 | for i in range(len(cfg.TEST.DATASETS)): 94 | dataset_name, proposal_file = get_inference_dataset(i) 95 | output_dir = args.output_dir 96 | results = parent_func( 97 | args, 98 | dataset_name, 99 | proposal_file, 100 | output_dir, 101 | multi_gpu=multi_gpu_testing 102 | ) 103 | all_results.append(results) 104 | 105 | return all_results 106 | else: 107 | # Subprocess child case: 108 | # In this case test_net was called via subprocess.Popen to execute on a 109 | # range of inputs on a single dataset 110 | dataset_name, proposal_file = get_inference_dataset(0, is_parent=False) 111 | output_dir = args.output_dir 112 | return child_func( 113 | args, 114 | dataset_name, 115 | proposal_file, 116 | output_dir, 117 | ind_range=ind_range, 118 | gpu_id=gpu_id 119 | ) 120 | 121 | all_results = result_getter() 122 | 123 | return all_results 124 | 125 | 126 | def test_net_on_dataset( 127 | args, 128 | dataset_name, 129 | proposal_file, 130 | output_dir, 131 | multi_gpu=False, 132 | gpu_id=0): 133 | """Run inference on a dataset.""" 134 | dataset = JsonDatasetRel(dataset_name) 135 | test_timer = Timer() 136 | test_timer.tic() 137 | if multi_gpu: 138 | num_images = len(dataset.get_roidb(gt=args.do_val)) 139 | all_results = multi_gpu_test_net_on_dataset( 140 | args, dataset_name, proposal_file, num_images, output_dir 141 | ) 142 | else: 143 | all_results = test_net( 144 | args, dataset_name, proposal_file, output_dir, gpu_id=gpu_id 145 | ) 146 | test_timer.toc() 147 | logger.info('Total inference time: {:.3f}s'.format(test_timer.average_time)) 148 | 149 | logger.info('Starting evaluation now...') 150 | if dataset_name.find('vg') >= 0 or dataset_name.find('vrd') >= 0: 151 | task_evaluation_vg_and_vrd.eval_rel_results(all_results, output_dir, args.do_val) 152 | else: 153 | task_evaluation_sg.eval_rel_results(all_results, output_dir, args.do_val, args.do_vis, args.do_special) 154 | 155 | return all_results 156 | 157 | 158 | def multi_gpu_test_net_on_dataset( 159 | args, dataset_name, proposal_file, num_images, output_dir): 160 | """Multi-gpu inference on a dataset.""" 161 | binary_dir = envu.get_runtime_dir() 162 | binary_ext = envu.get_py_bin_ext() 163 | binary = os.path.join(binary_dir, args.test_net_file + binary_ext) 164 | assert os.path.exists(binary), 'Binary \'{}\' not found'.format(binary) 165 | 166 | # Pass the target dataset and proposal file (if any) via the command line 167 | opts = ['TEST.DATASETS', '("{}",)'.format(dataset_name)] 168 | if proposal_file: 169 | opts += ['TEST.PROPOSAL_FILES', '("{}",)'.format(proposal_file)] 170 | 171 | if args.do_val: 172 | opts += ['--do_val'] 173 | if args.do_vis: 174 | opts += ['--do_vis'] 175 | if args.do_special: 176 | opts += ['--do_special'] 177 | if args.use_gt_boxes: 178 | opts += ['--use_gt_boxes'] 179 | if args.use_gt_labels: 180 | opts += ['--use_gt_labels'] 181 | 182 | # Run inference in parallel in subprocesses 183 | # Outputs will be a list of outputs from each subprocess, where the output 184 | # of each subprocess is the dictionary saved by test_net(). 185 | outputs = subprocess_utils.process_in_parallel( 186 | 'rel_detection', num_images, binary, output_dir, 187 | args.load_ckpt, args.load_detectron, opts 188 | ) 189 | 190 | # Collate the results from each subprocess 191 | all_results = [] 192 | for det_data in outputs: 193 | all_results += det_data 194 | 195 | if args.use_gt_boxes: 196 | if args.use_gt_labels: 197 | det_file = os.path.join(args.output_dir, 'rel_detections_gt_boxes_prdcls.pkl') 198 | else: 199 | det_file = os.path.join(args.output_dir, 'rel_detections_gt_boxes_sgcls.pkl') 200 | else: 201 | det_file = os.path.join(args.output_dir, 'rel_detections.pkl') 202 | save_object(all_results, det_file) 203 | logger.info('Wrote rel_detections to: {}'.format(os.path.abspath(det_file))) 204 | 205 | return all_results 206 | 207 | 208 | def test_net( 209 | args, 210 | dataset_name, 211 | proposal_file, 212 | output_dir, 213 | ind_range=None, 214 | gpu_id=0): 215 | """Run inference on all images in a dataset or over an index range of images 216 | in a dataset using a single GPU. 217 | """ 218 | assert not cfg.MODEL.RPN_ONLY, \ 219 | 'Use rpn_generate to generate proposals from RPN-only models' 220 | 221 | roidb, dataset, start_ind, end_ind, total_num_images = get_roidb_and_dataset( 222 | dataset_name, proposal_file, ind_range, args.do_val 223 | ) 224 | model = initialize_model_from_cfg(args, gpu_id=gpu_id) 225 | 226 | num_images = len(roidb) 227 | all_results = [None for _ in range(num_images)] 228 | timers = defaultdict(Timer) 229 | for i, entry in enumerate(roidb): 230 | box_proposals = None 231 | 232 | im = cv2.imread(entry['image']) 233 | if args.use_gt_boxes: 234 | im_results = im_detect_rels(model, im, dataset_name, box_proposals, args.do_vis, timers, entry, args.use_gt_labels) 235 | else: 236 | im_results = im_detect_rels(model, im, dataset_name, box_proposals, args.do_vis, timers) 237 | 238 | im_results.update(dict(image=entry['image'])) 239 | # add gt 240 | if args.do_val: 241 | im_results.update( 242 | dict(gt_sbj_boxes=entry['sbj_gt_boxes'], 243 | gt_sbj_labels=entry['sbj_gt_classes'], 244 | gt_obj_boxes=entry['obj_gt_boxes'], 245 | gt_obj_labels=entry['obj_gt_classes'], 246 | gt_prd_labels=entry['prd_gt_classes'])) 247 | 248 | all_results[i] = im_results 249 | 250 | if i % 10 == 0: # Reduce log file size 251 | ave_total_time = np.sum([t.average_time for t in timers.values()]) 252 | eta_seconds = ave_total_time * (num_images - i - 1) 253 | eta = str(datetime.timedelta(seconds=int(eta_seconds))) 254 | det_time = (timers['im_detect_rels'].average_time) 255 | logger.info(( 256 | 'im_detect: range [{:d}, {:d}] of {:d}: ' 257 | '{:d}/{:d} {:.3f}s (eta: {})').format( 258 | start_ind + 1, end_ind, total_num_images, start_ind + i + 1, 259 | start_ind + num_images, det_time, eta)) 260 | 261 | cfg_yaml = yaml.dump(cfg) 262 | if ind_range is not None: 263 | det_name = 'rel_detection_range_%s_%s.pkl' % tuple(ind_range) 264 | else: 265 | if args.use_gt_boxes: 266 | if args.use_gt_labels: 267 | det_name = 'rel_detections_gt_boxes_prdcls.pkl' 268 | else: 269 | det_name = 'rel_detections_gt_boxes_sgcls.pkl' 270 | else: 271 | det_name = 'rel_detections.pkl' 272 | det_file = os.path.join(output_dir, det_name) 273 | save_object(all_results, det_file) 274 | logger.info('Wrote rel_detections to: {}'.format(os.path.abspath(det_file))) 275 | return all_results 276 | 277 | 278 | def initialize_model_from_cfg(args, gpu_id=0): 279 | """Initialize a model from the global cfg. Loads test-time weights and 280 | set to evaluation mode. 281 | """ 282 | model = model_builder_rel.Generalized_RCNN() 283 | model.eval() 284 | 285 | if args.cuda: 286 | model.cuda() 287 | 288 | if args.load_ckpt: 289 | load_name = args.load_ckpt 290 | logger.info("loading checkpoint %s", load_name) 291 | checkpoint = torch.load(load_name, map_location=lambda storage, loc: storage) 292 | net_utils_rel.load_ckpt_rel(model, checkpoint['model']) 293 | 294 | if args.load_detectron: 295 | logger.info("loading detectron weights %s", args.load_detectron) 296 | load_detectron_weight(model, args.load_detectron) 297 | 298 | model = mynn.DataParallel(model, cpu_keywords=['im_info', 'roidb'], minibatch=True) 299 | 300 | return model 301 | 302 | 303 | def get_roidb_and_dataset(dataset_name, proposal_file, ind_range, do_val=True): 304 | """Get the roidb for the dataset specified in the global cfg. Optionally 305 | restrict it to a range of indices if ind_range is a pair of integers. 306 | """ 307 | dataset = JsonDatasetRel(dataset_name) 308 | roidb = dataset.get_roidb(gt=do_val) 309 | 310 | if ind_range is not None: 311 | total_num_images = len(roidb) 312 | start, end = ind_range 313 | roidb = roidb[start:end] 314 | else: 315 | start = 0 316 | end = len(roidb) 317 | total_num_images = end 318 | 319 | return roidb, dataset, start, end, total_num_images 320 | -------------------------------------------------------------------------------- /lib/roi_data_rel/loader_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang for this project in 2019 2 | # 3 | # Based on Detectron.Pytorch/lib/roi/loader.py by Roy Tseng 4 | 5 | import math 6 | import numpy as np 7 | import numpy.random as npr 8 | 9 | import torch 10 | import torch.utils.data as data 11 | import torch.utils.data.sampler as torch_sampler 12 | from torch.utils.data.dataloader import default_collate 13 | from torch._six import int_classes as _int_classes 14 | 15 | from core.config import cfg 16 | from roi_data_rel.minibatch_rel import get_minibatch 17 | import utils.blob as blob_utils 18 | 19 | 20 | class RoiDataLoader(data.Dataset): 21 | def __init__(self, roidb, num_classes, training=True): 22 | self._roidb = roidb 23 | self._num_classes = num_classes 24 | self.training = training 25 | self.DATA_SIZE = len(self._roidb) 26 | 27 | def __getitem__(self, index_tuple): 28 | index, ratio = index_tuple 29 | single_db = [self._roidb[index]] 30 | blobs, valid = get_minibatch(single_db) 31 | #TODO: Check if minibatch is valid ? If not, abandon it. 32 | # Need to change _worker_loop in torch.utils.data.dataloader.py. 33 | 34 | # Squeeze batch dim 35 | for key in blobs: 36 | if key != 'roidb': 37 | blobs[key] = blobs[key].squeeze(axis=0) 38 | 39 | if self._roidb[index]['need_crop']: 40 | self.crop_data(blobs, ratio) 41 | # Check bounding box 42 | entry = blobs['roidb'][0] 43 | boxes = entry['boxes'] 44 | invalid = (boxes[:, 0] == boxes[:, 2]) | (boxes[:, 1] == boxes[:, 3]) 45 | valid_inds = np.nonzero(~ invalid)[0] 46 | if len(valid_inds) < len(boxes): 47 | for key in ['boxes', 'gt_classes', 'seg_areas', 'gt_overlaps', 'is_crowd', 48 | 'box_to_gt_ind_map', 'gt_keypoints']: 49 | if key in entry: 50 | entry[key] = entry[key][valid_inds] 51 | entry['segms'] = [entry['segms'][ind] for ind in valid_inds] 52 | # for rel sanity check 53 | sbj_gt_boxes = entry['sbj_gt_boxes'] 54 | obj_gt_boxes = entry['obj_gt_boxes'] 55 | sbj_invalid = (sbj_gt_boxes[:, 0] == sbj_gt_boxes[:, 2]) | (sbj_gt_boxes[:, 1] == sbj_gt_boxes[:, 3]) 56 | obj_invalid = (obj_gt_boxes[:, 0] == obj_gt_boxes[:, 2]) | (obj_gt_boxes[:, 1] == obj_gt_boxes[:, 3]) 57 | rel_valid = sbj_invalid | obj_invalid 58 | rel_valid_inds = np.nonzero(~ rel_invalid)[0] 59 | if len(rel_valid_inds) < len(sbj_gt_boxes): 60 | for key in ['sbj_gt_boxes', 'sbj_gt_classes', 'obj_gt_boxes', 'obj_gt_classes', 'prd_gt_classes', 61 | 'sbj_gt_overlaps', 'obj_gt_overlaps', 'prd_gt_overlaps', 'pair_to_gt_ind_map', 62 | 'width', 'height']: 63 | if key in entry: 64 | entry[key] = entry[key][rel_valid_inds] 65 | 66 | blobs['roidb'] = blob_utils.serialize(blobs['roidb']) # CHECK: maybe we can serialize in collate_fn 67 | 68 | return blobs 69 | 70 | def crop_data(self, blobs, ratio): 71 | data_height, data_width = map(int, blobs['im_info'][:2]) 72 | boxes = blobs['roidb'][0]['boxes'] 73 | if ratio < 1: # width << height, crop height 74 | size_crop = math.ceil(data_width / ratio) # size after crop 75 | min_y = math.floor(np.min(boxes[:, 1])) 76 | max_y = math.floor(np.max(boxes[:, 3])) 77 | box_region = max_y - min_y + 1 78 | if min_y == 0: 79 | y_s = 0 80 | else: 81 | if (box_region - size_crop) < 0: 82 | y_s_min = max(max_y - size_crop, 0) 83 | y_s_max = min(min_y, data_height - size_crop) 84 | y_s = y_s_min if y_s_min == y_s_max else \ 85 | npr.choice(range(y_s_min, y_s_max + 1)) 86 | else: 87 | # CHECK: rethinking the mechnism for the case box_region > size_crop 88 | # Now, the crop is biased on the lower part of box_region caused by 89 | # // 2 for y_s_add 90 | y_s_add = (box_region - size_crop) // 2 91 | y_s = min_y if y_s_add == 0 else \ 92 | npr.choice(range(min_y, min_y + y_s_add + 1)) 93 | # Crop the image 94 | blobs['data'] = blobs['data'][:, y_s:(y_s + size_crop), :,] 95 | # Update im_info 96 | blobs['im_info'][0] = size_crop 97 | # Shift and clamp boxes ground truth 98 | boxes[:, 1] -= y_s 99 | boxes[:, 3] -= y_s 100 | np.clip(boxes[:, 1], 0, size_crop - 1, out=boxes[:, 1]) 101 | np.clip(boxes[:, 3], 0, size_crop - 1, out=boxes[:, 3]) 102 | blobs['roidb'][0]['boxes'] = boxes 103 | else: # width >> height, crop width 104 | size_crop = math.ceil(data_height * ratio) 105 | min_x = math.floor(np.min(boxes[:, 0])) 106 | max_x = math.floor(np.max(boxes[:, 2])) 107 | box_region = max_x - min_x + 1 108 | if min_x == 0: 109 | x_s = 0 110 | else: 111 | if (box_region - size_crop) < 0: 112 | x_s_min = max(max_x - size_crop, 0) 113 | x_s_max = min(min_x, data_width - size_crop) 114 | x_s = x_s_min if x_s_min == x_s_max else \ 115 | npr.choice(range(x_s_min, x_s_max + 1)) 116 | else: 117 | x_s_add = (box_region - size_crop) // 2 118 | x_s = min_x if x_s_add == 0 else \ 119 | npr.choice(range(min_x, min_x + x_s_add + 1)) 120 | # Crop the image 121 | blobs['data'] = blobs['data'][:, :, x_s:(x_s + size_crop)] 122 | # Update im_info 123 | blobs['im_info'][1] = size_crop 124 | # Shift and clamp boxes ground truth 125 | boxes[:, 0] -= x_s 126 | boxes[:, 2] -= x_s 127 | np.clip(boxes[:, 0], 0, size_crop - 1, out=boxes[:, 0]) 128 | np.clip(boxes[:, 2], 0, size_crop - 1, out=boxes[:, 2]) 129 | blobs['roidb'][0]['boxes'] = boxes 130 | 131 | def __len__(self): 132 | return self.DATA_SIZE 133 | 134 | 135 | def cal_minibatch_ratio(ratio_list): 136 | """Given the ratio_list, we want to make the RATIO same for each minibatch on each GPU. 137 | Note: this only work for 1) cfg.TRAIN.MAX_SIZE is ignored during `prep_im_for_blob` 138 | and 2) cfg.TRAIN.SCALES containing SINGLE scale. 139 | Since all prepared images will have same min side length of cfg.TRAIN.SCALES[0], we can 140 | pad and batch images base on that. 141 | """ 142 | DATA_SIZE = len(ratio_list) 143 | ratio_list_minibatch = np.empty((DATA_SIZE,)) 144 | num_minibatch = int(np.ceil(DATA_SIZE / cfg.TRAIN.IMS_PER_BATCH)) # Include leftovers 145 | for i in range(num_minibatch): 146 | left_idx = i * cfg.TRAIN.IMS_PER_BATCH 147 | right_idx = min((i+1) * cfg.TRAIN.IMS_PER_BATCH - 1, DATA_SIZE - 1) 148 | 149 | if ratio_list[right_idx] < 1: 150 | # for ratio < 1, we preserve the leftmost in each batch. 151 | target_ratio = ratio_list[left_idx] 152 | elif ratio_list[left_idx] > 1: 153 | # for ratio > 1, we preserve the rightmost in each batch. 154 | target_ratio = ratio_list[right_idx] 155 | else: 156 | # for ratio cross 1, we make it to be 1. 157 | target_ratio = 1 158 | 159 | ratio_list_minibatch[left_idx:(right_idx+1)] = target_ratio 160 | return ratio_list_minibatch 161 | 162 | 163 | class MinibatchSampler(torch_sampler.Sampler): 164 | def __init__(self, ratio_list, ratio_index): 165 | self.ratio_list = ratio_list 166 | self.ratio_index = ratio_index 167 | self.num_data = len(ratio_list) 168 | 169 | if cfg.TRAIN.ASPECT_GROUPING: 170 | # Given the ratio_list, we want to make the ratio same 171 | # for each minibatch on each GPU. 172 | self.ratio_list_minibatch = cal_minibatch_ratio(ratio_list) 173 | 174 | def __iter__(self): 175 | if cfg.TRAIN.ASPECT_GROUPING: 176 | # indices for aspect grouping awared permutation 177 | n, rem = divmod(self.num_data, cfg.TRAIN.IMS_PER_BATCH) 178 | round_num_data = n * cfg.TRAIN.IMS_PER_BATCH 179 | indices = np.arange(round_num_data) 180 | npr.shuffle(indices.reshape(-1, cfg.TRAIN.IMS_PER_BATCH)) # inplace shuffle 181 | if rem != 0: 182 | indices = np.append(indices, np.arange(round_num_data, round_num_data + rem)) 183 | ratio_index = self.ratio_index[indices] 184 | ratio_list_minibatch = self.ratio_list_minibatch[indices] 185 | else: 186 | rand_perm = npr.permutation(self.num_data) 187 | ratio_list = self.ratio_list[rand_perm] 188 | ratio_index = self.ratio_index[rand_perm] 189 | # re-calculate minibatch ratio list 190 | ratio_list_minibatch = cal_minibatch_ratio(ratio_list) 191 | 192 | return iter(zip(ratio_index.tolist(), ratio_list_minibatch.tolist())) 193 | 194 | def __len__(self): 195 | return self.num_data 196 | 197 | 198 | class BatchSampler(torch_sampler.BatchSampler): 199 | r"""Wraps another sampler to yield a mini-batch of indices. 200 | Args: 201 | sampler (Sampler): Base sampler. 202 | batch_size (int): Size of mini-batch. 203 | drop_last (bool): If ``True``, the sampler will drop the last batch if 204 | its size would be less than ``batch_size`` 205 | Example: 206 | >>> list(BatchSampler(range(10), batch_size=3, drop_last=False)) 207 | [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]] 208 | >>> list(BatchSampler(range(10), batch_size=3, drop_last=True)) 209 | [[0, 1, 2], [3, 4, 5], [6, 7, 8]] 210 | """ 211 | 212 | def __init__(self, sampler, batch_size, drop_last): 213 | if not isinstance(sampler, torch_sampler.Sampler): 214 | raise ValueError("sampler should be an instance of " 215 | "torch.utils.data.Sampler, but got sampler={}" 216 | .format(sampler)) 217 | if not isinstance(batch_size, _int_classes) or isinstance(batch_size, bool) or \ 218 | batch_size <= 0: 219 | raise ValueError("batch_size should be a positive integeral value, " 220 | "but got batch_size={}".format(batch_size)) 221 | if not isinstance(drop_last, bool): 222 | raise ValueError("drop_last should be a boolean value, but got " 223 | "drop_last={}".format(drop_last)) 224 | self.sampler = sampler 225 | self.batch_size = batch_size 226 | self.drop_last = drop_last 227 | 228 | def __iter__(self): 229 | batch = [] 230 | for idx in self.sampler: 231 | batch.append(idx) # Difference: batch.append(int(idx)) 232 | if len(batch) == self.batch_size: 233 | yield batch 234 | batch = [] 235 | if len(batch) > 0 and not self.drop_last: 236 | yield batch 237 | 238 | def __len__(self): 239 | if self.drop_last: 240 | return len(self.sampler) // self.batch_size 241 | else: 242 | return (len(self.sampler) + self.batch_size - 1) // self.batch_size 243 | 244 | 245 | 246 | def collate_minibatch(list_of_blobs): 247 | """Stack samples seperately and return a list of minibatches 248 | A batch contains NUM_GPUS minibatches and image size in different minibatch may be different. 249 | Hence, we need to stack smaples from each minibatch seperately. 250 | """ 251 | Batch = {key: [] for key in list_of_blobs[0]} 252 | # Because roidb consists of entries of variable length, it can't be batch into a tensor. 253 | # So we keep roidb in the type of "list of ndarray". 254 | list_of_roidb = [blobs.pop('roidb') for blobs in list_of_blobs] 255 | for i in range(0, len(list_of_blobs), cfg.TRAIN.IMS_PER_BATCH): 256 | mini_list = list_of_blobs[i:(i + cfg.TRAIN.IMS_PER_BATCH)] 257 | # Pad image data 258 | mini_list = pad_image_data(mini_list) 259 | minibatch = default_collate(mini_list) 260 | minibatch['roidb'] = list_of_roidb[i:(i + cfg.TRAIN.IMS_PER_BATCH)] 261 | for key in minibatch: 262 | Batch[key].append(minibatch[key]) 263 | 264 | return Batch 265 | 266 | 267 | def pad_image_data(list_of_blobs): 268 | max_shape = blob_utils.get_max_shape([blobs['data'].shape[1:] for blobs in list_of_blobs]) 269 | output_list = [] 270 | for blobs in list_of_blobs: 271 | data_padded = np.zeros((3, max_shape[0], max_shape[1]), dtype=np.float32) 272 | _, h, w = blobs['data'].shape 273 | data_padded[:, :h, :w] = blobs['data'] 274 | blobs['data'] = data_padded 275 | output_list.append(blobs) 276 | return output_list 277 | -------------------------------------------------------------------------------- /lib/datasets_rel/task_evaluation_sg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Written by Ji Zhang, 2019 3 | Some functions are adapted from Rowan Zellers 4 | Original source: 5 | https://github.com/rowanz/neural-motifs/blob/master/lib/evaluation/sg_eval.py 6 | """ 7 | import os 8 | import numpy as np 9 | import logging 10 | from six.moves import cPickle as pickle 11 | import json 12 | import csv 13 | from tqdm import tqdm 14 | 15 | from core.config import cfg 16 | from functools import reduce 17 | from utils.boxes import bbox_overlaps 18 | from datasets_rel.ap_eval_rel import ap_eval, prepare_mAP_dets 19 | 20 | from .pytorch_misc import intersect_2d, argsort_desc 21 | 22 | np.set_printoptions(precision=3) 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | def eval_rel_results(all_results, output_dir, do_val=True, do_vis=False, do_special=False): 28 | 29 | topk = 100 30 | 31 | if cfg.TEST.DATASETS[0].find('vg') >= 0: 32 | eval_per_img = True 33 | # eval_per_img = False 34 | prd_k = 1 35 | else: 36 | eval_per_img = False 37 | prd_k = 2 38 | 39 | if cfg.TEST.DATASETS[0].find('oi') >= 0: 40 | eval_ap = True 41 | else: 42 | eval_ap = False 43 | 44 | if eval_per_img: 45 | recalls = {1: [], 5: [], 10: [], 20: [], 50: [], 100: []} 46 | else: 47 | recalls = {1: 0, 5: 0, 10: 0, 20: 0, 50: 0, 100: 0} 48 | if do_val: 49 | all_gt_cnt = 0 50 | 51 | if do_special: 52 | special_img_f = open("/home/jiz/projects/100_img_special_set.txt", "r") 53 | special_imgs = special_img_f.readlines() 54 | special_imgs = [img[:-1] for img in special_imgs] 55 | special_img_set = set(special_imgs) 56 | logger.info('Special images len: {}'.format(len(special_img_set))) 57 | 58 | topk_dets = [] 59 | for im_i, res in enumerate(tqdm(all_results)): 60 | 61 | if do_special: 62 | img_id = res['image'].split('/')[-1].split('.')[0] 63 | if img_id not in special_img_set: 64 | continue 65 | 66 | # in oi_all_rel some images have no dets 67 | if res['prd_scores'] is None: 68 | det_boxes_s_top = np.zeros((0, 4), dtype=np.float32) 69 | det_boxes_o_top = np.zeros((0, 4), dtype=np.float32) 70 | det_labels_s_top = np.zeros(0, dtype=np.int32) 71 | det_labels_p_top = np.zeros(0, dtype=np.int32) 72 | det_labels_o_top = np.zeros(0, dtype=np.int32) 73 | det_scores_top = np.zeros(0, dtype=np.float32) 74 | 75 | det_scores_top_vis = np.zeros(0, dtype=np.float32) 76 | if 'prd_scores_bias' in res: 77 | det_scores_top_bias = np.zeros(0, dtype=np.float32) 78 | if 'prd_scores_spt' in res: 79 | det_scores_top_spt = np.zeros(0, dtype=np.float32) 80 | else: 81 | det_boxes_sbj = res['sbj_boxes'] # (#num_rel, 4) 82 | det_boxes_obj = res['obj_boxes'] # (#num_rel, 4) 83 | det_labels_sbj = res['sbj_labels'] # (#num_rel,) 84 | det_labels_obj = res['obj_labels'] # (#num_rel,) 85 | det_scores_sbj = res['sbj_scores'] # (#num_rel,) 86 | det_scores_obj = res['obj_scores'] # (#num_rel,) 87 | if 'prd_scores_ttl' in res: 88 | det_scores_prd = res['prd_scores_ttl'][:, 1:] 89 | else: 90 | det_scores_prd = res['prd_scores'][:, 1:] 91 | 92 | det_labels_prd = np.argsort(-det_scores_prd, axis=1) 93 | det_scores_prd = -np.sort(-det_scores_prd, axis=1) 94 | 95 | det_scores_so = det_scores_sbj * det_scores_obj 96 | det_scores_spo = det_scores_so[:, None] * det_scores_prd[:, :prd_k] 97 | 98 | det_scores_inds = argsort_desc(det_scores_spo)[:topk] 99 | det_scores_top = det_scores_spo[det_scores_inds[:, 0], det_scores_inds[:, 1]] 100 | det_boxes_so_top = np.hstack( 101 | (det_boxes_sbj[det_scores_inds[:, 0]], det_boxes_obj[det_scores_inds[:, 0]])) 102 | det_labels_p_top = det_labels_prd[det_scores_inds[:, 0], det_scores_inds[:, 1]] 103 | det_labels_spo_top = np.vstack( 104 | (det_labels_sbj[det_scores_inds[:, 0]], det_labels_p_top, det_labels_obj[det_scores_inds[:, 0]])).transpose() 105 | 106 | # filter out bad relationships 107 | cand_inds = np.where(det_scores_top > cfg.TEST.SPO_SCORE_THRESH)[0] 108 | det_boxes_so_top = det_boxes_so_top[cand_inds] 109 | det_labels_spo_top = det_labels_spo_top[cand_inds] 110 | det_scores_top = det_scores_top[cand_inds] 111 | 112 | det_scores_vis = res['prd_scores'][:, 1:] 113 | for i in range(det_labels_prd.shape[0]): 114 | det_scores_vis[i] = det_scores_vis[i][det_labels_prd[i]] 115 | det_scores_vis = det_scores_vis[:, :prd_k] 116 | det_scores_top_vis = det_scores_vis[det_scores_inds[:, 0], det_scores_inds[:, 1]] 117 | det_scores_top_vis = det_scores_top_vis[cand_inds] 118 | if 'prd_scores_bias' in res: 119 | det_scores_bias = res['prd_scores_bias'][:, 1:] 120 | for i in range(det_labels_prd.shape[0]): 121 | det_scores_bias[i] = det_scores_bias[i][det_labels_prd[i]] 122 | det_scores_bias = det_scores_bias[:, :prd_k] 123 | det_scores_top_bias = det_scores_bias[det_scores_inds[:, 0], det_scores_inds[:, 1]] 124 | det_scores_top_bias = det_scores_top_bias[cand_inds] 125 | if 'prd_scores_spt' in res: 126 | det_scores_spt = res['prd_scores_spt'][:, 1:] 127 | for i in range(det_labels_prd.shape[0]): 128 | det_scores_spt[i] = det_scores_spt[i][det_labels_prd[i]] 129 | det_scores_spt = det_scores_spt[:, :prd_k] 130 | det_scores_top_spt = det_scores_spt[det_scores_inds[:, 0], det_scores_inds[:, 1]] 131 | det_scores_top_spt = det_scores_top_spt[cand_inds] 132 | 133 | det_boxes_s_top = det_boxes_so_top[:, :4] 134 | det_boxes_o_top = det_boxes_so_top[:, 4:] 135 | det_labels_s_top = det_labels_spo_top[:, 0] 136 | det_labels_p_top = det_labels_spo_top[:, 1] 137 | det_labels_o_top = det_labels_spo_top[:, 2] 138 | 139 | topk_dets.append(dict(image=res['image'], 140 | det_boxes_s_top=det_boxes_s_top, 141 | det_boxes_o_top=det_boxes_o_top, 142 | det_labels_s_top=det_labels_s_top, 143 | det_labels_p_top=det_labels_p_top, 144 | det_labels_o_top=det_labels_o_top, 145 | det_scores_top=det_scores_top)) 146 | topk_dets[-1]['det_scores_top_vis'] = det_scores_top_vis 147 | if 'prd_scores_bias' in res: 148 | topk_dets[-1]['det_scores_top_bias'] = det_scores_top_bias 149 | if 'prd_scores_spt' in res: 150 | topk_dets[-1]['det_scores_top_spt'] = det_scores_top_spt 151 | if do_vis: 152 | topk_dets[-1].update(dict(blob_conv=res['blob_conv'], 153 | blob_conv_prd=res['blob_conv_prd'])) 154 | 155 | if do_val: 156 | gt_boxes_sbj = res['gt_sbj_boxes'] # (#num_gt, 4) 157 | gt_boxes_obj = res['gt_obj_boxes'] # (#num_gt, 4) 158 | gt_labels_sbj = res['gt_sbj_labels'] # (#num_gt,) 159 | gt_labels_obj = res['gt_obj_labels'] # (#num_gt,) 160 | gt_labels_prd = res['gt_prd_labels'] # (#num_gt,) 161 | gt_boxes_so = np.hstack((gt_boxes_sbj, gt_boxes_obj)) 162 | gt_labels_spo = np.vstack((gt_labels_sbj, gt_labels_prd, gt_labels_obj)).transpose() 163 | # Compute recall. It's most efficient to match once and then do recall after 164 | # det_boxes_so_top is (#num_rel, 8) 165 | # det_labels_spo_top is (#num_rel, 3) 166 | pred_to_gt = _compute_pred_matches( 167 | gt_labels_spo, det_labels_spo_top, 168 | gt_boxes_so, det_boxes_so_top) 169 | if eval_per_img: 170 | for k in recalls: 171 | if len(pred_to_gt): 172 | match = reduce(np.union1d, pred_to_gt[:k]) 173 | else: 174 | match = [] 175 | rec_i = float(len(match)) / float(gt_labels_spo.shape[0] + 1e-12) # in case there is no gt 176 | recalls[k].append(rec_i) 177 | else: 178 | all_gt_cnt += gt_labels_spo.shape[0] 179 | for k in recalls: 180 | if len(pred_to_gt): 181 | match = reduce(np.union1d, pred_to_gt[:k]) 182 | else: 183 | match = [] 184 | recalls[k] += len(match) 185 | 186 | topk_dets[-1].update(dict(gt_boxes_sbj=gt_boxes_sbj, 187 | gt_boxes_obj=gt_boxes_obj, 188 | gt_labels_sbj=gt_labels_sbj, 189 | gt_labels_obj=gt_labels_obj, 190 | gt_labels_prd=gt_labels_prd)) 191 | 192 | if do_val: 193 | if eval_per_img: 194 | for k, v in recalls.items(): 195 | recalls[k] = np.mean(v) 196 | else: 197 | for k in recalls: 198 | recalls[k] = float(recalls[k]) / (float(all_gt_cnt) + 1e-12) 199 | excel_str = print_stats(recalls) 200 | if eval_ap: 201 | # prepare dets for each class 202 | logger.info('Preparing dets for mAP...') 203 | cls_image_ids, cls_dets, cls_gts, npos = prepare_mAP_dets(topk_dets, 9) 204 | all_npos = sum(npos) 205 | with open(cfg.DATA_DIR + '/openimages_v4/rel/rel_9_predicates.json') as f: 206 | rel_prd_cats = json.load(f) 207 | 208 | rel_mAP = 0. 209 | w_rel_mAP = 0. 210 | ap_str = '' 211 | for c in range(9): 212 | rec, prec, ap = ap_eval(cls_image_ids[c], cls_dets[c], cls_gts[c], npos[c], True) 213 | weighted_ap = ap * float(npos[c]) / float(all_npos) 214 | w_rel_mAP += weighted_ap 215 | rel_mAP += ap 216 | ap_str += '{:.2f}, '.format(100 * ap) 217 | print('rel AP for class {}: {:.2f} ({:.6f})'.format(rel_prd_cats[c], 100 * ap, float(npos[c]) / float(all_npos))) 218 | rel_mAP /= 9. 219 | print('weighted rel mAP: {:.2f}'.format(100 * w_rel_mAP)) 220 | excel_str += ap_str 221 | 222 | phr_mAP = 0. 223 | w_phr_mAP = 0. 224 | ap_str = '' 225 | for c in range(9): 226 | rec, prec, ap = ap_eval(cls_image_ids[c], cls_dets[c], cls_gts[c], npos[c], False) 227 | weighted_ap = ap * float(npos[c]) / float(all_npos) 228 | w_phr_mAP += weighted_ap 229 | phr_mAP += ap 230 | ap_str += '{:.2f}, '.format(100 * ap) 231 | print('phr AP for class {}: {:.2f} ({:.6f})'.format(rel_prd_cats[c], 100 * ap, float(npos[c]) / float(all_npos))) 232 | phr_mAP /= 9. 233 | print('weighted phr mAP: {:.2f}'.format(100 * w_phr_mAP)) 234 | excel_str += ap_str 235 | 236 | # total: 0.4 x rel_mAP + 0.2 x R@50 + 0.4 x phr_mAP 237 | final_score = 0.4 * rel_mAP + 0.2 * recalls[50] + 0.4 * phr_mAP 238 | 239 | # total: 0.4 x w_rel_mAP + 0.2 x R@50 + 0.4 x w_phr_mAP 240 | w_final_score = 0.4 * w_rel_mAP + 0.2 * recalls[50] + 0.4 * w_phr_mAP 241 | print('weighted final_score: {:.2f}'.format(100 * w_final_score)) 242 | 243 | # get excel friendly string 244 | # excel_str = '{:.2f}, {:.2f}, {:.2f}, {:.2f}, '.format(100 * recalls[50], 100 * w_rel_mAP, 100 * w_phr_mAP, 100 * w_final_score) + excel_str 245 | # print('Excel-friendly format:') 246 | # print(excel_str.strip()[:-1]) 247 | 248 | # print('Saving topk dets...') 249 | # topk_dets_f = os.path.join(output_dir, 'rel_detections_topk.pkl') 250 | # with open(topk_dets_f, 'wb') as f: 251 | # pickle.dump(topk_dets, f, pickle.HIGHEST_PROTOCOL) 252 | # logger.info('topk_dets size: {}'.format(len(topk_dets))) 253 | print('Done.') 254 | 255 | 256 | def print_stats(recalls): 257 | # print('====================== ' + 'sgdet' + ' ============================') 258 | k_str = '' 259 | for k in recalls.keys(): 260 | if k == 50: 261 | continue 262 | k_str += '{}\t'.format(k) 263 | v_str = '' 264 | for k, v in recalls.items(): 265 | print('R@%i: %.2f' % (k, 100 * v)) 266 | if k == 50: 267 | continue 268 | v_str += '{:.2f}, '.format(100 * v) 269 | return v_str 270 | 271 | 272 | # This function is adapted from Rowan Zellers' code: 273 | # https://github.com/rowanz/neural-motifs/blob/master/lib/evaluation/sg_eval.py 274 | # Modified for this project to work with PyTorch v0.4 275 | def _compute_pred_matches(gt_triplets, pred_triplets, 276 | gt_boxes, pred_boxes, iou_thresh=0.5, phrdet=False): 277 | """ 278 | Given a set of predicted triplets, return the list of matching GT's for each of the 279 | given predictions 280 | :param gt_triplets: 281 | :param pred_triplets: 282 | :param gt_boxes: 283 | :param pred_boxes: 284 | :param iou_thresh: Do y 285 | :return: 286 | """ 287 | # This performs a matrix multiplication-esque thing between the two arrays 288 | # Instead of summing, we want the equality, so we reduce in that way 289 | # The rows correspond to GT triplets, columns to pred triplets 290 | keeps = intersect_2d(gt_triplets, pred_triplets) 291 | gt_has_match = keeps.any(1) 292 | pred_to_gt = [[] for x in range(pred_boxes.shape[0])] 293 | for gt_ind, gt_box, keep_inds in zip(np.where(gt_has_match)[0], 294 | gt_boxes[gt_has_match], 295 | keeps[gt_has_match], 296 | ): 297 | boxes = pred_boxes[keep_inds] 298 | if phrdet: 299 | # Evaluate where the union box > 0.5 300 | gt_box_union = gt_box.reshape((2, 4)) 301 | gt_box_union = np.concatenate((gt_box_union.min(0)[:2], gt_box_union.max(0)[2:]), 0) 302 | 303 | box_union = boxes.reshape((-1, 2, 4)) 304 | box_union = np.concatenate((box_union.min(1)[:,:2], box_union.max(1)[:,2:]), 1) 305 | 306 | gt_box_union = gt_box_union.astype(dtype=np.float32, copy=False) 307 | box_union = box_union.astype(dtype=np.float32, copy=False) 308 | inds = bbox_overlaps(gt_box_union[None], 309 | box_union = box_union)[0] >= iou_thresh 310 | 311 | else: 312 | gt_box = gt_box.astype(dtype=np.float32, copy=False) 313 | boxes = boxes.astype(dtype=np.float32, copy=False) 314 | sub_iou = bbox_overlaps(gt_box[None,:4], boxes[:, :4])[0] 315 | obj_iou = bbox_overlaps(gt_box[None,4:], boxes[:, 4:])[0] 316 | 317 | inds = (sub_iou >= iou_thresh) & (obj_iou >= iou_thresh) 318 | 319 | for i in np.where(keep_inds)[0][inds]: 320 | pred_to_gt[i].append(int(gt_ind)) 321 | return pred_to_gt 322 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Graphical Contrastive Losses for Scene Graph Parsing 2 | 3 | ![alt text](https://github.com/NVIDIA/ContrastiveLosses4VRD/blob/master/Examples.PNG) 4 |

Example results from the OpenImages dataset.

5 | 6 | ![alt text](https://github.com/NVIDIA/ContrastiveLosses4VRD/blob/master/Loss_illustration.PNG) 7 | Example results of RelDN with without and with our losses. "L0 only" means using only the original multi-class logistic loss (without our losses). The top row shows RelDN outputs and the bottom row visualizes the learned predicate CNN features of the two models. Red and green boxes highlight the wrong and right outputs (the first row) or feature saliency (the second row). 8 | 9 | This is a PyTorch implementation for [Graphical Contrastive Losses for Scene Graph Parsing, CVPR2019](https://arxiv.org/abs/1903.02728). This is an improved version of the code that won the 1st place in the [Google AI Open Images Visual Relationship Detection Chanllenge](https://www.kaggle.com/c/google-ai-open-images-visual-relationship-track/leaderboard). 10 | 11 | ## News 12 | We have created a branch for a version supporting pytorch1.0! Just go to the [pytorch1_0](https://github.com/NVIDIA/ContrastiveLosses4VRD/tree/pytorch1_0) branch and check it out! 13 | 14 | ## Benchmarking on Visual Genome 15 | | Method | Backbone | SGDET@20 | SGDET@50 | SGDET@100 | 16 | | :--- | :----: | :----: | :----: | :----: | 17 | | Frequency \[1\] | VGG16 | 17.7 | 23.5 | 27.6 | 18 | | Frequency+Overlap \[1\] | VGG16 | 20.1 | 26.2 | 30.1 | 19 | | MotifNet \[1\] | VGG16 | 21.4 | 27.2 | 30.3 | 20 | | Graph-RCNN \[2\] | Res-101 | 19.4 | 25.0 | 28.5 | 21 | | RelDN, w/o contrastive losses | VGG16 | 20.8 | 28.1 | 32.5 | 22 | | RelDN, full | VGG16 | 21.1 | 28.3 | 32.7 | 23 | | RelDN, full | ResNext-101-FPN | 22.5 | 31.0 | 36.7 | 24 | 25 | \*"RelDN" is the relationship detection model we proposed in the paper. 26 | 27 | \*We use the frequency prior in our model by default. 28 | 29 | \*Results of "Graph-RCNN" are directly copied from [their repo](https://github.com/jwyang/graph-rcnn.pytorch). 30 | 31 | \[1\] [Zellers, Rowan, et al. "Neural motifs: Scene graph parsing with global context." Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition. 2018.](http://openaccess.thecvf.com/content_cvpr_2018/html/Zellers_Neural_Motifs_Scene_CVPR_2018_paper.html) 32 | 33 | \[2\] [Yang, Jianwei, et al. "Graph r-cnn for scene graph generation." Proceedings of the European Conference on Computer Vision (ECCV). 2018.](http://openaccess.thecvf.com/content_ECCV_2018/html/Jianwei_Yang_Graph_R-CNN_for_ECCV_2018_paper.html) 34 | 35 | ## Cloning 36 | ``` 37 | git clone https://github.com/NVIDIA/ContrastiveLosses4VRD.git --recurse-submodules 38 | 39 | ``` 40 | 41 | ## Requirements 42 | * Python 3 43 | * Python packages 44 | * pytorch 0.4.0 or 0.4.1.post2 (not guaranteed to work on newer versions) 45 | * cython 46 | * matplotlib 47 | * numpy 48 | * scipy 49 | * opencv 50 | * pyyaml 51 | * packaging 52 | * [pycocotools](https://github.com/cocodataset/cocoapi) 53 | * tensorboardX 54 | * tqdm 55 | * pillow 56 | * scikit-image 57 | * An NVIDIA GPU and CUDA 8.0 or higher. Some operations only have gpu implementation. 58 | 59 | An easy installation if you already have Anaconda Python 3 and CUDA 9.0: 60 | ``` 61 | conda install pytorch=0.4.1 62 | pip install cython 63 | pip install matplotlib numpy scipy pyyaml packaging pycocotools tensorboardX tqdm pillow scikit-image 64 | conda install opencv 65 | ``` 66 | 67 | * (Optional) A dockerfile with all necessary dependencies is included in docker/Dockerfile. Requires nvidia-docker 68 | 69 | ``` 70 | # ROOT=path/to/cloned/repository 71 | cd $ROOT/docker 72 | # build the docker image and tag it 73 | docker build -t myname/mydockertag:1.0 74 | # launch an interactive session with this folder 75 | nvidia-docker run -v $ROOT:/workspace/visual-relationship-detection:rw -it myname/mydockertag:1.0 76 | # NOTE: you may need to mount other volumes depending on where your datasets are stored 77 | ``` 78 | 79 | ## Compilation 80 | Compile the CUDA code in the Detectron submodule and in the repo: 81 | ``` 82 | # ROOT=path/to/cloned/repository 83 | cd $ROOT/Detectron_pytorch/lib 84 | sh make.sh 85 | cd $ROOT/lib 86 | sh make.sh 87 | ``` 88 | 89 | ## Annotations 90 | 91 | Create a data folder at the top-level directory of the repository: 92 | ``` 93 | # ROOT=path/to/cloned/repository 94 | cd $ROOT 95 | mkdir data 96 | ``` 97 | If necessary, one may edit the `DATA_DIR` field in lib/core/config.py to change the expected path to the data directory. Be sure to update the paths in the VRD preprocessing scripts (mentioned below) if this is done. 98 | 99 | ### OpenImages/OpenImages_mini 100 | Download it [here](https://drive.google.com/open?id=1GeUEsiS9Z3eRYnH1GPUz99wjQwjcHl6n). Unzip it under the data folder. You should see an `openimages_v4` folder unzipped there. It contains .json annotation files for both OpenImages and OpenImages_mini, which is a subset of the former created by us including 4500 train and 1000 test images. The .json files are created based on the original .csv annotations. 101 | 102 | ### Visual Genome 103 | Download it [here](https://drive.google.com/open?id=1VDuba95vIPVhg5DiriPtwuVA6mleYGad). Unzip it under the data folder. You should see a `vg` folder unzipped there. It contains .json annotations that suit the dataloader used in this repo. 104 | 105 | ### Visual Relation Detection 106 | 107 | See [Images:VRD](#visual-relation-detection-1) 108 | 109 | ## Images 110 | 111 | ### OpenImages 112 | Create a folder `train/` for the training images: 113 | ``` 114 | # ROOT=path/to/cloned/repository 115 | cd $ROOT/data/openimages_v4 116 | mkdir train 117 | ``` 118 | Download OpenImages v4 training images from the [official page](https://storage.googleapis.com/openimages/web/download.html) (**Warning: this is a very large dataset**). **Note:** only training images are needed since our annotations will split them into a train and a validation set. Put all images in `train/` 119 | 120 | ### Visual Genome 121 | Create a folder for all images: 122 | ``` 123 | # ROOT=path/to/cloned/repository 124 | cd $ROOT/data/vg 125 | mkdir VG_100K 126 | ``` 127 | Download Visual Genome images from the [official page](https://visualgenome.org/api/v0/api_home.html). Unzip all images (part 1 and part 2) into `VG_100K/`. There should be a total of 108249 files. 128 | 129 | ### Visual Relation Detection 130 | Create the vrd folder under `data`: 131 | ``` 132 | # ROOT=path/to/cloned/repository 133 | cd $ROOT/data/vrd 134 | ``` 135 | Download the original annotation json files from [here](https://cs.stanford.edu/people/ranjaykrishna/vrd/) and unzip `json_dataset.zip` here. The images can be downloaded from [here](http://imagenet.stanford.edu/internal/jcjohns/scene_graphs/sg_dataset.zip). Unzip `sg_dataset.zip` to create an `sg_dataset` folder in `data/vrd`. Next run the preprocessing scripts: 136 | 137 | ``` 138 | cd $ROOT 139 | python tools/rename_vrd_with_numbers.py 140 | python tools/convert_vrd_anno_to_coco_format.py 141 | ``` 142 | `rename_vrd_with_numbers.py` converts all non-jpg images (some images are in png or gif) to jpg, and renames them in the {:012d}.jpg format (e.g., "000000000001.jpg"). It also creates new relationship annotations other than the original ones. This is mostly to make things easier for the dataloader. The filename mapping from the original is stored in `data/vrd/*_fname_mapping.json` where "*" is either "train" or "val". 143 | 144 | `convert_vrd_anno_to_coco_format.py` creates object detection annotations from the new annotations generated above, which are required by the dataloader during training. 145 | 146 | ## Pre-trained Object Detection Models 147 | Download pre-trained object detection models [here](https://drive.google.com/open?id=1NrqOLbMa_RwHbG3KIXJFWLnlND2kiIpj). Unzip it under the root directory. **Note:** We do not include code for training object detectors. Please refer to the "(Optional) Training Object Detection Models" section in [Large-Scale-VRD.pytorch](https://github.com/jz462/Large-Scale-VRD.pytorch) for this. 148 | 149 | ## Our Trained Relationship Detection Models 150 | Download our trained models [here](https://drive.google.com/open?id=15w0q3Nuye2ieu_aUNdTS_FNvoVzM4RMF). Unzip it under the root folder and you should see a `trained_models` folder there. 151 | 152 | ## Directory Structure 153 | The final directories for data and detection models should look like: 154 | ``` 155 | |-- detection_models 156 | | |-- oi_rel 157 | | | |-- X-101-64x4d-FPN 158 | | | | |-- model_step599999.pth 159 | | |-- vg 160 | | | |-- VGG16 161 | | | | |-- model_step479999.pth 162 | | | |-- X-101-64x4d-FPN 163 | | | | |-- model_step119999.pth 164 | | |-- vrd 165 | | | |-- VGG16 166 | | | | |-- model_step4499.pth 167 | |-- data 168 | | |-- openimages_v4 169 | | | |-- train <-- (contains OpenImages_v4 training/validation images) 170 | | | |-- rel 171 | | | | |-- rel_only_annotations_train.json 172 | | | | |-- rel_only_annotations_val.json 173 | | | | |-- ... 174 | | |-- vg 175 | | | |-- VG_100K <-- (contains Visual Genome all images) 176 | | | |-- rel_annotations_train.json 177 | | | |-- rel_annotations_val.json 178 | | | |-- ... 179 | | |-- vrd 180 | | | |-- train_images <-- (contains Visual Relation Detection training images) 181 | | | |-- val_images <-- (contains Visual Relation Detection validation images) 182 | | | |-- new_annotations_train.json 183 | | | |-- new_annotations_val.json 184 | | | |-- ... 185 | |-- trained_models 186 | | |-- oi_mini_X-101-64x4d-FPN 187 | | | |-- model_step6749.pth 188 | | |-- oi_X-101-64x4d-FPN 189 | | | |-- model_step80929.pth 190 | | |-- vg_VGG16 191 | | | |-- model_step62722.pth 192 | | |-- vg_X-101-64x4d-FPN 193 | | | |-- model_step62722.pth 194 | | |-- vrd_VGG16_IN_pretrained 195 | | | |-- model_step7559.pth 196 | | |-- vrd_VGG16_COCO_pretrained 197 | | | |-- model_step7559.pth 198 | ``` 199 | 200 | ## Evaluating Pre-trained Relationship Detection models 201 | 202 | DO NOT CHANGE anything in the provided config files(configs/xx/xxxx.yaml) even if you want to test with less or more than 8 GPUs. Use the environment variable `CUDA_VISIBLE_DEVICES` to control how many and which GPUs to use. Remove the 203 | `--multi-gpu-test` for single-gpu inference. 204 | 205 | ### OpenImages_mini 206 | To test a trained model using a ResNeXt-101-64x4d-FPN backbone, run 207 | ``` 208 | python ./tools/test_net_rel.py --dataset oi_rel_mini --cfg configs/oi_rel_mini/e2e_faster_rcnn_X-101-64x4d-FPN_12_epochs_oi_rel_mini_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml --load_ckpt trained_models/oi_mini_X-101-64x4d-FPN/model_step6749.pth --output_dir Outputs/oi_mini_X-101-64x4d-FPN --multi-gpu-testing --do_val 209 | ``` 210 | This should reproduce the numbers shown at the last line of Table 1 in the paper. 211 | 212 | ### OpenImages 213 | To test a trained model using a ResNeXt-101-64x4d-FPN backbone, run 214 | ``` 215 | python ./tools/test_net_rel.py --dataset oi_rel --cfg configs/oi_rel/e2e_faster_rcnn_X-101-64x4d-FPN_12_epochs_oi_rel_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml --load_ckpt trained_models/oi_X-101-64x4d-FPN/model_step80929.pth --output_dir Outputs/oi_X-101-64x4d-FPN --multi-gpu-testing --do_val 216 | ``` 217 | 218 | ### Visual Genome 219 | **NOTE:** May require at least 64GB RAM to evaluate on the Visual Genome test set 220 | 221 | We use three evaluation metrics for Visual Genome: 222 | 1. SGDET: predict all the three labels and two boxes 223 | 1. SGCLS: predict subject, object and predicate labels given ground truth subject and object boxes 224 | 1. PRDCLS: predict predicate labels given ground truth subject and object boxes and labels 225 | 226 | To test a trained model using a VGG16 backbone with "SGDET", run 227 | ``` 228 | python ./tools/test_net_rel.py --dataset vg --cfg configs/vg/e2e_faster_rcnn_VGG16_8_epochs_vg_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_no_spt.yaml --load_ckpt trained_models/vg_VGG16/model_step62722.pth --output_dir Outputs/vg_VGG16 --multi-gpu-testing --do_val 229 | ``` 230 | Use `--use_gt_boxes` option to test it with "SGCLS"; use `--use_gt_boxes --use_gt_labels` options to test it with "PRDCLS". The results will vary slightly with the last line of Table 6 in the paper. 231 | 232 | To test a trained model using a vg_X-101-64x4d-FPN backbone with "SGDET", run 233 | ``` 234 | python ./tools/test_net_rel.py --dataset vg --cfg configs/vg/e2e_faster_rcnn_X-101-64x4d-FPN_8_epochs_vg_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml --load_ckpt trained_models/vg_X-101-64x4d-FPN/model_step62722.pth --output_dir Outputs/vg_X-101-64x4d-FPN --multi-gpu-testing --do_val 235 | ``` 236 | Use `--use_gt_boxes` option to test it with "SGCLS"; use `--use_gt_boxes --use_gt_labels` options to test it with "PRDCLS". The results will vary slightly with those at the last line of Table 1 in the supplementary. 237 | 238 | ### Visual Relation Detection 239 | To test a trained model initialized by an ImageNet pre-trained VGG16 model, run 240 | ``` 241 | python ./tools/test_net_rel.py --dataset vrd --cfg configs/vrd/e2e_faster_rcnn_VGG16_16_epochs_vrd_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_IN_pretrained.yaml --load_ckpt trained_models/vrd_VGG16_IN_pretrained/model_step7559.pth --output_dir Outputs/vrd_VGG16_IN_pretrained --multi-gpu-testing --do_val 242 | ``` 243 | The results are slightly different with those at the second to the last line of Table 7. 244 | 245 | To test a trained model initialized by an COCO pre-trained VGG16 model, run 246 | ``` 247 | python ./tools/test_net_rel.py --dataset vrd --cfg configs/vrd/e2e_faster_rcnn_VGG16_16_epochs_vrd_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_COCO_pretrained.yaml --load_ckpt trained_models/vrd_VGG16_COCO_pretrained/model_step7559.pth --output_dir Outputs/vrd_VGG16_COCO_pretrained --multi-gpu-testing --do_val 248 | ``` 249 | The results are slightly different with those at the last line of Table 7. 250 | 251 | ## Training Relationship Detection Models 252 | 253 | The section provides the command-line arguments to train our relationship detection models given the pre-trained object detection models described above. **Note:** We do not train object detectors here. We only use trained object detectors (provided in `detection_models/`) to initialize our to-be-trained relationship models. 254 | 255 | DO NOT CHANGE anything in the provided config files(configs/xx/xxxx.yaml) even if you want to train with less or more than 8 GPUs. Use the environment variable `CUDA_VISIBLE_DEVICES` to control how many and which GPUs to use. 256 | 257 | With the following command lines, the training results (models and logs) should be in `$ROOT/Outputs/xxx/` where `xxx` is the .yaml file name used in the command without the ".yaml" extension. If you want to test with your trained models, simply run the test commands described above by setting `--load_ckpt` as the path of your trained models. 258 | 259 | ### OpenImages_mini 260 | To train our relationship network using a ResNeXt-101-64x4d-FPN backbone, run 261 | ``` 262 | python tools/train_net_step_rel.py --dataset oi_rel_mini --cfg configs/oi_rel_mini/e2e_faster_rcnn_X-101-64x4d-FPN_12_epochs_oi_rel_mini_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml --nw 8 --use_tfboard 263 | ``` 264 | 265 | ### OpenImages 266 | To train our relationship network using a ResNeXt-101-64x4d-FPN backbone, run 267 | ``` 268 | python tools/train_net_step_rel.py --dataset oi_rel --cfg configs/oi_rel/e2e_faster_rcnn_X-101-64x4d-FPN_12_epochs_oi_rel_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml --nw 8 --use_tfboard 269 | ``` 270 | 271 | ### Visual Genome 272 | To train our relationship network using a VGG16 backbone, run 273 | ``` 274 | python tools/train_net_step_rel.py --dataset vg --cfg configs/vg/e2e_faster_rcnn_VGG16_8_epochs_vg_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_no_spt.yaml --nw 8 --use_tfboard 275 | ``` 276 | 277 | To train our relationship network using a ResNeXt-101-64x4d-FPN backbone, run 278 | ``` 279 | python tools/train_net_step_rel.py --dataset vg --cfg configs/vg/e2e_faster_rcnn_X-101-64x4d-FPN_8_epochs_vg_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5.yaml --nw 8 --use_tfboard 280 | ``` 281 | 282 | ### Visual Relation Detection 283 | To train our relationship network initialized by an ImageNet pre-trained VGG16 model, run 284 | ``` 285 | python tools/train_net_step_rel.py --dataset vrd --cfg configs/vrd/e2e_faster_rcnn_VGG16_16_epochs_vrd_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_IN_pretrained.yaml --nw 8 --use_tfboard 286 | ``` 287 | 288 | To train our relationship network initialized by a COCO pre-trained VGG16 model, run 289 | ``` 290 | python tools/train_net_step_rel.py --dataset vrd --cfg configs/vrd/e2e_faster_rcnn_VGG16_16_epochs_vrd_v3_default_node_contrastive_loss_w_so_p_aware_margin_point2_so_weight_point5_COCO_pretrained.yaml --nw 8 --use_tfboard 291 | ``` 292 | 293 | ## Acknowledgements 294 | This repository uses code based on the [Neural-Motifs](https://github.com/rowanz/neural-motifs) source code from Rowan Zellers, as well as 295 | code from the [Detectron.pytorch](https://github.com/roytseng-tw/Detectron.pytorch) repository by Roy Tseng. See LICENSES for additional details. 296 | 297 | ## Citing 298 | If you use this code in your research, please use the following BibTeX entry. 299 | ``` 300 | @conference{zhang2019vrd, 301 | title={Graphical Contrastive Losses for Scene Graph Parsing}, 302 | author={Zhang, Ji and Shih, Kevin J. and Elgammal, Ahmed and Tao, Andrew and Catanzaro, Bryan}, 303 | booktitle={CVPR}, 304 | year={2019} 305 | } 306 | -------------------------------------------------------------------------------- /lib/roi_data_rel/fast_rcnn_rel.py: -------------------------------------------------------------------------------- 1 | # Adapted by Ji Zhang, 2019 2 | # 3 | # Based on Detectron.pytorch/lib/roi_data/fast_rcnn.py 4 | # Original license text: 5 | # -------------------------------------------------------- 6 | # Copyright (c) 2017-present, Facebook, Inc. 7 | # 8 | # Licensed under the Apache License, Version 2.0 (the "License"); 9 | # you may not use this file except in compliance with the License. 10 | # You may obtain a copy of the License at 11 | # 12 | # http://www.apache.org/licenses/LICENSE-2.0 13 | # 14 | # Unless required by applicable law or agreed to in writing, software 15 | # distributed under the License is distributed on an "AS IS" BASIS, 16 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | # See the License for the specific language governing permissions and 18 | # limitations under the License. 19 | ############################################################################## 20 | 21 | """Construct minibatches for Fast R-CNN training. Handles the minibatch blobs 22 | that are specific to Fast R-CNN. Other blobs that are generic to RPN, etc. 23 | are handled by their respecitive roi_data modules. 24 | """ 25 | 26 | from __future__ import absolute_import 27 | from __future__ import division 28 | from __future__ import print_function 29 | from __future__ import unicode_literals 30 | 31 | import numpy as np 32 | import numpy.random as npr 33 | import logging 34 | 35 | from core.config import cfg 36 | import utils_rel.boxes_rel as box_utils_rel 37 | import utils.blob as blob_utils 38 | import utils.fpn as fpn_utils 39 | 40 | 41 | logger = logging.getLogger(__name__) 42 | 43 | 44 | def add_rel_blobs(blobs, im_scales, roidb): 45 | """Add blobs needed for training Fast R-CNN style models.""" 46 | # Sample training RoIs from each image and append them to the blob lists 47 | for im_i, entry in enumerate(roidb): 48 | frcn_blobs = _sample_pairs(entry, im_scales[im_i], im_i) 49 | for k, v in frcn_blobs.items(): 50 | blobs[k].append(v) 51 | # Concat the training blob lists into tensors 52 | for k, v in blobs.items(): 53 | if isinstance(v, list) and len(v) > 0: 54 | blobs[k] = np.concatenate(v) 55 | 56 | if cfg.FPN.FPN_ON and cfg.FPN.MULTILEVEL_ROIS: 57 | _add_rel_multilevel_rois(blobs) 58 | 59 | return True 60 | 61 | 62 | def _sample_pairs(roidb, im_scale, batch_idx): 63 | """Generate a random sample of RoIs comprising foreground and background 64 | examples. 65 | """ 66 | fg_pairs_per_image = cfg.TRAIN.FG_REL_SIZE_PER_IM 67 | pairs_per_image = int(cfg.TRAIN.FG_REL_SIZE_PER_IM / cfg.TRAIN.FG_REL_FRACTION) # need much more pairs since it's quadratic 68 | max_pair_overlaps = roidb['max_pair_overlaps'] 69 | 70 | gt_pair_inds = np.where(max_pair_overlaps > 1.0 - 1e-4)[0] 71 | fg_pair_inds = np.where((max_pair_overlaps >= cfg.TRAIN.FG_THRESH) & 72 | (max_pair_overlaps <= 1.0 - 1e-4))[0] 73 | 74 | fg_pairs_per_this_image = np.minimum(fg_pairs_per_image, gt_pair_inds.size + fg_pair_inds.size) 75 | # Sample foreground regions without replacement 76 | if fg_pair_inds.size > 0: 77 | fg_pair_inds = npr.choice( 78 | fg_pair_inds, size=(fg_pairs_per_this_image - gt_pair_inds.size), replace=False) 79 | fg_pair_inds = np.append(fg_pair_inds, gt_pair_inds) 80 | 81 | # Label is the class each RoI has max overlap with 82 | fg_prd_labels = roidb['max_prd_classes'][fg_pair_inds] 83 | blob_dict = dict( 84 | fg_prd_labels_int32=fg_prd_labels.astype(np.int32, copy=False)) 85 | if cfg.MODEL.USE_BG: 86 | bg_pair_inds = np.where((max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] 87 | 88 | # Compute number of background RoIs to take from this image (guarding 89 | # against there being fewer than desired) 90 | bg_pairs_per_this_image = pairs_per_image - fg_pairs_per_this_image 91 | bg_pairs_per_this_image = np.minimum(bg_pairs_per_this_image, bg_pair_inds.size) 92 | # Sample foreground regions without replacement 93 | if bg_pair_inds.size > 0: 94 | bg_pair_inds = npr.choice( 95 | bg_pair_inds, size=bg_pairs_per_this_image, replace=False) 96 | keep_pair_inds = np.append(fg_pair_inds, bg_pair_inds) 97 | all_prd_labels = np.zeros(keep_pair_inds.size, dtype=np.int32) 98 | all_prd_labels[:fg_pair_inds.size] = fg_prd_labels + 1 # class should start from 1 99 | else: 100 | keep_pair_inds = fg_pair_inds 101 | all_prd_labels = fg_prd_labels 102 | blob_dict['all_prd_labels_int32'] = all_prd_labels.astype(np.int32, copy=False) 103 | blob_dict['fg_size'] = np.array([fg_pair_inds.size], dtype=np.int32) # this is used to check if there is at least one fg to learn 104 | 105 | sampled_sbj_boxes = roidb['sbj_boxes'][keep_pair_inds] 106 | sampled_obj_boxes = roidb['obj_boxes'][keep_pair_inds] 107 | # Scale rois and format as (batch_idx, x1, y1, x2, y2) 108 | sampled_sbj_rois = sampled_sbj_boxes * im_scale 109 | sampled_obj_rois = sampled_obj_boxes * im_scale 110 | repeated_batch_idx = batch_idx * blob_utils.ones((keep_pair_inds.shape[0], 1)) 111 | sampled_sbj_rois = np.hstack((repeated_batch_idx, sampled_sbj_rois)) 112 | sampled_obj_rois = np.hstack((repeated_batch_idx, sampled_obj_rois)) 113 | blob_dict['sbj_rois'] = sampled_sbj_rois 114 | blob_dict['obj_rois'] = sampled_obj_rois 115 | sampled_rel_rois = box_utils_rel.rois_union(sampled_sbj_rois, sampled_obj_rois) 116 | blob_dict['rel_rois'] = sampled_rel_rois 117 | if cfg.MODEL.USE_SPATIAL_FEAT: 118 | sampled_spt_feat = box_utils_rel.get_spt_features( 119 | sampled_sbj_boxes, sampled_obj_boxes, roidb['width'], roidb['height']) 120 | blob_dict['spt_feat'] = sampled_spt_feat 121 | if cfg.MODEL.USE_FREQ_BIAS: 122 | sbj_labels = roidb['max_sbj_classes'][keep_pair_inds] 123 | obj_labels = roidb['max_obj_classes'][keep_pair_inds] 124 | blob_dict['all_sbj_labels_int32'] = sbj_labels.astype(np.int32, copy=False) 125 | blob_dict['all_obj_labels_int32'] = obj_labels.astype(np.int32, copy=False) 126 | if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS: 127 | nodes_per_image = cfg.MODEL.NODE_SAMPLE_SIZE 128 | max_sbj_overlaps = roidb['max_sbj_overlaps'] 129 | max_obj_overlaps = roidb['max_obj_overlaps'] 130 | # sbj 131 | # Here a naturally existing assumption is, each positive sbj should have at least one positive obj 132 | sbj_pos_pair_pos_inds = np.where((max_pair_overlaps >= cfg.TRAIN.FG_THRESH))[0] 133 | sbj_pos_obj_pos_pair_neg_inds = np.where((max_sbj_overlaps >= cfg.TRAIN.FG_THRESH) & 134 | (max_obj_overlaps >= cfg.TRAIN.FG_THRESH) & 135 | (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] 136 | sbj_pos_obj_neg_pair_neg_inds = np.where((max_sbj_overlaps >= cfg.TRAIN.FG_THRESH) & 137 | (max_obj_overlaps < cfg.TRAIN.FG_THRESH) & 138 | (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] 139 | if sbj_pos_pair_pos_inds.size > 0: 140 | sbj_pos_pair_pos_inds = npr.choice( 141 | sbj_pos_pair_pos_inds, 142 | size=int(min(nodes_per_image, sbj_pos_pair_pos_inds.size)), 143 | replace=False) 144 | if sbj_pos_obj_pos_pair_neg_inds.size > 0: 145 | sbj_pos_obj_pos_pair_neg_inds = npr.choice( 146 | sbj_pos_obj_pos_pair_neg_inds, 147 | size=int(min(nodes_per_image, sbj_pos_obj_pos_pair_neg_inds.size)), 148 | replace=False) 149 | sbj_pos_pair_neg_inds = sbj_pos_obj_pos_pair_neg_inds 150 | if nodes_per_image - sbj_pos_obj_pos_pair_neg_inds.size > 0 and sbj_pos_obj_neg_pair_neg_inds.size > 0: 151 | sbj_pos_obj_neg_pair_neg_inds = npr.choice( 152 | sbj_pos_obj_neg_pair_neg_inds, 153 | size=int(min(nodes_per_image - sbj_pos_obj_pos_pair_neg_inds.size, sbj_pos_obj_neg_pair_neg_inds.size)), 154 | replace=False) 155 | sbj_pos_pair_neg_inds = np.append(sbj_pos_pair_neg_inds, sbj_pos_obj_neg_pair_neg_inds) 156 | sbj_pos_inds = np.append(sbj_pos_pair_pos_inds, sbj_pos_pair_neg_inds) 157 | binary_labels_sbj_pos = np.zeros(sbj_pos_inds.size, dtype=np.int32) 158 | binary_labels_sbj_pos[:sbj_pos_pair_pos_inds.size] = 1 159 | blob_dict['binary_labels_sbj_pos_int32'] = binary_labels_sbj_pos.astype(np.int32, copy=False) 160 | prd_pos_labels_sbj_pos = roidb['max_prd_classes'][sbj_pos_pair_pos_inds] 161 | prd_labels_sbj_pos = np.zeros(sbj_pos_inds.size, dtype=np.int32) 162 | prd_labels_sbj_pos[:sbj_pos_pair_pos_inds.size] = prd_pos_labels_sbj_pos + 1 163 | blob_dict['prd_labels_sbj_pos_int32'] = prd_labels_sbj_pos.astype(np.int32, copy=False) 164 | sbj_labels_sbj_pos = roidb['max_sbj_classes'][sbj_pos_inds] + 1 165 | # 1. set all obj labels > 0 166 | obj_labels_sbj_pos = roidb['max_obj_classes'][sbj_pos_inds] + 1 167 | # 2. find those negative obj 168 | max_obj_overlaps_sbj_pos = roidb['max_obj_overlaps'][sbj_pos_inds] 169 | obj_neg_inds_sbj_pos = np.where(max_obj_overlaps_sbj_pos < cfg.TRAIN.FG_THRESH)[0] 170 | obj_labels_sbj_pos[obj_neg_inds_sbj_pos] = 0 171 | blob_dict['sbj_labels_sbj_pos_int32'] = sbj_labels_sbj_pos.astype(np.int32, copy=False) 172 | blob_dict['obj_labels_sbj_pos_int32'] = obj_labels_sbj_pos.astype(np.int32, copy=False) 173 | # this is for freq bias in RelDN 174 | blob_dict['sbj_labels_sbj_pos_fg_int32'] = roidb['max_sbj_classes'][sbj_pos_inds].astype(np.int32, copy=False) 175 | blob_dict['obj_labels_sbj_pos_fg_int32'] = roidb['max_obj_classes'][sbj_pos_inds].astype(np.int32, copy=False) 176 | 177 | sampled_sbj_boxes_sbj_pos = roidb['sbj_boxes'][sbj_pos_inds] 178 | sampled_obj_boxes_sbj_pos = roidb['obj_boxes'][sbj_pos_inds] 179 | # Scale rois and format as (batch_idx, x1, y1, x2, y2) 180 | sampled_sbj_rois_sbj_pos = sampled_sbj_boxes_sbj_pos * im_scale 181 | sampled_obj_rois_sbj_pos = sampled_obj_boxes_sbj_pos * im_scale 182 | repeated_batch_idx = batch_idx * blob_utils.ones((sbj_pos_inds.shape[0], 1)) 183 | sampled_sbj_rois_sbj_pos = np.hstack((repeated_batch_idx, sampled_sbj_rois_sbj_pos)) 184 | sampled_obj_rois_sbj_pos = np.hstack((repeated_batch_idx, sampled_obj_rois_sbj_pos)) 185 | blob_dict['sbj_rois_sbj_pos'] = sampled_sbj_rois_sbj_pos 186 | blob_dict['obj_rois_sbj_pos'] = sampled_obj_rois_sbj_pos 187 | sampled_rel_rois_sbj_pos = box_utils_rel.rois_union(sampled_sbj_rois_sbj_pos, sampled_obj_rois_sbj_pos) 188 | blob_dict['rel_rois_sbj_pos'] = sampled_rel_rois_sbj_pos 189 | _, inds_unique_sbj_pos, inds_reverse_sbj_pos = np.unique( 190 | sampled_sbj_rois_sbj_pos, return_index=True, return_inverse=True, axis=0) 191 | assert inds_reverse_sbj_pos.shape[0] == sampled_sbj_rois_sbj_pos.shape[0] 192 | blob_dict['inds_unique_sbj_pos'] = inds_unique_sbj_pos 193 | blob_dict['inds_reverse_sbj_pos'] = inds_reverse_sbj_pos 194 | if cfg.MODEL.USE_SPATIAL_FEAT: 195 | sampled_spt_feat_sbj_pos = box_utils_rel.get_spt_features( 196 | sampled_sbj_boxes_sbj_pos, sampled_obj_boxes_sbj_pos, roidb['width'], roidb['height']) 197 | blob_dict['spt_feat_sbj_pos'] = sampled_spt_feat_sbj_pos 198 | # obj 199 | # Here a naturally existing assumption is, each positive obj should have at least one positive sbj 200 | obj_pos_pair_pos_inds = np.where((max_pair_overlaps >= cfg.TRAIN.FG_THRESH))[0] 201 | obj_pos_sbj_pos_pair_neg_inds = np.where((max_obj_overlaps >= cfg.TRAIN.FG_THRESH) & 202 | (max_sbj_overlaps >= cfg.TRAIN.FG_THRESH) & 203 | (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] 204 | obj_pos_sbj_neg_pair_neg_inds = np.where((max_obj_overlaps >= cfg.TRAIN.FG_THRESH) & 205 | (max_sbj_overlaps < cfg.TRAIN.FG_THRESH) & 206 | (max_pair_overlaps < cfg.TRAIN.BG_THRESH_HI))[0] 207 | if obj_pos_pair_pos_inds.size > 0: 208 | obj_pos_pair_pos_inds = npr.choice( 209 | obj_pos_pair_pos_inds, 210 | size=int(min(nodes_per_image, obj_pos_pair_pos_inds.size)), 211 | replace=False) 212 | if obj_pos_sbj_pos_pair_neg_inds.size > 0: 213 | obj_pos_sbj_pos_pair_neg_inds = npr.choice( 214 | obj_pos_sbj_pos_pair_neg_inds, 215 | size=int(min(nodes_per_image, obj_pos_sbj_pos_pair_neg_inds.size)), 216 | replace=False) 217 | obj_pos_pair_neg_inds = obj_pos_sbj_pos_pair_neg_inds 218 | if nodes_per_image - obj_pos_sbj_pos_pair_neg_inds.size > 0 and obj_pos_sbj_neg_pair_neg_inds.size: 219 | obj_pos_sbj_neg_pair_neg_inds = npr.choice( 220 | obj_pos_sbj_neg_pair_neg_inds, 221 | size=int(min(nodes_per_image - obj_pos_sbj_pos_pair_neg_inds.size, obj_pos_sbj_neg_pair_neg_inds.size)), 222 | replace=False) 223 | obj_pos_pair_neg_inds = np.append(obj_pos_pair_neg_inds, obj_pos_sbj_neg_pair_neg_inds) 224 | obj_pos_inds = np.append(obj_pos_pair_pos_inds, obj_pos_pair_neg_inds) 225 | binary_labels_obj_pos = np.zeros(obj_pos_inds.size, dtype=np.int32) 226 | binary_labels_obj_pos[:obj_pos_pair_pos_inds.size] = 1 227 | blob_dict['binary_labels_obj_pos_int32'] = binary_labels_obj_pos.astype(np.int32, copy=False) 228 | prd_pos_labels_obj_pos = roidb['max_prd_classes'][obj_pos_pair_pos_inds] 229 | prd_labels_obj_pos = np.zeros(obj_pos_inds.size, dtype=np.int32) 230 | prd_labels_obj_pos[:obj_pos_pair_pos_inds.size] = prd_pos_labels_obj_pos + 1 231 | blob_dict['prd_labels_obj_pos_int32'] = prd_labels_obj_pos.astype(np.int32, copy=False) 232 | obj_labels_obj_pos = roidb['max_obj_classes'][obj_pos_inds] + 1 233 | # 1. set all sbj labels > 0 234 | sbj_labels_obj_pos = roidb['max_sbj_classes'][obj_pos_inds] + 1 235 | # 2. find those negative sbj 236 | max_sbj_overlaps_obj_pos = roidb['max_sbj_overlaps'][obj_pos_inds] 237 | sbj_neg_inds_obj_pos = np.where(max_sbj_overlaps_obj_pos < cfg.TRAIN.FG_THRESH)[0] 238 | sbj_labels_obj_pos[sbj_neg_inds_obj_pos] = 0 239 | blob_dict['sbj_labels_obj_pos_int32'] = sbj_labels_obj_pos.astype(np.int32, copy=False) 240 | blob_dict['obj_labels_obj_pos_int32'] = obj_labels_obj_pos.astype(np.int32, copy=False) 241 | # this is for freq bias in RelDN 242 | blob_dict['sbj_labels_obj_pos_fg_int32'] = roidb['max_sbj_classes'][obj_pos_inds].astype(np.int32, copy=False) 243 | blob_dict['obj_labels_obj_pos_fg_int32'] = roidb['max_obj_classes'][obj_pos_inds].astype(np.int32, copy=False) 244 | 245 | sampled_sbj_boxes_obj_pos = roidb['sbj_boxes'][obj_pos_inds] 246 | sampled_obj_boxes_obj_pos = roidb['obj_boxes'][obj_pos_inds] 247 | # Scale rois and format as (batch_idx, x1, y1, x2, y2) 248 | sampled_sbj_rois_obj_pos = sampled_sbj_boxes_obj_pos * im_scale 249 | sampled_obj_rois_obj_pos = sampled_obj_boxes_obj_pos * im_scale 250 | repeated_batch_idx = batch_idx * blob_utils.ones((obj_pos_inds.shape[0], 1)) 251 | sampled_sbj_rois_obj_pos = np.hstack((repeated_batch_idx, sampled_sbj_rois_obj_pos)) 252 | sampled_obj_rois_obj_pos = np.hstack((repeated_batch_idx, sampled_obj_rois_obj_pos)) 253 | blob_dict['sbj_rois_obj_pos'] = sampled_sbj_rois_obj_pos 254 | blob_dict['obj_rois_obj_pos'] = sampled_obj_rois_obj_pos 255 | sampled_rel_rois_obj_pos = box_utils_rel.rois_union(sampled_sbj_rois_obj_pos, sampled_obj_rois_obj_pos) 256 | blob_dict['rel_rois_obj_pos'] = sampled_rel_rois_obj_pos 257 | _, inds_unique_obj_pos, inds_reverse_obj_pos = np.unique( 258 | sampled_obj_rois_obj_pos, return_index=True, return_inverse=True, axis=0) 259 | assert inds_reverse_obj_pos.shape[0] == sampled_obj_rois_obj_pos.shape[0] 260 | blob_dict['inds_unique_obj_pos'] = inds_unique_obj_pos 261 | blob_dict['inds_reverse_obj_pos'] = inds_reverse_obj_pos 262 | if cfg.MODEL.USE_SPATIAL_FEAT: 263 | sampled_spt_feat_obj_pos = box_utils_rel.get_spt_features( 264 | sampled_sbj_boxes_obj_pos, sampled_obj_boxes_obj_pos, roidb['width'], roidb['height']) 265 | blob_dict['spt_feat_obj_pos'] = sampled_spt_feat_obj_pos 266 | 267 | return blob_dict 268 | 269 | 270 | def _add_rel_multilevel_rois(blobs): 271 | """By default training RoIs are added for a single feature map level only. 272 | When using FPN, the RoIs must be distributed over different FPN levels 273 | according the level assignment heuristic (see: modeling.FPN. 274 | map_rois_to_fpn_levels). 275 | """ 276 | lvl_min = cfg.FPN.ROI_MIN_LEVEL 277 | lvl_max = cfg.FPN.ROI_MAX_LEVEL 278 | 279 | def _distribute_rois_over_fpn_levels(rois_blob_names): 280 | """Distribute rois over the different FPN levels.""" 281 | # Get target level for each roi 282 | # Recall blob rois are in (batch_idx, x1, y1, x2, y2) format, hence take 283 | # the box coordinates from columns 1:5 284 | lowest_target_lvls = None 285 | for rois_blob_name in rois_blob_names: 286 | target_lvls = fpn_utils.map_rois_to_fpn_levels( 287 | blobs[rois_blob_name][:, 1:5], lvl_min, lvl_max) 288 | if lowest_target_lvls is None: 289 | lowest_target_lvls = target_lvls 290 | else: 291 | lowest_target_lvls = np.minimum(lowest_target_lvls, target_lvls) 292 | for rois_blob_name in rois_blob_names: 293 | # Add per FPN level roi blobs named like: _fpn 294 | fpn_utils.add_multilevel_roi_blobs( 295 | blobs, rois_blob_name, blobs[rois_blob_name], lowest_target_lvls, lvl_min, 296 | lvl_max) 297 | 298 | _distribute_rois_over_fpn_levels(['sbj_rois']) 299 | _distribute_rois_over_fpn_levels(['obj_rois']) 300 | _distribute_rois_over_fpn_levels(['rel_rois']) 301 | if cfg.MODEL.USE_NODE_CONTRASTIVE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_SO_AWARE_LOSS or cfg.MODEL.USE_NODE_CONTRASTIVE_P_AWARE_LOSS: 302 | _distribute_rois_over_fpn_levels(['sbj_rois_sbj_pos']) 303 | _distribute_rois_over_fpn_levels(['obj_rois_sbj_pos']) 304 | _distribute_rois_over_fpn_levels(['rel_rois_sbj_pos']) 305 | _distribute_rois_over_fpn_levels(['sbj_rois_obj_pos']) 306 | _distribute_rois_over_fpn_levels(['obj_rois_obj_pos']) 307 | _distribute_rois_over_fpn_levels(['rel_rois_obj_pos']) 308 | --------------------------------------------------------------------------------