├── .gitignore ├── .idea ├── Relation_Networks-pytorch.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── README.md ├── Weights └── README.md ├── config.py ├── data ├── __init__.py ├── dataset.py ├── util.py └── voc_dataset.py ├── demo.ipynb ├── demo ├── demo.jpg └── demo_output.png ├── evaluate.py ├── experiments.py ├── lib ├── array_tool.py ├── bbox_tools.py ├── creator_tool.py ├── eval_tool.py ├── nms │ ├── __init__.py │ ├── _nms_gpu_post.c │ ├── _nms_gpu_post.pyx │ ├── _nms_gpu_post_py.py │ ├── build.py │ └── non_maximum_suppression.py ├── relation_tool.py ├── roi_cupy.py └── vis_tool.py ├── losses.py ├── model.py ├── train.py ├── train_history.txt └── trainer.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | wheels/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | MANIFEST 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | .pytest_cache/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | db.sqlite3 57 | 58 | # Flask stuff: 59 | instance/ 60 | .webassets-cache 61 | 62 | # Scrapy stuff: 63 | .scrapy 64 | 65 | # Sphinx documentation 66 | docs/_build/ 67 | 68 | # PyBuilder 69 | target/ 70 | 71 | # Jupyter Notebook 72 | .ipynb_checkpoints 73 | 74 | # pyenv 75 | .python-version 76 | 77 | # celery beat schedule file 78 | celerybeat-schedule 79 | 80 | # SageMath parsed files 81 | *.sage.py 82 | 83 | # Environments 84 | .env 85 | .venv 86 | env/ 87 | venv/ 88 | ENV/ 89 | env.bak/ 90 | venv.bak/ 91 | 92 | # Spyder project settings 93 | .spyderproject 94 | .spyproject 95 | 96 | # Rope project settings 97 | .ropeproject 98 | 99 | # mkdocs documentation 100 | /site 101 | 102 | # mypy 103 | .mypy_cache/ 104 | 105 | *.pth 106 | 107 | *.pt 108 | 109 | train_history\.txt 110 | -------------------------------------------------------------------------------- /.idea/Relation_Networks-pytorch.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 7 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Relation_Networks-pytorch 2 | Relation Networks for object detection based pytorch 3 | 4 | ### Progress 5 | 6 | - [x] add relation module 7 | - [x] add nms using relation module 8 | - [x] end to end train renet based faster-rcnn on VOC2007 9 | - [x] evaluate mAP 10 | - [ ] modified learn_nms 11 | - [ ] add OHEM 12 | - [ ] improve performance 13 | - [ ] train and forward on COCO 14 | 15 | ## Requirements 16 | 1. Python3.5 17 | 2. The following Python packages: 18 | ``` 19 | Cython 20 | cupy 21 | numpy 22 | pytorch 23 | opencv-python 24 | collections 25 | ``` 26 | ## Acknowledgement 27 | This work builds on many excellent works, which include: 28 | 29 | - [Yusuke Niitani's ChainerCV](https://github.com/chainer/chainercv) (mainly) 30 | - [Ruotian Luo's pytorch-faster-rcnn](https://github.com/ruotianluo/pytorch-faster-rcnn) which based on [Xinlei Chen's tf-faster-rcnn](https://github.com/endernewton/tf-faster-rcnn) 31 | - [simple-faster-rcnn-pytorch by chenyuntc](https://github.com/chenyuntc/simple-faster-rcnn-pytorch) 32 | - [faster-rcnn.pytorch by Jianwei Yang and Jiasen Lu](https://github.com/jwyang/faster-rcnn.pytorch).It mainly refer to [longcw's faster_rcnn_pytorch](https://github.com/longcw/faster_rcnn_pytorch) 33 | - All the above Repositories have referred to [py-faster-rcnn by Ross Girshick and Sean Bell](https://github.com/rbgirshick/py-faster-rcnn) either directly or indirectly. 34 | - [yhenon's pytorch-retinanet](https://github.com/yhenon/pytorch-retinanet) 35 | - [msracver's Relation-Networks-for-Object-Detection](https://github.com/msracver/Relation-Networks-for-Object-Detection) -------------------------------------------------------------------------------- /Weights/README.md: -------------------------------------------------------------------------------- 1 | This folder stores Weights from train 2 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | 2 | class Config: 3 | # data 4 | voc_data_dir = '/media/heecheol/새 볼륨/DataSet/VOC2007/' 5 | min_size = 600 # image resize 6 | max_size = 1000 # image resize 7 | num_workers = 8 8 | 9 | # sigma for l1_smooth_loss 10 | rpn_sigma = 3. 11 | roi_sigma = 1. 12 | 13 | # param for optimizer 14 | # 0.0005 in origin paper but 0.0001 in tf-faster-rcnn 15 | weight_decay = 0.0001 16 | lr = 1e-4 17 | 18 | 19 | # training 20 | trainset = 'trainval' 21 | testset = 'test' 22 | epoch = 15 23 | isLearnNMS = False 24 | use_adam = True #You need set a very low lr for Adam 25 | #The batch size can still only one. 26 | batch_size=1 27 | 28 | model_name='squeeze' 29 | 30 | features_dim = 512 31 | 32 | 33 | 34 | opt = Config() 35 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/heefe92/Relation_Networks-pytorch/33e645ca38691f9e1988d28fcd5cf5b3c0fcc641/data/__init__.py -------------------------------------------------------------------------------- /data/dataset.py: -------------------------------------------------------------------------------- 1 | import torch as t 2 | from .voc_dataset import VOCBboxDataset 3 | from skimage import transform as sktsf 4 | from torchvision import transforms as tvtsf 5 | from . import util 6 | import numpy as np 7 | 8 | 9 | 10 | def preprocess(img, min_size=600, max_size=1000): 11 | """Preprocess an image for feature extraction. 12 | 13 | The length of the shorter edge is scaled to :obj:`self.min_size`. 14 | After the scaling, if the length of the longer edge is longer than 15 | :param min_size: 16 | :obj:`self.max_size`, the image is scaled to fit the longer edge 17 | to :obj:`self.max_size`. 18 | 19 | After resizing the image, the image is subtracted by a mean image value 20 | :obj:`self.mean`. 21 | 22 | Args: 23 | img (~numpy.ndarray): An image. This is in CHW and RGB format. 24 | The range of its value is :math:`[0, 255]`. 25 | 26 | Returns: 27 | ~numpy.ndarray: A preprocessed image. 28 | 29 | """ 30 | C, H, W = img.shape 31 | scale1 = min_size / min(H, W) 32 | scale2 = max_size / max(H, W) 33 | scale = min(scale1, scale2) 34 | img = img.astype(np.float32)/255.0 35 | img = sktsf.resize(img, (C, (H * scale)//32 * 32, (W * scale)//32 * 32), mode='reflect') 36 | # both the longer and shorter should be less than 37 | # max_size and min_size 38 | 39 | return img 40 | 41 | def normalize(img): 42 | """ 43 | https://github.com/pytorch/vision/issues/223 44 | return appr -1~1 RGB 45 | """ 46 | normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406], 47 | std=[0.229, 0.224, 0.225]) 48 | img = normalize(t.from_numpy(img)) 49 | return img.numpy() 50 | def VGGTestPreprocess(img): 51 | """Preprocess an image for feature extraction. 52 | The length of the shorter edge is scaled to :obj:`self.min_size`. 53 | After the scaling, if the length of the longer edge is longer than 54 | :param min_size: 55 | :obj:`self.max_size`, the image is scaled to fit the longer edge 56 | to :obj:`self.max_size`. 57 | After resizing the image, the image is subtracted by a mean image value 58 | :obj:`self.mean`. 59 | Args: 60 | img (~numpy.ndarray): An image. This is in CHW and RGB format. 61 | The range of its value is :math:`[0, 255]`. 62 | Returns: 63 | ~numpy.ndarray: A preprocessed image. 64 | """ 65 | img = img / 255. 66 | # both the longer and shorter should be less than 67 | # max_size and min_size 68 | 69 | return normalize(img) 70 | def VGGpreprocess(img, min_size=600, max_size=1000): 71 | """Preprocess an image for feature extraction. 72 | The length of the shorter edge is scaled to :obj:`self.min_size`. 73 | After the scaling, if the length of the longer edge is longer than 74 | :param min_size: 75 | :obj:`self.max_size`, the image is scaled to fit the longer edge 76 | to :obj:`self.max_size`. 77 | After resizing the image, the image is subtracted by a mean image value 78 | :obj:`self.mean`. 79 | Args: 80 | img (~numpy.ndarray): An image. This is in CHW and RGB format. 81 | The range of its value is :math:`[0, 255]`. 82 | Returns: 83 | ~numpy.ndarray: A preprocessed image. 84 | """ 85 | C, H, W = img.shape 86 | scale1 = min_size / min(H, W) 87 | scale2 = max_size / max(H, W) 88 | scale = min(scale1, scale2) 89 | img = img / 255. 90 | img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect',anti_aliasing=False) 91 | # both the longer and shorter should be less than 92 | # max_size and min_size 93 | 94 | return normalize(img) 95 | class VGGTransform(object): 96 | def __init__(self, min_size=600, max_size=1000): 97 | self.min_size = min_size 98 | self.max_size = max_size 99 | 100 | def __call__(self, in_data): 101 | img, bbox, label = in_data 102 | _, H, W = img.shape 103 | img = VGGpreprocess(img, self.min_size, self.max_size) 104 | _, o_H, o_W = img.shape 105 | scale = o_H / H 106 | bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W)) 107 | 108 | # horizontally flip 109 | img, params = util.random_flip( 110 | img, x_random=True, return_param=True) 111 | bbox = util.flip_bbox( 112 | bbox, (o_H, o_W), x_flip=params['x_flip']) 113 | 114 | return img, bbox, label, scale 115 | 116 | class Transform(object): 117 | 118 | def __init__(self, min_size, max_size): 119 | self.min_size = min_size 120 | self.max_size = max_size 121 | 122 | def __call__(self, in_data): 123 | img, bbox, label = in_data 124 | _, H, W = img.shape 125 | img = preprocess(img, self.min_size, self.max_size) 126 | _, o_H, o_W = img.shape 127 | 128 | bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W)) 129 | 130 | # horizontally flip 131 | img, params = util.random_flip( 132 | img, x_random=True, return_param=True) 133 | bbox = util.flip_bbox( 134 | bbox, (o_H, o_W), x_flip=params['x_flip']) 135 | 136 | return img, bbox, label, 1.0 137 | 138 | class VGGDataset: 139 | def __init__(self, opt): 140 | self.opt = opt 141 | self.db = VOCBboxDataset(opt.voc_data_dir, split=opt.trainset) 142 | self.tsf = VGGTransform(opt.min_size, opt.max_size) 143 | 144 | def __getitem__(self, idx): 145 | ori_img, bbox, label, difficult = self.db.get_example(idx) 146 | 147 | img, bbox, label, scale = self.tsf((ori_img, bbox, label)) 148 | # TODO: check whose stride is negative to fix this instead copy all 149 | # some of the strides of a given numpy array are negative. 150 | return img.copy(), bbox.copy(), label.copy(), scale 151 | 152 | def __len__(self): 153 | return len(self.db) 154 | class VGGTestDataset: 155 | def __init__(self, opt, use_difficult=True): 156 | self.opt = opt 157 | self.db = VOCBboxDataset(opt.voc_data_dir, split=opt.testset, use_difficult=use_difficult) 158 | def __getitem__(self, idx): 159 | ori_img, bbox, label, difficult = self.db.get_example(idx) 160 | img = VGGTestPreprocess(ori_img) 161 | return img.copy(), img.shape[1:], bbox.copy(), label.copy(), difficult.copy() 162 | 163 | def __len__(self): 164 | return len(self.db) 165 | 166 | class Dataset: 167 | def __init__(self, opt): 168 | self.VOCBboxDataset = VOCBboxDataset(opt.voc_data_dir,split='trainval') 169 | self.Transform = Transform(opt.min_size, opt.max_size) 170 | 171 | def __getitem__(self, idx): 172 | ori_img, bbox, label, difficult = self.VOCBboxDataset.get_example(idx) 173 | 174 | img, bbox, label, scale = self.Transform((ori_img, bbox, label)) 175 | # TODO: check whose stride is negative to fix this instead copy all 176 | # some of the strides of a given numpy array are negative. 177 | return img.copy(), bbox.copy(), label.copy(), scale 178 | 179 | def __len__(self): 180 | return len(self.VOCBboxDataset) 181 | 182 | class TestDataset: 183 | def __init__(self, opt, split='test', use_difficult=True): 184 | self.opt = opt 185 | self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult) 186 | self.Transform = Transform(opt.min_size, opt.min_size) 187 | def __getitem__(self, idx): 188 | ori_img, bbox, label, difficult = self.db.get_example(idx) 189 | img, bbox, label, scale = self.Transform((ori_img, bbox, label)) 190 | return img.copy(), img.shape[1:], bbox.copy(), label.copy(), difficult.copy() 191 | 192 | def __len__(self): 193 | return len(self.db) 194 | -------------------------------------------------------------------------------- /data/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from PIL import Image 3 | import random 4 | 5 | 6 | def read_image(path, dtype=np.float32, color=True): 7 | """Read an image from a file. 8 | 9 | This function reads an image from given file. The image is CHW format and 10 | the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the 11 | order of the channels is RGB. 12 | 13 | Args: 14 | path (str): A path of image file. 15 | dtype: The type of array. The default value is :obj:`~numpy.float32`. 16 | color (bool): This option determines the number of channels. 17 | If :obj:`True`, the number of channels is three. In this case, 18 | the order of the channels is RGB. This is the default behaviour. 19 | If :obj:`False`, this function returns a grayscale image. 20 | 21 | Returns: 22 | ~numpy.ndarray: An image. 23 | """ 24 | 25 | f = Image.open(path) 26 | try: 27 | if color: 28 | img = f.convert('RGB') 29 | else: 30 | img = f.convert('P') 31 | img = np.asarray(img, dtype=dtype) 32 | finally: 33 | if hasattr(f, 'close'): 34 | f.close() 35 | 36 | if img.ndim == 2: 37 | # reshape (H, W) -> (1, H, W) 38 | return img[np.newaxis] 39 | else: 40 | # transpose (H, W, C) -> (C, H, W) 41 | return img.transpose((2, 0, 1)) 42 | 43 | 44 | def resize_bbox(bbox, in_size, out_size): 45 | """Resize bounding boxes according to image resize. 46 | 47 | The bounding boxes are expected to be packed into a two dimensional 48 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 49 | bounding boxes in the image. The second axis represents attributes of 50 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 51 | where the four attributes are coordinates of the top left and the 52 | bottom right vertices. 53 | 54 | Args: 55 | bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`. 56 | :math:`R` is the number of bounding boxes. 57 | in_size (tuple): A tuple of length 2. The height and the width 58 | of the image before resized. 59 | out_size (tuple): A tuple of length 2. The height and the width 60 | of the image after resized. 61 | 62 | Returns: 63 | ~numpy.ndarray: 64 | Bounding boxes rescaled according to the given image shapes. 65 | 66 | """ 67 | bbox = bbox.copy() 68 | y_scale = float(out_size[0]) / in_size[0] 69 | x_scale = float(out_size[1]) / in_size[1] 70 | bbox[:, 0] = y_scale * bbox[:, 0] 71 | bbox[:, 2] = y_scale * bbox[:, 2] 72 | bbox[:, 1] = x_scale * bbox[:, 1] 73 | bbox[:, 3] = x_scale * bbox[:, 3] 74 | return bbox 75 | 76 | 77 | def flip_bbox(bbox, size, y_flip=False, x_flip=False): 78 | """Flip bounding boxes accordingly. 79 | 80 | The bounding boxes are expected to be packed into a two dimensional 81 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 82 | bounding boxes in the image. The second axis represents attributes of 83 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 84 | where the four attributes are coordinates of the top left and the 85 | bottom right vertices. 86 | 87 | Args: 88 | bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`. 89 | :math:`R` is the number of bounding boxes. 90 | size (tuple): A tuple of length 2. The height and the width 91 | of the image before resized. 92 | y_flip (bool): Flip bounding box according to a vertical flip of 93 | an image. 94 | x_flip (bool): Flip bounding box according to a horizontal flip of 95 | an image. 96 | 97 | Returns: 98 | ~numpy.ndarray: 99 | Bounding boxes flipped according to the given flips. 100 | 101 | """ 102 | H, W = size 103 | bbox = bbox.copy() 104 | if y_flip: 105 | y_max = H - bbox[:, 0] 106 | y_min = H - bbox[:, 2] 107 | bbox[:, 0] = y_min 108 | bbox[:, 2] = y_max 109 | if x_flip: 110 | x_max = W - bbox[:, 1] 111 | x_min = W - bbox[:, 3] 112 | bbox[:, 1] = x_min 113 | bbox[:, 3] = x_max 114 | return bbox 115 | 116 | 117 | def crop_bbox( 118 | bbox, y_slice=None, x_slice=None, 119 | allow_outside_center=True, return_param=False): 120 | """Translate bounding boxes to fit within the cropped area of an image. 121 | 122 | This method is mainly used together with image cropping. 123 | This method translates the coordinates of bounding boxes like 124 | :func:`data.util.translate_bbox`. In addition, 125 | this function truncates the bounding boxes to fit within the cropped area. 126 | If a bounding box does not overlap with the cropped area, 127 | this bounding box will be removed. 128 | 129 | The bounding boxes are expected to be packed into a two dimensional 130 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 131 | bounding boxes in the image. The second axis represents attributes of 132 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 133 | where the four attributes are coordinates of the top left and the 134 | bottom right vertices. 135 | 136 | Args: 137 | bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is 138 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 139 | y_slice (slice): The slice of y axis. 140 | x_slice (slice): The slice of x axis. 141 | allow_outside_center (bool): If this argument is :obj:`False`, 142 | bounding boxes whose centers are outside of the cropped area 143 | are removed. The default value is :obj:`True`. 144 | return_param (bool): If :obj:`True`, this function returns 145 | indices of kept bounding boxes. 146 | 147 | Returns: 148 | ~numpy.ndarray or (~numpy.ndarray, dict): 149 | 150 | If :obj:`return_param = False`, returns an array :obj:`bbox`. 151 | 152 | If :obj:`return_param = True`, 153 | returns a tuple whose elements are :obj:`bbox, param`. 154 | :obj:`param` is a dictionary of intermediate parameters whose 155 | contents are listed below with key, value-type and the description 156 | of the value. 157 | 158 | * **index** (*numpy.ndarray*): An array holding indices of used \ 159 | bounding boxes. 160 | 161 | """ 162 | 163 | t, b = _slice_to_bounds(y_slice) 164 | l, r = _slice_to_bounds(x_slice) 165 | crop_bb = np.array((t, l, b, r)) 166 | 167 | if allow_outside_center: 168 | mask = np.ones(bbox.shape[0], dtype=bool) 169 | else: 170 | center = (bbox[:, :2] + bbox[:, 2:]) / 2 171 | mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \ 172 | .all(axis=1) 173 | 174 | bbox = bbox.copy() 175 | bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2]) 176 | bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:]) 177 | bbox[:, :2] -= crop_bb[:2] 178 | bbox[:, 2:] -= crop_bb[:2] 179 | 180 | mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1)) 181 | bbox = bbox[mask] 182 | 183 | if return_param: 184 | return bbox, {'index': np.flatnonzero(mask)} 185 | else: 186 | return bbox 187 | 188 | 189 | def _slice_to_bounds(slice_): 190 | if slice_ is None: 191 | return 0, np.inf 192 | 193 | if slice_.start is None: 194 | l = 0 195 | else: 196 | l = slice_.start 197 | 198 | if slice_.stop is None: 199 | u = np.inf 200 | else: 201 | u = slice_.stop 202 | 203 | return l, u 204 | 205 | 206 | def translate_bbox(bbox, y_offset=0, x_offset=0): 207 | """Translate bounding boxes. 208 | 209 | This method is mainly used together with image transforms, such as padding 210 | and cropping, which translates the left top point of the image from 211 | coordinate :math:`(0, 0)` to coordinate 212 | :math:`(y, x) = (y_{offset}, x_{offset})`. 213 | 214 | The bounding boxes are expected to be packed into a two dimensional 215 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 216 | bounding boxes in the image. The second axis represents attributes of 217 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 218 | where the four attributes are coordinates of the top left and the 219 | bottom right vertices. 220 | 221 | Args: 222 | bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is 223 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 224 | y_offset (int or float): The offset along y axis. 225 | x_offset (int or float): The offset along x axis. 226 | 227 | Returns: 228 | ~numpy.ndarray: 229 | Bounding boxes translated according to the given offsets. 230 | 231 | """ 232 | 233 | out_bbox = bbox.copy() 234 | out_bbox[:, :2] += (y_offset, x_offset) 235 | out_bbox[:, 2:] += (y_offset, x_offset) 236 | 237 | return out_bbox 238 | 239 | 240 | def random_flip(img, y_random=False, x_random=False, 241 | return_param=False, copy=False): 242 | """Randomly flip an image in vertical or horizontal direction. 243 | 244 | Args: 245 | img (~numpy.ndarray): An array that gets flipped. This is in 246 | CHW format. 247 | y_random (bool): Randomly flip in vertical direction. 248 | x_random (bool): Randomly flip in horizontal direction. 249 | return_param (bool): Returns information of flip. 250 | copy (bool): If False, a view of :obj:`img` will be returned. 251 | 252 | Returns: 253 | ~numpy.ndarray or (~numpy.ndarray, dict): 254 | 255 | If :obj:`return_param = False`, 256 | returns an array :obj:`out_img` that is the result of flipping. 257 | 258 | If :obj:`return_param = True`, 259 | returns a tuple whose elements are :obj:`out_img, param`. 260 | :obj:`param` is a dictionary of intermediate parameters whose 261 | contents are listed below with key, value-type and the description 262 | of the value. 263 | 264 | * **y_flip** (*bool*): Whether the image was flipped in the\ 265 | vertical direction or not. 266 | * **x_flip** (*bool*): Whether the image was flipped in the\ 267 | horizontal direction or not. 268 | 269 | """ 270 | y_flip, x_flip = False, False 271 | if y_random: 272 | y_flip = random.choice([True, False]) 273 | if x_random: 274 | x_flip = random.choice([True, False]) 275 | 276 | if y_flip: 277 | img = img[:, ::-1, :] 278 | if x_flip: 279 | img = img[:, :, ::-1] 280 | 281 | if copy: 282 | img = img.copy() 283 | 284 | if return_param: 285 | return img, {'y_flip': y_flip, 'x_flip': x_flip} 286 | else: 287 | return img 288 | -------------------------------------------------------------------------------- /data/voc_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import xml.etree.ElementTree as ET 3 | 4 | import numpy as np 5 | import skimage.io 6 | import skimage.transform 7 | import skimage.color 8 | import skimage 9 | 10 | from .util import read_image 11 | 12 | 13 | class VOCBboxDataset: 14 | """Bounding box dataset for PASCAL `VOC`_. 15 | 16 | .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/ 17 | 18 | The index corresponds to each image. 19 | 20 | When queried by an index, if :obj:`return_difficult == False`, 21 | this dataset returns a corresponding 22 | :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels. 23 | This is the default behaviour. 24 | If :obj:`return_difficult == True`, this dataset returns corresponding 25 | :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array 26 | that indicates whether bounding boxes are labeled as difficult or not. 27 | 28 | The bounding boxes are packed into a two dimensional tensor of shape 29 | :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in 30 | the image. The second axis represents attributes of the bounding box. 31 | They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the 32 | four attributes are coordinates of the top left and the bottom right 33 | vertices. 34 | 35 | The labels are packed into a one dimensional tensor of shape :math:`(R,)`. 36 | :math:`R` is the number of bounding boxes in the image. 37 | The class name of the label :math:`l` is :math:`l` th element of 38 | :obj:`VOC_BBOX_LABEL_NAMES`. 39 | 40 | The array :obj:`difficult` is a one dimensional boolean array of shape 41 | :math:`(R,)`. :math:`R` is the number of bounding boxes in the image. 42 | If :obj:`use_difficult` is :obj:`False`, this array is 43 | a boolean array with all :obj:`False`. 44 | 45 | The type of the image, the bounding boxes and the labels are as follows. 46 | 47 | * :obj:`img.dtype == numpy.float32` 48 | * :obj:`bbox.dtype == numpy.float32` 49 | * :obj:`label.dtype == numpy.int32` 50 | * :obj:`difficult.dtype == numpy.bool` 51 | 52 | Args: 53 | data_dir (string): Path to the root of the training data. 54 | i.e. "/data/image/voc/VOCdevkit/VOC2007/" 55 | split ({'train', 'val', 'trainval', 'test'}): Select a split of the 56 | dataset. :obj:`test` split is only available for 57 | 2007 dataset. 58 | year ({'2007', '2012'}): Use a dataset prepared for a challenge 59 | held in :obj:`year`. 60 | use_difficult (bool): If :obj:`True`, use images that are labeled as 61 | difficult in the original annotation. 62 | return_difficult (bool): If :obj:`True`, this dataset returns 63 | a boolean array 64 | that indicates whether bounding boxes are labeled as difficult 65 | or not. The default value is :obj:`False`. 66 | 67 | """ 68 | 69 | def __init__(self, data_dir, split='train', 70 | use_difficult=False, return_difficult=False, 71 | ): 72 | 73 | # if split not in ['train', 'trainval', 'val']: 74 | # if not (split == 'test' and year == '2007'): 75 | # warnings.warn( 76 | # 'please pick split from \'train\', \'trainval\', \'val\'' 77 | # 'for 2012 dataset. For 2007 dataset, you can pick \'test\'' 78 | # ' in addition to the above mentioned splits.' 79 | # ) 80 | id_list_file = os.path.join( 81 | data_dir, 'ImageSets/Main/{0}.txt'.format(split)) 82 | 83 | self.ids = [id_.strip() for id_ in open(id_list_file)] 84 | self.data_dir = data_dir 85 | self.use_difficult = use_difficult 86 | self.return_difficult = return_difficult 87 | self.label_names = VOC_BBOX_LABEL_NAMES 88 | 89 | def __len__(self): 90 | return len(self.ids) 91 | 92 | def get_example(self, i): 93 | """Returns the i-th example. 94 | 95 | Returns a color image and bounding boxes. The image is in CHW format. 96 | The returned image is RGB. 97 | 98 | Args: 99 | i (int): The index of the example. 100 | 101 | Returns: 102 | tuple of an image and bounding boxes 103 | 104 | """ 105 | id_ = self.ids[i] 106 | anno = ET.parse( 107 | os.path.join(self.data_dir, 'Annotations', id_ + '.xml')) 108 | bbox = list() 109 | label = list() 110 | difficult = list() 111 | for obj in anno.findall('object'): 112 | # when in not using difficult split, and the object is 113 | # difficult, skipt it. 114 | if not self.use_difficult and int(obj.find('difficult').text) == 1: 115 | continue 116 | 117 | difficult.append(int(obj.find('difficult').text)) 118 | bndbox_anno = obj.find('bndbox') 119 | # subtract 1 to make pixel indexes 0-based 120 | bbox.append([ 121 | int(bndbox_anno.find(tag).text) - 1 122 | for tag in ('ymin', 'xmin', 'ymax', 'xmax')]) 123 | name = obj.find('name').text.lower().strip() 124 | label.append(VOC_BBOX_LABEL_NAMES.index(name)) 125 | bbox = np.stack(bbox).astype(np.float32) 126 | label = np.stack(label).astype(np.int32) 127 | # When `use_difficult==False`, all elements in `difficult` are False. 128 | difficult = np.array(difficult, dtype=np.bool).astype(np.uint8) # PyTorch don't support np.bool 129 | 130 | # Load a image 131 | img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg') 132 | img = skimage.io.imread(img_file) 133 | if len(img.shape) == 2: 134 | img = skimage.color.gray2rgb(img) 135 | img = img.transpose((2, 0, 1)) 136 | 137 | #img = read_image(img_file, color=True) 138 | 139 | # if self.return_difficult: 140 | # return img, bbox, label, difficult 141 | return img, bbox, label, difficult 142 | 143 | __getitem__ = get_example 144 | 145 | 146 | VOC_BBOX_LABEL_NAMES = ( 147 | 'aeroplane',#0 148 | 'bicycle',#1 149 | 'bird',#2 150 | 'boat',#3 151 | 'bottle',#4 152 | 'bus',#5 153 | 'car',#6 154 | 'cat',#7 155 | 'chair',#8 156 | 'cow',#9 157 | 'diningtable',#10 158 | 'dog',#11 159 | 'horse',#12 160 | 'motorbike',#13 161 | 'person',#14 162 | 'pottedplant', 163 | 'sheep', 164 | 'sofa', 165 | 'train', 166 | 'tvmonitor') 167 | -------------------------------------------------------------------------------- /demo/demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/heefe92/Relation_Networks-pytorch/33e645ca38691f9e1988d28fcd5cf5b3c0fcc641/demo/demo.jpg -------------------------------------------------------------------------------- /demo/demo_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/heefe92/Relation_Networks-pytorch/33e645ca38691f9e1988d28fcd5cf5b3c0fcc641/demo/demo_output.png -------------------------------------------------------------------------------- /evaluate.py: -------------------------------------------------------------------------------- 1 | from torch.utils import data as data_ 2 | import model 3 | 4 | import torch 5 | from lib.eval_tool import eval_detection_voc 6 | from data.dataset import TestDataset 7 | from config import opt 8 | import cv2,time 9 | import numpy as np 10 | from lib.array_tool import tonumpy 11 | 12 | # def eval(dataloader, model, test_num=10000): 13 | # pred_bboxes, pred_labels, pred_scores = list(), list(), list() 14 | # gt_bboxes, gt_labels, gt_difficults = list(), list(), list() 15 | # for ii, data in enumerate(dataloader): 16 | # (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) = data 17 | # 18 | # nms_scores, sorted_labels, sorted_cls_bboxes = model.predict( 19 | # imgs.cuda().float()) 20 | # if not ( nms_scores is None): 21 | # test = np.reshape(np.argwhere(nms_scores>0.7),-1) 22 | # nms_scores = nms_scores[test] 23 | # sorted_labels = sorted_labels[test] 24 | # sorted_cls_bboxes = sorted_cls_bboxes[test] 25 | # 26 | # pred_bboxes.append(np.reshape(tonumpy(sorted_cls_bboxes),(-1,4)).copy()) 27 | # pred_labels.append(np.reshape(tonumpy(sorted_labels),(-1)).copy()) 28 | # pred_scores.append(np.reshape(tonumpy(nms_scores),(-1)).copy()) 29 | # else: 30 | # pred_bboxes.append(np.array([])) 31 | # pred_labels.append(np.array([])) 32 | # pred_scores.append(np.array([])) 33 | # gt_bboxes += list(gt_bboxes_.numpy()) 34 | # gt_labels += list(gt_labels_.numpy()) 35 | # gt_difficults += list(gt_difficults_.numpy()) 36 | # if ii == test_num: break 37 | # result = eval_detection_voc( 38 | # pred_bboxes, pred_labels, pred_scores, 39 | # gt_bboxes, gt_labels, gt_difficults, 40 | # use_07_metric=True) 41 | # return result 42 | 43 | 44 | def eval(dataloader, model, test_num=10000): 45 | pred_bboxes, pred_labels, pred_scores = list(), list(), list() 46 | gt_bboxes, gt_labels, gt_difficults = list(), list(), list() 47 | for ii, data in enumerate(dataloader): 48 | (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) = data 49 | sizes = [sizes[0][0], sizes[1][0]] 50 | pred_bboxes_, pred_labels_, pred_scores_ = model.predict(imgs, [sizes]) 51 | gt_bboxes += list(gt_bboxes_.numpy()) 52 | gt_labels += list(gt_labels_.numpy()) 53 | gt_difficults += list(gt_difficults_.numpy()) 54 | pred_bboxes += pred_bboxes_ 55 | pred_labels += pred_labels_ 56 | pred_scores += pred_scores_ 57 | if ii == test_num: break 58 | 59 | result = eval_detection_voc( 60 | pred_bboxes, pred_labels, pred_scores, 61 | gt_bboxes, gt_labels, gt_difficults, 62 | use_07_metric=True) 63 | return result 64 | 65 | def run_evaluate(): 66 | testset = TestDataset(opt) 67 | test_dataloader = data_.DataLoader(testset, 68 | batch_size=opt.batch_size, 69 | num_workers=opt.num_workers, 70 | shuffle=False#, \ 71 | #pin_memory=True 72 | ) 73 | 74 | resnet = model.resnet101(20,True) 75 | resnet = torch.nn.DataParallel(resnet).cuda() 76 | 77 | resnet.load_state_dict(torch.load('Weights/resnet101_relation_47.pt')) 78 | resnet.module.use_preset(isTraining=False,preset='evaluate') 79 | resnet.eval() 80 | 81 | for child in resnet.module.children(): 82 | for param in child.parameters(): 83 | param.requires_grad = False 84 | 85 | print(eval(test_dataloader,resnet,10000)) 86 | 87 | if __name__ == "__main__": 88 | run_evaluate() -------------------------------------------------------------------------------- /experiments.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import numpy as np 4 | from torch.utils import data as data_ 5 | import model 6 | 7 | from trainer import Trainer 8 | import torch 9 | import torch.optim as optim 10 | from data.dataset import VGGDataset, VGGTestDataset 11 | from config import opt 12 | import cv2,time 13 | 14 | def run_train(train_verbose=False): 15 | dataset = VGGDataset(opt) 16 | dataloader = data_.DataLoader(dataset, \ 17 | batch_size=opt.batch_size, \ 18 | shuffle=True, \ 19 | # pin_memory=True, 20 | num_workers=opt.num_workers) 21 | 22 | testset = VGGTestDataset(opt) 23 | test_dataloader = data_.DataLoader(testset, 24 | batch_size=opt.batch_size, 25 | num_workers=opt.num_workers, 26 | shuffle=False, 27 | pin_memory=True 28 | ) 29 | 30 | my_model = model.SqueezeFRCN(20).cuda() 31 | 32 | optimizer = my_model.get_optimizer() 33 | 34 | loss_hist = collections.deque(maxlen=500) 35 | epoch_loss_hist = [] 36 | my_trainer = Trainer(my_model,optimizer,model_name=opt.model_name) 37 | #my_trainer.model_load(3) 38 | 39 | freeze_num = -1 #pretrain model 40 | best_map = 0 41 | best_map_epoch_num = -1 42 | 43 | for epoch_num in range(opt.epoch): 44 | my_trainer.train_mode(freeze_num) 45 | train_start_time = time.time() 46 | train_epoch_loss = [] 47 | start = time.time() 48 | for iter_num, data in enumerate(dataloader): 49 | curr_loss = my_trainer.train_step(data) 50 | loss_hist.append(float(curr_loss)) 51 | train_epoch_loss.append(float(curr_loss)) 52 | 53 | if (train_verbose): 54 | print('Epoch: {} | Iteration: {} | loss: {:1.5f} | Running loss: {:1.5f} | Iter time: {:1.5f} | Train' 55 | ' time: {:1.5f}'.format(epoch_num, iter_num, float(curr_loss), np.mean(loss_hist), 56 | time.time()-start, time.time()-train_start_time)) 57 | start = time.time() 58 | 59 | del curr_loss 60 | print('train epoch time :', time.time() - train_start_time) 61 | print('Epoch: {} | epoch train loss: {:1.5f}'.format( 62 | epoch_num, np.mean(train_epoch_loss))) 63 | 64 | vali_start_time = time.time() 65 | 66 | 67 | vali_eval_result = my_trainer.run_eval(test_dataloader) 68 | print(vali_eval_result) 69 | vali_map = vali_eval_result['map'] 70 | print('vali epoch time :', time.time() - vali_start_time) 71 | 72 | 73 | if(best_map < vali_map): 74 | best_map = vali_map 75 | best_map_epoch_num = epoch_num 76 | my_trainer.model_save(epoch_num) 77 | if (epoch_num==9): 78 | my_trainer.model_load(best_map_epoch_num) 79 | my_trainer.reduce_lr(factor=0.1, verbose=True) 80 | 81 | print('best epoch num', best_map_epoch_num) 82 | print('----------------------------------------') 83 | 84 | print(epoch_loss_hist) 85 | 86 | 87 | if __name__ == "__main__": 88 | run_train(train_verbose = True) 89 | #my_model = model.SqueezeFRCN(20) -------------------------------------------------------------------------------- /lib/array_tool.py: -------------------------------------------------------------------------------- 1 | """ 2 | tools to convert specified type 3 | """ 4 | import torch as t 5 | import numpy as np 6 | 7 | def tonumpy(data): 8 | if isinstance(data, np.ndarray): 9 | return data 10 | if isinstance(data, t._C._TensorBase): 11 | return data.cpu().numpy() 12 | if isinstance(data, t.autograd.Variable): 13 | return tonumpy(data.data) 14 | 15 | 16 | def totensor(data, cuda=True): 17 | if isinstance(data, np.ndarray): 18 | tensor = t.from_numpy(data) 19 | if isinstance(data, t._C._TensorBase): 20 | tensor = data 21 | if isinstance(data, t.autograd.Variable): 22 | tensor = data.data 23 | if cuda: 24 | tensor = tensor.cuda() 25 | return tensor 26 | 27 | 28 | def tovariable(data): 29 | if isinstance(data, np.ndarray): 30 | return tovariable(totensor(data)) 31 | if isinstance(data, t._C._TensorBase): 32 | return t.autograd.Variable(data) 33 | if isinstance(data, t.autograd.Variable): 34 | return data 35 | else: 36 | raise ValueError("UnKnow data type: %s, input should be {np.ndarray,Tensor,Variable}" %type(data)) 37 | 38 | 39 | def scalar(data): 40 | if isinstance(data, np.ndarray): 41 | return data.reshape(1)[0] 42 | if isinstance(data, t._C._TensorBase): 43 | return data.view(1)[0] 44 | if isinstance(data, t.autograd.Variable): 45 | return data.data.view(1)[0] 46 | -------------------------------------------------------------------------------- /lib/bbox_tools.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import six 4 | 5 | 6 | def loc2bbox(src_bbox, loc): 7 | """Decode bounding boxes from bounding box offsets and scales. 8 | 9 | Given bounding box offsets and scales computed by 10 | :meth:`bbox2loc`, this function decodes the representation to 11 | coordinates in 2D image coordinates. 12 | 13 | Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding 14 | box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`, 15 | the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x` 16 | and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated 17 | by the following formulas. 18 | 19 | * :math:`\\hat{g}_y = p_h t_y + p_y` 20 | * :math:`\\hat{g}_x = p_w t_x + p_x` 21 | * :math:`\\hat{g}_h = p_h \\exp(t_h)` 22 | * :math:`\\hat{g}_w = p_w \\exp(t_w)` 23 | 24 | The decoding formulas are used in works such as R-CNN [#]_. 25 | 26 | The output is same type as the type of the inputs. 27 | 28 | .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \ 29 | Rich feature hierarchies for accurate object detection and semantic \ 30 | segmentation. CVPR 2014. 31 | 32 | Args: 33 | src_bbox (array): A coordinates of bounding boxes. 34 | Its shape is :math:`(R, 4)`. These coordinates are 35 | :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`. 36 | loc (array): An array with offsets and scales. 37 | The shapes of :obj:`src_bbox` and :obj:`loc` should be same. 38 | This contains values :math:`t_y, t_x, t_h, t_w`. 39 | 40 | Returns: 41 | array: 42 | Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \ 43 | The second axis contains four values \ 44 | :math:`\\hat{g}_{ymin}, \\hat{g}_{xmin}, 45 | \\hat{g}_{ymax}, \\hat{g}_{xmax}`. 46 | 47 | """ 48 | 49 | if src_bbox.shape[0] == 0: 50 | return np.zeros((0, 4), dtype=loc.dtype) 51 | 52 | src_bbox = src_bbox.astype(src_bbox.dtype, copy=False) 53 | 54 | src_height = src_bbox[:, 2] - src_bbox[:, 0] 55 | src_width = src_bbox[:, 3] - src_bbox[:, 1] 56 | src_ctr_y = src_bbox[:, 0] + 0.5 * src_height 57 | src_ctr_x = src_bbox[:, 1] + 0.5 * src_width 58 | dy = loc[:, 0::4] 59 | dx = loc[:, 1::4] 60 | dh = loc[:, 2::4] 61 | dw = loc[:, 3::4] 62 | ctr_y = dy * src_height[:, np.newaxis] + src_ctr_y[:, np.newaxis] 63 | ctr_x = dx * src_width[:, np.newaxis] + src_ctr_x[:, np.newaxis] 64 | h = np.exp(dh) * src_height[:, np.newaxis] 65 | w = np.exp(dw) * src_width[:, np.newaxis] 66 | 67 | dst_bbox = np.zeros(loc.shape, dtype=loc.dtype) 68 | dst_bbox[:, 0::4] = ctr_y - 0.5 * h 69 | dst_bbox[:, 1::4] = ctr_x - 0.5 * w 70 | dst_bbox[:, 2::4] = ctr_y + 0.5 * h 71 | dst_bbox[:, 3::4] = ctr_x + 0.5 * w 72 | 73 | return dst_bbox 74 | 75 | 76 | def bbox2loc(src_bbox, dst_bbox): 77 | """Encodes the source and the destination bounding boxes to "loc". 78 | 79 | Given bounding boxes, this function computes offsets and scales 80 | to match the source bounding boxes to the target bounding boxes. 81 | Mathematcially, given a bounding box whose center is 82 | :math:`(y, x) = p_y, p_x` and 83 | size :math:`p_h, p_w` and the target bounding box whose center is 84 | :math:`g_y, g_x` and size :math:`g_h, g_w`, the offsets and scales 85 | :math:`t_y, t_x, t_h, t_w` can be computed by the following formulas. 86 | 87 | * :math:`t_y = \\frac{(g_y - p_y)} {p_h}` 88 | * :math:`t_x = \\frac{(g_x - p_x)} {p_w}` 89 | * :math:`t_h = \\log(\\frac{g_h} {p_h})` 90 | * :math:`t_w = \\log(\\frac{g_w} {p_w})` 91 | 92 | The output is same type as the type of the inputs. 93 | The encoding formulas are used in works such as R-CNN [#]_. 94 | 95 | .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \ 96 | Rich feature hierarchies for accurate object detection and semantic \ 97 | segmentation. CVPR 2014. 98 | 99 | Args: 100 | src_bbox (array): An image coordinate array whose shape is 101 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 102 | These coordinates are 103 | :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`. 104 | dst_bbox (array): An image coordinate array whose shape is 105 | :math:`(R, 4)`. 106 | These coordinates are 107 | :math:`g_{ymin}, g_{xmin}, g_{ymax}, g_{xmax}`. 108 | 109 | Returns: 110 | array: 111 | Bounding box offsets and scales from :obj:`src_bbox` \ 112 | to :obj:`dst_bbox`. \ 113 | This has shape :math:`(R, 4)`. 114 | The second axis contains four values :math:`t_y, t_x, t_h, t_w`. 115 | 116 | """ 117 | 118 | height = src_bbox[:, 2] - src_bbox[:, 0] 119 | width = src_bbox[:, 3] - src_bbox[:, 1] 120 | ctr_y = src_bbox[:, 0] + 0.5 * height 121 | ctr_x = src_bbox[:, 1] + 0.5 * width 122 | 123 | base_height = dst_bbox[:, 2] - dst_bbox[:, 0] 124 | base_width = dst_bbox[:, 3] - dst_bbox[:, 1] 125 | base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height 126 | base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width 127 | 128 | eps = np.finfo(height.dtype).eps 129 | height = np.maximum(height, eps) 130 | width = np.maximum(width, eps) 131 | 132 | dy = (base_ctr_y - ctr_y) / height 133 | dx = (base_ctr_x - ctr_x) / width 134 | dh = np.log(base_height / height) 135 | dw = np.log(base_width / width) 136 | 137 | loc = np.vstack((dy, dx, dh, dw)).transpose() 138 | return loc 139 | 140 | 141 | def bbox_iou(bbox_a, bbox_b): 142 | """Calculate the Intersection of Unions (IoUs) between bounding boxes. 143 | 144 | IoU is calculated as a ratio of area of the intersection 145 | and area of the union. 146 | 147 | This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as 148 | inputs. Please note that both :obj:`bbox_a` and :obj:`bbox_b` need to be 149 | same type. 150 | The output is same type as the type of the inputs. 151 | 152 | Args: 153 | bbox_a (array): An array whose shape is :math:`(N, 4)`. 154 | :math:`N` is the number of bounding boxes. 155 | The dtype should be :obj:`numpy.float32`. 156 | bbox_b (array): An array similar to :obj:`bbox_a`, 157 | whose shape is :math:`(K, 4)`. 158 | The dtype should be :obj:`numpy.float32`. 159 | 160 | Returns: 161 | array: 162 | An array whose shape is :math:`(N, K)`. \ 163 | An element at index :math:`(n, k)` contains IoUs between \ 164 | :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \ 165 | box in :obj:`bbox_b`. 166 | 167 | """ 168 | if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4: 169 | raise IndexError 170 | 171 | # top left 172 | tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2]) 173 | # bottom right 174 | br = np.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:]) 175 | 176 | area_i = np.prod(br - tl, axis=2) * (tl < br).all(axis=2) 177 | area_a = np.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1) 178 | area_b = np.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1) 179 | return area_i / (area_a[:, None] + area_b - area_i) 180 | 181 | 182 | def __test(): 183 | pass 184 | 185 | 186 | if __name__ == '__main__': 187 | __test() 188 | 189 | 190 | def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2], 191 | anchor_scales=[8, 16, 32]): 192 | """Generate anchor base windows by enumerating aspect ratio and scales. 193 | 194 | Generate anchors that are scaled and modified to the given aspect ratios. 195 | Area of a scaled anchor is preserved when modifying to the given aspect 196 | ratio. 197 | 198 | :obj:`R = len(ratios) * len(anchor_scales)` anchors are generated by this 199 | function. 200 | The :obj:`i * len(anchor_scales) + j` th anchor corresponds to an anchor 201 | generated by :obj:`ratios[i]` and :obj:`anchor_scales[j]`. 202 | 203 | For example, if the scale is :math:`8` and the ratio is :math:`0.25`, 204 | the width and the height of the base window will be stretched by :math:`8`. 205 | For modifying the anchor to the given aspect ratio, 206 | the height is halved and the width is doubled. 207 | 208 | Args: 209 | base_size (number): The width and the height of the reference window. 210 | ratios (list of floats): This is ratios of width to height of 211 | the anchors. 212 | anchor_scales (list of numbers): This is areas of anchors. 213 | Those areas will be the product of the square of an element in 214 | :obj:`anchor_scales` and the original area of the reference 215 | window. 216 | 217 | Returns: 218 | ~numpy.ndarray: 219 | An array of shape :math:`(R, 4)`. 220 | Each element is a set of coordinates of a bounding box. 221 | The second axis corresponds to 222 | :math:`(y_{min}, x_{min}, y_{max}, x_{max})` of a bounding box. 223 | 224 | """ 225 | py = base_size / 2. 226 | px = base_size / 2. 227 | 228 | anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), 229 | dtype=np.float32) 230 | for i in six.moves.range(len(ratios)): 231 | for j in six.moves.range(len(anchor_scales)): 232 | h = base_size * anchor_scales[j] * np.sqrt(ratios[i]) 233 | w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i]) 234 | 235 | index = i * len(anchor_scales) + j 236 | anchor_base[index, 0] = py - h / 2. 237 | anchor_base[index, 1] = px - w / 2. 238 | anchor_base[index, 2] = py + h / 2. 239 | anchor_base[index, 3] = px + w / 2. 240 | return anchor_base 241 | -------------------------------------------------------------------------------- /lib/creator_tool.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cupy as cp 3 | 4 | from lib.bbox_tools import bbox2loc, bbox_iou, loc2bbox 5 | from lib.nms import non_maximum_suppression 6 | 7 | 8 | class ProposalTargetCreator(object): 9 | """Assign ground truth bounding boxes to given RoIs. 10 | 11 | The :meth:`__call__` of this class generates training targets 12 | for each object proposal. 13 | This is used to train Faster RCNN [#]_. 14 | 15 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 16 | Faster R-CNN: Towards Real-Time Object Detection with \ 17 | Region Proposal Networks. NIPS 2015. 18 | 19 | Args: 20 | n_sample (int): The number of sampled regions. 21 | pos_ratio (float): Fraction of regions that is labeled as a 22 | foreground. 23 | pos_iou_thresh (float): IoU threshold for a RoI to be considered as a 24 | foreground. 25 | neg_iou_thresh_hi (float): RoI is considered to be the background 26 | if IoU is in 27 | [:obj:`neg_iou_thresh_hi`, :obj:`neg_iou_thresh_hi`). 28 | neg_iou_thresh_lo (float): See above. 29 | 30 | """ 31 | 32 | def __init__(self, 33 | n_sample=128, 34 | pos_ratio=0.25, pos_iou_thresh=0.5, 35 | neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0 36 | ): 37 | self.n_sample = n_sample 38 | self.pos_ratio = pos_ratio 39 | self.pos_iou_thresh = pos_iou_thresh 40 | self.neg_iou_thresh_hi = neg_iou_thresh_hi 41 | self.neg_iou_thresh_lo = neg_iou_thresh_lo # NOTE: py-faster-rcnn默认的值是0.1 42 | 43 | def __call__(self, roi, bbox, label, 44 | loc_normalize_mean=(0., 0., 0., 0.), 45 | loc_normalize_std=(0.1, 0.1, 0.2, 0.2)): 46 | """Assigns ground truth to sampled proposals. 47 | 48 | This function samples total of :obj:`self.n_sample` RoIs 49 | from the combination of :obj:`roi` and :obj:`bbox`. 50 | The RoIs are assigned with the ground truth class labels as well as 51 | bounding box offsets and scales to match the ground truth bounding 52 | boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are 53 | sampled as foregrounds. 54 | 55 | Offsets and scales of bounding boxes are calculated using 56 | :func:`model.utils.bbox_tools.bbox2loc`. 57 | Also, types of input arrays and output arrays are same. 58 | 59 | Here are notations. 60 | 61 | * :math:`S` is the total number of sampled RoIs, which equals \ 62 | :obj:`self.n_sample`. 63 | * :math:`L` is number of object classes possibly including the \ 64 | background. 65 | 66 | Args: 67 | roi (array): Region of Interests (RoIs) from which we sample. 68 | Its shape is :math:`(R, 4)` 69 | bbox (array): The coordinates of ground truth bounding boxes. 70 | Its shape is :math:`(R', 4)`. 71 | label (array): Ground truth bounding box labels. Its shape 72 | is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where 73 | :math:`L` is the number of foreground classes. 74 | loc_normalize_mean (tuple of four floats): Mean values to normalize 75 | coordinates of bouding boxes. 76 | loc_normalize_std (tupler of four floats): Standard deviation of 77 | the coordinates of bounding boxes. 78 | 79 | Returns: 80 | (array, array, array): 81 | 82 | * **sample_roi**: Regions of interests that are sampled. \ 83 | Its shape is :math:`(S, 4)`. 84 | * **gt_roi_loc**: Offsets and scales to match \ 85 | the sampled RoIs to the ground truth bounding boxes. \ 86 | Its shape is :math:`(S, 4)`. 87 | * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \ 88 | :math:`(S,)`. Its range is :math:`[0, L]`. The label with \ 89 | value 0 is the background. 90 | 91 | """ 92 | n_bbox, _ = bbox.shape 93 | 94 | roi = np.concatenate((roi, bbox), axis=0) 95 | 96 | pos_roi_per_image = np.round(self.n_sample * self.pos_ratio) 97 | iou = bbox_iou(roi, bbox) 98 | gt_assignment = iou.argmax(axis=1) 99 | max_iou = iou.max(axis=1) 100 | # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class]. 101 | # The label with value 0 is the background. 102 | gt_roi_label = label[gt_assignment] + 1 103 | 104 | 105 | # Select foreground RoIs as those with >= pos_iou_thresh IoU. 106 | pos_index = np.where(max_iou >= self.pos_iou_thresh)[0] 107 | pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) 108 | if pos_index.size > 0: 109 | pos_index = np.random.choice( 110 | pos_index, size=pos_roi_per_this_image, replace=False) 111 | 112 | # Select background RoIs as those within 113 | # [neg_iou_thresh_lo, neg_iou_thresh_hi). 114 | neg_index = np.where((max_iou < self.neg_iou_thresh_hi) & 115 | (max_iou >= self.neg_iou_thresh_lo))[0] 116 | neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image 117 | neg_roi_per_this_image = int(min(neg_roi_per_this_image, 118 | neg_index.size)) 119 | if neg_index.size > 0: 120 | neg_index = np.random.choice( 121 | neg_index, size=neg_roi_per_this_image, replace=False) 122 | 123 | # The indices that we're selecting (both positive and negative). 124 | keep_index = np.append(pos_index, neg_index) 125 | gt_roi_label = gt_roi_label[keep_index] 126 | gt_roi_label[pos_roi_per_this_image:] = 0 # negative labels --> 0 127 | sample_roi = roi[keep_index] 128 | 129 | # Compute offsets and scales to match sampled RoIs to the GTs. 130 | gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]]) 131 | gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32) 132 | ) / np.array(loc_normalize_std, np.float32)) 133 | 134 | return sample_roi, gt_roi_loc, gt_roi_label 135 | 136 | 137 | class AnchorTargetCreator(object): 138 | """Assign the ground truth bounding boxes to anchors. 139 | 140 | Assigns the ground truth bounding boxes to anchors for training Region 141 | Proposal Networks introduced in Faster R-CNN [#]_. 142 | 143 | Offsets and scales to match anchors to the ground truth are 144 | calculated using the encoding scheme of 145 | :func:`model.utils.bbox_tools.bbox2loc`. 146 | 147 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 148 | Faster R-CNN: Towards Real-Time Object Detection with \ 149 | Region Proposal Networks. NIPS 2015. 150 | 151 | Args: 152 | n_sample (int): The number of regions to produce. 153 | pos_iou_thresh (float): Anchors with IoU above this 154 | threshold will be assigned as positive. 155 | neg_iou_thresh (float): Anchors with IoU below this 156 | threshold will be assigned as negative. 157 | pos_ratio (float): Ratio of positive regions in the 158 | sampled regions. 159 | 160 | """ 161 | 162 | def __init__(self, 163 | n_sample=256, 164 | pos_iou_thresh=0.7, neg_iou_thresh=0.3, 165 | pos_ratio=0.5): 166 | self.n_sample = n_sample 167 | self.pos_iou_thresh = pos_iou_thresh 168 | self.neg_iou_thresh = neg_iou_thresh 169 | self.pos_ratio = pos_ratio 170 | 171 | def __call__(self, bbox, anchor, img_size): 172 | """Assign ground truth supervision to sampled subset of anchors. 173 | 174 | Types of input arrays and output arrays are same. 175 | 176 | Here are notations. 177 | 178 | * :math:`S` is the number of anchors. 179 | * :math:`R` is the number of bounding boxes. 180 | 181 | Args: 182 | bbox (array): Coordinates of bounding boxes. Its shape is 183 | :math:`(R, 4)`. 184 | anchor (array): Coordinates of anchors. Its shape is 185 | :math:`(S, 4)`. 186 | img_size (tuple of ints): A tuple :obj:`H, W`, which 187 | is a tuple of height and width of an image. 188 | 189 | Returns: 190 | (array, array): 191 | 192 | #NOTE: it's scale not only offset 193 | * **loc**: Offsets and scales to match the anchors to \ 194 | the ground truth bounding boxes. Its shape is :math:`(S, 4)`. 195 | * **label**: Labels of anchors with values \ 196 | :obj:`(1=positive, 0=negative, -1=ignore)`. Its shape \ 197 | is :math:`(S,)`. 198 | 199 | """ 200 | 201 | img_H, img_W = img_size 202 | 203 | n_anchor = len(anchor) 204 | inside_index = _get_inside_index(anchor, img_H, img_W) 205 | anchor = anchor[inside_index] 206 | argmax_ious, label = self._create_label( 207 | inside_index, anchor, bbox) 208 | 209 | # compute bounding box regression targets 210 | loc = bbox2loc(anchor, bbox[argmax_ious]) 211 | 212 | # map up to original set of anchors 213 | label = _unmap(label, n_anchor, inside_index, fill=-1) 214 | loc = _unmap(loc, n_anchor, inside_index, fill=0) 215 | 216 | return loc, label 217 | 218 | def _create_label(self, inside_index, anchor, bbox): 219 | # label: 1 is positive, 0 is negative, -1 is dont care 220 | label = np.empty((len(inside_index),), dtype=np.int32) 221 | label.fill(-1) 222 | 223 | argmax_ious, max_ious, gt_argmax_ious = \ 224 | self._calc_ious(anchor, bbox, inside_index) 225 | 226 | # assign negative labels first so that positive labels can clobber them 227 | label[max_ious < self.neg_iou_thresh] = 0 228 | 229 | # positive label: for each gt, anchor with highest iou 230 | label[gt_argmax_ious] = 1 231 | 232 | # positive label: above threshold IOU 233 | label[max_ious >= self.pos_iou_thresh] = 1 234 | 235 | # subsample positive labels if we have too many 236 | n_pos = int(self.pos_ratio * self.n_sample) 237 | pos_index = np.where(label == 1)[0] 238 | if len(pos_index) > n_pos: 239 | disable_index = np.random.choice( 240 | pos_index, size=(len(pos_index) - n_pos), replace=False) 241 | label[disable_index] = -1 242 | 243 | # subsample negative labels if we have too many 244 | n_neg = self.n_sample - np.sum(label == 1) 245 | neg_index = np.where(label == 0)[0] 246 | if len(neg_index) > n_neg: 247 | disable_index = np.random.choice( 248 | neg_index, size=(len(neg_index) - n_neg), replace=False) 249 | label[disable_index] = -1 250 | 251 | return argmax_ious, label 252 | 253 | def _calc_ious(self, anchor, bbox, inside_index): 254 | # ious between the anchors and the gt boxes 255 | ious = bbox_iou(anchor, bbox) 256 | argmax_ious = ious.argmax(axis=1) 257 | max_ious = ious[np.arange(len(inside_index)), argmax_ious] 258 | gt_argmax_ious = ious.argmax(axis=0) 259 | gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])] 260 | gt_argmax_ious = np.where(ious == gt_max_ious)[0] 261 | 262 | return argmax_ious, max_ious, gt_argmax_ious 263 | 264 | 265 | def _unmap(data, count, index, fill=0): 266 | # Unmap a subset of item (data) back to the original set of items (of 267 | # size count) 268 | 269 | if len(data.shape) == 1: 270 | ret = np.empty((count,), dtype=data.dtype) 271 | ret.fill(fill) 272 | ret[index] = data 273 | else: 274 | ret = np.empty((count,) + data.shape[1:], dtype=data.dtype) 275 | ret.fill(fill) 276 | ret[index, :] = data 277 | return ret 278 | 279 | 280 | def _get_inside_index(anchor, H, W): 281 | # Calc indicies of anchors which are located completely inside of the image 282 | # whose size is speficied. 283 | index_inside = np.where( 284 | (anchor[:, 0] >= 0) & 285 | (anchor[:, 1] >= 0) & 286 | (anchor[:, 2] <= H) & 287 | (anchor[:, 3] <= W) 288 | )[0] 289 | return index_inside 290 | 291 | 292 | class ProposalCreator: 293 | # unNOTE: I'll make it undifferential 294 | # unTODO: make sure it's ok 295 | # It's ok 296 | """Proposal regions are generated by calling this object. 297 | 298 | The :meth:`__call__` of this object outputs object detection proposals by 299 | applying estimated bounding box offsets 300 | to a set of anchors. 301 | 302 | This class takes parameters to control number of bounding boxes to 303 | pass to NMS and keep after NMS. 304 | If the paramters are negative, it uses all the bounding boxes supplied 305 | or keep all the bounding boxes returned by NMS. 306 | 307 | This class is used for Region Proposal Networks introduced in 308 | Faster R-CNN [#]_. 309 | 310 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 311 | Faster R-CNN: Towards Real-Time Object Detection with \ 312 | Region Proposal Networks. NIPS 2015. 313 | 314 | Args: 315 | nms_thresh (float): Threshold value used when calling NMS. 316 | n_train_pre_nms (int): Number of top scored bounding boxes 317 | to keep before passing to NMS in train mode. 318 | n_train_post_nms (int): Number of top scored bounding boxes 319 | to keep after passing to NMS in train mode. 320 | n_test_pre_nms (int): Number of top scored bounding boxes 321 | to keep before passing to NMS in test mode. 322 | n_test_post_nms (int): Number of top scored bounding boxes 323 | to keep after passing to NMS in test mode. 324 | force_cpu_nms (bool): If this is :obj:`True`, 325 | always use NMS in CPU mode. If :obj:`False`, 326 | the NMS mode is selected based on the type of inputs. 327 | min_size (int): A paramter to determine the threshold on 328 | discarding bounding boxes based on their sizes. 329 | 330 | """ 331 | 332 | def __init__(self, 333 | parent_model, 334 | nms_thresh=0.7, 335 | n_train_pre_nms=12000, 336 | n_train_post_nms=2000, 337 | n_test_pre_nms=6000, 338 | n_test_post_nms=300, 339 | min_size=16 340 | ): 341 | self.parent_model = parent_model 342 | self.nms_thresh = nms_thresh 343 | self.n_train_pre_nms = n_train_pre_nms 344 | self.n_train_post_nms = n_train_post_nms 345 | self.n_test_pre_nms = n_test_pre_nms 346 | self.n_test_post_nms = n_test_post_nms 347 | self.min_size = min_size 348 | 349 | def __call__(self, loc, score, 350 | anchor, img_size, scale=1.): 351 | """input should be ndarray 352 | Propose RoIs. 353 | 354 | Inputs :obj:`loc, score, anchor` refer to the same anchor when indexed 355 | by the same index. 356 | 357 | On notations, :math:`R` is the total number of anchors. This is equal 358 | to product of the height and the width of an image and the number of 359 | anchor bases per pixel. 360 | 361 | Type of the output is same as the inputs. 362 | 363 | Args: 364 | loc (array): Predicted offsets and scaling to anchors. 365 | Its shape is :math:`(R, 4)`. 366 | score (array): Predicted foreground probability for anchors. 367 | Its shape is :math:`(R,)`. 368 | anchor (array): Coordinates of anchors. Its shape is 369 | :math:`(R, 4)`. 370 | img_size (tuple of ints): A tuple :obj:`height, width`, 371 | which contains image size after scaling. 372 | scale (float): The scaling factor used to scale an image after 373 | reading it from a file. 374 | 375 | Returns: 376 | array: 377 | An array of coordinates of proposal boxes. 378 | Its shape is :math:`(S, 4)`. :math:`S` is less than 379 | :obj:`self.n_test_post_nms` in test time and less than 380 | :obj:`self.n_train_post_nms` in train time. :math:`S` depends on 381 | the size of the predicted bounding boxes and the number of 382 | bounding boxes discarded by NMS. 383 | 384 | """ 385 | # NOTE: when test, remember 386 | # faster_rcnn.eval() 387 | # to set self.traing = False 388 | if self.parent_model.training: 389 | n_pre_nms = self.n_train_pre_nms 390 | n_post_nms = self.n_train_post_nms 391 | else: 392 | n_pre_nms = self.n_test_pre_nms 393 | n_post_nms = self.n_test_post_nms 394 | 395 | # Convert anchors into proposal via bbox transformations. 396 | # roi = loc2bbox(anchor, loc) 397 | roi = loc2bbox(anchor, loc) 398 | 399 | # Clip predicted boxes to image. 400 | roi[:, slice(0, 4, 2)] = np.clip( 401 | roi[:, slice(0, 4, 2)], 0, img_size[0]) 402 | roi[:, slice(1, 4, 2)] = np.clip( 403 | roi[:, slice(1, 4, 2)], 0, img_size[1]) 404 | 405 | # Remove predicted boxes with either height or width < threshold. 406 | min_size = self.min_size * scale 407 | hs = roi[:, 2] - roi[:, 0] 408 | ws = roi[:, 3] - roi[:, 1] 409 | keep = np.where((hs >= min_size) & (ws >= min_size))[0] 410 | roi = roi[keep, :] 411 | score = score[keep] 412 | 413 | # Sort all (proposal, score) pairs by score from highest to lowest. 414 | # Take top pre_nms_topN (e.g. 6000). 415 | order = score.ravel().argsort()[::-1] 416 | if n_pre_nms > 0: 417 | order = order[:n_pre_nms] 418 | roi = roi[order, :] 419 | 420 | # Apply nms (e.g. threshold = 0.7). 421 | # Take after_nms_topN (e.g. 300). 422 | 423 | # unNOTE: somthing is wrong here! 424 | # TODO: remove cuda.to_gpu 425 | keep = non_maximum_suppression( 426 | cp.ascontiguousarray(cp.asarray(roi)), 427 | thresh=self.nms_thresh) 428 | if n_post_nms > 0: 429 | keep = keep[:n_post_nms] 430 | roi = roi[keep] 431 | return roi 432 | -------------------------------------------------------------------------------- /lib/eval_tool.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | from collections import defaultdict 4 | import itertools 5 | import numpy as np 6 | import six 7 | 8 | from lib.bbox_tools import bbox_iou 9 | 10 | 11 | def eval_detection_voc( 12 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, 13 | gt_difficults=None, 14 | iou_thresh=0.5, use_07_metric=False): 15 | """Calculate average precisions based on evaluation code of PASCAL VOC. 16 | 17 | This function evaluates predicted bounding boxes obtained from a dataset 18 | which has :math:`N` images by using average precision for each class. 19 | The code is based on the evaluation code used in PASCAL VOC Challenge. 20 | 21 | Args: 22 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 23 | sets of bounding boxes. 24 | Its index corresponds to an index for the base dataset. 25 | Each element of :obj:`pred_bboxes` is a set of coordinates 26 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 27 | where :math:`R` corresponds 28 | to the number of bounding boxes, which may vary among boxes. 29 | The second axis corresponds to 30 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 31 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 32 | Similar to :obj:`pred_bboxes`, its index corresponds to an 33 | index for the base dataset. Its length is :math:`N`. 34 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 35 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 36 | its index corresponds to an index for the base dataset. 37 | Its length is :math:`N`. 38 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 39 | bounding boxes 40 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 41 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 42 | bounding boxes in each image does not need to be same as the number 43 | of corresponding predicted boxes. 44 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 45 | labels which are organized similarly to :obj:`gt_bboxes`. 46 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 47 | arrays which is organized similarly to :obj:`gt_bboxes`. 48 | This tells whether the 49 | corresponding ground truth bounding box is difficult or not. 50 | By default, this is :obj:`None`. In that case, this function 51 | considers all bounding boxes to be not difficult. 52 | iou_thresh (float): A prediction is correct if its Intersection over 53 | Union with the ground truth is above this value. 54 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 55 | for calculating average precision. The default value is 56 | :obj:`False`. 57 | 58 | Returns: 59 | dict: 60 | 61 | The keys, value-types and the description of the values are listed 62 | below. 63 | 64 | * **ap** (*numpy.ndarray*): An array of average precisions. \ 65 | The :math:`l`-th value corresponds to the average precision \ 66 | for class :math:`l`. If class :math:`l` does not exist in \ 67 | either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \ 68 | value is set to :obj:`numpy.nan`. 69 | * **map** (*float*): The average of Average Precisions over classes. 70 | 71 | """ 72 | 73 | prec, rec = calc_detection_voc_prec_rec( 74 | pred_bboxes, pred_labels, pred_scores, 75 | gt_bboxes, gt_labels, gt_difficults, 76 | iou_thresh=iou_thresh) 77 | 78 | ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric) 79 | 80 | return {'ap': ap, 'map': np.nanmean(ap)} 81 | 82 | 83 | def calc_detection_voc_prec_rec( 84 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, 85 | gt_difficults=None, 86 | iou_thresh=0.5): 87 | """Calculate precision and recall based on evaluation code of PASCAL VOC. 88 | 89 | This function calculates precision and recall of 90 | predicted bounding boxes obtained from a dataset which has :math:`N` 91 | images. 92 | The code is based on the evaluation code used in PASCAL VOC Challenge. 93 | 94 | Args: 95 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 96 | sets of bounding boxes. 97 | Its index corresponds to an index for the base dataset. 98 | Each element of :obj:`pred_bboxes` is a set of coordinates 99 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 100 | where :math:`R` corresponds 101 | to the number of bounding boxes, which may vary among boxes. 102 | The second axis corresponds to 103 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 104 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 105 | Similar to :obj:`pred_bboxes`, its index corresponds to an 106 | index for the base dataset. Its length is :math:`N`. 107 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 108 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 109 | its index corresponds to an index for the base dataset. 110 | Its length is :math:`N`. 111 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 112 | bounding boxes 113 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 114 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 115 | bounding boxes in each image does not need to be same as the number 116 | of corresponding predicted boxes. 117 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 118 | labels which are organized similarly to :obj:`gt_bboxes`. 119 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 120 | arrays which is organized similarly to :obj:`gt_bboxes`. 121 | This tells whether the 122 | corresponding ground truth bounding box is difficult or not. 123 | By default, this is :obj:`None`. In that case, this function 124 | considers all bounding boxes to be not difficult. 125 | iou_thresh (float): A prediction is correct if its Intersection over 126 | Union with the ground truth is above this value.. 127 | 128 | Returns: 129 | tuple of two lists: 130 | This function returns two lists: :obj:`prec` and :obj:`rec`. 131 | 132 | * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \ 133 | for class :math:`l`. If class :math:`l` does not exist in \ 134 | either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \ 135 | set to :obj:`None`. 136 | * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \ 137 | for class :math:`l`. If class :math:`l` that is not marked as \ 138 | difficult does not exist in \ 139 | :obj:`gt_labels`, :obj:`rec[l]` is \ 140 | set to :obj:`None`. 141 | 142 | """ 143 | 144 | pred_bboxes = iter(pred_bboxes) 145 | pred_labels = iter(pred_labels) 146 | pred_scores = iter(pred_scores) 147 | gt_bboxes = iter(gt_bboxes) 148 | gt_labels = iter(gt_labels) 149 | if gt_difficults is None: 150 | gt_difficults = itertools.repeat(None) 151 | else: 152 | gt_difficults = iter(gt_difficults) 153 | 154 | n_pos = defaultdict(int) 155 | score = defaultdict(list) 156 | match = defaultdict(list) 157 | 158 | for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \ 159 | six.moves.zip( 160 | pred_bboxes, pred_labels, pred_scores, 161 | gt_bboxes, gt_labels, gt_difficults): 162 | 163 | if gt_difficult is None: 164 | gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool) 165 | 166 | for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): 167 | pred_mask_l = pred_label == l 168 | pred_bbox_l = pred_bbox[pred_mask_l] 169 | pred_score_l = pred_score[pred_mask_l] 170 | # sort by score 171 | order = pred_score_l.argsort()[::-1] 172 | pred_bbox_l = pred_bbox_l[order] 173 | pred_score_l = pred_score_l[order] 174 | 175 | gt_mask_l = gt_label == l 176 | gt_bbox_l = gt_bbox[gt_mask_l] 177 | gt_difficult_l = gt_difficult[gt_mask_l] 178 | 179 | n_pos[l] += np.logical_not(gt_difficult_l).sum() 180 | score[l].extend(pred_score_l) 181 | 182 | if len(pred_bbox_l) == 0: 183 | continue 184 | if len(gt_bbox_l) == 0: 185 | match[l].extend((0,) * pred_bbox_l.shape[0]) 186 | continue 187 | 188 | # VOC evaluation follows integer typed bounding boxes. 189 | pred_bbox_l = pred_bbox_l.copy() 190 | pred_bbox_l[:, 2:] += 1 191 | gt_bbox_l = gt_bbox_l.copy() 192 | gt_bbox_l[:, 2:] += 1 193 | 194 | iou = bbox_iou(pred_bbox_l, gt_bbox_l) 195 | gt_index = iou.argmax(axis=1) 196 | # set -1 if there is no matching ground truth 197 | gt_index[iou.max(axis=1) < iou_thresh] = -1 198 | del iou 199 | 200 | selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) 201 | for gt_idx in gt_index: 202 | if gt_idx >= 0: 203 | if gt_difficult_l[gt_idx]: 204 | match[l].append(-1) 205 | else: 206 | if not selec[gt_idx]: 207 | match[l].append(1) 208 | else: 209 | match[l].append(0) 210 | selec[gt_idx] = True 211 | else: 212 | match[l].append(0) 213 | 214 | for iter_ in ( 215 | pred_bboxes, pred_labels, pred_scores, 216 | gt_bboxes, gt_labels, gt_difficults): 217 | if next(iter_, None) is not None: 218 | raise ValueError('Length of input iterables need to be same.') 219 | 220 | n_fg_class = max(n_pos.keys()) + 1 221 | prec = [None] * n_fg_class 222 | rec = [None] * n_fg_class 223 | 224 | for l in n_pos.keys(): 225 | score_l = np.array(score[l]) 226 | match_l = np.array(match[l], dtype=np.int8) 227 | 228 | order = score_l.argsort()[::-1] 229 | match_l = match_l[order] 230 | 231 | tp = np.cumsum(match_l == 1) 232 | fp = np.cumsum(match_l == 0) 233 | 234 | # If an element of fp + tp is 0, 235 | # the corresponding element of prec[l] is nan. 236 | prec[l] = tp / (fp + tp) 237 | # If n_pos[l] is 0, rec[l] is None. 238 | if n_pos[l] > 0: 239 | rec[l] = tp / n_pos[l] 240 | 241 | return prec, rec 242 | 243 | 244 | def calc_detection_voc_ap(prec, rec, use_07_metric=False): 245 | """Calculate average precisions based on evaluation code of PASCAL VOC. 246 | 247 | This function calculates average precisions 248 | from given precisions and recalls. 249 | The code is based on the evaluation code used in PASCAL VOC Challenge. 250 | 251 | Args: 252 | prec (list of numpy.array): A list of arrays. 253 | :obj:`prec[l]` indicates precision for class :math:`l`. 254 | If :obj:`prec[l]` is :obj:`None`, this function returns 255 | :obj:`numpy.nan` for class :math:`l`. 256 | rec (list of numpy.array): A list of arrays. 257 | :obj:`rec[l]` indicates recall for class :math:`l`. 258 | If :obj:`rec[l]` is :obj:`None`, this function returns 259 | :obj:`numpy.nan` for class :math:`l`. 260 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 261 | for calculating average precision. The default value is 262 | :obj:`False`. 263 | 264 | Returns: 265 | ~numpy.ndarray: 266 | This function returns an array of average precisions. 267 | The :math:`l`-th value corresponds to the average precision 268 | for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is 269 | :obj:`None`, the corresponding value is set to :obj:`numpy.nan`. 270 | 271 | """ 272 | 273 | n_fg_class = len(prec) 274 | ap = np.empty(n_fg_class) 275 | for l in six.moves.range(n_fg_class): 276 | if prec[l] is None or rec[l] is None: 277 | ap[l] = np.nan 278 | continue 279 | 280 | if use_07_metric: 281 | # 11 point metric 282 | ap[l] = 0 283 | for t in np.arange(0., 1.1, 0.1): 284 | if np.sum(rec[l] >= t) == 0: 285 | p = 0 286 | else: 287 | p = np.max(np.nan_to_num(prec[l])[rec[l] >= t]) 288 | ap[l] += p / 11 289 | else: 290 | # correct AP calculation 291 | # first append sentinel values at the end 292 | mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0])) 293 | mrec = np.concatenate(([0], rec[l], [1])) 294 | 295 | mpre = np.maximum.accumulate(mpre[::-1])[::-1] 296 | 297 | # to calculate area under PR curve, look for points 298 | # where X axis (recall) changes value 299 | i = np.where(mrec[1:] != mrec[:-1])[0] 300 | 301 | # and sum (\Delta recall) * prec 302 | ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 303 | 304 | return ap 305 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- 1 | from lib.nms.non_maximum_suppression import non_maximum_suppression -------------------------------------------------------------------------------- /lib/nms/_nms_gpu_post.pyx: -------------------------------------------------------------------------------- 1 | cimport numpy as np 2 | from libc.stdint cimport uint64_t 3 | 4 | import numpy as np 5 | 6 | def _nms_gpu_post(np.ndarray[np.uint64_t, ndim=1] mask, 7 | int n_bbox, 8 | int threads_per_block, 9 | int col_blocks 10 | ): 11 | cdef: 12 | int i, j, nblock, index 13 | uint64_t inblock 14 | int n_selection = 0 15 | uint64_t one_ull = 1 16 | np.ndarray[np.int32_t, ndim=1] selection 17 | np.ndarray[np.uint64_t, ndim=1] remv 18 | 19 | selection = np.zeros((n_bbox,), dtype=np.int32) 20 | remv = np.zeros((col_blocks,), dtype=np.uint64) 21 | 22 | for i in range(n_bbox): 23 | nblock = i // threads_per_block 24 | inblock = i % threads_per_block 25 | 26 | if not (remv[nblock] & one_ull << inblock): 27 | selection[n_selection] = i 28 | n_selection += 1 29 | 30 | index = i * col_blocks 31 | for j in range(nblock, col_blocks): 32 | remv[j] |= mask[index + j] 33 | return selection, n_selection 34 | -------------------------------------------------------------------------------- /lib/nms/_nms_gpu_post_py.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | def _nms_gpu_post( mask, 5 | n_bbox, 6 | threads_per_block, 7 | col_blocks 8 | ): 9 | n_selection = 0 10 | one_ull = np.array([1],dtype=np.uint64) 11 | selection = np.zeros((n_bbox,), dtype=np.int32) 12 | remv = np.zeros((col_blocks,), dtype=np.uint64) 13 | 14 | for i in range(n_bbox): 15 | nblock = i // threads_per_block 16 | inblock = i % threads_per_block 17 | 18 | if not (remv[nblock] & one_ull << inblock): 19 | selection[n_selection] = i 20 | n_selection += 1 21 | 22 | index = i * col_blocks 23 | for j in range(nblock, col_blocks): 24 | remv[j] |= mask[index + j] 25 | return selection, n_selection 26 | -------------------------------------------------------------------------------- /lib/nms/build.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.extension import Extension 3 | from Cython.Distutils import build_ext 4 | 5 | ext_modules = [Extension("_nms_gpu_post", ["_nms_gpu_post.pyx"])] 6 | setup( 7 | name="Hello pyx", 8 | cmdclass={'build_ext': build_ext}, 9 | ext_modules=ext_modules 10 | ) 11 | -------------------------------------------------------------------------------- /lib/nms/non_maximum_suppression.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import cupy as cp 4 | try: 5 | from ._nms_gpu_post import _nms_gpu_post 6 | except: 7 | import warnings 8 | warnings.warn(''' 9 | the python code for non_maximum_suppression is about 2x slow 10 | It is strongly recommended to build cython code: 11 | `cd lib/nms/; python build.py build_ext --inplace''') 12 | from ._nms_gpu_post_py import _nms_gpu_post 13 | 14 | 15 | @cp.util.memoize(for_each_device=True) 16 | def _load_kernel(kernel_name, code, options=()): 17 | cp.cuda.runtime.free(0) 18 | assert isinstance(options, tuple) 19 | kernel_code = cp.cuda.compile_with_cache(code, options=options) 20 | return kernel_code.get_function(kernel_name) 21 | 22 | 23 | def non_maximum_suppression(bbox, thresh, score=None, 24 | limit=None): 25 | """Suppress bounding boxes according to their IoUs. 26 | 27 | This method checks each bounding box sequentially and selects the bounding 28 | box if the Intersection over Unions (IoUs) between the bounding box and the 29 | previously selected bounding boxes is less than :obj:`thresh`. This method 30 | is mainly used as postprocessing of object detection. 31 | The bounding boxes are selected from ones with higher scores. 32 | If :obj:`score` is not provided as an argument, the bounding box 33 | is ordered by its index in ascending order. 34 | 35 | The bounding boxes are expected to be packed into a two dimensional 36 | tensor of shape :math:`(R, 4)`, where :math:`R` is the number of 37 | bounding boxes in the image. The second axis represents attributes of 38 | the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, 39 | where the four attributes are coordinates of the top left and the 40 | bottom right vertices. 41 | 42 | :obj:`score` is a float array of shape :math:`(R,)`. Each score indicates 43 | confidence of prediction. 44 | 45 | This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as 46 | an input. Please note that both :obj:`bbox` and :obj:`score` need to be 47 | the same type. 48 | The type of the output is the same as the input. 49 | 50 | Args: 51 | bbox (array): Bounding boxes to be transformed. The shape is 52 | :math:`(R, 4)`. :math:`R` is the number of bounding boxes. 53 | thresh (float): Threshold of IoUs. 54 | score (array): An array of confidences whose shape is :math:`(R,)`. 55 | limit (int): The upper bound of the number of the output bounding 56 | boxes. If it is not specified, this method selects as many 57 | bounding boxes as possible. 58 | 59 | Returns: 60 | array: 61 | An array with indices of bounding boxes that are selected. \ 62 | They are sorted by the scores of bounding boxes in descending \ 63 | order. \ 64 | The shape of this array is :math:`(K,)` and its dtype is\ 65 | :obj:`numpy.int32`. Note that :math:`K \\leq R`. 66 | 67 | """ 68 | 69 | return _non_maximum_suppression_gpu(bbox, thresh, score, limit) 70 | 71 | 72 | def _non_maximum_suppression_gpu(bbox, thresh, score=None, limit=None): 73 | if len(bbox) == 0: 74 | return cp.zeros((0,), dtype=np.int32) 75 | 76 | n_bbox = bbox.shape[0] 77 | 78 | if score is not None: 79 | order = score.argsort()[::-1].astype(np.int32) 80 | else: 81 | order = cp.arange(n_bbox, dtype=np.int32) 82 | 83 | sorted_bbox = bbox[order, :] 84 | selec, n_selec = _call_nms_kernel( 85 | sorted_bbox, thresh) 86 | selec = selec[:n_selec] 87 | selec = order[selec] 88 | if limit is not None: 89 | selec = selec[:limit] 90 | return cp.asnumpy(selec) 91 | 92 | 93 | _nms_gpu_code = ''' 94 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 95 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 96 | 97 | __device__ 98 | inline float devIoU(float const *const bbox_a, float const *const bbox_b) { 99 | float top = max(bbox_a[0], bbox_b[0]); 100 | float bottom = min(bbox_a[2], bbox_b[2]); 101 | float left = max(bbox_a[1], bbox_b[1]); 102 | float right = min(bbox_a[3], bbox_b[3]); 103 | float height = max(bottom - top, 0.f); 104 | float width = max(right - left, 0.f); 105 | float area_i = height * width; 106 | float area_a = (bbox_a[2] - bbox_a[0]) * (bbox_a[3] - bbox_a[1]); 107 | float area_b = (bbox_b[2] - bbox_b[0]) * (bbox_b[3] - bbox_b[1]); 108 | return area_i / (area_a + area_b - area_i); 109 | } 110 | 111 | extern "C" 112 | __global__ 113 | void nms_kernel(const int n_bbox, const float thresh, 114 | const float *dev_bbox, 115 | unsigned long long *dev_mask) { 116 | const int row_start = blockIdx.y; 117 | const int col_start = blockIdx.x; 118 | 119 | const int row_size = 120 | min(n_bbox - row_start * threadsPerBlock, threadsPerBlock); 121 | const int col_size = 122 | min(n_bbox - col_start * threadsPerBlock, threadsPerBlock); 123 | 124 | __shared__ float block_bbox[threadsPerBlock * 4]; 125 | if (threadIdx.x < col_size) { 126 | block_bbox[threadIdx.x * 4 + 0] = 127 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0]; 128 | block_bbox[threadIdx.x * 4 + 1] = 129 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1]; 130 | block_bbox[threadIdx.x * 4 + 2] = 131 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2]; 132 | block_bbox[threadIdx.x * 4 + 3] = 133 | dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3]; 134 | } 135 | __syncthreads(); 136 | 137 | if (threadIdx.x < row_size) { 138 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 139 | const float *cur_box = dev_bbox + cur_box_idx * 4; 140 | int i = 0; 141 | unsigned long long t = 0; 142 | int start = 0; 143 | if (row_start == col_start) { 144 | start = threadIdx.x + 1; 145 | } 146 | for (i = start; i < col_size; i++) { 147 | if (devIoU(cur_box, block_bbox + i * 4) >= thresh) { 148 | t |= 1ULL << i; 149 | } 150 | } 151 | const int col_blocks = DIVUP(n_bbox, threadsPerBlock); 152 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 153 | } 154 | } 155 | ''' 156 | 157 | 158 | def _call_nms_kernel(bbox, thresh): 159 | # PyTorch does not support unsigned long Tensor. 160 | # Doesn't matter,since it returns ndarray finally. 161 | # So I'll keep it unmodified. 162 | n_bbox = bbox.shape[0] 163 | threads_per_block = 64 164 | col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32) 165 | blocks = (col_blocks, col_blocks, 1) 166 | threads = (threads_per_block, 1, 1) 167 | 168 | mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64) 169 | bbox = cp.ascontiguousarray(bbox, dtype=np.float32) # NOTE: 变成连续的 170 | kern = _load_kernel('nms_kernel', _nms_gpu_code) 171 | kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh), 172 | bbox, mask_dev)) 173 | 174 | mask_host = mask_dev.get() 175 | selection, n_selec = _nms_gpu_post( 176 | mask_host, n_bbox, threads_per_block, col_blocks) 177 | return selection, n_selec 178 | -------------------------------------------------------------------------------- /lib/relation_tool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def RankEmbedding(rank_dim=128,feat_dim=1024,wave_len=1000): 4 | rank_range = torch.arange(0, rank_dim).cuda().float() 5 | 6 | feat_range = torch.arange(feat_dim / 2).cuda() 7 | dim_mat = feat_range / (feat_dim / 2) 8 | dim_mat = 1. / (torch.pow(wave_len, dim_mat)) 9 | 10 | dim_mat = dim_mat.view(1, -1) 11 | rank_mat = rank_range.view(-1, 1) 12 | 13 | mul_mat = rank_mat * dim_mat 14 | sin_mat = torch.sin(mul_mat) 15 | cos_mat = torch.cos(mul_mat) 16 | embedding = torch.cat((sin_mat, cos_mat), -1) 17 | 18 | return embedding 19 | def PositionalEmbedding( f_g, dim_g=64, wave_len=1000): 20 | x_min, y_min, x_max, y_max = torch.chunk(f_g, 4, dim=1) 21 | 22 | cx = (x_min + x_max) * 0.5 23 | cy = (y_min + y_max) * 0.5 24 | w = (x_max - x_min) + 1. 25 | h = (y_max - y_min) + 1. 26 | 27 | delta_x = cx - cx.view(1, -1) 28 | delta_x = torch.clamp(torch.abs(delta_x / w), min=1e-3) 29 | delta_x = torch.log(delta_x) 30 | 31 | delta_y = cy - cy.view(1, -1) 32 | delta_y = torch.clamp(torch.abs(delta_y / h), min=1e-3) 33 | delta_y = torch.log(delta_y) 34 | 35 | delta_w = torch.log(w / w.view(1, -1)) 36 | delta_h = torch.log(h / h.view(1, -1)) 37 | size = delta_h.size() 38 | 39 | delta_x = delta_x.view(size[0], size[1], 1) 40 | delta_y = delta_y.view(size[0], size[1], 1) 41 | delta_w = delta_w.view(size[0], size[1], 1) 42 | delta_h = delta_h.view(size[0], size[1], 1) 43 | 44 | position_mat = torch.cat((delta_x, delta_y, delta_w, delta_h), -1) 45 | 46 | feat_range = torch.arange(dim_g / 8).cuda() 47 | dim_mat = feat_range / (dim_g / 8) 48 | dim_mat = 1. / (torch.pow(wave_len, dim_mat)) 49 | 50 | dim_mat = dim_mat.view(1, 1, 1, -1) 51 | position_mat = position_mat.view(size[0], size[1], 4, -1) 52 | position_mat = 100. * position_mat 53 | 54 | mul_mat = position_mat * dim_mat 55 | mul_mat = mul_mat.view(size[0], size[1], -1) 56 | sin_mat = torch.sin(mul_mat) 57 | cos_mat = torch.cos(mul_mat) 58 | embedding = torch.cat((sin_mat, cos_mat), -1) 59 | 60 | return embedding -------------------------------------------------------------------------------- /lib/roi_cupy.py: -------------------------------------------------------------------------------- 1 | kernel_forward = ''' 2 | extern "C" 3 | __global__ void roi_forward(const float* const bottom_data,const float* const bottom_rois, 4 | float* top_data, int* argmax_data, 5 | const double spatial_scale,const int channels,const int height, 6 | const int width, const int pooled_height, 7 | const int pooled_width,const int NN 8 | ){ 9 | 10 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 11 | if(idx>=NN) 12 | return; 13 | const int pw = idx % pooled_width; 14 | const int ph = (idx / pooled_width) % pooled_height; 15 | const int c = (idx / pooled_width / pooled_height) % channels; 16 | int num = idx / pooled_width / pooled_height / channels; 17 | const int roi_batch_ind = bottom_rois[num * 5 + 0]; 18 | const int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale); 19 | const int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale); 20 | const int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale); 21 | const int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale); 22 | // Force malformed ROIs to be 1x1 23 | const int roi_width = max(roi_end_w - roi_start_w + 1, 1); 24 | const int roi_height = max(roi_end_h - roi_start_h + 1, 1); 25 | const float bin_size_h = static_cast(roi_height) 26 | / static_cast(pooled_height); 27 | const float bin_size_w = static_cast(roi_width) 28 | / static_cast(pooled_width); 29 | 30 | int hstart = static_cast(floor(static_cast(ph) 31 | * bin_size_h)); 32 | int wstart = static_cast(floor(static_cast(pw) 33 | * bin_size_w)); 34 | int hend = static_cast(ceil(static_cast(ph + 1) 35 | * bin_size_h)); 36 | int wend = static_cast(ceil(static_cast(pw + 1) 37 | * bin_size_w)); 38 | 39 | // Add roi offsets and clip to input boundaries 40 | hstart = min(max(hstart + roi_start_h, 0), height); 41 | hend = min(max(hend + roi_start_h, 0), height); 42 | wstart = min(max(wstart + roi_start_w, 0), width); 43 | wend = min(max(wend + roi_start_w, 0), width); 44 | bool is_empty = (hend <= hstart) || (wend <= wstart); 45 | 46 | // Define an empty pooling region to be zero 47 | float maxval = is_empty ? 0 : -1E+37; 48 | // If nothing is pooled, argmax=-1 causes nothing to be backprop'd 49 | int maxidx = -1; 50 | const int data_offset = (roi_batch_ind * channels + c) * height * width; 51 | for (int h = hstart; h < hend; ++h) { 52 | for (int w = wstart; w < wend; ++w) { 53 | int bottom_index = h * width + w; 54 | if (bottom_data[data_offset + bottom_index] > maxval) { 55 | maxval = bottom_data[data_offset + bottom_index]; 56 | maxidx = bottom_index; 57 | } 58 | } 59 | } 60 | top_data[idx]=maxval; 61 | argmax_data[idx]=maxidx; 62 | } 63 | ''' 64 | kernel_backward = ''' 65 | extern "C" 66 | __global__ void roi_backward(const float* const top_diff, 67 | const int* const argmax_data,const float* const bottom_rois, 68 | float* bottom_diff, const int num_rois, 69 | const double spatial_scale, int channels, 70 | int height, int width, int pooled_height, 71 | int pooled_width,const int NN) 72 | { 73 | 74 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 75 | ////Importtan >= instead of > 76 | if(idx>=NN) 77 | return; 78 | int w = idx % width; 79 | int h = (idx / width) % height; 80 | int c = (idx/ (width * height)) % channels; 81 | int num = idx / (width * height * channels); 82 | 83 | float gradient = 0; 84 | // Accumulate gradient over all ROIs that pooled this element 85 | for (int roi_n = 0; roi_n < num_rois; ++roi_n) { 86 | // Skip if ROI's batch index doesn't match num 87 | if (num != static_cast(bottom_rois[roi_n * 5])) { 88 | continue; 89 | } 90 | 91 | int roi_start_w = round(bottom_rois[roi_n * 5 + 1] 92 | * spatial_scale); 93 | int roi_start_h = round(bottom_rois[roi_n * 5 + 2] 94 | * spatial_scale); 95 | int roi_end_w = round(bottom_rois[roi_n * 5 + 3] 96 | * spatial_scale); 97 | int roi_end_h = round(bottom_rois[roi_n * 5 + 4] 98 | * spatial_scale); 99 | 100 | // Skip if ROI doesn't include (h, w) 101 | const bool in_roi = (w >= roi_start_w && w <= roi_end_w && 102 | h >= roi_start_h && h <= roi_end_h); 103 | if (!in_roi) { 104 | continue; 105 | } 106 | 107 | int offset = (roi_n * channels + c) * pooled_height 108 | * pooled_width; 109 | 110 | // Compute feasible set of pooled units that could have pooled 111 | // this bottom unit 112 | 113 | // Force malformed ROIs to be 1x1 114 | int roi_width = max(roi_end_w - roi_start_w + 1, 1); 115 | int roi_height = max(roi_end_h - roi_start_h + 1, 1); 116 | 117 | float bin_size_h = static_cast(roi_height) 118 | / static_cast(pooled_height); 119 | float bin_size_w = static_cast(roi_width) 120 | / static_cast(pooled_width); 121 | 122 | int phstart = floor(static_cast(h - roi_start_h) 123 | / bin_size_h); 124 | int phend = ceil(static_cast(h - roi_start_h + 1) 125 | / bin_size_h); 126 | int pwstart = floor(static_cast(w - roi_start_w) 127 | / bin_size_w); 128 | int pwend = ceil(static_cast(w - roi_start_w + 1) 129 | / bin_size_w); 130 | 131 | phstart = min(max(phstart, 0), pooled_height); 132 | phend = min(max(phend, 0), pooled_height); 133 | pwstart = min(max(pwstart, 0), pooled_width); 134 | pwend = min(max(pwend, 0), pooled_width); 135 | for (int ph = phstart; ph < phend; ++ph) { 136 | for (int pw = pwstart; pw < pwend; ++pw) { 137 | int index_ = ph * pooled_width + pw + offset; 138 | if (argmax_data[index_] == (h * width + w)) { 139 | gradient += top_diff[index_]; 140 | } 141 | } 142 | } 143 | } 144 | bottom_diff[idx] = gradient; 145 | } 146 | ''' 147 | -------------------------------------------------------------------------------- /lib/vis_tool.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | import numpy as np 4 | import matplotlib 5 | import torch as t 6 | import visdom 7 | 8 | matplotlib.use('Agg') 9 | from matplotlib import pyplot as plot 10 | 11 | # from data.voc_dataset import VOC_BBOX_LABEL_NAMES 12 | 13 | 14 | VOC_BBOX_LABEL_NAMES = ( 15 | 'fly', 16 | 'bike', 17 | 'bird', 18 | 'boat', 19 | 'pin', 20 | 'bus', 21 | 'c', 22 | 'cat', 23 | 'chair', 24 | 'cow', 25 | 'table', 26 | 'dog', 27 | 'horse', 28 | 'moto', 29 | 'p', 30 | 'plant', 31 | 'shep', 32 | 'sofa', 33 | 'train', 34 | 'tv', 35 | ) 36 | 37 | 38 | def vis_image(img, ax=None): 39 | """Visualize a color image. 40 | 41 | Args: 42 | img (~numpy.ndarray): An array of shape :math:`(3, height, width)`. 43 | This is in RGB format and the range of its value is 44 | :math:`[0, 255]`. 45 | ax (matplotlib.axes.Axis): The visualization is displayed on this 46 | axis. If this is :obj:`None` (default), a new axis is created. 47 | 48 | Returns: 49 | ~matploblib.axes.Axes: 50 | Returns the Axes object with the plot for further tweaking. 51 | 52 | """ 53 | 54 | if ax is None: 55 | fig = plot.figure() 56 | ax = fig.add_subplot(1, 1, 1) 57 | # CHW -> HWC 58 | img = img.transpose((1, 2, 0)) 59 | 60 | ax.imshow(img.astype(np.uint8)) 61 | return ax 62 | 63 | 64 | def vis_bbox(img, bbox, label=None, score=None, ax=None): 65 | """Visualize bounding boxes inside image. 66 | 67 | Args: 68 | img (~numpy.ndarray): An array of shape :math:`(3, height, width)`. 69 | This is in RGB format and the range of its value is 70 | :math:`[0, 255]`. 71 | bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where 72 | :math:`R` is the number of bounding boxes in the image. 73 | Each element is organized 74 | by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis. 75 | label (~numpy.ndarray): An integer array of shape :math:`(R,)`. 76 | The values correspond to id for label names stored in 77 | :obj:`label_names`. This is optional. 78 | score (~numpy.ndarray): A float array of shape :math:`(R,)`. 79 | Each value indicates how confident the prediction is. 80 | This is optional. 81 | label_names (iterable of strings): Name of labels ordered according 82 | to label ids. If this is :obj:`None`, labels will be skipped. 83 | ax (matplotlib.axes.Axis): The visualization is displayed on this 84 | axis. If this is :obj:`None` (default), a new axis is created. 85 | 86 | Returns: 87 | ~matploblib.axes.Axes: 88 | Returns the Axes object with the plot for further tweaking. 89 | 90 | """ 91 | 92 | label_names = list(VOC_BBOX_LABEL_NAMES) + ['bg'] 93 | # add for index `-1` 94 | if label is not None and not len(bbox) == len(label): 95 | raise ValueError('The length of label must be same as that of bbox') 96 | if score is not None and not len(bbox) == len(score): 97 | raise ValueError('The length of score must be same as that of bbox') 98 | 99 | # Returns newly instantiated matplotlib.axes.Axes object if ax is None 100 | ax = vis_image(img, ax=ax) 101 | 102 | # If there is no bounding box to display, visualize the image and exit. 103 | if len(bbox) == 0: 104 | return ax 105 | 106 | for i, bb in enumerate(bbox): 107 | xy = (bb[1], bb[0]) 108 | height = bb[2] - bb[0] 109 | width = bb[3] - bb[1] 110 | ax.add_patch(plot.Rectangle( 111 | xy, width, height, fill=False, edgecolor='red', linewidth=2)) 112 | 113 | caption = list() 114 | 115 | if label is not None and label_names is not None: 116 | lb = label[i] 117 | if not (-1 <= lb < len(label_names)): # modfy here to add backgroud 118 | raise ValueError('No corresponding name is given') 119 | caption.append(label_names[lb]) 120 | if score is not None: 121 | sc = score[i] 122 | caption.append('{:.2f}'.format(sc)) 123 | 124 | if len(caption) > 0: 125 | ax.text(bb[1], bb[0], 126 | ': '.join(caption), 127 | style='italic', 128 | bbox={'facecolor': 'white', 'alpha': 0.5, 'pad': 0}) 129 | return ax 130 | 131 | 132 | def fig2data(fig): 133 | """ 134 | brief Convert a Matplotlib figure to a 4D numpy array with RGBA 135 | channels and return it 136 | 137 | @param fig： a matplotlib figure 138 | @return a numpy 3D array of RGBA values 139 | """ 140 | # draw the renderer 141 | fig.canvas.draw() 142 | 143 | # Get the RGBA buffer from the figure 144 | w, h = fig.canvas.get_width_height() 145 | buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8) 146 | buf.shape = (w, h, 4) 147 | 148 | # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode 149 | buf = np.roll(buf, 3, axis=2) 150 | return buf.reshape(h, w, 4) 151 | 152 | 153 | def fig4vis(fig): 154 | """ 155 | convert figure to ndarray 156 | """ 157 | ax = fig.get_figure() 158 | img_data = fig2data(ax).astype(np.int32) 159 | plot.close() 160 | # HWC->CHW 161 | return img_data[:, :, :3].transpose((2, 0, 1)) / 255. 162 | 163 | 164 | def visdom_bbox(*args, **kwargs): 165 | fig = vis_bbox(*args, **kwargs) 166 | data = fig4vis(fig) 167 | return data 168 | 169 | 170 | class Visualizer(object): 171 | """ 172 | wrapper for visdom 173 | you can still access naive visdom function by 174 | self.line, self.scater,self._send,etc. 175 | due to the implementation of `__getattr__` 176 | """ 177 | 178 | def __init__(self, env='default', **kwargs): 179 | self.vis = visdom.Visdom(env=env, **kwargs) 180 | self._vis_kw = kwargs 181 | 182 | # e.g.（’loss',23） the 23th value of loss 183 | self.index = {} 184 | self.log_text = '' 185 | 186 | def reinit(self, env='default', **kwargs): 187 | """ 188 | change the config of visdom 189 | """ 190 | self.vis = visdom.Visdom(env=env, **kwargs) 191 | return self 192 | 193 | def plot_many(self, d): 194 | """ 195 | plot multi values 196 | @params d: dict (name,value) i.e. ('loss',0.11) 197 | """ 198 | for k, v in d.items(): 199 | if v is not None: 200 | self.plot(k, v) 201 | 202 | def img_many(self, d): 203 | for k, v in d.items(): 204 | self.img(k, v) 205 | 206 | def plot(self, name, y, **kwargs): 207 | """ 208 | self.plot('loss',1.00) 209 | """ 210 | x = self.index.get(name, 0) 211 | self.vis.line(Y=np.array([y]), X=np.array([x]), 212 | win=name, 213 | opts=dict(title=name), 214 | update=None if x == 0 else 'append', 215 | **kwargs 216 | ) 217 | self.index[name] = x + 1 218 | 219 | def img(self, name, img_, **kwargs): 220 | """ 221 | self.img('input_img',t.Tensor(64,64)) 222 | self.img('input_imgs',t.Tensor(3,64,64)) 223 | self.img('input_imgs',t.Tensor(100,1,64,64)) 224 | self.img('input_imgs',t.Tensor(100,3,64,64),nrows=10) 225 | ！！！don‘t ~~self.img('input_imgs',t.Tensor(100,64,64),nrows=10)~~！！！ 226 | """ 227 | self.vis.images(t.Tensor(img_).cpu().numpy(), 228 | win=name, 229 | opts=dict(title=name), 230 | **kwargs 231 | ) 232 | 233 | def log(self, info, win='log_text'): 234 | """ 235 | self.log({'loss':1,'lr':0.0001}) 236 | """ 237 | self.log_text += ('[{time}] {info}
'.format( 238 | time=time.strftime('%m%d_%H%M%S'), \ 239 | info=info)) 240 | self.vis.text(self.log_text, win) 241 | 242 | def __getattr__(self, name): 243 | return getattr(self.vis, name) 244 | 245 | def state_dict(self): 246 | return { 247 | 'index': self.index, 248 | 'vis_kw': self._vis_kw, 249 | 'log_text': self.log_text, 250 | 'env': self.vis.env 251 | } 252 | 253 | def load_state_dict(self, d): 254 | self.vis = visdom.Visdom(env=d.get('env', self.vis.env), **(self.d.get('vis_kw'))) 255 | self.log_text = d.get('log_text', '') 256 | self.index = d.get('index', dict()) 257 | return self 258 | -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from torch import nn 4 | import torch as t 5 | from torch.autograd import Variable 6 | import lib.array_tool as at 7 | from torch.nn import functional as F 8 | from config import opt 9 | from lib.bbox_tools import bbox_iou 10 | from lib.array_tool import tonumpy 11 | def _smooth_l1_loss(x, t, in_weight, sigma): 12 | sigma2 = sigma ** 2 13 | diff = in_weight * (x - t) 14 | abs_diff = diff.abs() 15 | flag = (abs_diff.data < (1. / sigma2)).float() 16 | flag = Variable(flag) 17 | y = (flag * (sigma2 / 2.) * (diff ** 2) + 18 | (1 - flag) * (abs_diff - 0.5 / sigma2)) 19 | return y.sum() 20 | 21 | 22 | def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma): 23 | in_weight = t.zeros(gt_loc.shape).cuda() 24 | # Localization loss is calculated only for positive rois. 25 | # NOTE: unlike origin implementation, 26 | # we don't need inside_weight and outside_weight, they can calculate by gt_label 27 | in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1 28 | loc_loss = _smooth_l1_loss(pred_loc, gt_loc, Variable(in_weight), sigma) 29 | # Normalize by total number of negtive and positive rois. 30 | loc_loss /= (gt_label >= 0).sum().float() # ignore gt_label==-1 for rpn_loss 31 | return loc_loss 32 | 33 | class RPNLoss(nn.Module): 34 | def __init__(self): 35 | super(RPNLoss, self).__init__() 36 | self.rpn_sigma = opt.rpn_sigma 37 | 38 | def forward(self, gt_rpn_loc,gt_rpn_label, rpn_locs, rpn_scores): 39 | # Since batch size is one, convert variables to singular form 40 | rpn_score = rpn_scores[0] 41 | rpn_loc = rpn_locs[0] 42 | 43 | 44 | # ------------------ RPN losses -------------------# 45 | 46 | gt_rpn_label = at.tovariable(gt_rpn_label).long() 47 | gt_rpn_loc = at.tovariable(gt_rpn_loc) 48 | rpn_loc_loss = _fast_rcnn_loc_loss( 49 | rpn_loc, 50 | gt_rpn_loc, 51 | gt_rpn_label.data, 52 | self.rpn_sigma) 53 | 54 | # NOTE: default value of ignore_index is -100 ... 55 | rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1) 56 | return [rpn_loc_loss, rpn_cls_loss] 57 | 58 | class ROILoss(nn.Module): 59 | def __init__(self): 60 | super(ROILoss, self).__init__() 61 | self.roi_sigma = opt.roi_sigma 62 | def forward(self,gt_roi_loc, gt_roi_label,roi_cls_loc, roi_score): 63 | n_sample = roi_cls_loc.shape[0] 64 | roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4) 65 | gt_roi_label = at.tovariable(gt_roi_label).long() 66 | gt_roi_loc = at.tovariable(gt_roi_loc) 67 | roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), at.totensor(gt_roi_label).long()] 68 | 69 | roi_loc_loss = _fast_rcnn_loc_loss( 70 | roi_loc.contiguous(), 71 | gt_roi_loc, 72 | gt_roi_label.data, 73 | self.roi_sigma) 74 | 75 | roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda()) 76 | return [roi_loc_loss,roi_cls_loss] 77 | 78 | class RelationNetworksLoss(nn.Module): 79 | def __init__(self): 80 | super(RelationNetworksLoss, self).__init__() 81 | 82 | def forward(self, gt_bboxes, gt_labels, nms_scores, sorted_labels, sorted_cls_bboxes): 83 | if nms_scores is None: 84 | return [1.] 85 | sorted_score, prob_argsort = t.sort(nms_scores, descending=True) 86 | sorted_cls_bboxes = sorted_cls_bboxes[prob_argsort] 87 | sorted_labels = sorted_labels[prob_argsort] 88 | sorted_labels = tonumpy(sorted_labels) 89 | gt_labels = tonumpy(gt_labels) 90 | 91 | nms_gt = t.zeros_like(sorted_score) 92 | 93 | eps = 1e-8 94 | 95 | iou = bbox_iou(tonumpy(gt_bboxes[0]), tonumpy(sorted_cls_bboxes)) 96 | for gt_idx in range(len(iou)): 97 | accept_iou = np.reshape(np.argwhere(iou[gt_idx] > 0.5),-1) 98 | accept_label = np.reshape(np.argwhere(sorted_labels[accept_iou] == gt_labels[0][gt_idx]),-1) 99 | 100 | if not(len(accept_label)==0): 101 | nms_gt[accept_iou[accept_label[0]]] = 1. 102 | 103 | loss = nms_gt * (sorted_score+ eps).log() + (1 - nms_gt) * (1-sorted_score + eps).log() 104 | loss = -loss.mean() 105 | return [loss] 106 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import time 2 | import math 3 | import torch.utils.model_zoo as model_zoo 4 | import six 5 | 6 | from torch.nn import functional as F 7 | from losses import ROILoss, RPNLoss, RelationNetworksLoss 8 | from lib.nms import non_maximum_suppression 9 | from collections import namedtuple 10 | from string import Template 11 | import lib.array_tool as at 12 | from config import opt 13 | from data.dataset import preprocess, VGGpreprocess 14 | from lib.bbox_tools import loc2bbox 15 | 16 | import torch as t 17 | from torch.autograd import Function 18 | 19 | from lib.roi_cupy import kernel_backward, kernel_forward 20 | from lib.creator_tool import ProposalCreator, ProposalTargetCreator, AnchorTargetCreator 21 | from lib.relation_tool import PositionalEmbedding, RankEmbedding 22 | from torchvision.models import vgg16_bn,squeezenet1_1 23 | 24 | 25 | 26 | 27 | import torch 28 | import torch.nn as nn 29 | import numpy as np 30 | import cupy as cp 31 | 32 | 33 | Stream = namedtuple('Stream', ['ptr']) 34 | 35 | @cp.util.memoize(for_each_device=True) 36 | def load_kernel(kernel_name, code, **kwargs): 37 | cp.cuda.runtime.free(0) 38 | code = Template(code).substitute(**kwargs) 39 | kernel_code = cp.cuda.compile_with_cache(code) 40 | return kernel_code.get_function(kernel_name) 41 | 42 | CUDA_NUM_THREADS = 1024 43 | 44 | def GET_BLOCKS(N, K=CUDA_NUM_THREADS): 45 | return (N + K - 1) // K 46 | 47 | class SqueezeFRCN(nn.Module): 48 | feat_stride = 16 # downsample 16x for output of convolution squeeze 49 | def __init__(self, num_classes): 50 | super(SqueezeFRCN, self).__init__() 51 | self.loc_normalize_mean = (0., 0., 0., 0.) 52 | self.loc_normalize_std = (0.1, 0.1, 0.2, 0.2) 53 | self.n_class = num_classes +1 54 | self.training = False 55 | 56 | model = squeezenet1_1(pretrained=True) 57 | self.feature_extractor = model.features 58 | 59 | # freeze 60 | for layer in self.feature_extractor[:5]: 61 | for p in layer.parameters(): 62 | p.requires_grad = False 63 | 64 | self.rpn = RegionProposalNetwork(in_channels=512, mid_channels=512, feat_stride=self.feat_stride) 65 | self.roi_head = RoIHead(n_class=self.n_class, roi_size=7, spatial_scale=(1. / self.feat_stride), 66 | in_channels=512, fc_features=512, n_relations=0) 67 | 68 | self.proposal_target_creator = ProposalTargetCreator() 69 | self.anchor_target_creator = AnchorTargetCreator() 70 | 71 | self.roiLoss = ROILoss() 72 | self.rpnLoss = RPNLoss() 73 | 74 | def forward(self,inputs,scale = 1.): 75 | if self.training: 76 | img_batch, bboxes, labels, _ = inputs 77 | else: 78 | img_batch = inputs 79 | 80 | _, _, H, W = img_batch.shape 81 | img_size = (H, W) 82 | start = time.time() 83 | features = self.feature_extractor(img_batch) 84 | rpn_locs, rpn_scores, rois, roi_indices, anchor = self.rpn(features, img_size, scale) 85 | if self.training: 86 | gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( 87 | at.tonumpy(bboxes[0]), 88 | anchor, 89 | img_size) 90 | sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( 91 | rois, 92 | at.tonumpy(bboxes[0]), 93 | at.tonumpy(labels[0]), 94 | self.loc_normalize_mean, 95 | self.loc_normalize_std) 96 | sample_roi_index = t.zeros(len(sample_roi)) 97 | 98 | roi_cls_loc, roi_score, appearance_features = self.roi_head(features, sample_roi, sample_roi_index) 99 | 100 | return gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \ 101 | sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes 102 | else: 103 | roi_cls_loc, roi_score, appearance_features = self.roi_head(features, rois, roi_indices) 104 | 105 | return roi_cls_loc, roi_score, rois, roi_indices, appearance_features, img_size 106 | def get_loss(self,inputs,isLearnNMS): 107 | gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \ 108 | sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes = self(inputs) 109 | if(isLearnNMS): 110 | rpn_loss = self.rpnLoss(gt_rpn_loc,gt_rpn_label, rpn_locs, rpn_scores) 111 | roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label,roi_cls_loc, roi_score) 112 | nms_scores, sorted_labels, sorted_cls_bboxes = self.duplicate_remover(sample_roi, roi_cls_loc, roi_score, 113 | appearance_features, img_size) 114 | nms_loss = self.nmsLoss(bboxes, labels,nms_scores, sorted_labels, sorted_cls_bboxes) 115 | losses = rpn_loss+roi_loss+nms_loss 116 | losses = [sum(losses)]+losses 117 | return losses 118 | else: 119 | rpn_loss = self.rpnLoss(gt_rpn_loc, gt_rpn_label, rpn_locs, rpn_scores) 120 | roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score) 121 | losses = rpn_loss + roi_loss 122 | losses = [sum(losses)]+losses 123 | return losses 124 | def predict(self, imgs, sizes=None, visualize=False): 125 | if visualize: 126 | self.use_preset(isTraining=False, preset='visualize') 127 | prepared_imgs = list() 128 | for img in imgs: 129 | size = img.shape[1:] 130 | img = VGGpreprocess(at.tonumpy(img)) 131 | prepared_imgs.append(img) 132 | else: 133 | self.use_preset(isTraining=False, preset='evaluate') 134 | prepared_imgs = imgs 135 | 136 | bboxes = list() 137 | labels = list() 138 | scores = list() 139 | for img in prepared_imgs: 140 | img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) 141 | size = img.shape[2:] 142 | scale = np.array(1.) 143 | roi_cls_loc, roi_scores, rois, _,_ ,_ = self(img, scale=scale) 144 | # We are assuming that batch size is 1. 145 | roi_score = roi_scores.data 146 | roi_cls_loc = roi_cls_loc.data 147 | 148 | roi = at.totensor(rois) 149 | 150 | # Convert predictions to bounding boxes in image coordinates. 151 | # Bounding boxes are scaled to the scale of the input images. 152 | mean = t.Tensor(self.loc_normalize_mean).cuda(). \ 153 | repeat(self.n_class)[None] 154 | std = t.Tensor(self.loc_normalize_std).cuda(). \ 155 | repeat(self.n_class)[None] 156 | 157 | roi_cls_loc = (roi_cls_loc * std + mean) 158 | roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) 159 | roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) 160 | cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), 161 | at.tonumpy(roi_cls_loc).reshape((-1, 4))) 162 | cls_bbox = at.totensor(cls_bbox) 163 | cls_bbox = cls_bbox.view(-1, self.n_class * 4) 164 | # clip bounding box 165 | cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) 166 | cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) 167 | 168 | prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) 169 | 170 | raw_cls_bbox = at.tonumpy(cls_bbox) 171 | raw_prob = at.tonumpy(prob) 172 | 173 | bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) 174 | bboxes.append(bbox) 175 | labels.append(label) 176 | scores.append(score) 177 | 178 | return bboxes, labels, scores 179 | 180 | def _suppress(self, raw_cls_bbox, raw_prob): 181 | bbox = list() 182 | label = list() 183 | score = list() 184 | # skip cls_id = 0 because it is the background class 185 | for l in range(1, self.n_class): 186 | cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :] 187 | prob_l = raw_prob[:, l] 188 | mask = prob_l > self.score_thresh 189 | cls_bbox_l = cls_bbox_l[mask] 190 | prob_l = prob_l[mask] 191 | keep = non_maximum_suppression( 192 | cp.array(cls_bbox_l), self.nms_thresh, prob_l) 193 | keep = cp.asnumpy(keep) 194 | bbox.append(cls_bbox_l[keep]) 195 | # The labels are in [0, self.n_class - 2]. 196 | label.append((l - 1) * np.ones((len(keep),))) 197 | score.append(prob_l[keep]) 198 | bbox = np.concatenate(bbox, axis=0).astype(np.float32) 199 | label = np.concatenate(label, axis=0).astype(np.int32) 200 | score = np.concatenate(score, axis=0).astype(np.float32) 201 | return bbox, label, score 202 | def freeze_bn(self): 203 | '''Freeze BatchNorm layers.''' 204 | for layer in self.modules(): 205 | if isinstance(layer, nn.BatchNorm2d): 206 | layer.eval() 207 | def use_preset(self,isTraining,preset='visualize'): 208 | if preset == 'visualize': 209 | self.nms_thresh = 0.3 210 | self.score_thresh = 0.7 211 | elif preset == 'evaluate': 212 | self.nms_thresh = 0.3 213 | self.score_thresh = 0.05 214 | self.training=isTraining 215 | def get_optimizer(self): 216 | """ 217 | return optimizer, It could be overwriten if you want to specify 218 | special optimizer 219 | """ 220 | lr = opt.lr 221 | params = [] 222 | for key, value in dict(self.named_parameters()).items(): 223 | if value.requires_grad: 224 | if 'bias' in key: 225 | params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}] 226 | else: 227 | params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}] 228 | if(opt.use_adam): 229 | optimizer = t.optim.Adam(params) 230 | else: 231 | optimizer = t.optim.SGD(params,momentum = 0.9) 232 | return optimizer 233 | class VGGFRCN(nn.Module): 234 | feat_stride = 16 # downsample 16x for output of conv5 in vgg16 235 | def __init__(self, num_classes): 236 | super(VGGFRCN, self).__init__() 237 | self.loc_normalize_mean = (0., 0., 0., 0.) 238 | self.loc_normalize_std = (0.1, 0.1, 0.2, 0.2) 239 | self.n_class = num_classes+1 240 | self.training = False 241 | model = vgg16_bn(pretrained=True) 242 | self.feature_extractor = model.features[:43] 243 | # freeze top4 conv 244 | for layer in self.feature_extractor[:14]: 245 | for p in layer.parameters(): 246 | p.requires_grad = False 247 | 248 | classifier = model.classifier 249 | del classifier[6] 250 | del classifier[5] 251 | del classifier[2] 252 | classifier = nn.Sequential(*classifier) 253 | 254 | self.rpn = RegionProposalNetwork(in_channels=512, mid_channels=512, feat_stride=self.feat_stride) 255 | 256 | self.roi_head = RoIHead(n_class=self.n_class, roi_size=7, spatial_scale=(1. / self.feat_stride), n_relations=0, 257 | in_channels=512, fc_features=4096, classifier = classifier) 258 | 259 | self.proposal_target_creator = ProposalTargetCreator() 260 | self.anchor_target_creator = AnchorTargetCreator() 261 | 262 | self.roiLoss = ROILoss() 263 | self.rpnLoss = RPNLoss() 264 | self.freeze_bn() 265 | def forward(self,inputs, scale=1.): 266 | if self.training: 267 | img_batch, bboxes, labels, _ = inputs 268 | else: 269 | img_batch = inputs 270 | 271 | _, _, H, W = img_batch.shape 272 | img_size = (H, W) 273 | 274 | features = self.feature_extractor(img_batch) 275 | rpn_locs, rpn_scores, rois, roi_indices, anchor = self.rpn(features, img_size, scale) 276 | 277 | if self.training: 278 | gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( 279 | at.tonumpy(bboxes[0]), 280 | anchor, 281 | img_size) 282 | sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( 283 | rois, 284 | at.tonumpy(bboxes[0]), 285 | at.tonumpy(labels[0]), 286 | self.loc_normalize_mean, 287 | self.loc_normalize_std) 288 | sample_roi_index = t.zeros(len(sample_roi)) 289 | 290 | roi_cls_loc, roi_score, appearance_features = self.roi_head(features, sample_roi, sample_roi_index) 291 | 292 | return gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \ 293 | sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes 294 | 295 | else: 296 | roi_cls_loc, roi_score, appearance_features = self.roi_head(features, rois, roi_indices) 297 | 298 | return roi_cls_loc, roi_score, rois, roi_indices, appearance_features, img_size 299 | 300 | def get_loss(self,inputs,isLearnNMS): 301 | gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \ 302 | sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes = self(inputs) 303 | if(isLearnNMS): 304 | rpn_loss = self.rpnLoss(gt_rpn_loc,gt_rpn_label, rpn_locs, rpn_scores) 305 | roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label,roi_cls_loc, roi_score) 306 | nms_scores, sorted_labels, sorted_cls_bboxes = self.duplicate_remover(sample_roi, roi_cls_loc, roi_score, 307 | appearance_features, img_size) 308 | nms_loss = self.nmsLoss(bboxes, labels,nms_scores, sorted_labels, sorted_cls_bboxes) 309 | losses = rpn_loss+roi_loss+nms_loss 310 | losses = [sum(losses)]+losses 311 | return losses 312 | else: 313 | rpn_loss = self.rpnLoss(gt_rpn_loc, gt_rpn_label, rpn_locs, rpn_scores) 314 | roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score) 315 | losses = rpn_loss + roi_loss 316 | losses = [sum(losses)]+losses 317 | return losses 318 | def predict(self, imgs, sizes=None, visualize=False): 319 | if visualize: 320 | self.use_preset(isTraining=False, preset='visualize') 321 | prepared_imgs = list() 322 | for img in imgs: 323 | size = img.shape[1:] 324 | img = VGGpreprocess(at.tonumpy(img)) 325 | prepared_imgs.append(img) 326 | else: 327 | self.use_preset(isTraining=False, preset='evaluate') 328 | prepared_imgs = imgs 329 | 330 | bboxes = list() 331 | labels = list() 332 | scores = list() 333 | for img in prepared_imgs: 334 | img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) 335 | size = img.shape[2:] 336 | scale = np.array(1.) 337 | roi_cls_loc, roi_scores, rois, _,_ ,_ = self(img, scale=scale) 338 | # We are assuming that batch size is 1. 339 | roi_score = roi_scores.data 340 | roi_cls_loc = roi_cls_loc.data 341 | 342 | roi = at.totensor(rois) 343 | 344 | # Convert predictions to bounding boxes in image coordinates. 345 | # Bounding boxes are scaled to the scale of the input images. 346 | mean = t.Tensor(self.loc_normalize_mean).cuda(). \ 347 | repeat(self.n_class)[None] 348 | std = t.Tensor(self.loc_normalize_std).cuda(). \ 349 | repeat(self.n_class)[None] 350 | 351 | roi_cls_loc = (roi_cls_loc * std + mean) 352 | roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) 353 | roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) 354 | cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), 355 | at.tonumpy(roi_cls_loc).reshape((-1, 4))) 356 | cls_bbox = at.totensor(cls_bbox) 357 | cls_bbox = cls_bbox.view(-1, self.n_class * 4) 358 | # clip bounding box 359 | cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) 360 | cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) 361 | 362 | prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) 363 | 364 | raw_cls_bbox = at.tonumpy(cls_bbox) 365 | raw_prob = at.tonumpy(prob) 366 | 367 | bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) 368 | bboxes.append(bbox) 369 | labels.append(label) 370 | scores.append(score) 371 | 372 | return bboxes, labels, scores 373 | 374 | def _suppress(self, raw_cls_bbox, raw_prob): 375 | bbox = list() 376 | label = list() 377 | score = list() 378 | # skip cls_id = 0 because it is the background class 379 | for l in range(1, self.n_class): 380 | cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :] 381 | prob_l = raw_prob[:, l] 382 | mask = prob_l > self.score_thresh 383 | cls_bbox_l = cls_bbox_l[mask] 384 | prob_l = prob_l[mask] 385 | keep = non_maximum_suppression( 386 | cp.array(cls_bbox_l), self.nms_thresh, prob_l) 387 | keep = cp.asnumpy(keep) 388 | bbox.append(cls_bbox_l[keep]) 389 | # The labels are in [0, self.n_class - 2]. 390 | label.append((l - 1) * np.ones((len(keep),))) 391 | score.append(prob_l[keep]) 392 | bbox = np.concatenate(bbox, axis=0).astype(np.float32) 393 | label = np.concatenate(label, axis=0).astype(np.int32) 394 | score = np.concatenate(score, axis=0).astype(np.float32) 395 | return bbox, label, score 396 | def freeze_bn(self): 397 | '''Freeze BatchNorm layers.''' 398 | for layer in self.modules(): 399 | if isinstance(layer, nn.BatchNorm2d): 400 | layer.eval() 401 | def use_preset(self,isTraining,preset='visualize'): 402 | if preset == 'visualize': 403 | self.nms_thresh = 0.3 404 | self.score_thresh = 0.7 405 | elif preset == 'evaluate': 406 | self.nms_thresh = 0.3 407 | self.score_thresh = 0.05 408 | self.training=isTraining 409 | def get_optimizer(self): 410 | """ 411 | return optimizer, It could be overwriten if you want to specify 412 | special optimizer 413 | """ 414 | lr = opt.lr 415 | params = [] 416 | for key, value in dict(self.named_parameters()).items(): 417 | if value.requires_grad: 418 | if 'bias' in key: 419 | params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}] 420 | else: 421 | params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}] 422 | if(opt.use_adam): 423 | optimizer = t.optim.Adam(params) 424 | else: 425 | optimizer = t.optim.SGD(params,momentum = 0.9) 426 | return optimizer 427 | 428 | # class ResFRCN(nn.Module): 429 | # feat_stride = 16 # downsample 32x for output of convolution resnet 430 | # def __init__(self, num_classes, block, layers): 431 | # self.training=False 432 | # self.inplanes = 64 433 | # self.loc_normalize_mean = (0., 0., 0., 0.) 434 | # self.loc_normalize_std = (0.1, 0.1, 0.2, 0.2) 435 | # self.n_class = num_classes+1 436 | # 437 | # super(ResFRCN, self).__init__() 438 | # self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 439 | # self.bn1 = nn.BatchNorm2d(64) 440 | # self.relu = nn.ReLU(inplace=True) 441 | # self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 442 | # self.layer1 = self._make_layer(block, 64, layers[0]) 443 | # self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 444 | # self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 445 | # self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 446 | # 447 | # if block == BasicBlock: 448 | # fpn_sizes = [self.layer2[layers[1]-1].conv2.out_channels, self.layer3[layers[2]-1].conv2.out_channels, 449 | # self.layer4[layers[3]-1].conv2.out_channels] 450 | # self.conv2 = nn.Conv2d(self.layer4[layers[3]-1].conv2.out_channels, 512, kernel_size=1, stride=1, bias=False) 451 | # elif block == Bottleneck: 452 | # fpn_sizes = [self.layer2[layers[1]-1].conv3.out_channels, self.layer3[layers[2]-1].conv3.out_channels, 453 | # self.layer4[layers[3]-1].conv3.out_channels] 454 | # self.conv2 = nn.Conv2d(self.layer4[layers[3]-1].conv3.out_channels, 512, kernel_size=1, stride=1, bias=False) 455 | # 456 | # #self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2],feature_size = 512) 457 | # 458 | # self.rpn = RegionProposalNetwork(in_channels=512,mid_channels=512,feat_stride = self.feat_stride) 459 | # self.roi_head = RoIHead(n_class = num_classes+1,roi_size=7,spatial_scale=(1. / self.feat_stride), 460 | # in_channels=512,fc_features = 1024, n_relations= 16) 461 | # self.duplicate_remover = DuplicationRemovalNetwork(n_relations=16,appearance_feature_dim=1024, 462 | # num_classes=num_classes) 463 | # self.proposal_target_creator = ProposalTargetCreator() 464 | # self.anchor_target_creator = AnchorTargetCreator() 465 | # 466 | # self.roiLoss = ROILoss() 467 | # self.rpnLoss = RPNLoss() 468 | # self.nmsLoss = RelationNetworksLoss() 469 | # for m in self.modules(): 470 | # if isinstance(m, nn.Conv2d): 471 | # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 472 | # m.weight.data.normal_(0, math.sqrt(2. / n)) 473 | # elif isinstance(m, nn.BatchNorm2d): 474 | # m.weight.data.fill_(1) 475 | # m.bias.data.zero_() 476 | # 477 | # 478 | # self.freeze_bn() 479 | # def use_preset(self,isTraining,preset='visualize'): 480 | # if preset == 'visualize': 481 | # self.nms_thresh = 0.3 482 | # self.score_thresh = 0.7 483 | # elif preset == 'evaluate': 484 | # self.nms_thresh = 0.3 485 | # self.score_thresh = 0.5 486 | # self.training=isTraining 487 | # def _make_layer(self, block, planes, blocks, stride=1): 488 | # downsample = None 489 | # if stride != 1 or self.inplanes != planes * block.expansion: 490 | # downsample = nn.Sequential( 491 | # nn.Conv2d(self.inplanes, planes * block.expansion, 492 | # kernel_size=1, stride=stride, bias=False), 493 | # nn.BatchNorm2d(planes * block.expansion), 494 | # ) 495 | # layers = [] 496 | # layers.append(block(self.inplanes, planes, stride, downsample)) 497 | # self.inplanes = planes * block.expansion 498 | # for i in range(1, blocks): 499 | # layers.append(block(self.inplanes, planes)) 500 | # return nn.Sequential(*layers) 501 | # def freeze_bn(self): 502 | # '''Freeze BatchNorm layers.''' 503 | # for layer in self.modules(): 504 | # if isinstance(layer, nn.BatchNorm2d): 505 | # layer.eval() 506 | # def forward(self, inputs, scale=1.): 507 | # if self.training: 508 | # img_batch, bboxes, labels, _ = inputs 509 | # else: 510 | # img_batch = inputs 511 | # 512 | # _, _, H, W = img_batch.shape 513 | # img_size = (H, W) 514 | # x = self.conv1(img_batch) 515 | # x = self.bn1(x) 516 | # x = self.relu(x) 517 | # x = self.maxpool(x) 518 | # x1 = self.layer1(x) 519 | # x2 = self.layer2(x1) 520 | # x3 = self.layer3(x2) 521 | # x4 = self.layer4(x3) 522 | # 523 | # #features = self.fpn([x2, x3, x4]) 524 | # features = self.conv2(x4) 525 | # rpn_locs, rpn_scores, rois, roi_indices, anchor = self.rpn(features,img_size,scale) 526 | # 527 | # if self.training: 528 | # gt_rpn_loc, gt_rpn_label = self.anchor_target_creator( 529 | # at.tonumpy(bboxes[0]), 530 | # anchor, 531 | # img_size) 532 | # sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator( 533 | # rois, 534 | # at.tonumpy(bboxes[0]), 535 | # at.tonumpy(labels[0]), 536 | # self.loc_normalize_mean, 537 | # self.loc_normalize_std) 538 | # sample_roi_index = t.zeros(len(sample_roi)) 539 | # 540 | # roi_cls_loc, roi_score, appearance_features = self.roi_head(features, sample_roi, sample_roi_index) 541 | # 542 | # return gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores,\ 543 | # sample_roi, roi_cls_loc, roi_score, appearance_features, img_size,labels, bboxes 544 | # 545 | # else: 546 | # roi_cls_loc, roi_score, appearance_features = self.roi_head(features, rois, roi_indices) 547 | # 548 | # return roi_cls_loc,roi_score, rois, roi_indices, appearance_features, img_size 549 | # 550 | # def _suppress(self, raw_cls_bbox, raw_prob): 551 | # bbox = list() 552 | # label = list() 553 | # score = list() 554 | # # skip cls_id = 0 because it is the background class 555 | # for l in range(1, self.n_class): 556 | # cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :] 557 | # prob_l = raw_prob[:, l] 558 | # mask = prob_l > self.score_thresh 559 | # cls_bbox_l = cls_bbox_l[mask] 560 | # prob_l = prob_l[mask] 561 | # keep = non_maximum_suppression( 562 | # cp.array(cls_bbox_l), self.nms_thresh, prob_l) 563 | # keep = cp.asnumpy(keep) 564 | # bbox.append(cls_bbox_l[keep]) 565 | # # The labels are in [0, self.n_class - 2]. 566 | # label.append((l - 1) * np.ones((len(keep),))) 567 | # score.append(prob_l[keep]) 568 | # bbox = np.concatenate(bbox, axis=0).astype(np.float32) 569 | # label = np.concatenate(label, axis=0).astype(np.int32) 570 | # score = np.concatenate(score, axis=0).astype(np.float32) 571 | # return bbox, label, score 572 | # def predict(self, imgs, sizes=None, visualize=False): 573 | # if visualize: 574 | # self.use_preset(isTraining=False, preset='visualize') 575 | # prepared_imgs = list() 576 | # sizes = list() 577 | # for img in imgs: 578 | # size = img.shape[1:] 579 | # img = preprocess(at.tonumpy(img)) 580 | # prepared_imgs.append(img) 581 | # sizes.append(size) 582 | # else: 583 | # self.use_preset(isTraining=False, preset='evaluate') 584 | # prepared_imgs = imgs 585 | # bboxes = list() 586 | # labels = list() 587 | # scores = list() 588 | # for img, size in zip(prepared_imgs, sizes): 589 | # img = t.autograd.Variable(at.totensor(img).float()[None], volatile=True) 590 | # scale = img.shape[3] / size[1] 591 | # roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale) 592 | # # We are assuming that batch size is 1. 593 | # roi_score = roi_scores.data 594 | # roi_cls_loc = roi_cls_loc.data 595 | # if visualize: 596 | # roi = at.totensor(rois) / scale 597 | # else: 598 | # roi = at.totensor(rois) / scale.cuda().float() 599 | # 600 | # # Convert predictions to bounding boxes in image coordinates. 601 | # # Bounding boxes are scaled to the scale of the input images. 602 | # mean = t.Tensor(self.loc_normalize_mean).cuda(). \ 603 | # repeat(self.n_class)[None] 604 | # std = t.Tensor(self.loc_normalize_std).cuda(). \ 605 | # repeat(self.n_class)[None] 606 | # 607 | # roi_cls_loc = (roi_cls_loc * std + mean) 608 | # roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) 609 | # roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) 610 | # cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), 611 | # at.tonumpy(roi_cls_loc).reshape((-1, 4))) 612 | # cls_bbox = at.totensor(cls_bbox) 613 | # cls_bbox = cls_bbox.view(-1, self.n_class * 4) 614 | # # clip bounding box 615 | # cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) 616 | # cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) 617 | # 618 | # prob = at.tonumpy(F.softmax(at.tovariable(roi_score), dim=1)) 619 | # 620 | # raw_cls_bbox = at.tonumpy(cls_bbox) 621 | # raw_prob = at.tonumpy(prob) 622 | # 623 | # bbox, label, score = self._suppress(raw_cls_bbox, raw_prob) 624 | # bboxes.append(bbox) 625 | # labels.append(label) 626 | # scores.append(score) 627 | # 628 | # # self.use_preset('evaluate') 629 | # # self.train() 630 | # return bboxes, labels, scores 631 | # 632 | # def get_loss(self,inputs,isLearnNMS): 633 | # gt_rpn_loc, gt_rpn_label, gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score, rpn_locs, rpn_scores, \ 634 | # sample_roi, roi_cls_loc, roi_score, appearance_features, img_size, labels, bboxes = self(inputs) 635 | # if(isLearnNMS): 636 | # rpn_loss = self.rpnLoss(gt_rpn_loc,gt_rpn_label, rpn_locs, rpn_scores) 637 | # roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label,roi_cls_loc, roi_score) 638 | # nms_scores, sorted_labels, sorted_cls_bboxes = self.duplicate_remover(sample_roi, roi_cls_loc, roi_score, 639 | # appearance_features, img_size) 640 | # nms_loss = self.nmsLoss(bboxes, labels,nms_scores, sorted_labels, sorted_cls_bboxes) 641 | # losses = rpn_loss+roi_loss+nms_loss 642 | # losses = [sum(losses)]+losses 643 | # return losses 644 | # else: 645 | # rpn_loss = self.rpnLoss(gt_rpn_loc, gt_rpn_label, rpn_locs, rpn_scores) 646 | # roi_loss = self.roiLoss(gt_roi_loc, gt_roi_label, roi_cls_loc, roi_score) 647 | # losses = rpn_loss + roi_loss 648 | # losses = [sum(losses)]+losses 649 | # return losses 650 | 651 | class RegionProposalNetwork(nn.Module): 652 | """Region Proposal Network introduced in Faster R-CNN. 653 | 654 | This is Region Proposal Network introduced in Faster R-CNN [#]_. 655 | This takes features extracted from images and propose 656 | class agnostic bounding boxes around "objects". 657 | 658 | .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \ 659 | Faster R-CNN: Towards Real-Time Object Detection with \ 660 | Region Proposal Networks. NIPS 2015. 661 | 662 | Args: 663 | in_channels (int): The channel size of input. 664 | mid_channels (int): The channel size of the intermediate tensor. 665 | ratios (list of floats): This is ratios of width to height of 666 | the anchors. 667 | anchor_scales (list of numbers): This is areas of anchors. 668 | Those areas will be the product of the square of an element in 669 | :obj:`anchor_scales` and the original area of the reference 670 | window. 671 | feat_stride (int): Stride size after extracting features from an 672 | image. 673 | initialW (callable): Initial weight value. If :obj:`None` then this 674 | function uses Gaussian distribution scaled by 0.1 to 675 | initialize weight. 676 | May also be a callable that takes an array and edits its values. 677 | proposal_creator_params (dict): Key valued paramters for 678 | :class:`model.utils.creator_tools.ProposalCreator`. 679 | 680 | .. seealso:: 681 | :class:`~model.utils.creator_tools.ProposalCreator` 682 | 683 | """ 684 | 685 | def __init__( 686 | self, in_channels=256, mid_channels=256, ratios=[0.5, 1, 2], 687 | anchor_scales=[8, 16, 32], feat_stride=32, 688 | proposal_creator_params=dict(), 689 | ): 690 | super(RegionProposalNetwork, self).__init__() 691 | self.anchor_base = self.generate_anchor_base( 692 | anchor_scales=anchor_scales, ratios=ratios) 693 | self.feat_stride = feat_stride 694 | self.proposal_layer = ProposalCreator(self, **proposal_creator_params) 695 | n_anchor = self.anchor_base.shape[0] 696 | 697 | self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1) 698 | self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0) 699 | self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0) 700 | 701 | def forward(self, x, img_size, scale=1.): 702 | """Forward Region Proposal Network. 703 | 704 | Here are notations. 705 | 706 | * :math:`N` is batch size. 707 | * :math:`C` channel size of the input. 708 | * :math:`H` and :math:`W` are height and witdh of the input feature. 709 | * :math:`A` is number of anchors assigned to each pixel. 710 | 711 | Args: 712 | x (~torch.autograd.Variable): The Features extracted from images. 713 | Its shape is :math:`(N, C, H, W)`. 714 | img_size (tuple of ints): A tuple :obj:`height, width`, 715 | which contains image size after scaling. 716 | scale (float): The amount of scaling done to the input images after 717 | reading them from files. 718 | 719 | Returns: 720 | (~torch.autograd.Variable, ~torch.autograd.Variable, array, array, array): 721 | 722 | This is a tuple of five following values. 723 | 724 | * **rpn_locs**: Predicted bounding box offsets and scales for \ 725 | anchors. Its shape is :math:`(N, H W A, 4)`. 726 | * **rpn_scores**: Predicted foreground scores for \ 727 | anchors. Its shape is :math:`(N, H W A, 2)`. 728 | * **rois**: A bounding box array containing coordinates of \ 729 | proposal boxes. This is a concatenation of bounding box \ 730 | arrays from multiple images in the batch. \ 731 | Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \ 732 | bounding boxes from the :math:`i` th image, \ 733 | :math:`R' = \\sum _{i=1} ^ N R_i`. 734 | * **roi_indices**: An array containing indices of images to \ 735 | which RoIs correspond to. Its shape is :math:`(R',)`. 736 | * **anchor**: Coordinates of enumerated shifted anchors. \ 737 | Its shape is :math:`(H W A, 4)`. 738 | 739 | """ 740 | n, _, hh, ww = x.shape 741 | anchor = self._enumerate_shifted_anchor_torch( 742 | np.array(self.anchor_base), 743 | self.feat_stride, hh, ww) 744 | n_anchor = anchor.shape[0] // (hh * ww) 745 | h = F.relu(self.conv1(x)) 746 | 747 | rpn_locs = self.loc(h) 748 | # UNNOTE: check whether need contiguous 749 | # A: Yes 750 | rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4) 751 | 752 | rpn_scores = self.score(h) 753 | rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous() 754 | rpn_fg_scores = \ 755 | rpn_scores.view(n, hh, ww, n_anchor, 2)[:, :, :, :, 1].contiguous() 756 | rpn_fg_scores = rpn_fg_scores.view(n, -1) 757 | rpn_scores = rpn_scores.view(n, -1, 2) 758 | 759 | rois = list() 760 | roi_indices = list() 761 | 762 | for i in range(n): 763 | roi = self.proposal_layer( 764 | rpn_locs[i].cpu().data.numpy(), 765 | rpn_fg_scores[i].cpu().data.numpy(), 766 | anchor, img_size, 767 | scale=scale) 768 | batch_index = i * np.ones((len(roi),), dtype=np.int32) 769 | rois.append(roi) 770 | roi_indices.append(batch_index) 771 | 772 | rois = np.concatenate(rois, axis=0) 773 | roi_indices = np.concatenate(roi_indices, axis=0) 774 | return rpn_locs, rpn_scores, rois, roi_indices, anchor 775 | 776 | def generate_anchor_base(self,base_size=16, ratios=[0.5, 1, 2], 777 | anchor_scales=[8, 16, 32]): 778 | """Generate anchor base windows by enumerating aspect ratio and scales. 779 | 780 | Generate anchors that are scaled and modified to the given aspect ratios. 781 | Area of a scaled anchor is preserved when modifying to the given aspect 782 | ratio. 783 | 784 | :obj:`R = len(ratios) * len(anchor_scales)` anchors are generated by this 785 | function. 786 | The :obj:`i * len(anchor_scales) + j` th anchor corresponds to an anchor 787 | generated by :obj:`ratios[i]` and :obj:`anchor_scales[j]`. 788 | 789 | For example, if the scale is :math:`8` and the ratio is :math:`0.25`, 790 | the width and the height of the base window will be stretched by :math:`8`. 791 | For modifying the anchor to the given aspect ratio, 792 | the height is halved and the width is doubled. 793 | 794 | Args: 795 | base_size (number): The width and the height of the reference window. 796 | ratios (list of floats): This is ratios of width to height of 797 | the anchors. 798 | anchor_scales (list of numbers): This is areas of anchors. 799 | Those areas will be the product of the square of an element in 800 | :obj:`anchor_scales` and the original area of the reference 801 | window. 802 | 803 | Returns: 804 | ~numpy.ndarray: 805 | An array of shape :math:`(R, 4)`. 806 | Each element is a set of coordinates of a bounding box. 807 | The second axis corresponds to 808 | :math:`(y_{min}, x_{min}, y_{max}, x_{max})` of a bounding box. 809 | 810 | """ 811 | py = base_size / 2. 812 | px = base_size / 2. 813 | 814 | anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4), 815 | dtype=np.float32) 816 | for i in six.moves.range(len(ratios)): 817 | for j in six.moves.range(len(anchor_scales)): 818 | h = base_size * anchor_scales[j] * np.sqrt(ratios[i]) 819 | w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i]) 820 | 821 | index = i * len(anchor_scales) + j 822 | anchor_base[index, 0] = py - h / 2. 823 | anchor_base[index, 1] = px - w / 2. 824 | anchor_base[index, 2] = py + h / 2. 825 | anchor_base[index, 3] = px + w / 2. 826 | return anchor_base 827 | 828 | def _enumerate_shifted_anchor_torch(self,anchor_base, feat_stride, height, width): 829 | # Enumerate all shifted anchors: 830 | # 831 | # add A anchors (1, A, 4) to 832 | # cell K shifts (K, 1, 4) to get 833 | # shift anchors (K, A, 4) 834 | # reshape to (K*A, 4) shifted anchors 835 | # return (K*A, 4) 836 | 837 | # !TODO: add support for torch.CudaTensor 838 | # xp = cuda.get_array_module(anchor_base) 839 | shift_y = t.arange(0, height * feat_stride, feat_stride) 840 | shift_x = t.arange(0, width * feat_stride, feat_stride) 841 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 842 | shift = np.stack((shift_y.ravel(), shift_x.ravel(), 843 | shift_y.ravel(), shift_x.ravel()), axis=1) 844 | 845 | A = anchor_base.shape[0] 846 | K = shift.shape[0] 847 | anchor = anchor_base.reshape((1, A, 4)) + \ 848 | shift.reshape((1, K, 4)).transpose((1, 0, 2)) 849 | anchor = anchor.reshape((K * A, 4)).astype(np.float32) 850 | return anchor 851 | 852 | class RoI(Function): 853 | """ 854 | NOTE：only CUDA-compatible 855 | """ 856 | 857 | def __init__(self, outh, outw, spatial_scale): 858 | self.forward_fn = load_kernel('roi_forward', kernel_forward) 859 | self.backward_fn = load_kernel('roi_backward', kernel_backward) 860 | self.outh, self.outw, self.spatial_scale = outh, outw, spatial_scale 861 | 862 | def forward(self, x, rois): 863 | # NOTE: MAKE SURE input is contiguous too 864 | x = x.contiguous() 865 | rois = rois.contiguous() 866 | self.in_size = B, C, H, W = x.size() ## 1, 128, heights/32, width/32 867 | self.N = N = rois.size(0) ## 128 868 | output = t.zeros(N, C, self.outh, self.outw).cuda() ## 128,128,7,7 869 | self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda() 870 | self.rois = rois 871 | args = [x.data_ptr(), rois.data_ptr(), 872 | output.data_ptr(), 873 | self.argmax_data.data_ptr(), 874 | self.spatial_scale, C, H, W, 875 | self.outh, self.outw, 876 | output.numel()] 877 | stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) 878 | self.forward_fn(args=args, 879 | block=(CUDA_NUM_THREADS, 1, 1), 880 | grid=(GET_BLOCKS(output.numel()), 1, 1), 881 | stream=stream) 882 | return output 883 | 884 | def backward(self, grad_output): 885 | ##NOTE: IMPORTANT CONTIGUOUS 886 | # TODO: input 887 | grad_output = grad_output.contiguous() 888 | B, C, H, W = self.in_size 889 | grad_input = t.zeros(self.in_size).cuda() 890 | stream = Stream(ptr=torch.cuda.current_stream().cuda_stream) 891 | args = [grad_output.data_ptr(), 892 | self.argmax_data.data_ptr(), 893 | self.rois.data_ptr(), 894 | grad_input.data_ptr(), 895 | self.N, self.spatial_scale, C, H, W, self.outh, self.outw, 896 | grad_input.numel()] 897 | self.backward_fn(args=args, 898 | block=(CUDA_NUM_THREADS, 1, 1), 899 | grid=(GET_BLOCKS(grad_input.numel()), 1, 1), 900 | stream=stream 901 | ) 902 | return grad_input, None 903 | class RoIPooling2D(nn.Module): 904 | 905 | def __init__(self, outh, outw, spatial_scale): 906 | super(RoIPooling2D, self).__init__() 907 | self.RoI = RoI(outh, outw, spatial_scale) 908 | 909 | def forward(self, x, rois): 910 | return self.RoI(x, rois) 911 | 912 | class DuplicationRemovalNetwork(nn.Module): 913 | def __init__(self,n_relations = 16, appearance_feature_dim=1024,num_classes=20,d_f=128): 914 | super(DuplicationRemovalNetwork, self).__init__() 915 | self.loc_normalize_mean = (0., 0., 0., 0.) 916 | self.loc_normalize_std = (0.1, 0.1, 0.2, 0.2) 917 | self.key_feature_dim = int(appearance_feature_dim/n_relations) 918 | self.geo_feature_dim = int(appearance_feature_dim/n_relations) 919 | self.appearance_feature_dim=appearance_feature_dim 920 | self.n_class = num_classes+1 921 | 922 | self.nms_rank_fc = nn.Linear(appearance_feature_dim, d_f, bias=True) 923 | self.roi_feat_embedding_fc = nn.Linear(appearance_feature_dim,d_f,bias=True) 924 | self.relation_module = RelationModule(n_relations=n_relations,appearance_feature_dim=d_f, 925 | key_feature_dim=64, 926 | geo_feature_dim=64,isDuplication=True) 927 | 928 | self.nms_logit_fc = nn.Linear(appearance_feature_dim,1,bias=True) 929 | self.sigmoid = nn.Sigmoid() 930 | def forward(self,sample_roi,roi_cls_loc, roi_score, appearance_features,size): 931 | N = sample_roi.shape[0] 932 | roi_score = roi_score.data 933 | roi_cls_loc = roi_cls_loc.data 934 | roi = at.totensor(sample_roi) 935 | 936 | 937 | mean = t.Tensor(self.loc_normalize_mean).cuda(). \ 938 | repeat(self.n_class)[None] 939 | std = t.Tensor(self.loc_normalize_std).cuda(). \ 940 | repeat(self.n_class)[None] 941 | 942 | roi_cls_loc = (roi_cls_loc * std + mean) 943 | roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4) 944 | roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc) 945 | cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)), 946 | at.tonumpy(roi_cls_loc).reshape((-1, 4))) 947 | cls_bbox = at.totensor(cls_bbox) 948 | cls_bbox = cls_bbox.view(-1, self.n_class , 4) 949 | # clip bounding box 950 | cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0]) 951 | cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1]) 952 | 953 | prob = F.softmax(at.tovariable(roi_score), dim=1) 954 | 955 | prob,prob_argmax = torch.max(prob,dim=-1) 956 | cls_bbox = cls_bbox[np.arange(start=0,stop=N),prob_argmax] 957 | 958 | nonzero_idx=torch.nonzero(prob_argmax) 959 | 960 | if(nonzero_idx.size()[0]==0): 961 | return None,None,None 962 | else: 963 | nonzero_idx = nonzero_idx[:, 0] 964 | prob_argmax = prob_argmax[nonzero_idx] 965 | prob = prob[nonzero_idx] 966 | cls_bbox = cls_bbox[nonzero_idx] 967 | appearance_features_nobg = appearance_features[nonzero_idx] 968 | sorted_score,prob_argsort = torch.sort(prob,descending=True) 969 | 970 | sorted_prob = prob[prob_argsort] 971 | sorted_cls_bboxes = cls_bbox[prob_argsort] 972 | sorted_labels = prob_argmax[prob_argsort] 973 | sorted_features = appearance_features_nobg[prob_argsort] 974 | 975 | nms_rank_embedding = RankEmbedding(sorted_prob.size()[0],self.appearance_feature_dim) 976 | nms_rank = self.nms_rank_fc(nms_rank_embedding) 977 | roi_feat_embedding = self.roi_feat_embedding_fc(sorted_features) 978 | nms_embedding_feat = nms_rank + roi_feat_embedding 979 | position_embedding = PositionalEmbedding(sorted_cls_bboxes,dim_g = self.geo_feature_dim) 980 | nms_logit = self.relation_module([sorted_features, nms_embedding_feat,position_embedding]) 981 | nms_logit = self.nms_logit_fc(nms_logit) 982 | s1 = self.sigmoid(nms_logit).view(-1) 983 | nms_scores = s1 * sorted_prob 984 | 985 | return nms_scores, sorted_labels-1, sorted_cls_bboxes 986 | class RelationModule(nn.Module): 987 | def __init__(self,n_relations = 16, appearance_feature_dim=1024,key_feature_dim = 64, geo_feature_dim = 64, isDuplication = False): 988 | super(RelationModule, self).__init__() 989 | self.isDuplication=isDuplication 990 | self.Nr = n_relations 991 | self.dim_g = geo_feature_dim 992 | self.relation = nn.ModuleList() 993 | for N in range(self.Nr): 994 | self.relation.append(RelationUnit(appearance_feature_dim, key_feature_dim, geo_feature_dim)) 995 | def forward(self, input_data ): 996 | if(self.isDuplication): 997 | f_a, embedding_f_a, position_embedding =input_data 998 | else: 999 | f_a, position_embedding = input_data 1000 | isFirst=True 1001 | for N in range(self.Nr): 1002 | if(isFirst): 1003 | if(self.isDuplication): 1004 | concat = self.relation[N](embedding_f_a,position_embedding) 1005 | else: 1006 | concat = self.relation[N](f_a,position_embedding) 1007 | isFirst=False 1008 | else: 1009 | if(self.isDuplication): 1010 | concat = torch.cat((concat, self.relation[N](embedding_f_a, position_embedding)), -1) 1011 | else: 1012 | concat = torch.cat((concat, self.relation[N](f_a, position_embedding)), -1) 1013 | return concat+f_a 1014 | class RelationUnit(nn.Module): 1015 | def __init__(self, appearance_feature_dim=1024,key_feature_dim = 64, geo_feature_dim = 64): 1016 | super(RelationUnit, self).__init__() 1017 | self.dim_g = geo_feature_dim 1018 | self.dim_k = key_feature_dim 1019 | self.WG = nn.Linear(geo_feature_dim, 1, bias=True) 1020 | self.WK = nn.Linear(appearance_feature_dim, key_feature_dim, bias=True) 1021 | self.WQ = nn.Linear(appearance_feature_dim, key_feature_dim, bias=True) 1022 | self.WV = nn.Linear(appearance_feature_dim, key_feature_dim, bias=True) 1023 | self.relu = nn.ReLU(inplace=True) 1024 | 1025 | 1026 | def forward(self, f_a, position_embedding): 1027 | N,_ = f_a.size() 1028 | 1029 | position_embedding = position_embedding.view(-1,self.dim_g) 1030 | 1031 | w_g = self.relu(self.WG(position_embedding)) 1032 | w_k = self.WK(f_a) 1033 | w_k = w_k.view(N,1,self.dim_k) 1034 | 1035 | w_q = self.WQ(f_a) 1036 | w_q = w_q.view(1,N,self.dim_k) 1037 | 1038 | scaled_dot = torch.sum((w_k*w_q),-1 ) 1039 | scaled_dot = scaled_dot / np.sqrt(self.dim_k) 1040 | 1041 | w_g = w_g.view(N,N) 1042 | w_a = scaled_dot.view(N,N) 1043 | 1044 | w_mn = torch.log(torch.clamp(w_g, min = 1e-6)) + w_a 1045 | w_mn = torch.nn.Softmax(dim=1)(w_mn) 1046 | 1047 | w_v = self.WV(f_a) 1048 | 1049 | w_mn = w_mn.view(N,N,1) 1050 | w_v = w_v.view(N,1,-1) 1051 | 1052 | output = w_mn*w_v 1053 | 1054 | output = torch.sum(output,-2) 1055 | return output 1056 | 1057 | class RoIHead(nn.Module): 1058 | """Faster R-CNN Head for VGG-16 based implementation. 1059 | This class is used as a head for Faster R-CNN. 1060 | This outputs class-wise localizations and classification based on feature 1061 | maps in the given RoIs. 1062 | 1063 | Args: 1064 | n_class (int): The number of classes possibly including the background. 1065 | roi_size (int): Height and width of the feature maps after RoI-pooling. 1066 | spatial_scale (float): Scale of the roi is resized. 1067 | classifier (nn.Module): Two layer Linear ported from vgg16 1068 | 1069 | """ 1070 | 1071 | def __init__(self, n_class, roi_size, spatial_scale, 1072 | in_channels = 128,fc_features = 1024, n_relations = 0 , classifier = None): 1073 | # n_class includes the background 1074 | super(RoIHead, self).__init__() 1075 | if classifier is None: 1076 | self.n_relations=n_relations 1077 | fully_connected1 = nn.Linear(7*7*in_channels, fc_features) 1078 | relu1 = nn.ReLU(inplace=True) 1079 | 1080 | fully_connected2 = nn.Linear(fc_features, fc_features) 1081 | relu2 = nn.ReLU(inplace=True) 1082 | if(n_relations>0): 1083 | self.dim_g = int(fc_features/n_relations) 1084 | relation1= RelationModule(n_relations = n_relations, appearance_feature_dim=fc_features, 1085 | key_feature_dim = self.dim_g, geo_feature_dim = self.dim_g) 1086 | 1087 | relation2 = RelationModule(n_relations=n_relations, appearance_feature_dim=fc_features, 1088 | key_feature_dim=self.dim_g, geo_feature_dim=self.dim_g) 1089 | self.classifier = nn.Sequential(fully_connected1, relu1, relation1, 1090 | fully_connected2, relu2, relation2) 1091 | else: 1092 | self.classifier = nn.Sequential(fully_connected1, relu1, 1093 | fully_connected2, relu2) 1094 | else : 1095 | self.classifier = classifier 1096 | 1097 | self.cls_loc = nn.Linear(fc_features, n_class * 4) 1098 | self.score = nn.Linear(fc_features, n_class) 1099 | normal_init(self.cls_loc, 0, 0.001) 1100 | normal_init(self.score, 0, 0.01) 1101 | self.n_class = n_class 1102 | self.roi_size = roi_size 1103 | self.spatial_scale = spatial_scale 1104 | self.roi = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale) 1105 | 1106 | def forward(self, x, rois, roi_indices): 1107 | """Forward the chain. 1108 | 1109 | We assume that there are :math:`N` batches. 1110 | 1111 | Args: 1112 | x (Variable): 4D image variable. 1113 | rois (Tensor): A bounding box array containing coordinates of 1114 | proposal boxes. This is a concatenation of bounding box 1115 | arrays from multiple images in the batch. 1116 | Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed 1117 | RoIs from the :math:`i` th image, 1118 | :math:`R' = \\sum _{i=1} ^ N R_i`. 1119 | roi_indices (Tensor): An array containing indices of images to 1120 | which bounding boxes correspond to. Its shape is :math:`(R',)`. 1121 | 1122 | """ 1123 | # in case roi_indices is ndarray 1124 | roi_indices = at.totensor(roi_indices).float() 1125 | rois = at.totensor(rois).float() 1126 | indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1) 1127 | # NOTE: important: yx->xy 1128 | xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] 1129 | indices_and_rois = t.autograd.Variable(xy_indices_and_rois.contiguous()) 1130 | if(self.n_relations>0): 1131 | position_embedding = PositionalEmbedding(indices_and_rois[:, 1:],dim_g = self.dim_g) 1132 | 1133 | pool = self.roi(x, indices_and_rois) 1134 | 1135 | pool = pool.view(pool.size(0), -1) 1136 | 1137 | fc7 = self.classifier(pool) 1138 | roi_cls_locs = self.cls_loc(fc7) 1139 | roi_scores = self.score(fc7) 1140 | return roi_cls_locs, roi_scores, fc7 1141 | 1142 | class VGG16RoIHead(nn.Module): 1143 | """Faster R-CNN Head for VGG-16 based implementation. 1144 | This class is used as a head for Faster R-CNN. 1145 | This outputs class-wise localizations and classification based on feature 1146 | maps in the given RoIs. 1147 | 1148 | Args: 1149 | n_class (int): The number of classes possibly including the background. 1150 | roi_size (int): Height and width of the feature maps after RoI-pooling. 1151 | spatial_scale (float): Scale of the roi is resized. 1152 | classifier (nn.Module): Two layer Linear ported from vgg16 1153 | """ 1154 | 1155 | def __init__(self, n_class, roi_size, spatial_scale, 1156 | classifier): 1157 | # n_class includes the background 1158 | super(VGG16RoIHead, self).__init__() 1159 | 1160 | self.classifier = classifier 1161 | self.cls_loc = nn.Linear(4096, n_class * 4) 1162 | self.score = nn.Linear(4096, n_class) 1163 | 1164 | normal_init(self.cls_loc, 0, 0.001) 1165 | normal_init(self.score, 0, 0.01) 1166 | 1167 | self.n_class = n_class 1168 | self.roi_size = roi_size 1169 | self.spatial_scale = spatial_scale 1170 | self.roi = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale) 1171 | 1172 | def forward(self, x, rois, roi_indices): 1173 | """Forward the chain. 1174 | 1175 | We assume that there are :math:`N` batches. 1176 | 1177 | Args: 1178 | x (Variable): 4D image variable. 1179 | rois (Tensor): A bounding box array containing coordinates of 1180 | proposal boxes. This is a concatenation of bounding box 1181 | arrays from multiple images in the batch. 1182 | Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed 1183 | RoIs from the :math:`i` th image, 1184 | :math:`R' = \\sum _{i=1} ^ N R_i`. 1185 | roi_indices (Tensor): An array containing indices of images to 1186 | which bounding boxes correspond to. Its shape is :math:`(R',)`. 1187 | 1188 | """ 1189 | # in case roi_indices is ndarray 1190 | roi_indices = at.totensor(roi_indices).float() 1191 | rois = at.totensor(rois).float() 1192 | indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1) 1193 | # NOTE: important: yx->xy 1194 | xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] 1195 | indices_and_rois = xy_indices_and_rois.contiguous() 1196 | 1197 | pool = self.roi(x, indices_and_rois) 1198 | pool = pool.view(pool.size(0), -1) 1199 | fc7 = self.classifier(pool) 1200 | roi_cls_locs = self.cls_loc(fc7) 1201 | roi_scores = self.score(fc7) 1202 | return roi_cls_locs, roi_scores, fc7 1203 | 1204 | def normal_init(m, mean, stddev, truncated=False): 1205 | """ 1206 | weight initalizer: truncated normal and random normal. 1207 | """ 1208 | # x is a parameter 1209 | if truncated: 1210 | m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean) # not a perfect approximation 1211 | else: 1212 | m.weight.data.normal_(mean, stddev) 1213 | m.bias.data.zero_() -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import collections 2 | 3 | import numpy as np 4 | from torch.utils import data as data_ 5 | import model 6 | 7 | from trainer import Trainer 8 | import torch 9 | import torch.optim as optim 10 | from data.dataset import Dataset, TestDataset 11 | from config import opt 12 | import cv2,time 13 | 14 | def run_train(train_verbose=False): 15 | dataset = Dataset(opt) 16 | dataloader = data_.DataLoader(dataset, \ 17 | batch_size=opt.batch_size, \ 18 | shuffle=True, \ 19 | # pin_memory=True, 20 | num_workers=opt.num_workers) 21 | 22 | testset = TestDataset(opt) 23 | test_dataloader = data_.DataLoader(testset, 24 | batch_size=opt.batch_size, 25 | num_workers=opt.num_workers, 26 | shuffle=False#, \ 27 | #pin_memory=True 28 | ) 29 | 30 | resnet = model.resnet101(20,True).cuda() 31 | 32 | optimizer = optim.Adam(resnet.parameters(), lr=opt.lr) 33 | 34 | loss_hist = collections.deque(maxlen=500) 35 | epoch_loss_hist = [] 36 | resnet_trainer = Trainer(resnet,optimizer,model_name=opt.model_name) 37 | 38 | freeze_num = 8 #pretrain model 39 | best_loss = 10 40 | best_loss_epoch_num = -1 41 | num_bad_epochs = 0 42 | max_bad_epochs = 5 43 | resnet_trainer.model_freeze(freeze_num=freeze_num) 44 | 45 | for epoch_num in range(opt.epoch): 46 | resnet_trainer.train_mode(freeze_num) 47 | train_start_time = time.time() 48 | train_epoch_loss = [] 49 | start = time.time() 50 | for iter_num, data in enumerate(dataloader): 51 | curr_loss = resnet_trainer.train_step(data) 52 | loss_hist.append(float(curr_loss)) 53 | train_epoch_loss.append(float(curr_loss)) 54 | 55 | if (train_verbose): 56 | print('Epoch: {} | Iteration: {} | loss: {:1.5f} | Running loss: {:1.5f} | Iter time: {:1.5f} | Train' 57 | ' time: {:1.5f}'.format(epoch_num, iter_num, float(curr_loss), np.mean(loss_hist), 58 | time.time()-start, time.time()-train_start_time)) 59 | start = time.time() 60 | 61 | del curr_loss 62 | print('train epoch time :', time.time() - train_start_time) 63 | print('Epoch: {} | epoch train loss: {:1.5f}'.format( 64 | epoch_num, np.mean(train_epoch_loss))) 65 | 66 | vali_start_time = time.time() 67 | 68 | #resnet_trainer.eval_mode() 69 | vali_epoch_loss = [] 70 | for iter_num, data in enumerate(test_dataloader): 71 | curr_loss = resnet_trainer.get_loss(data) 72 | vali_epoch_loss.append(float(curr_loss)) 73 | 74 | del curr_loss 75 | 76 | epoch_loss_hist.append(np.mean(vali_epoch_loss)) 77 | 78 | # vali_eval_result = resnet_trainer.run_eval(test_dataloader) 79 | # print(vali_eval_result) 80 | print('vali epoch time :', time.time() - vali_start_time) 81 | print('Epoch: {} | epoch vali loss: {:1.5f}'.format( 82 | epoch_num, np.mean(vali_epoch_loss))) 83 | 84 | if (best_loss < np.mean(vali_epoch_loss)): 85 | num_bad_epochs += 1 86 | else: 87 | best_loss = np.mean(vali_epoch_loss) 88 | best_loss_epoch_num = epoch_num 89 | num_bad_epochs = 0 90 | resnet_trainer.model_save(epoch_num) 91 | if (num_bad_epochs > max_bad_epochs): 92 | freeze_num=11 93 | num_bad_epochs = 0 94 | resnet_trainer.model_load(best_loss_epoch_num) 95 | resnet_trainer.reduce_lr(factor=0.1, verbose=True) 96 | resnet_trainer.model_freeze(freeze_num=freeze_num) 97 | 98 | print('best epoch num', best_loss_epoch_num) 99 | print('----------------------------------------') 100 | 101 | print(epoch_loss_hist) 102 | 103 | 104 | if __name__ == "__main__": 105 | run_train(train_verbose = True) -------------------------------------------------------------------------------- /train_history.txt: -------------------------------------------------------------------------------- 1 | resnet18_pyramid_no_relation 2 | Epoch: 36 | epoch train loss: 0.44957 3 | Epoch: 36 | epoch vali loss: 0.79966 4 | best epoch num 36 5 | 6 | ----------------------------------------------- 7 | resnet101_pyramid_no_relation 8 | Epoch: 53 | epoch train loss: 0.29215 9 | Epoch: 53 | epoch vali loss: 0.76078 10 | best epoch num 53 11 | 12 | -------------------------------------------- 13 | resnet101_pyramid_no_relation_e2e 14 | Epoch: 14 | epoch train loss: 3.83095 15 | Epoch: 14 | epoch vali loss: 3.70913 16 | best epoch num 14 17 | 18 | --------------------------------------------- 19 | resnet101_pyramid_no_relation_e2e 20 | Epoch: 6 | epoch train loss: 0.49311 21 | Epoch: 6 | epoch vali loss: 0.98503 22 | best epoch num 6 23 | 24 | -------------------------------------------- 25 | resnet101_pyramid_no_relation_e2e 26 | Epoch: 7 | epoch train loss: 0.52024 27 | Epoch: 7 | epoch vali loss: 0.94327 28 | best epoch num 7 29 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from evaluate import eval 3 | from config import opt 4 | 5 | class Trainer(): 6 | def __init__(self, my_model, optimizer, model_name): 7 | self.my_model=my_model 8 | self.optimizer=optimizer 9 | self.model_name = model_name 10 | self.my_model.train() 11 | self.my_model.use_preset(isTraining=True) 12 | self.my_model.freeze_bn() 13 | 14 | def train_mode(self,freeze_num): 15 | self.my_model.train() 16 | self.my_model.use_preset(isTraining=True) 17 | self.my_model.freeze_bn() 18 | self.model_freeze(freeze_num) 19 | 20 | def eval_mode(self): 21 | self.my_model.eval() 22 | self.my_model.use_preset(isTraining=False, preset='evaluate') 23 | for child in self.my_model.children(): 24 | for param in child.parameters(): 25 | param.requires_grad = False 26 | def train_step(self, data): 27 | self.optimizer.zero_grad() 28 | losses = self.my_model.get_loss( 29 | [data[0].cuda().float(), data[1].cuda().float(), data[2].cuda().float(), data[3].cuda().float()],opt.isLearnNMS) 30 | if losses[0]==0.: 31 | return 1. 32 | losses[0].backward() 33 | torch.nn.utils.clip_grad_norm_(self.my_model.parameters(), 0.1) 34 | 35 | self.optimizer.step() 36 | 37 | curr_loss = losses[0].item() 38 | return curr_loss 39 | def get_loss(self, data): 40 | losses = self.my_model.get_loss( 41 | [data[0].cuda().float(), data[2].cuda().float(), data[3].cuda().float(), data[4].cuda().float()],opt.isLearnNMS) 42 | if losses[0]==0.: 43 | return 1. 44 | curr_loss = losses[0].item() 45 | return curr_loss 46 | 47 | def model_save(self,epoch_num): 48 | torch.save(self.my_model.state_dict(), 'Weights/'+self.model_name+'_{}.pt'.format(epoch_num)) 49 | 50 | def model_load(self,epoch_num): 51 | self.my_model.load_state_dict(torch.load('Weights/'+self.model_name+'_{}.pt'.format(epoch_num)),False) 52 | 53 | def reduce_lr(self,factor=0.1,verbose=True): 54 | for i, param_group in enumerate(self.optimizer.param_groups): 55 | old_lr = float(param_group['lr']) 56 | new_lr = old_lr * factor 57 | param_group['lr'] = new_lr 58 | if verbose: 59 | print('reducing learning rate' 60 | ' of group {} to {:.4e}.'.format( i, new_lr)) 61 | 62 | def model_freeze(self,freeze_num): 63 | if(freeze_num == -1): 64 | return 65 | child_count = 0 66 | for child in self.my_model.children(): 67 | if(child_count < freeze_num): 68 | for param in child.parameters(): 69 | param.requires_grad = False 70 | else: 71 | for param in child.parameters(): 72 | param.requires_grad = True 73 | child_count+=1 74 | self.my_model.freeze_bn() 75 | 76 | def run_eval(self, data_loader,test_num=1000000): 77 | return eval(data_loader, self.my_model, test_num) --------------------------------------------------------------------------------