├── LICENSE ├── README.md ├── configs ├── config.py ├── hsd_res101_coco_512.yaml ├── hsd_resx101_coco_512.yaml ├── hsd_vgg_coco_320.yaml └── hsd_vgg_coco_512.yaml ├── data ├── __init__.py ├── coco.py ├── data_augment.py ├── scripts │ ├── VOC2007.sh │ └── VOC2012.sh ├── voc0712.py └── voc_eval.py ├── demo.py ├── eval.py ├── hsd.jpg ├── layers ├── __init__.py ├── functions │ ├── __init__.py │ ├── detection.py │ ├── prior_box.py │ └── prior_layer.py └── modules │ ├── __init__.py │ ├── focal_loss_sigmoid.py │ ├── focal_loss_softmax.py │ ├── hsd_multibox_loss.py │ ├── multibox_loss.py │ ├── weight_smooth_l1_loss.py │ └── weight_softmax_loss.py ├── make.sh ├── models ├── attention.py ├── deform │ ├── .gitignore │ ├── README.md │ ├── build.py │ ├── functions │ │ ├── __init__.py │ │ └── deform_conv.py │ ├── make.sh │ ├── modules │ │ ├── __init__.py │ │ └── deform_conv.py │ └── src │ │ ├── deform_conv.c │ │ ├── deform_conv.h │ │ ├── deform_conv_cuda.c │ │ ├── deform_conv_cuda.h │ │ ├── deform_conv_cuda_kernel.cu │ │ └── deform_conv_cuda_kernel.h ├── hsd_res.py ├── hsd_resx.py ├── hsd_vgg.py ├── model_builder.py └── model_helper.py ├── train.py └── utils ├── __init__.py ├── augmentations.py ├── averageMeter.py ├── box_utils.py ├── build.py ├── collections.py ├── get_class_map.py ├── nms ├── __init__.py ├── cpu_nms.c ├── cpu_nms.pyx ├── gpu_nms.cpp ├── gpu_nms.hpp ├── gpu_nms.pyx ├── nms_kernel.cu └── py_cpu_nms.py ├── nms_wrapper.py └── timer.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 JialeCao001 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HSD 2 | 3 | This code is a official implementation of "[*Hierarchical Shot Detector (ICCV2019)*](https://openaccess.thecvf.com/content_ICCV_2019/papers/Cao_Hierarchical_Shot_Detector_ICCV_2019_paper.pdf)" on COCO object detection with Pytorch. 4 | 5 | ## Introduction 6 | We propose a novel pipeline for accurate object detection (called ROC). Instead of simultaneous classification and regression, ROC firstly conducts box regression, secondly predicts the feature sampling locations for box classification, and finally classifies regressed boxes with the features of offset locations. To achieve the better detection accuracy, a hierarchical shot detector is proposed by stacking two ROC modules. Meanwhile, the contextual information is also incorporated to enrich the features of the second ROC module. 7 | 8 |
9 | 10 |

HSD detection pipeline.

11 |
12 | 13 | ## Results 14 | | name | backbone | input size | minival | download | 15 | | :-------------: | :-----: | :-----: | :------: | :-----------------: | 16 | | HSD320 | VGG16 | 320x320 | 33.9 | [model](https://drive.google.com/open?id=1SQ3PIdc9WD_Dj4X9MHhqeX9nq5aZ9r9e) | 17 | | HSD512 | VGG16 | 512x512 | 38.6 | [model](https://drive.google.com/open?id=1Jvpv5Exhtsnbo8XXcJwc0mACYfGugjav) | 18 | | HSD512 | ResNet101 | 512x512 | 39.7 | [model](https://drive.google.com/open?id=1FmOwuat0yfqu_B499O95_EbNAJnbHzgH) | 19 | | HSD512 | ResNext101 | 512x512 | 41.4 | model | 20 | 21 | ## Installation 22 | - Install PyTorch-0.4.0 and Python 3+. 23 | - Clone this repository. 24 | - Compile the nms and install coco tools: 25 | ```shell 26 | cd HSD 27 | pip install Cython, pycocotools, opencv-python, matplotlib, PyYaml 28 | ./make.sh 29 | ``` 30 | - Compile the deformable convolution: 31 | ```shell 32 | cd HSD/models/deform 33 | sh make.sh 34 | CC=g++ python build.py 35 | ``` 36 | - Then download the COCO dataset and put the MS COCO dataset at $path/data/coco 37 | ``` 38 | coco 39 | |_ images 40 | | |_ train2014 41 | | |_ .jpg 42 | | |_ ... 43 | | |_ val2014 44 | | |_ ... 45 | | |_ test2015 46 | | |_ ... 47 | |_ annotations 48 | | |_ instances_train2014.json 49 | | |_ ... 50 | |_ cache 51 | ``` 52 | ## Train and Inference 53 | - Download the pre-trained models (e.g., VGG16, Resnet, and ResNext) and put these models in the `HSD/weights/pretrained_models/` dir: 54 | ```Shell 55 | VGG16: wget https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth 56 | ResNet101: wget https://download.pytorch.org/models/resnet101-5d3b4d8f.pth 57 | Resnext101: wget https://s3.ap-northeast-2.amazonaws.com/open-mmlab/pretrain/third_party/resnext101_32x4d-a5af3160.pth 58 | ``` 59 | - Use `train.py` to train the detector: 60 | ```Shell 61 | e.g., python train.py --cfg ./configs/hsd_vgg_coco_320.yaml 62 | ``` 63 | - To evaluate a trained network: 64 | 65 | ```Shell 66 | e.g., python eval.py --cfg ./configs/hsd_vgg_coco_320.yaml --weights ./weights/hsd-vgg320-coco/hsd_vgg_epoch_160_300.pth 67 | ``` 68 | 69 | ## Ciatation 70 | If the project helps your research, please cite this paper. 71 | 72 | ``` 73 | @article{Cao_HSD_ICCV_2019, 74 | author = {Jiale Cao and Yanwei Pang and Jungong Han and Xuelong Li}, 75 | title = {Hierarchical Shot Detector}, 76 | journal = {Proc. International Conference on Computer Vision}, 77 | year = {2019} 78 | } 79 | ``` 80 | ## Acknowledgement 81 | Many thanks to the open source codes, i.e., [SSD_Pytorch](https://github.com/yqyao/SSD_Pytorch), [deformable-convolution-pytorch](https://github.com/1zb/deformable-convolution-pytorch), [mmdetection](https://github.com/open-mmlab/mmdetection), and [DANet](https://github.com/junfu1115/DANet). 82 | 83 | -------------------------------------------------------------------------------- /configs/config.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | from __future__ import unicode_literals 5 | 6 | from utils.collections import AttrDict 7 | import six 8 | import yaml 9 | import torch 10 | import torch.nn as nn 11 | from torch.nn import init 12 | import numpy as np 13 | import copy 14 | from ast import literal_eval 15 | 16 | __C = AttrDict() 17 | cfg = __C 18 | 19 | __C.MODEL = AttrDict() 20 | 21 | __C.MODEL.NUM_CLASSES = -1 22 | __C.MODEL.TYPE = '' 23 | __C.MODEL.SIZE = '300' 24 | __C.MODEL.CONV_BODY = '' 25 | __C.MODEL.CASCADE = True 26 | __C.MODEL.LOAD_PRETRAINED_WEIGHTS = False 27 | __C.MODEL.PRETRAIN_WEIGHTS = '' 28 | __C.MODEL.OBJECT_SCORE = 0.01 29 | 30 | __C.TRAIN = AttrDict() 31 | __C.TRAIN.OVERLAP = 0.5 32 | __C.TRAIN.OHEM = True 33 | __C.TRAIN.NEG_RATIO = 3 34 | __C.TRAIN.FOCAL_LOSS = False 35 | __C.TRAIN.FOCAL_LOSS_TYPE = 'SOFTMAX' 36 | __C.TRAIN.BGR_MEAN = [104, 117, 123] 37 | __C.TRAIN.BATCH_SIZE = 1 38 | __C.TRAIN.CHANNEL_SIZE = '48' 39 | __C.TRAIN.WARMUP = True 40 | __C.TRAIN.WARMUP_EPOCH = 2 41 | __C.TRAIN.DEVICE_IDS = [0] 42 | __C.TRAIN.TRAIN_ON = True 43 | 44 | __C.SMALL = AttrDict() 45 | 46 | __C.SMALL.FEATURE_MAPS = [[38, 38], [19, 19], [10, 10], [5, 5], [3, 3], [1, 1]] 47 | __C.SMALL.ARM_CHANNELS = [512, 1024, 512, 256, 256, 256] 48 | __C.SMALL.ODM_CHANNELS = [256, 256, 256, 256] 49 | __C.SMALL.NUM_ANCHORS = [4, 6, 6, 6, 4, 4] 50 | __C.SMALL.STEPS = [[8, 8], [16, 16], [32, 32], [64, 64], [100, 100], 51 | [300, 300]] 52 | __C.SMALL.MIN_SIZES = [30, 60, 111, 162, 213, 264] 53 | __C.SMALL.MAX_SIZES = [60, 111, 162, 213, 264, 315] 54 | __C.SMALL.ASPECT_RATIOS = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], 55 | [2, 3, 0.5, 0.333], [2, 0.5], [2, 0.5]] 56 | __C.SMALL.VARIANCE = [0.1, 0.2] 57 | __C.SMALL.CLIP = True 58 | __C.SMALL.IMG_WH = [300, 300] 59 | __C.SMALL.INPUT_FIXED = True 60 | __C.SMALL.USE_MAX_SIZE = True 61 | 62 | __C.BIG = AttrDict() 63 | __C.BIG.FEATURE_MAPS = [[64, 64], [32, 32], [16, 16], [8, 8], [4, 4], [2, 2], 64 | [1, 1]] 65 | __C.BIG.ARM_CHANNELS = [512, 1024, 512, 256, 256, 256, 256] 66 | __C.BIG.ODM_CHANNELS = [256, 256, 256, 256] 67 | __C.BIG.NUM_ANCHORS = [4, 6, 6, 6, 6, 4, 4] 68 | __C.BIG.STEPS = [[8, 8], [16, 16], [32, 32], [64, 64], [128, 128], [256, 256], 69 | [512, 512]] 70 | __C.BIG.MIN_SIZES = [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 71 | __C.BIG.MAX_SIZES = [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.6] 72 | __C.BIG.ASPECT_RATIOS = [[2, 0.5], [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], 73 | [2, 3, 0.5, 0.333], [2, 3, 0.5, 0.333], [2, 0.5], 74 | [2, 0.5]] 75 | __C.BIG.VARIANCE = [0.1, 0.2] 76 | __C.BIG.CLIP = True 77 | __C.BIG.IMG_WH = [512, 512] 78 | __C.BIG.INPUT_FIXED = True 79 | __C.BIG.USE_MAX_SIZE = True 80 | 81 | __C.SOLVER = AttrDict() 82 | 83 | __C.SOLVER.WEIGHT_DECAY = 0.0005 84 | __C.SOLVER.BASE_LR = 0.001 85 | __C.SOLVER.GAMMA = 0.1 86 | __C.SOLVER.MOMENTUM = 0.9 87 | __C.SOLVER.EPOCH_STEPS = [] 88 | __C.SOLVER.LR = [] 89 | __C.SOLVER.END_EPOCH = 1 90 | __C.SOLVER.START_EPOCH = 0 91 | 92 | __C.DATASETS = AttrDict() 93 | 94 | VOCROOT = 'data/datasets/VOCdevkit0712/' 95 | COCOROOT = 'data/datasets/coco2015' 96 | 97 | __C.DATASETS.TRAIN_TYPE = [] 98 | __C.DATASETS.VAL_TYPE = [] 99 | __C.DATASETS.DATAROOT = VOCROOT 100 | __C.DATASETS.DATA_TYPE = '' 101 | 102 | __C.DATASETS.SETS = AttrDict() 103 | __C.DATASETS.SETS.VOC = [['0712', '0712_trainval']] 104 | __C.DATASETS.SETS.VOC0712PLUS = [['0712', '0712_trainval_test']] 105 | __C.DATASETS.SETS.VOC0712 = [['2012', '2012_trainval']] 106 | __C.DATASETS.SETS.VOC2007 = [['0712', "2007_test"]] 107 | __C.DATASETS.SETS.COCO = [['2014', 'train'], ['2014', 'valminusminival']] 108 | __C.DATASETS.SETS.COCOval = [['2014', 'minival']] 109 | __C.DATASETS.SETS.VOCROOT = VOCROOT 110 | __C.DATASETS.SETS.COCOROOT = COCOROOT 111 | 112 | __C.TEST = AttrDict() 113 | __C.TEST.INPUT_WH = [300, 300] 114 | __C.TEST.CONFIDENCE_THRESH = 0.01 115 | __C.TEST.NMS_TYPE = 'NMS' 116 | __C.TEST.NMS_OVERLAP = 0.45 117 | __C.TEST.BATCH_SIZE = 16 118 | 119 | VOC_CLASSES = ( 120 | '__background__', # always index 0 121 | 'aeroplane', 122 | 'bicycle', 123 | 'bird', 124 | 'boat', 125 | 'bottle', 126 | 'bus', 127 | 'car', 128 | 'cat', 129 | 'chair', 130 | 'cow', 131 | 'diningtable', 132 | 'dog', 133 | 'horse', 134 | 'motorbike', 135 | 'person', 136 | 'pottedplant', 137 | 'sheep', 138 | 'sofa', 139 | 'train', 140 | 'tvmonitor') 141 | 142 | COCO_CLASSES = ('__background__', 'person', 'bicycle', 'car', 'motorbike', 143 | 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 144 | 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 145 | 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 146 | 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 147 | 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 148 | 'kite', 'baseball bat', 'baseball glove', 'skateboard', 149 | 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 150 | 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 151 | 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 152 | 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 153 | 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 154 | 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 155 | 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 156 | 'scissors', 'teddy bear', 'hair drier', 'toothbrush') 157 | 158 | 159 | def merge_cfg_from_file(cfg_filename): 160 | """Load a yaml config file and merge it into the global config.""" 161 | with open(cfg_filename, 'r') as f: 162 | yaml_cfg = AttrDict(yaml.load(f)) 163 | _merge_a_into_b(yaml_cfg, __C) 164 | 165 | 166 | cfg_from_file = merge_cfg_from_file 167 | 168 | 169 | def merge_cfg_from_cfg(cfg_other): 170 | """Merge `cfg_other` into the global config.""" 171 | _merge_a_into_b(cfg_other, __C) 172 | 173 | 174 | def _merge_a_into_b(a, b, stack=None): 175 | """Merge config dictionary a into config dictionary b, clobbering the 176 | options in b whenever they are also specified in a. 177 | """ 178 | assert isinstance(a, AttrDict), 'Argument `a` must be an AttrDict' 179 | assert isinstance(b, AttrDict), 'Argument `b` must be an AttrDict' 180 | 181 | for k, v_ in a.items(): 182 | full_key = '.'.join(stack) + '.' + k if stack is not None else k 183 | # a must specify keys that are in b 184 | if k not in b: 185 | raise KeyError('Non-existent config key: {}'.format(full_key)) 186 | 187 | v = copy.deepcopy(v_) 188 | v = _decode_cfg_value(v) 189 | v = _check_and_coerce_cfg_value_type(v, b[k], k, full_key) 190 | 191 | # Recursively merge dicts 192 | if isinstance(v, AttrDict): 193 | try: 194 | stack_push = [k] if stack is None else stack + [k] 195 | _merge_a_into_b(v, b[k], stack=stack_push) 196 | except BaseException: 197 | raise 198 | else: 199 | b[k] = v 200 | 201 | 202 | def _decode_cfg_value(v): 203 | """Decodes a raw config value (e.g., from a yaml config files or command 204 | line argument) into a Python object. 205 | """ 206 | # Configs parsed from raw yaml will contain dictionary keys that need to be 207 | # converted to AttrDict objects 208 | if isinstance(v, dict): 209 | return AttrDict(v) 210 | # All remaining processing is only applied to strings 211 | if not isinstance(v, six.string_types): 212 | return v 213 | # Try to interpret `v` as a: 214 | # string, number, tuple, list, dict, boolean, or None 215 | try: 216 | v = literal_eval(v) 217 | # The following two excepts allow v to pass through when it represents a 218 | # string. 219 | # 220 | # Longer explanation: 221 | # The type of v is always a string (before calling literal_eval), but 222 | # sometimes it *represents* a string and other times a data structure, like 223 | # a list. In the case that v represents a string, what we got back from the 224 | # yaml parser is 'foo' *without quotes* (so, not '"foo"'). literal_eval is 225 | # ok with '"foo"', but will raise a ValueError if given 'foo'. In other 226 | # cases, like paths (v = 'foo/bar' and not v = '"foo/bar"'), literal_eval 227 | # will raise a SyntaxError. 228 | except ValueError: 229 | pass 230 | except SyntaxError: 231 | pass 232 | return v 233 | 234 | 235 | def _check_and_coerce_cfg_value_type(value_a, value_b, key, full_key): 236 | """Checks that `value_a`, which is intended to replace `value_b` is of the 237 | right type. The type is correct if it matches exactly or is one of a few 238 | cases in which the type can be easily coerced. 239 | """ 240 | # The types must match (with some exceptions) 241 | type_b = type(value_b) 242 | type_a = type(value_a) 243 | if type_a is type_b: 244 | return value_a 245 | 246 | # Exceptions: numpy arrays, strings, tuple<->list 247 | if isinstance(value_b, np.ndarray): 248 | value_a = np.array(value_a, dtype=value_b.dtype) 249 | elif isinstance(value_b, six.string_types): 250 | value_a = str(value_a) 251 | elif isinstance(value_a, tuple) and isinstance(value_b, list): 252 | value_a = list(value_a) 253 | elif isinstance(value_a, list) and isinstance(value_b, tuple): 254 | value_a = tuple(value_a) 255 | else: 256 | raise ValueError( 257 | 'Type mismatch ({} vs. {}) with values ({} vs. {}) for config ' 258 | 'key: {}'.format(type_b, type_a, value_b, value_a, full_key)) 259 | return value_a -------------------------------------------------------------------------------- /configs/hsd_res101_coco_512.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: hsd_res 3 | SIZE: '512' 4 | CASCADE: True 5 | CONV_BODY: hsd_res.HSDResnet101 6 | NUM_CLASSES: 81 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/resnet101-5d3b4d8f.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | 22 | BIG: 23 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]] 24 | ARM_CHANNELS: [256, 256, 256, 256] 25 | ODM_CHANNELS: [256, 256, 256, 256] 26 | NUM_ANCHORS: [3, 3, 3, 3] 27 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 28 | MIN_SIZES: [30, 64, 128, 256] 29 | MAX_SIZES: [64, 128, 256, 315] 30 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 31 | CLIP: True 32 | IMG_WH: [512, 512] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: False 35 | 36 | SOLVER: 37 | WEIGHT_DECAY: 0.0005 38 | BASE_LR: 0.004 39 | GAMMA: 0.1 40 | MOMENTUM: 0.9 41 | EPOCH_STEPS: [0, 90, 140] 42 | END_EPOCH: 160 43 | START_EPOCH: 0 44 | 45 | 46 | DATASETS: 47 | TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']] 48 | VAL_TYPE: [['2014', 'minival']] 49 | DATAROOT: 'data/COCO/' 50 | DATA_TYPE: 'COCO' 51 | 52 | TEST: 53 | INPUT_WH: [512, 512] 54 | CONFIDENCE_THRESH: 0.01 55 | NMS_OVERLAP: 0.45 56 | BATCH_SIZE: 16 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /configs/hsd_resx101_coco_512.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: hsd_resx 3 | SIZE: '512' 4 | CASCADE: True 5 | CONV_BODY: hsd_resx.HSDResnet101 6 | NUM_CLASSES: 81 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/resnext101_32x4d-a5af3160.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | 22 | BIG: 23 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]] 24 | ARM_CHANNELS: [256, 256, 256, 256] 25 | ODM_CHANNELS: [256, 256, 256, 256] 26 | NUM_ANCHORS: [3, 3, 3, 3] 27 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 28 | MIN_SIZES: [30, 64, 128, 256] 29 | MAX_SIZES: [64, 128, 256, 315] 30 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 31 | CLIP: True 32 | IMG_WH: [512, 512] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: False 35 | 36 | SOLVER: 37 | WEIGHT_DECAY: 0.0005 38 | BASE_LR: 0.004 39 | GAMMA: 0.1 40 | MOMENTUM: 0.9 41 | EPOCH_STEPS: [0, 90, 140] 42 | END_EPOCH: 160 43 | START_EPOCH: 0 44 | 45 | 46 | DATASETS: 47 | TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']] 48 | VAL_TYPE: [['2014', 'minival']] 49 | DATAROOT: 'data/COCO/' 50 | DATA_TYPE: 'COCO' 51 | 52 | TEST: 53 | INPUT_WH: [512, 512] 54 | CONFIDENCE_THRESH: 0.01 55 | NMS_OVERLAP: 0.5 56 | BATCH_SIZE: 16 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /configs/hsd_vgg_coco_320.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: hsd_vgg 3 | SIZE: '300' 4 | CASCADE: True 5 | CONV_BODY: hsd_vgg.hsd_vgg 6 | NUM_CLASSES: 81 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | SMALL: 21 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]] 22 | ARM_CHANNELS: [256, 256, 256, 256] 23 | ODM_CHANNELS: [256, 256, 256, 256] 24 | NUM_ANCHORS: [3, 3, 3, 3] 25 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 26 | MIN_SIZES: [20, 45, 112, 256] 27 | MAX_SIZES: [45, 112, 256, 315] 28 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 29 | VARIANCE : [0.1, 0.2] 30 | CLIP: True 31 | IMG_WH: [320, 320] 32 | INPUT_FIXED: True 33 | USE_MAX_SIZE: False 34 | 35 | SOLVER: 36 | WEIGHT_DECAY: 0.0005 37 | BASE_LR: 0.004 38 | GAMMA: 0.1 39 | MOMENTUM: 0.9 40 | EPOCH_STEPS: [0, 90, 140] 41 | END_EPOCH: 160 42 | START_EPOCH: 0 43 | 44 | DATASETS: 45 | TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']] 46 | VAL_TYPE: [['2014', 'minival']] 47 | DATAROOT: 'data/COCO/' 48 | DATA_TYPE: 'COCO' 49 | 50 | TEST: 51 | INPUT_WH: [320, 320] 52 | CONFIDENCE_THRESH: 0.01 53 | NMS_OVERLAP: 0.45 54 | BATCH_SIZE: 16 55 | 56 | 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /configs/hsd_vgg_coco_512.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | TYPE: hsd_vgg 3 | SIZE: '512' 4 | CASCADE: True 5 | CONV_BODY: hsd_vgg.hsd_vgg 6 | NUM_CLASSES: 81 7 | LOAD_PRETRAINED_WEIGHTS: True 8 | PRETRAIN_WEIGHTS: './weights/pretrained_models/vgg16_reducedfc.pth' 9 | 10 | TRAIN: 11 | OVERLAP: 0.5 12 | BGR_MEAN: [104, 117, 123] 13 | BATCH_SIZE: 32 14 | OHEM: True 15 | NEG_RATIO: 3 16 | WARMUP: True 17 | WARMUP_EPOCH: 2 18 | TRAIN_ON: True 19 | 20 | 21 | SMALL: 22 | FEATURE_MAPS: [[40, 40], [20, 20], [10, 10], [5, 5]] 23 | ARM_CHANNELS: [256, 256, 256, 256] 24 | ODM_CHANNELS: [256, 256, 256, 256] 25 | NUM_ANCHORS: [3, 3, 3, 3] 26 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 27 | MIN_SIZES: [20, 45, 112, 256] 28 | MAX_SIZES: [45, 112, 256, 315] 29 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 30 | VARIANCE : [0.1, 0.2] 31 | CLIP: True 32 | IMG_WH: [320, 320] 33 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 34 | USE_MAX_SIZE: False 35 | 36 | BIG: 37 | FEATURE_MAPS: [[64, 64], [32, 32], [16, 16], [8, 8]] 38 | ARM_CHANNELS: [256, 256, 256, 256] 39 | ODM_CHANNELS: [256, 256, 256, 256] 40 | NUM_ANCHORS: [3, 3, 3, 3] 41 | STEPS: [[8, 8], [16, 16], [32, 32], [64, 64]] 42 | MIN_SIZES: [30, 64, 128, 256] 43 | MAX_SIZES: [64, 128, 256, 315] 44 | ASPECT_RATIOS : [[2, 0.5], [2, 0.5], [2, 0.5], [2, 0.5]] 45 | CLIP: True 46 | IMG_WH: [512, 512] 47 | INPUT_FIXED: True # if you want to input different size, you need to set this False. 48 | USE_MAX_SIZE: False 49 | 50 | SOLVER: 51 | WEIGHT_DECAY: 0.0005 52 | BASE_LR: 0.004 53 | GAMMA: 0.1 54 | MOMENTUM: 0.9 55 | EPOCH_STEPS: [0, 90, 140] 56 | END_EPOCH: 160 57 | START_EPOCH: 0 58 | 59 | DATASETS: 60 | TRAIN_TYPE: [['2014', 'train'], ['2014', 'valminusminival']] 61 | VAL_TYPE: [['2014', 'minival']] 62 | DATAROOT: 'data/COCO/' 63 | DATA_TYPE: 'COCO' 64 | 65 | 66 | TEST: 67 | INPUT_WH: [320, 320] 68 | CONFIDENCE_THRESH: 0.01 69 | NMS_OVERLAP: 0.45 70 | BATCH_SIZE: 16 71 | 72 | 73 | 74 | 75 | 76 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .voc0712 import VOCDetection, detection_collate 2 | from .coco import * 3 | from .data_augment import * 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /data/coco.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | 9 | import os 10 | import pickle 11 | import os.path 12 | import sys 13 | import torch 14 | import torch.utils.data as data 15 | import torchvision.transforms as transforms 16 | import cv2 17 | import numpy as np 18 | import json 19 | import uuid 20 | 21 | from pycocotools.coco import COCO 22 | from pycocotools.cocoeval import COCOeval 23 | from pycocotools import mask as COCOmask 24 | 25 | 26 | # no use 27 | class COCOAnnotationTransform(object): 28 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 29 | Initilized with a dictionary lookup of classnames to indexes 30 | 31 | Arguments: 32 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 33 | (default: alphabetic indexing of VOC's 20 classes) 34 | keep_difficult (bool, optional): keep difficult instances or not 35 | (default: False) 36 | height (int): height 37 | width (int): width 38 | """ 39 | 40 | def __init__(self): 41 | pass 42 | 43 | def __call__(self, target, width, height): 44 | """ 45 | Arguments: 46 | target (annotation) : the target annotation to be made usable 47 | will be not normlized 48 | Returns: 49 | a list containing lists of bounding boxes [bbox coords, class name] 50 | """ 51 | 52 | boxes = target[:, :-1].copy() 53 | labels = target[:, -1].copy() 54 | boxes[:, 0::2] /= width 55 | boxes[:, 1::2] /= height 56 | b_w = (boxes[:, 2] - boxes[:, 0]) * 1. 57 | b_h = (boxes[:, 3] - boxes[:, 1]) * 1. 58 | mask_b = np.minimum(b_w, b_h) > 0.01 59 | boxes_t = boxes[mask_b] 60 | labels_t = labels[mask_b].copy() 61 | 62 | return boxes_t, labels_t 63 | 64 | 65 | class COCODetection(data.Dataset): 66 | """VOC Detection Dataset Object 67 | 68 | input is image, target is annotation 69 | 70 | Arguments: 71 | root (string): filepath to VOCdevkit folder. 72 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 73 | transform (callable, optional): transformation to perform on the 74 | input image 75 | target_transform (callable, optional): transformation to perform on the 76 | target `annotation` 77 | (eg: take in caption string, return tensor of word indices) 78 | dataset_name (string, optional): which dataset to load 79 | (default: 'VOC2007') 80 | """ 81 | 82 | def __init__(self, root, image_sets, transform=None, dataset_name='COCO'): 83 | self.root = root 84 | self.cache_path = os.path.join(self.root, 'cache') 85 | self.image_set = image_sets 86 | self.transform = transform 87 | self.name = dataset_name 88 | self.ids = list() 89 | self.annotations = list() 90 | self._view_map = { 91 | 'minival2014': 'val2014', # 5k val2014 subset 92 | 'valminusminival2014': 'val2014', # val2014 \setminus minival2014 93 | 'test-dev2015': 'test2015', 94 | } 95 | 96 | for (year, image_set) in image_sets: 97 | coco_name = image_set + year 98 | data_name = (self._view_map[coco_name] 99 | if coco_name in self._view_map else coco_name) 100 | annofile = self._get_ann_file(coco_name) 101 | _COCO = COCO(annofile) 102 | self._COCO = _COCO 103 | self.coco_name = coco_name 104 | cats = _COCO.loadCats(_COCO.getCatIds()) 105 | self._classes = tuple(['__background__'] + 106 | [c['name'] for c in cats]) 107 | self.num_classes = len(self._classes) 108 | self._class_to_ind = dict( 109 | zip(self._classes, range(self.num_classes))) 110 | self._class_to_coco_cat_id = dict( 111 | zip([c['name'] for c in cats], _COCO.getCatIds())) 112 | indexes = _COCO.getImgIds() 113 | self.image_indexes = indexes 114 | self.ids.extend([ 115 | self.image_path_from_index(data_name, index) 116 | for index in indexes 117 | ]) 118 | if image_set.find('test') != -1: 119 | print('test set will not load annotations!') 120 | else: 121 | self.annotations.extend( 122 | self._load_coco_annotations(coco_name, indexes, _COCO)) 123 | 124 | def image_path_from_index(self, name, index): 125 | """ 126 | Construct an image path from the image's "index" identifier. 127 | """ 128 | # Example image path for index=119993: 129 | # images/train2014/COCO_train2014_000000119993.jpg 130 | file_name = ('COCO_' + name + '_' + str(index).zfill(12) + '.jpg') 131 | image_path = os.path.join(self.root, 'images', name, file_name) 132 | assert os.path.exists(image_path), \ 133 | 'Path does not exist: {}'.format(image_path) 134 | return image_path 135 | 136 | def _get_ann_file(self, name): 137 | prefix = 'instances' if name.find('test') == -1 \ 138 | else 'image_info' 139 | return os.path.join(self.root, 'annotations', 140 | prefix + '_' + name + '.json') 141 | 142 | def _load_coco_annotations(self, coco_name, indexes, _COCO): 143 | cache_file = os.path.join(self.cache_path, coco_name + '_gt_roidb.pkl') 144 | if os.path.exists(cache_file): 145 | with open(cache_file, 'rb') as fid: 146 | roidb = pickle.load(fid) 147 | print('{} gt roidb loaded from {}'.format(coco_name, cache_file)) 148 | return roidb 149 | 150 | gt_roidb = [ 151 | self._annotation_from_index(index, _COCO) for index in indexes 152 | ] 153 | with open(cache_file, 'wb') as fid: 154 | pickle.dump(gt_roidb, fid, pickle.HIGHEST_PROTOCOL) 155 | print('wrote gt roidb to {}'.format(cache_file)) 156 | return gt_roidb 157 | 158 | def _annotation_from_index(self, index, _COCO): 159 | """ 160 | Loads COCO bounding-box instance annotations. Crowd instances are 161 | handled by marking their overlaps (with all categories) to -1. This 162 | overlap value means that crowd "instances" are excluded from training. 163 | """ 164 | im_ann = _COCO.loadImgs(index)[0] 165 | width = im_ann['width'] 166 | height = im_ann['height'] 167 | 168 | annIds = _COCO.getAnnIds(imgIds=index, iscrowd=None) 169 | objs = _COCO.loadAnns(annIds) 170 | # Sanitize bboxes -- some are invalid 171 | valid_objs = [] 172 | for obj in objs: 173 | x1 = np.max((0, obj['bbox'][0])) 174 | y1 = np.max((0, obj['bbox'][1])) 175 | x2 = np.min((width - 1, x1 + np.max((0, obj['bbox'][2] - 1)))) 176 | y2 = np.min((height - 1, y1 + np.max((0, obj['bbox'][3] - 1)))) 177 | if obj['area'] > 0 and x2 >= x1 and y2 >= y1: 178 | obj['clean_bbox'] = [x1, y1, x2, y2] 179 | valid_objs.append(obj) 180 | objs = valid_objs 181 | num_objs = len(objs) 182 | 183 | res = np.zeros((num_objs, 5)) 184 | 185 | # Lookup table to map from COCO category ids to our internal class 186 | # indices 187 | coco_cat_id_to_class_ind = dict([(self._class_to_coco_cat_id[cls], 188 | self._class_to_ind[cls]) 189 | for cls in self._classes[1:]]) 190 | 191 | for ix, obj in enumerate(objs): 192 | cls = coco_cat_id_to_class_ind[obj['category_id']] 193 | res[ix, 0:4] = obj['clean_bbox'] 194 | res[ix, 4] = cls 195 | 196 | return res 197 | 198 | def __getitem__(self, index): 199 | img_id = self.ids[index] 200 | target = self.annotations[index] if self.coco_name.find('test') == -1 else 1 201 | img = cv2.imread(img_id, cv2.IMREAD_COLOR) 202 | # img0 = img[:, ::-1, :] 203 | height, width, _ = img.shape 204 | img_info = [width, height] 205 | # if self.target_transform is not None: 206 | # target = self.target_transform(target) 207 | 208 | if self.transform is not None: 209 | img, target = self.transform(img, target) 210 | # img0, target = self.transform(img0, target) 211 | # img = torch.cat([img, img0], dim=0) 212 | 213 | return img, target, img_info 214 | 215 | def __len__(self): 216 | return len(self.ids) 217 | 218 | def pull_image(self, index): 219 | '''Returns the original image object at index in PIL form 220 | 221 | Note: not using self.__getitem__(), as any transformations passed in 222 | could mess up this functionality. 223 | 224 | Argument: 225 | index (int): index of img to show 226 | Return: 227 | PIL img 228 | ''' 229 | img_id = self.ids[index] 230 | return cv2.imread(img_id, cv2.IMREAD_COLOR) 231 | 232 | def pull_tensor(self, index): 233 | '''Returns the original image at an index in tensor form 234 | 235 | Note: not using self.__getitem__(), as any transformations passed in 236 | could mess up this functionality. 237 | 238 | Argument: 239 | index (int): index of img to show 240 | Return: 241 | tensorized version of img, squeezed 242 | ''' 243 | to_tensor = transforms.ToTensor() 244 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 245 | 246 | def _print_detection_eval_metrics(self, coco_eval): 247 | IoU_lo_thresh = 0.5 248 | IoU_hi_thresh = 0.95 249 | 250 | def _get_thr_ind(coco_eval, thr): 251 | ind = np.where((coco_eval.params.iouThrs > thr - 1e-5) & 252 | (coco_eval.params.iouThrs < thr + 1e-5))[0][0] 253 | iou_thr = coco_eval.params.iouThrs[ind] 254 | assert np.isclose(iou_thr, thr) 255 | return ind 256 | 257 | ind_lo = _get_thr_ind(coco_eval, IoU_lo_thresh) 258 | ind_hi = _get_thr_ind(coco_eval, IoU_hi_thresh) 259 | # precision has dims (iou, recall, cls, area range, max dets) 260 | # area range index 0: all area ranges 261 | # max dets index 2: 100 per image 262 | precision = \ 263 | coco_eval.eval['precision'][ind_lo:(ind_hi + 1), :, :, 0, 2] 264 | ap_default = np.mean(precision[precision > -1]) 265 | print('~~~~ Mean and per-category AP @ IoU=[{:.2f},{:.2f}] ' 266 | '~~~~'.format(IoU_lo_thresh, IoU_hi_thresh)) 267 | print('{:.1f}'.format(100 * ap_default)) 268 | for cls_ind, cls in enumerate(self._classes): 269 | if cls == '__background__': 270 | continue 271 | # minus 1 because of __background__ 272 | precision = coco_eval.eval['precision'][ind_lo:( 273 | ind_hi + 1), :, cls_ind - 1, 0, 2] 274 | ap = np.mean(precision[precision > -1]) 275 | print('{:.1f}'.format(100 * ap)) 276 | 277 | print('~~~~ Summary metrics ~~~~') 278 | coco_eval.summarize() 279 | 280 | def _do_detection_eval(self, res_file, output_dir): 281 | ann_type = 'bbox' 282 | coco_dt = self._COCO.loadRes(res_file) 283 | coco_eval = COCOeval(self._COCO, coco_dt) 284 | coco_eval.params.useSegm = (ann_type == 'segm') 285 | coco_eval.evaluate() 286 | coco_eval.accumulate() 287 | self._print_detection_eval_metrics(coco_eval) 288 | eval_file = os.path.join(output_dir, 'detection_results.pkl') 289 | with open(eval_file, 'wb') as fid: 290 | pickle.dump(coco_eval, fid, pickle.HIGHEST_PROTOCOL) 291 | print('Wrote COCO eval results to: {}'.format(eval_file)) 292 | 293 | def _coco_results_one_category(self, boxes, cat_id): 294 | results = [] 295 | for im_ind, index in enumerate(self.image_indexes): 296 | dets = boxes[im_ind].astype(np.float) 297 | if dets == []: 298 | continue 299 | scores = dets[:, -1] 300 | xs = dets[:, 0] 301 | ys = dets[:, 1] 302 | ws = dets[:, 2] - xs + 1 303 | hs = dets[:, 3] - ys + 1 304 | results.extend([{ 305 | 'image_id': index, 306 | 'category_id': cat_id, 307 | 'bbox': [xs[k], ys[k], ws[k], hs[k]], 308 | 'score': scores[k] 309 | } for k in range(dets.shape[0])]) 310 | return results 311 | 312 | def _write_coco_results_file(self, all_boxes, res_file): 313 | # [{"image_id": 42, 314 | # "category_id": 18, 315 | # "bbox": [258.15,41.29,348.26,243.78], 316 | # "score": 0.236}, ...] 317 | results = [] 318 | for cls_ind, cls in enumerate(self._classes): 319 | if cls == '__background__': 320 | continue 321 | print('Collecting {} results ({:d}/{:d})'.format( 322 | cls, cls_ind, self.num_classes)) 323 | coco_cat_id = self._class_to_coco_cat_id[cls] 324 | results.extend( 325 | self._coco_results_one_category(all_boxes[cls_ind], 326 | coco_cat_id)) 327 | ''' 328 | if cls_ind ==30: 329 | res_f = res_file+ '_1.json' 330 | print('Writing results json to {}'.format(res_f)) 331 | with open(res_f, 'w') as fid: 332 | json.dump(results, fid) 333 | results = [] 334 | ''' 335 | #res_f2 = res_file+'_2.json' 336 | print('Writing results json to {}'.format(res_file)) 337 | with open(res_file, 'w') as fid: 338 | json.dump(results, fid) 339 | 340 | def evaluate_detections(self, all_boxes, output_dir): 341 | res_file = os.path.join(output_dir, 342 | ('detections_' + self.coco_name + '_results')) 343 | res_file += '.json' 344 | self._write_coco_results_file(all_boxes, res_file) 345 | # Only do evaluation on non-test sets 346 | if self.coco_name.find('test') == -1: 347 | self._do_detection_eval(res_file, output_dir) 348 | # Optionally cleanup results json file 349 | -------------------------------------------------------------------------------- /data/data_augment.py: -------------------------------------------------------------------------------- 1 | """Data augmentation functionality. Passed as callable transformations to 2 | Dataset classes. 3 | 4 | The data augmentation procedures were interpreted from @weiliu89's SSD paper 5 | http://arxiv.org/abs/1512.02325 6 | 7 | TODO: implement data_augment for training 8 | 9 | Ellis Brown, Max deGroot 10 | """ 11 | 12 | import torch 13 | from torchvision import transforms 14 | import cv2 15 | import numpy as np 16 | import random 17 | import math 18 | from utils.box_utils import matrix_iou 19 | 20 | 21 | def _crop(image, boxes, labels): 22 | height, width, _ = image.shape 23 | 24 | if len(boxes) == 0: 25 | return image, boxes, labels 26 | 27 | while True: 28 | mode = random.choice(( 29 | None, 30 | (0.1, None), 31 | (0.3, None), 32 | (0.5, None), 33 | (0.7, None), 34 | (0.9, None), 35 | (None, None), 36 | )) 37 | 38 | if mode is None: 39 | return image, boxes, labels 40 | 41 | min_iou, max_iou = mode 42 | if min_iou is None: 43 | min_iou = float('-inf') 44 | if max_iou is None: 45 | max_iou = float('inf') 46 | 47 | for _ in range(50): 48 | scale = random.uniform(0.3, 1.) 49 | min_ratio = max(0.5, scale * scale) 50 | max_ratio = min(2, 1. / scale / scale) 51 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 52 | w = int(scale * ratio * width) 53 | h = int((scale / ratio) * height) 54 | 55 | l = random.randrange(width - w) 56 | t = random.randrange(height - h) 57 | roi = np.array((l, t, l + w, t + h)) 58 | 59 | iou = matrix_iou(boxes, roi[np.newaxis]) 60 | 61 | if not (min_iou <= iou.min() and iou.max() <= max_iou): 62 | continue 63 | 64 | image_t = image[roi[1]:roi[3], roi[0]:roi[2]] 65 | 66 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2 67 | mask = np.logical_and(roi[:2] < centers, centers < roi[2:]) \ 68 | .all(axis=1) 69 | boxes_t = boxes[mask].copy() 70 | labels_t = labels[mask].copy() 71 | if len(boxes_t) == 0: 72 | continue 73 | 74 | boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2]) 75 | boxes_t[:, :2] -= roi[:2] 76 | boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:]) 77 | boxes_t[:, 2:] -= roi[:2] 78 | 79 | return image_t, boxes_t, labels_t 80 | 81 | 82 | def _distort(image): 83 | def _convert(image, alpha=1, beta=0): 84 | tmp = image.astype(float) * alpha + beta 85 | tmp[tmp < 0] = 0 86 | tmp[tmp > 255] = 255 87 | image[:] = tmp 88 | 89 | image = image.copy() 90 | 91 | if random.randrange(2): 92 | _convert(image, beta=random.uniform(-32, 32)) 93 | 94 | if random.randrange(2): 95 | _convert(image, alpha=random.uniform(0.5, 1.5)) 96 | 97 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 98 | 99 | if random.randrange(2): 100 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 101 | tmp %= 180 102 | image[:, :, 0] = tmp 103 | 104 | if random.randrange(2): 105 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 106 | 107 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 108 | 109 | return image 110 | 111 | 112 | def _expand(image, boxes, fill, p): 113 | if random.random() > p: 114 | return image, boxes 115 | 116 | height, width, depth = image.shape 117 | for _ in range(50): 118 | scale = random.uniform(1, 4) 119 | 120 | min_ratio = max(0.5, 1. / scale / scale) 121 | max_ratio = min(2, scale * scale) 122 | ratio = math.sqrt(random.uniform(min_ratio, max_ratio)) 123 | ws = scale * ratio 124 | hs = scale / ratio 125 | if ws < 1 or hs < 1: 126 | continue 127 | w = int(ws * width) 128 | h = int(hs * height) 129 | 130 | left = random.randint(0, w - width) 131 | top = random.randint(0, h - height) 132 | 133 | boxes_t = boxes.copy() 134 | boxes_t[:, :2] += (left, top) 135 | boxes_t[:, 2:] += (left, top) 136 | 137 | expand_image = np.empty((h, w, depth), dtype=image.dtype) 138 | expand_image[:, :] = fill 139 | expand_image[top:top + height, left:left + width] = image 140 | image = expand_image 141 | 142 | return image, boxes_t 143 | 144 | 145 | def _mirror(image, boxes): 146 | _, width, _ = image.shape 147 | if random.randrange(2): 148 | image = image[:, ::-1] 149 | boxes = boxes.copy() 150 | boxes[:, 0::2] = width - boxes[:, 2::-2] 151 | return image, boxes 152 | 153 | 154 | def preproc_for_test(image, resize_wh, mean): 155 | interp_methods = [ 156 | cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, 157 | cv2.INTER_LANCZOS4 158 | ] 159 | interp_method = interp_methods[random.randrange(5)] 160 | # interp_method = interp_methods[0] 161 | image = cv2.resize( 162 | image, (resize_wh[0], resize_wh[1]), interpolation=interp_method) 163 | image = image.astype(np.float32) 164 | image -= mean 165 | # to rgb 166 | # image = image[:, :, (2, 1, 0)] 167 | return image.transpose(2, 0, 1) 168 | 169 | 170 | class preproc(object): 171 | def __init__(self, resize_wh, rgb_means, p): 172 | self.means = rgb_means 173 | self.resize_wh = resize_wh 174 | self.p = p 175 | 176 | def __call__(self, image, targets): 177 | boxes = targets[:, :-1].copy() 178 | labels = targets[:, -1].copy() 179 | if len(boxes) == 0: 180 | #boxes = np.empty((0, 4)) 181 | targets = np.zeros((1, 5)) 182 | image = preproc_for_test(image, self.resize_wh, self.means) 183 | return torch.from_numpy(image), targets 184 | 185 | image_o = image.copy() 186 | targets_o = targets.copy() 187 | height_o, width_o, _ = image_o.shape 188 | boxes_o = targets_o[:, :-1] 189 | labels_o = targets_o[:, -1] 190 | boxes_o[:, 0::2] /= width_o 191 | boxes_o[:, 1::2] /= height_o 192 | labels_o = np.expand_dims(labels_o, 1) 193 | targets_o = np.hstack((boxes_o, labels_o)) 194 | 195 | image_t, boxes, labels = _crop(image, boxes, labels) 196 | image_t = _distort(image_t) 197 | image_t, boxes = _expand(image_t, boxes, self.means, self.p) 198 | image_t, boxes = _mirror(image_t, boxes) 199 | #image_t, boxes = _mirror(image, boxes) 200 | 201 | height, width, _ = image_t.shape 202 | image_t = preproc_for_test(image_t, self.resize_wh, self.means) 203 | boxes = boxes.copy() 204 | boxes[:, 0::2] /= width 205 | boxes[:, 1::2] /= height 206 | b_w = (boxes[:, 2] - boxes[:, 0]) * 1. 207 | b_h = (boxes[:, 3] - boxes[:, 1]) * 1. 208 | mask_b = np.minimum(b_w, b_h) > 0.01 209 | boxes_t = boxes[mask_b] 210 | labels_t = labels[mask_b].copy() 211 | 212 | if len(boxes_t) == 0: 213 | image = preproc_for_test(image_o, self.resize_wh, self.means) 214 | return torch.from_numpy(image), targets_o 215 | 216 | labels_t = np.expand_dims(labels_t, 1) 217 | targets_t = np.hstack((boxes_t, labels_t)) 218 | 219 | return torch.from_numpy(image_t), targets_t 220 | 221 | 222 | class BaseTransform(object): 223 | """Defines the transformations that should be applied to test PIL image 224 | for input into the network 225 | 226 | dimension -> tensorize -> color adj 227 | 228 | Arguments: 229 | resize (int): input dimension to SSD 230 | rgb_means ((int,int,int)): average RGB of the dataset 231 | (104,117,123) 232 | swap ((int,int,int)): final order of channels 233 | Returns: 234 | transform (transform) : callable transform to be applied to test/val 235 | data 236 | """ 237 | 238 | def __init__(self, resize_wh, rgb_means, swap=(2, 0, 1)): 239 | self.means = rgb_means 240 | self.resize_wh = resize_wh 241 | self.swap = swap 242 | 243 | # assume input is cv2 img for now 244 | def __call__(self, img, target=None): 245 | 246 | interp_methods = [ 247 | cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, 248 | cv2.INTER_NEAREST, cv2.INTER_LANCZOS4 249 | ] 250 | interp_method = interp_methods[0] 251 | img = cv2.resize( 252 | np.array(img), (self.resize_wh[0], self.resize_wh[1]), 253 | interpolation=interp_method).astype(np.float32) 254 | img -= self.means 255 | img = img.transpose(self.swap) 256 | return torch.from_numpy(img), target 257 | -------------------------------------------------------------------------------- /data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/voc0712.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | 9 | import os 10 | import os.path 11 | import pickle 12 | import sys 13 | import torch 14 | import torch.utils.data as data 15 | from PIL import Image, ImageDraw, ImageFont 16 | import cv2 17 | import numpy as np 18 | from .voc_eval import voc_eval 19 | if sys.version_info[0] == 2: 20 | import xml.etree.cElementTree as ET 21 | else: 22 | import xml.etree.ElementTree as ET 23 | 24 | VOC_CLASSES = ( 25 | '__background__', # always index 0 26 | 'aeroplane', 27 | 'bicycle', 28 | 'bird', 29 | 'boat', 30 | 'bottle', 31 | 'bus', 32 | 'car', 33 | 'cat', 34 | 'chair', 35 | 'cow', 36 | 'diningtable', 37 | 'dog', 38 | 'horse', 39 | 'motorbike', 40 | 'person', 41 | 'pottedplant', 42 | 'sheep', 43 | 'sofa', 44 | 'train', 45 | 'tvmonitor') 46 | 47 | # for making bounding boxes pretty 48 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 49 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 50 | 51 | 52 | class AnnotationTransform(object): 53 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 54 | Initilized with a dictionary lookup of classnames to indexes 55 | 56 | Arguments: 57 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 58 | (default: alphabetic indexing of VOC's 20 classes) 59 | keep_difficult (bool, optional): keep difficult instances or not 60 | (default: False) 61 | height (int): height 62 | width (int): width 63 | """ 64 | 65 | def __init__(self, class_to_ind=None, keep_difficult=False): 66 | self.class_to_ind = class_to_ind or dict( 67 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 68 | self.keep_difficult = keep_difficult 69 | 70 | def __call__(self, target, width, height): 71 | """ 72 | Arguments: 73 | target (annotation) : the target annotation to be made usable 74 | will be an ET.Element 75 | Returns: 76 | a list containing lists of bounding boxes [bbox coords, class name] 77 | """ 78 | res = np.empty((0, 5)) 79 | for obj in target.iter('object'): 80 | difficult = int(obj.find('difficult').text) == 1 81 | if not self.keep_difficult and difficult: 82 | continue 83 | name = obj.find('name').text.lower().strip() 84 | bbox = obj.find('bndbox') 85 | 86 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 87 | bndbox = [] 88 | for i, pt in enumerate(pts): 89 | cur_pt = int(bbox.find(pt).text) - 1 90 | # scale height or width 91 | # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 92 | bndbox.append(cur_pt) 93 | label_idx = self.class_to_ind[name] 94 | bndbox.append(label_idx) 95 | # res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 96 | res = np.vstack((res, bndbox)) 97 | # img_id = target.find('filename').text[:-4] 98 | if len(res) == 0: 99 | np.vstack((res, [0, 0, 0, 0, 0])) 100 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 101 | 102 | 103 | class VOCDetection(data.Dataset): 104 | """VOC Detection Dataset Object 105 | 106 | input is image, target is annotation 107 | 108 | Arguments: 109 | root (string): filepath to VOCdevkit folder. 110 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 111 | transform (callable, optional): transformation to perform on the 112 | input image 113 | target_transform (callable, optional): transformation to perform on the 114 | target `annotation` 115 | (eg: take in caption string, return tensor of word indices) 116 | dataset_name (string, optional): which dataset to load 117 | (default: 'VOC2007') 118 | """ 119 | 120 | def __init__(self, 121 | root, 122 | image_sets, 123 | transform=None, 124 | dataset_name='VOC0712'): 125 | self.root = root 126 | self.image_set = image_sets 127 | self.transform = transform 128 | self.target_transform = AnnotationTransform() 129 | self.name = dataset_name 130 | self._annopath = os.path.join('%s', 'Annotations', '%s.xml') 131 | self._imgpath = os.path.join('%s', 'JPEGImages', '%s.jpg') 132 | self.ids = list() 133 | for (year, name) in image_sets: 134 | self._year = year 135 | rootpath = os.path.join(self.root, 'VOC' + year) 136 | for line in open( 137 | os.path.join(rootpath, 'ImageSets', 'Main', 138 | name + '.txt')): 139 | self.ids.append((rootpath, line.strip())) 140 | 141 | def __getitem__(self, index): 142 | im, gt, img_info = self.pull_item(index) 143 | return im, gt, img_info 144 | 145 | def __len__(self): 146 | return len(self.ids) 147 | 148 | def pull_item(self, index): 149 | img_id = self.ids[index] 150 | 151 | if self.name != 'test': 152 | target = ET.parse(self._annopath % img_id).getroot() 153 | else: 154 | target = np.zeros((1, 5)) 155 | img = cv2.imread(self._imgpath % img_id) 156 | im_h, im_w, channels = img.shape 157 | img_info = [im_w, im_h] 158 | if self.target_transform is not None: 159 | target = self.target_transform(target, im_w, im_h) 160 | 161 | if self.name != 'test': 162 | if self.transform is not None: 163 | img, target = self.transform(img, target) 164 | else: 165 | if self.transform is not None: 166 | img = self.transform(img) 167 | 168 | return img, target, img_info 169 | 170 | def pull_image(self, index): 171 | '''Returns the original image object at index in PIL form 172 | 173 | Note: not using self.__getitem__(), as any transformations passed in 174 | could mess up this functionality. 175 | 176 | Argument: 177 | index (int): index of img to show 178 | Return: 179 | PIL img 180 | ''' 181 | img_id = self.ids[index] 182 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 183 | 184 | def pull_anno(self, index): 185 | '''Returns the original annotation of image at index 186 | 187 | Note: not using self.__getitem__(), as any transformations passed in 188 | could mess up this functionality. 189 | 190 | Argument: 191 | index (int): index of img to get annotation of 192 | Return: 193 | list: [img_id, [(label, bbox coords),...]] 194 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 195 | ''' 196 | img_id = self.ids[index] 197 | anno = ET.parse(self._annopath % img_id).getroot() 198 | gt = self.target_transform(anno, 1, 1) 199 | return img_id[1], gt 200 | 201 | def pull_tensor(self, index): 202 | '''Returns the original image at an index in tensor form 203 | 204 | Note: not using self.__getitem__(), as any transformations passed in 205 | could mess up this functionality. 206 | 207 | Argument: 208 | index (int): index of img to show 209 | Return: 210 | tensorized version of img, squeezed 211 | ''' 212 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 213 | 214 | def evaluate_detections(self, all_boxes, output_dir=None): 215 | """ 216 | all_boxes is a list of length number-of-classes. 217 | Each list element is a list of length number-of-images. 218 | Each of those list elements is either an empty list [] 219 | or a numpy array of detection. 220 | 221 | all_boxes[class][image] = [] or np.array of shape #dets x 5 222 | """ 223 | self._write_voc_results_file(all_boxes) 224 | self._do_python_eval(output_dir) 225 | 226 | def _get_voc_results_file_template(self): 227 | filename = 'comp3_det_test' + '_{:s}.txt' 228 | filedir = os.path.join(self.root, 'results', 'VOC' + self._year, 229 | 'Main') 230 | if not os.path.exists(filedir): 231 | os.makedirs(filedir) 232 | path = os.path.join(filedir, filename) 233 | return path 234 | 235 | def _write_voc_results_file(self, all_boxes): 236 | for cls_ind, cls in enumerate(VOC_CLASSES): 237 | if cls == '__background__': 238 | continue 239 | print('Writing {} VOC results file'.format(cls)) 240 | filename = self._get_voc_results_file_template().format(cls) 241 | # print(filename) 242 | with open(filename, 'wt') as f: 243 | for im_ind, index in enumerate(self.ids): 244 | index = index[1] 245 | dets = all_boxes[cls_ind][im_ind] 246 | if dets == []: 247 | continue 248 | for k in range(dets.shape[0]): 249 | f.write( 250 | '{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'.format( 251 | index, dets[k, -1], dets[k, 0] + 1, 252 | dets[k, 1] + 1, dets[k, 2] + 1, 253 | dets[k, 3] + 1)) 254 | 255 | def _do_python_eval(self, output_dir='output'): 256 | rootpath = os.path.join(self.root, 'VOC' + self._year) 257 | name = self.image_set[0][1] 258 | annopath = os.path.join(rootpath, 'Annotations', '{:s}.xml') 259 | imagesetfile = os.path.join(rootpath, 'ImageSets', 'Main', 260 | name + '.txt') 261 | cachedir = os.path.join(self.root, 'annotations_cache') 262 | aps = [] 263 | # The PASCAL VOC metric changed in 2010 264 | use_07_metric = True if int(self._year) < 2010 else False 265 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 266 | if output_dir is not None and not os.path.isdir(output_dir): 267 | os.mkdir(output_dir) 268 | for i, cls in enumerate(VOC_CLASSES): 269 | if cls == '__background__': 270 | continue 271 | 272 | filename = self._get_voc_results_file_template().format(cls) 273 | rec, prec, ap = voc_eval( 274 | filename, 275 | annopath, 276 | imagesetfile, 277 | cls, 278 | cachedir, 279 | ovthresh=0.5, 280 | use_07_metric=use_07_metric) 281 | aps += [ap] 282 | print('AP for {} = {:.4f}'.format(cls, ap)) 283 | if output_dir is not None: 284 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 285 | 'wb') as f: 286 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 287 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 288 | print('~~~~~~~~') 289 | print('Results:') 290 | for ap in aps: 291 | print('{:.3f}'.format(ap)) 292 | print('{:.3f}'.format(np.mean(aps))) 293 | print('~~~~~~~~') 294 | print('') 295 | print('--------------------------------------------------------------') 296 | print('Results computed with the **unofficial** Python eval code.') 297 | print('Results should be very close to the official MATLAB eval code.') 298 | print('Recompute with `./tools/reval.py --matlab ...` for your paper.') 299 | print('-- Thanks, The Management') 300 | print('--------------------------------------------------------------') 301 | 302 | 303 | def detection_collate(batch): 304 | """Custom collate fn for dealing with batches of images that have a different 305 | number of associated object annotations (bounding boxes). 306 | 307 | Arguments: 308 | batch: (tuple) A tuple of tensor images and lists of annotations 309 | 310 | Return: 311 | A tuple containing: 312 | 1) (tensor) batch of images stacked on their 0 dim 313 | 2) (list of tensors) annotations for a given image are stacked on 0 dim 314 | """ 315 | targets = [] 316 | imgs = [] 317 | img_info = [] 318 | for sample in batch: 319 | imgs.append(sample[0]) 320 | targets.append(torch.FloatTensor(sample[1])) 321 | img_info.append(torch.FloatTensor(sample[2])) 322 | return torch.stack(imgs, 0), targets, img_info 323 | -------------------------------------------------------------------------------- /data/voc_eval.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast/er R-CNN 3 | # Licensed under The MIT License [see LICENSE for details] 4 | # Written by Bharath Hariharan 5 | # -------------------------------------------------------- 6 | 7 | import xml.etree.ElementTree as ET 8 | import os 9 | import pickle 10 | import numpy as np 11 | import pdb 12 | import matplotlib 13 | matplotlib.use('Agg') 14 | import matplotlib.pyplot as plt 15 | 16 | 17 | def parse_rec(filename): 18 | """ Parse a PASCAL VOC xml file """ 19 | tree = ET.parse(filename) 20 | objects = [] 21 | for obj in tree.findall('object'): 22 | obj_struct = {} 23 | obj_struct['name'] = obj.find('name').text 24 | obj_struct['pose'] = obj.find('pose').text 25 | obj_struct['truncated'] = int(obj.find('truncated').text) 26 | obj_struct['difficult'] = int(obj.find('difficult').text) 27 | bbox = obj.find('bndbox') 28 | obj_struct['bbox'] = [ 29 | int(bbox.find('xmin').text), 30 | int(bbox.find('ymin').text), 31 | int(bbox.find('xmax').text), 32 | int(bbox.find('ymax').text) 33 | ] 34 | objects.append(obj_struct) 35 | 36 | return objects 37 | 38 | 39 | def voc_ap(rec, prec, use_07_metric=False): 40 | """ ap = voc_ap(rec, prec, [use_07_metric]) 41 | Compute VOC AP given precision and recall. 42 | If use_07_metric is true, uses the 43 | VOC 07 11 point method (default:False). 44 | """ 45 | if use_07_metric: 46 | # 11 point metric 47 | ap = 0. 48 | for t in np.arange(0., 1.1, 0.1): 49 | if np.sum(rec >= t) == 0: 50 | p = 0 51 | else: 52 | p = np.max(prec[rec >= t]) 53 | ap = ap + p / 11. 54 | else: 55 | # correct AP calculation 56 | # first append sentinel values at the end 57 | mrec = np.concatenate(([0.], rec, [1.])) 58 | mpre = np.concatenate(([0.], prec, [0.])) 59 | 60 | # compute the precision envelope 61 | for i in range(mpre.size - 1, 0, -1): 62 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 63 | 64 | # to calculate area under PR curve, look for points 65 | # where X axis (recall) changes value 66 | i = np.where(mrec[1:] != mrec[:-1])[0] 67 | 68 | # and sum (\Delta recall) * prec 69 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 70 | return ap 71 | 72 | 73 | def voc_eval(detpath, 74 | annopath, 75 | imagesetfile, 76 | classname, 77 | cachedir, 78 | ovthresh=0.5, 79 | use_07_metric=False): 80 | """rec, prec, ap = voc_eval(detpath, 81 | annopath, 82 | imagesetfile, 83 | classname, 84 | [ovthresh], 85 | [use_07_metric]) 86 | 87 | Top level function that does the PASCAL VOC evaluation. 88 | 89 | detpath: Path to detections 90 | detpath.format(classname) should produce the detection results file. 91 | annopath: Path to annotations 92 | annopath.format(imagename) should be the xml annotations file. 93 | imagesetfile: Text file containing the list of images, one image per line. 94 | classname: Category name (duh) 95 | cachedir: Directory for caching the annotations 96 | [ovthresh]: Overlap threshold (default = 0.5) 97 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 98 | (default False) 99 | """ 100 | # assumes detections are in detpath.format(classname) 101 | # assumes annotations are in annopath.format(imagename) 102 | # assumes imagesetfile is a text file with each line an image name 103 | # cachedir caches the annotations in a pickle file 104 | 105 | # first load gt 106 | if not os.path.isdir(cachedir): 107 | os.mkdir(cachedir) 108 | cachefile = os.path.join(cachedir, 'annots.pkl') 109 | # read list of images 110 | with open(imagesetfile, 'r') as f: 111 | lines = f.readlines() 112 | imagenames = [x.strip() for x in lines] 113 | 114 | if not os.path.isfile(cachefile): 115 | # load annots 116 | recs = {} 117 | for i, imagename in enumerate(imagenames): 118 | recs[imagename] = parse_rec(annopath.format(imagename)) 119 | if i % 100 == 0: 120 | print('Reading annotation for {:d}/{:d}'.format( 121 | i + 1, len(imagenames))) 122 | # save 123 | print('Saving cached annotations to {:s}'.format(cachefile)) 124 | with open(cachefile, 'wb') as f: 125 | pickle.dump(recs, f) 126 | else: 127 | # load 128 | with open(cachefile, 'rb') as f: 129 | recs = pickle.load(f) 130 | 131 | # extract gt objects for this class 132 | class_recs = {} 133 | npos = 0 134 | for imagename in imagenames: 135 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 136 | bbox = np.array([x['bbox'] for x in R]) 137 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 138 | det = [False] * len(R) 139 | npos = npos + sum(~difficult) 140 | class_recs[imagename] = { 141 | 'bbox': bbox, 142 | 'difficult': difficult, 143 | 'det': det 144 | } 145 | 146 | # read dets 147 | detfile = detpath.format(classname) 148 | with open(detfile, 'r') as f: 149 | lines = f.readlines() 150 | 151 | splitlines = [x.strip().split(' ') for x in lines] 152 | image_ids = [x[0] for x in splitlines] 153 | confidence = np.array([float(x[1]) for x in splitlines]) 154 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 155 | # sort by confidence 156 | sorted_ind = np.argsort(-confidence) 157 | sorted_scores = np.sort(-confidence) 158 | BB = BB[sorted_ind, :] 159 | image_ids = [image_ids[x] for x in sorted_ind] 160 | 161 | # go down dets and mark TPs and FPs 162 | nd = len(image_ids) 163 | tp = np.zeros(nd) 164 | fp = np.zeros(nd) 165 | for d in range(nd): 166 | R = class_recs[image_ids[d]] 167 | bb = BB[d, :].astype(float) 168 | ovmax = -np.inf 169 | BBGT = R['bbox'].astype(float) 170 | 171 | if BBGT.size > 0: 172 | # compute overlaps 173 | # intersection 174 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 175 | iymin = np.maximum(BBGT[:, 1], bb[1]) 176 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 177 | iymax = np.minimum(BBGT[:, 3], bb[3]) 178 | iw = np.maximum(ixmax - ixmin + 1., 0.) 179 | ih = np.maximum(iymax - iymin + 1., 0.) 180 | inters = iw * ih 181 | 182 | # union 183 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 184 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 185 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 186 | 187 | overlaps = inters / uni 188 | ovmax = np.max(overlaps) 189 | jmax = np.argmax(overlaps) 190 | 191 | if ovmax > ovthresh: 192 | if not R['difficult'][jmax]: 193 | if not R['det'][jmax]: 194 | tp[d] = 1. 195 | R['det'][jmax] = 1 196 | else: 197 | fp[d] = 1. 198 | else: 199 | fp[d] = 1. 200 | 201 | # compute precision recall 202 | fp = np.cumsum(fp) 203 | tp = np.cumsum(tp) 204 | rec = tp / float(npos) 205 | # avoid divide by zero in case the first detection matches a difficult 206 | # ground truth 207 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 208 | # if classname == 'person': 209 | final_rec = round(rec[-1], 4) 210 | final_prec = round(prec[-1], 4) 211 | plt_save_path = os.path.join(".", "eval", "pr") 212 | if not os.path.exists(plt_save_path): 213 | os.makedirs(plt_save_path) 214 | plt.plot(rec, prec, 'r') 215 | pr_curl = os.path.join( 216 | plt_save_path, '{}_{}_{}pr.jpg'.format(classname, str(final_prec), 217 | str(final_rec))) 218 | plt.savefig(pr_curl) 219 | plt.close() 220 | ap = voc_ap(rec, prec, use_07_metric) 221 | 222 | return rec, prec, ap 223 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.optim as optim 5 | import torch.backends.cudnn as cudnn 6 | import torch.nn.init as init 7 | import argparse 8 | from torch.autograd import Variable 9 | import torch.utils.data as data 10 | from data import COCODetection, VOCDetection, detection_collate, BaseTransform, preproc 11 | from layers.modules import MultiBoxLoss, HSDMultiBoxLoss 12 | from layers.functions import Detect 13 | from utils.nms_wrapper import nms, soft_nms 14 | from configs.config import cfg, cfg_from_file, VOC_CLASSES, COCO_CLASSES 15 | from utils.box_utils import draw_rects 16 | import numpy as np 17 | import time 18 | import os 19 | import sys 20 | import pickle 21 | import datetime 22 | from models.model_builder import SSD 23 | import yaml 24 | import cv2 25 | 26 | 27 | def arg_parse(): 28 | parser = argparse.ArgumentParser( 29 | description='Single Shot MultiBox Detection') 30 | parser.add_argument( 31 | "--images", 32 | dest='images', 33 | help="Image / Directory containing images to perform detection upon", 34 | default="images", 35 | type=str) 36 | parser.add_argument( 37 | '--weights', 38 | default='weights/ssd_darknet_300.pth', 39 | type=str, 40 | help='Trained state_dict file path to open') 41 | parser.add_argument( 42 | '--cfg', 43 | dest='cfg_file', 44 | required=True, 45 | help='Config file for training (and optionally testing)') 46 | parser.add_argument( 47 | '--save_folder', 48 | default='eval/', 49 | type=str, 50 | help='File path to save results') 51 | parser.add_argument( 52 | '--num_workers', 53 | default=8, 54 | type=int, 55 | help='Number of workers used in dataloading') 56 | parser.add_argument( 57 | '--retest', default=False, type=bool, help='test cache results') 58 | args = parser.parse_args() 59 | return args 60 | 61 | 62 | def im_detect(img, net, detector, transform, thresh=0.01): 63 | with torch.no_grad(): 64 | t0 = time.time() 65 | w, h = img.shape[1], img.shape[0] 66 | x = transform(img)[0].unsqueeze(0) 67 | x = x.cuda() 68 | t1 = time.time() 69 | output = net(x) 70 | boxes, scores = detector.forward(output) 71 | t2 = time.time() 72 | max_conf, max_id = scores[0].topk(1, 1, True, True) 73 | pos = max_id > 0 74 | if len(pos) == 0: 75 | return np.empty((0, 6)) 76 | boxes = boxes[0][pos.view(-1, 1).expand(len(pos), 4)].view(-1, 4) 77 | scores = max_conf[pos].view(-1, 1) 78 | max_id = max_id[pos].view(-1, 1) 79 | inds = scores > thresh 80 | if len(inds) == 0: 81 | return np.empty((0, 6)) 82 | boxes = boxes[inds.view(-1, 1).expand(len(inds), 4)].view(-1, 4) 83 | scores = scores[inds].view(-1, 1) 84 | max_id = max_id[inds].view(-1, 1) 85 | c_dets = torch.cat((boxes, scores, max_id.float()), 1).cpu().numpy() 86 | img_classes = np.unique(c_dets[:, -1]) 87 | output = None 88 | flag = False 89 | for cls in img_classes: 90 | cls_mask = np.where(c_dets[:, -1] == cls)[0] 91 | image_pred_class = c_dets[cls_mask, :] 92 | keep = nms(image_pred_class, cfg.TEST.NMS_OVERLAP, force_cpu=True) 93 | keep = keep[:50] 94 | image_pred_class = image_pred_class[keep, :] 95 | if not flag: 96 | output = image_pred_class 97 | flag = True 98 | else: 99 | output = np.concatenate((output, image_pred_class), axis=0) 100 | output[:, 0:2][output[:, 0:2] < 0] = 0 101 | output[:, 2:4][output[:, 2:4] > 1] = 1 102 | scale = np.array([w, h, w, h]) 103 | output[:, :4] = output[:, :4] * scale 104 | t3 = time.time() 105 | print("transform_t:", round(t1 - t0, 3), "detect_time:", 106 | round(t2 - t1, 3), "nms_time:", round(t3 - t2, 3)) 107 | return output 108 | 109 | 110 | def main(): 111 | global args 112 | args = arg_parse() 113 | cfg_from_file(args.cfg_file) 114 | bgr_means = cfg.TRAIN.BGR_MEAN 115 | dataset_name = cfg.DATASETS.DATA_TYPE 116 | batch_size = cfg.TEST.BATCH_SIZE 117 | num_workers = args.num_workers 118 | if cfg.DATASETS.DATA_TYPE == 'VOC': 119 | trainvalDataset = VOCDetection 120 | classes = VOC_CLASSES 121 | top_k = 200 122 | else: 123 | trainvalDataset = COCODetection 124 | classes = COCO_CLASSES 125 | top_k = 300 126 | valSet = cfg.DATASETS.VAL_TYPE 127 | num_classes = cfg.MODEL.NUM_CLASSES 128 | save_folder = args.save_folder 129 | if not os.path.exists(save_folder): 130 | os.mkdir(save_folder) 131 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 132 | cfg.TRAIN.TRAIN_ON = False 133 | net = SSD(cfg) 134 | 135 | checkpoint = torch.load(args.weights) 136 | state_dict = checkpoint['model'] 137 | from collections import OrderedDict 138 | new_state_dict = OrderedDict() 139 | for k, v in state_dict.items(): 140 | head = k[:7] 141 | if head == 'module.': 142 | name = k[7:] # remove `module.` 143 | else: 144 | name = k 145 | new_state_dict[name] = v 146 | net.load_state_dict(new_state_dict) 147 | 148 | detector = Detect(cfg) 149 | img_wh = cfg.TEST.INPUT_WH 150 | ValTransform = BaseTransform(img_wh, bgr_means, (2, 0, 1)) 151 | input_folder = args.images 152 | thresh = cfg.TEST.CONFIDENCE_THRESH 153 | for item in os.listdir(input_folder): 154 | img_path = os.path.join(input_folder, item) 155 | print(img_path) 156 | img = cv2.imread(img_path) 157 | dets = im_detect(img, net, detector, ValTransform, thresh) 158 | draw_img = draw_rects(img, dets, classes) 159 | out_img_name = "output_" + item[:-4] + '_hsd'+item[-4:] 160 | save_path = os.path.join(save_folder, out_img_name) 161 | cv2.imwrite(save_path, img) 162 | 163 | 164 | if __name__ == '__main__': 165 | st = time.time() 166 | main() 167 | print("final time", time.time() - st) 168 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | #os.environ["CUDA_VISIBLE_DEVICES"] = "1,0" 3 | import torch 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import torch.backends.cudnn as cudnn 7 | import torch.nn.init as init 8 | import argparse 9 | from torch.autograd import Variable 10 | import torch.utils.data as data 11 | from data import COCODetection, VOCDetection, detection_collate, BaseTransform, preproc 12 | from layers.modules import MultiBoxLoss, HSDMultiBoxLoss 13 | from layers.functions import Detect 14 | from utils.nms_wrapper import nms, soft_nms 15 | from configs.config import cfg, cfg_from_file 16 | import numpy as np 17 | import time 18 | import os 19 | import sys 20 | import pickle 21 | import datetime 22 | from models.model_builder import SSD 23 | import yaml 24 | 25 | 26 | def arg_parse(): 27 | parser = argparse.ArgumentParser( 28 | description='Hierachical shot detection') 29 | parser.add_argument( 30 | '--weights', 31 | default='weights/hsd_vgg312_coco.pth', 32 | type=str, 33 | help='Trained state_dict file path to open') 34 | parser.add_argument( 35 | '--cfg', 36 | dest='cfg_file', 37 | required=True, 38 | help='Config file for training (and optionally testing)') 39 | parser.add_argument( 40 | '--save_folder', 41 | default='eval/', 42 | type=str, 43 | help='File path to save results') 44 | parser.add_argument( 45 | '--num_workers', 46 | default=8, 47 | type=int, 48 | help='Number of workers used in dataloading') 49 | parser.add_argument( 50 | '--retest', default=False, type=bool, help='test cache results') 51 | args = parser.parse_args() 52 | return args 53 | 54 | 55 | def eval_net(val_dataset, 56 | val_loader, 57 | net, 58 | detector, 59 | cfg, 60 | transform, 61 | max_per_image=300, 62 | thresh=0.01, 63 | batch_size=1): 64 | net.eval() 65 | num_images = len(val_dataset) 66 | num_classes = cfg.MODEL.NUM_CLASSES 67 | eval_save_folder = "./eval/" 68 | if not os.path.exists(eval_save_folder): 69 | os.mkdir(eval_save_folder) 70 | all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)] 71 | det_file = os.path.join(eval_save_folder, 'detections.pkl') 72 | 73 | if args.retest: 74 | f = open(det_file, 'rb') 75 | all_boxes = pickle.load(f) 76 | print('Evaluating detections') 77 | val_dataset.evaluate_detections(all_boxes, eval_save_folder) 78 | return 79 | 80 | for idx, (imgs, _, img_info) in enumerate(val_loader): 81 | with torch.no_grad(): 82 | x = imgs 83 | x = x.cuda() 84 | torch.cuda.synchronize() 85 | t1 = time.time() 86 | output = net(x) 87 | torch.cuda.synchronize() 88 | t4 = time.time() 89 | boxes, scores = detector.forward(output) 90 | torch.cuda.synchronize() 91 | t2 = time.time() 92 | for k in range(boxes.size(0)): 93 | i = idx * batch_size + k 94 | boxes_ = boxes[k] 95 | scores_ = scores[k] 96 | boxes_ = boxes_.cpu().numpy() 97 | scores_ = scores_.cpu().numpy() 98 | img_wh = img_info[k] 99 | scale = np.array([img_wh[0], img_wh[1], img_wh[0], img_wh[1]]) 100 | boxes_ *= scale 101 | for j in range(1, num_classes): 102 | inds = np.where(scores_[:, j] > thresh)[0] 103 | if len(inds) == 0: 104 | all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) 105 | continue 106 | c_bboxes = boxes_[inds] 107 | c_scores = scores_[inds, j] 108 | c_dets = np.hstack((c_bboxes, 109 | c_scores[:, np.newaxis])).astype( 110 | np.float32, copy=False) 111 | keep = nms(c_dets, cfg.TEST.NMS_OVERLAP, force_cpu=True) 112 | keep = keep[:50] 113 | c_dets = c_dets[keep, :] 114 | all_boxes[j][i] = c_dets 115 | torch.cuda.synchronize() 116 | t3 = time.time() 117 | detect_time = t2 - t1 118 | nms_time = t3 - t2 119 | forward_time = t4 - t1 120 | if idx % 10 == 0: 121 | print('im_detect: {:d}/{:d} {:.3f}s {:.3f}s {:.3f}s'.format( 122 | i + 1, num_images, forward_time, detect_time, nms_time)) 123 | 124 | with open(det_file, 'wb') as f: 125 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 126 | print('Evaluating detections') 127 | val_dataset.evaluate_detections(all_boxes, eval_save_folder) 128 | print("detect time: ", time.time() - st) 129 | 130 | 131 | def main(): 132 | global args 133 | args = arg_parse() 134 | cfg_from_file(args.cfg_file) 135 | bgr_means = cfg.TRAIN.BGR_MEAN 136 | dataset_name = cfg.DATASETS.DATA_TYPE 137 | batch_size = cfg.TEST.BATCH_SIZE 138 | num_workers = args.num_workers 139 | if cfg.DATASETS.DATA_TYPE == 'VOC': 140 | trainvalDataset = VOCDetection 141 | top_k = 200 142 | else: 143 | trainvalDataset = COCODetection 144 | top_k = 300 145 | dataroot = cfg.DATASETS.DATAROOT 146 | if cfg.MODEL.SIZE == '300': 147 | size_cfg = cfg.SMALL 148 | else: 149 | size_cfg = cfg.BIG 150 | valSet = cfg.DATASETS.VAL_TYPE 151 | num_classes = cfg.MODEL.NUM_CLASSES 152 | save_folder = args.save_folder 153 | if not os.path.exists(save_folder): 154 | os.mkdir(save_folder) 155 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 156 | cfg.TRAIN.TRAIN_ON = False 157 | net = SSD(cfg) 158 | 159 | checkpoint = torch.load(args.weights) 160 | state_dict = checkpoint['model'] 161 | from collections import OrderedDict 162 | new_state_dict = OrderedDict() 163 | for k, v in state_dict.items(): 164 | head = k[:7] 165 | if head == 'module.': 166 | name = k[7:] # remove `module.` 167 | else: 168 | name = k 169 | new_state_dict[name] = v 170 | net.load_state_dict(new_state_dict) 171 | detector = Detect(cfg) 172 | ValTransform = BaseTransform(size_cfg.IMG_WH, bgr_means, (2, 0, 1)) 173 | val_dataset = trainvalDataset(dataroot, valSet, ValTransform, "val") 174 | val_loader = data.DataLoader( 175 | val_dataset, 176 | batch_size, 177 | shuffle=False, 178 | num_workers=num_workers, 179 | collate_fn=detection_collate) 180 | top_k = 300 181 | thresh = cfg.TEST.CONFIDENCE_THRESH 182 | eval_net( 183 | val_dataset, 184 | val_loader, 185 | net, 186 | detector, 187 | cfg, 188 | ValTransform, 189 | top_k, 190 | thresh=thresh, 191 | batch_size=batch_size) 192 | 193 | 194 | if __name__ == '__main__': 195 | st = time.time() 196 | main() 197 | print("final time", time.time() - st) 198 | -------------------------------------------------------------------------------- /hsd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JialeCao001/HSD/8abcf78db5f313266a3bb3f85b9424927fe59a2d/hsd.jpg -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /layers/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection import Detect 2 | from .prior_box import PriorBox 3 | # from .refine_prior_box import RefinePriorBox 4 | 5 | 6 | __all__ = ['Detect', 'PriorBox'] 7 | -------------------------------------------------------------------------------- /layers/functions/detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.backends.cudnn as cudnn 4 | from torch.autograd import Function 5 | from torch.autograd import Variable 6 | import torch.nn.functional as F 7 | from utils.box_utils import decode, center_size 8 | 9 | 10 | class Detect(Function): 11 | """At test time, Detect is the final layer of SSD. Decode location preds, 12 | apply non-maximum suppression to location predictions based on conf 13 | scores and threshold to a top_k number of output predictions for both 14 | confidence score and locations. 15 | """ 16 | 17 | def __init__(self, cfg): 18 | self.cfg = cfg 19 | self.num_classes = cfg.MODEL.NUM_CLASSES 20 | #self.thresh = thresh 21 | self.size = cfg.MODEL.SIZE 22 | if self.size == '300': 23 | size_cfg = cfg.SMALL 24 | else: 25 | size_cfg = cfg.BIG 26 | # Parameters used in nms. 27 | self.variance = size_cfg.VARIANCE 28 | self.variance1 = [size_cfg.VARIANCE[0]/2, size_cfg.VARIANCE[1]/2] 29 | self.object_score = cfg.MODEL.OBJECT_SCORE 30 | 31 | def forward(self, predictions): 32 | """ 33 | Args: 34 | loc_data: (tensor) Loc preds from loc layers 35 | Shape: [batch,num_priors*4] 36 | conf_data: (tensor) Shape: Conf preds from conf layers 37 | Shape: [batch*num_priors,num_classes] 38 | prior_data: (tensor) Prior boxes and variances from priorbox layers 39 | Shape: [1,num_priors,4] 40 | """ 41 | # loc, conf, priors = predictions 42 | if self.cfg.MODEL.CASCADE: 43 | arm_loc, arm_conf, loc, conf, priors = predictions 44 | arm_conf = F.softmax(arm_conf.view(-1, 2), 1) 45 | conf = F.softmax(conf.view(-1, self.num_classes), 1) 46 | arm_loc_data = arm_loc.data 47 | arm_conf_data = arm_conf.data 48 | arm_object_conf = arm_conf_data[:, 1:] 49 | no_object_index = arm_object_conf <= self.object_score 50 | # print(torch.sum(no_object_index) / loc.data.size(0), loc.data.size(1)) 51 | conf.data[no_object_index.expand_as(conf.data)] = 0 52 | else: 53 | loc, conf, priors = predictions 54 | conf = F.softmax(conf.view(-1, self.num_classes), 1) 55 | loc_data = loc.data 56 | conf_data = conf.data 57 | # prior_data = priors.data 58 | prior_data = priors[:loc_data.size(1), :] 59 | 60 | num = loc_data.size(0) # batch size 61 | 62 | self.num_priors = prior_data.size(0) 63 | 64 | self.boxes = torch.zeros(num, self.num_priors, 4) 65 | self.scores = torch.zeros(num, self.num_priors, self.num_classes) 66 | conf_preds = conf_data.view(num, self.num_priors, self.num_classes) 67 | batch_prior = prior_data.view(-1, self.num_priors, 4).expand( 68 | (num, self.num_priors, 4)) 69 | batch_prior = batch_prior.contiguous().view(-1, 4) 70 | if self.cfg.MODEL.CASCADE: 71 | default = decode( 72 | arm_loc_data.view(-1, 4), batch_prior, self.variance) 73 | default = center_size(default) 74 | decoded_boxes = decode( 75 | loc_data.view(-1, 4), default, self.variance1) 76 | 77 | else: 78 | decoded_boxes = decode( 79 | loc_data.view(-1, 4), batch_prior, self.variance) 80 | 81 | self.scores = conf_preds.view(num, self.num_priors, self.num_classes) 82 | self.boxes = decoded_boxes.view(num, self.num_priors, 4) 83 | return self.boxes, self.scores 84 | -------------------------------------------------------------------------------- /layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from math import sqrt as sqrt 3 | from itertools import product as product 4 | 5 | 6 | class PriorBox(object): 7 | """Compute priorbox coordinates in center-offset form for each source 8 | feature map. 9 | Note: 10 | This 'layer' has changed between versions of the original SSD 11 | paper, so we include both versions, but note v2 is the most tested and most 12 | recent version of the paper. 13 | 14 | """ 15 | 16 | def __init__(self, cfg): 17 | super(PriorBox, self).__init__() 18 | self.size = cfg.MODEL.SIZE 19 | if self.size == '300': 20 | size_cfg = cfg.SMALL 21 | else: 22 | size_cfg = cfg.BIG 23 | self.img_wh = size_cfg.IMG_WH 24 | self.num_priors = len(size_cfg.ASPECT_RATIOS) 25 | self.feature_maps = size_cfg.FEATURE_MAPS 26 | self.variance = size_cfg.VARIANCE or [0.1] 27 | self.min_sizes = size_cfg.MIN_SIZES 28 | self.use_max_sizes = size_cfg.USE_MAX_SIZE 29 | if self.use_max_sizes: 30 | self.max_sizes = size_cfg.MAX_SIZES 31 | self.steps = size_cfg.STEPS 32 | self.aspect_ratios = size_cfg.ASPECT_RATIOS 33 | self.clip = size_cfg.CLIP 34 | for v in self.variance: 35 | if v <= 0: 36 | raise ValueError('Variances must be greater than 0') 37 | 38 | def forward(self): 39 | mean = [] 40 | for k, f in enumerate(self.feature_maps): 41 | grid_h, grid_w = f[1], f[0] 42 | for i in range(grid_h): 43 | for j in range(grid_w): 44 | f_k_h = self.img_wh[1] / self.steps[k][1] 45 | f_k_w = self.img_wh[0] / self.steps[k][0] 46 | # unit center x,y 47 | cx = (j + 0.5) / f_k_w 48 | cy = (i + 0.5) / f_k_h 49 | 50 | # aspect_ratio: 1 51 | # rel size: min_size 52 | s_k_h = self.min_sizes[k] / self.img_wh[1] 53 | s_k_w = self.min_sizes[k] / self.img_wh[0] 54 | mean += [cx, cy, s_k_w, s_k_h] 55 | 56 | # aspect_ratio: 1 57 | # rel size: sqrt(s_k * s_(k+1)) 58 | if self.use_max_sizes: 59 | s_k_prime_w = sqrt( 60 | s_k_w * (self.max_sizes[k] / self.img_wh[0])) 61 | s_k_prime_h = sqrt( 62 | s_k_h * (self.max_sizes[k] / self.img_wh[1])) 63 | mean += [cx, cy, s_k_prime_w, s_k_prime_h] 64 | 65 | for ar in self.aspect_ratios[k]: 66 | mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)] 67 | 68 | # back to torch land 69 | output = torch.Tensor(mean).view(-1, 4) 70 | if self.clip: 71 | output.clamp_(max=1, min=0) 72 | # print(output.size()) 73 | return output 74 | -------------------------------------------------------------------------------- /layers/functions/prior_layer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from math import sqrt as sqrt 3 | from math import ceil 4 | import torch.nn as nn 5 | from itertools import product as product 6 | 7 | 8 | class PriorLayer(nn.Module): 9 | def __init__(self, cfg): 10 | super(PriorLayer, self).__init__() 11 | self.size = cfg.MODEL.SIZE 12 | if self.size == '300': 13 | size_cfg = cfg.SMALL 14 | else: 15 | size_cfg = cfg.BIG 16 | self.img_wh = size_cfg.IMG_WH 17 | self.num_priors = len(size_cfg.ASPECT_RATIOS) 18 | self.feature_maps = size_cfg.FEATURE_MAPS 19 | self.variance = size_cfg.VARIANCE or [0.1] 20 | self.min_sizes = size_cfg.MIN_SIZES 21 | self.use_max_sizes = size_cfg.USE_MAX_SIZE 22 | if self.use_max_sizes: 23 | self.max_sizes = size_cfg.MAX_SIZES 24 | self.steps = size_cfg.STEPS 25 | self.aspect_ratios = size_cfg.ASPECT_RATIOS 26 | self.clip = size_cfg.CLIP 27 | for v in self.variance: 28 | if v <= 0: 29 | raise ValueError('Variances must be greater than 0') 30 | 31 | def forward(self, img_wh, feature_maps_wh): 32 | self.img_wh = img_wh 33 | self.feature_maps_wh = feature_maps_wh 34 | mean = [] 35 | for k, f in enumerate(self.feature_maps_wh): 36 | grid_h, grid_w = f[1], f[0] 37 | for i in range(grid_h): 38 | for j in range(grid_w): 39 | f_k_h = self.img_wh[1] / self.steps[k][1] 40 | f_k_w = self.img_wh[0] / self.steps[k][0] 41 | # unit center x,y 42 | cx = (j + 0.5) / f_k_w 43 | cy = (i + 0.5) / f_k_h 44 | 45 | # aspect_ratio: 1 46 | # rel size: min_size 47 | s_k_h = self.min_sizes[k] / self.img_wh[1] 48 | s_k_w = self.min_sizes[k] / self.img_wh[0] 49 | mean += [cx, cy, s_k_w, s_k_h] 50 | 51 | # aspect_ratio: 1 52 | # rel size: sqrt(s_k * s_(k+1)) 53 | if self.use_max_sizes: 54 | s_k_prime_w = sqrt( 55 | s_k_w * (self.max_sizes[k] / self.img_wh[0])) 56 | s_k_prime_h = sqrt( 57 | s_k_h * (self.max_sizes[k] / self.img_wh[1])) 58 | mean += [cx, cy, s_k_prime_w, s_k_prime_h] 59 | 60 | for ar in self.aspect_ratios[k]: 61 | mean += [cx, cy, s_k_w * sqrt(ar), s_k_h / sqrt(ar)] 62 | 63 | output = torch.Tensor(mean).view(-1, 4) 64 | if self.clip: 65 | output.clamp_(max=1, min=0) 66 | return output 67 | -------------------------------------------------------------------------------- /layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .weight_smooth_l1_loss import WeightSmoothL1Loss 2 | from .weight_softmax_loss import WeightSoftmaxLoss 3 | from .multibox_loss import MultiBoxLoss 4 | from .hsd_multibox_loss import HSDMultiBoxLoss 5 | from .focal_loss_sigmoid import FocalLossSigmoid 6 | from .focal_loss_softmax import FocalLossSoftmax 7 | 8 | __all__ = ['MultiBoxLoss', 'WeightSoftmaxLoss', ] 9 | -------------------------------------------------------------------------------- /layers/modules/focal_loss_sigmoid.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class FocalLossSigmoid(nn.Module): 11 | ''' 12 | sigmoid version focal loss 13 | ''' 14 | 15 | def __init__(self, alpha=0.25, gamma=2, size_average=False): 16 | super(FocalLossSigmoid, self).__init__() 17 | self.alpha = alpha 18 | self.gamma = gamma 19 | self.size_average = size_average 20 | 21 | def forward(self, inputs, targets): 22 | N = inputs.size(0) 23 | C = inputs.size(1) 24 | P = torch.sigmoid(inputs) 25 | alpha_mask = self.alpha * targets 26 | loss_pos = -1. * torch.pow( 27 | 1 - P, self.gamma) * torch.log(P) * targets * alpha_mask 28 | loss_neg = -1. * torch.pow(1 - P, self.gamma) * torch.log(1 - P) * ( 29 | 1 - targets) * (1 - alpha_mask) 30 | batch_loss = loss_neg + loss_pos 31 | if self.size_average: 32 | loss = batch_loss.mean() 33 | else: 34 | loss = batch_loss.sum() 35 | return loss 36 | -------------------------------------------------------------------------------- /layers/modules/focal_loss_softmax.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class FocalLossSoftmax(nn.Module): 11 | ''' 12 | softmax version focal loss 13 | ''' 14 | 15 | def __init__(self, class_num, alpha=None, gamma=2, size_average=True): 16 | super(FocalLossSoftmax, self).__init__() 17 | if alpha is None: 18 | self.alpha = Variable(torch.ones(class_num, 1)) 19 | else: 20 | if isinstance(alpha, Variable): 21 | self.alpha = alpha 22 | else: 23 | self.alpha = Variable(alpha) 24 | self.gamma = gamma 25 | self.class_num = class_num 26 | self.size_average = size_average 27 | 28 | def forward(self, inputs, targets): 29 | N = inputs.size(0) 30 | C = inputs.size(1) 31 | P = F.softmax(inputs) 32 | 33 | class_mask = inputs.data.new(N, C).fill_(0) 34 | class_mask = Variable(class_mask) 35 | ids = targets.view(-1, 1) 36 | class_mask.scatter_(1, ids.data, 1.) 37 | 38 | if inputs.is_cuda and not self.alpha.is_cuda: 39 | self.alpha = self.alpha.cuda() 40 | alpha = self.alpha[ids.data.view(-1)] 41 | probs = (P * class_mask).sum(1).view(-1, 1) 42 | log_p = probs.log() 43 | batch_loss = -alpha * (torch.pow((1 - probs), self.gamma)) * log_p 44 | 45 | if self.size_average: 46 | loss = batch_loss.mean() 47 | else: 48 | loss = batch_loss.sum() 49 | return loss -------------------------------------------------------------------------------- /layers/modules/hsd_multibox_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | import numpy as np 8 | from torch.autograd import Variable 9 | from utils.box_utils import match, log_sum_exp, roc_match, hsd_match 10 | from layers.modules import WeightSoftmaxLoss, WeightSmoothL1Loss 11 | GPU = False 12 | if torch.cuda.is_available(): 13 | GPU = True 14 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 15 | 16 | 17 | class HSDMultiBoxLoss(nn.Module): 18 | """SSD Weighted Loss Function 19 | Compute Targets: 20 | 1) Produce Confidence Target Indices by matching ground truth boxes 21 | with (default) 'priorboxes' that have jaccard index > threshold parameter 22 | (default threshold: 0.5). 23 | 2) Produce localization target by 'encoding' variance into offsets of ground 24 | truth boxes and their matched 'priorboxes'. 25 | 3) Hard negative mining to filter the excessive number of negative examples 26 | that comes with using a large number of default bounding boxes. 27 | (default negative:positive ratio 3:1) 28 | Objective Loss: 29 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 30 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 31 | weighted by α which is set to 1 by cross val. 32 | Args: 33 | c: class confidences, 34 | l: predicted boxes, 35 | g: ground truth boxes 36 | N: number of matched default boxes 37 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 38 | """ 39 | 40 | def __init__(self, cfg, num_classes): 41 | super(HSDMultiBoxLoss, self).__init__() 42 | self.cfg = cfg 43 | self.size = cfg.MODEL.SIZE 44 | if self.size == '300': 45 | size_cfg = cfg.SMALL 46 | else: 47 | size_cfg = cfg.BIG 48 | self.variance = size_cfg.VARIANCE 49 | self.num_classes = num_classes 50 | self.threshold = cfg.TRAIN.OVERLAP 51 | self.OHEM = cfg.TRAIN.OHEM 52 | self.negpos_ratio = cfg.TRAIN.NEG_RATIO 53 | self.object_score = cfg.MODEL.OBJECT_SCORE 54 | self.variance = size_cfg.VARIANCE 55 | if cfg.TRAIN.FOCAL_LOSS: 56 | if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX': 57 | self.focaloss = FocalLossSoftmax( 58 | self.num_classes, gamma=2, size_average=False) 59 | else: 60 | self.focaloss = FocalLossSigmoid() 61 | 62 | def forward(self, 63 | predictions, 64 | targets, 65 | use_arm=False, 66 | filter_object=False, 67 | debug=False): 68 | """Multibox Loss 69 | Args: 70 | predictions (tuple): A tuple containing loc preds, conf preds, 71 | and prior boxes from SSD net. 72 | conf shape: torch.size(batch_size,num_priors,num_classes) 73 | loc shape: torch.size(batch_size,num_priors,4) 74 | priors shape: torch.size(num_priors,4) 75 | 76 | ground_truth (tensor): Ground truth boxes and labels for a batch, 77 | shape: [batch_size,num_objs,5] (last idx is the label). 78 | """ 79 | # arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions 80 | if use_arm: 81 | arm_loc_data, arm_conf_data, loc_data, conf_data, priors = predictions 82 | else: 83 | loc_data, conf_data, _, _, priors = predictions 84 | num = loc_data.size(0) 85 | priors = priors[:loc_data.size(1), :] 86 | num_priors = (priors.size(0)) 87 | num_classes = self.num_classes 88 | 89 | # match priors (default boxes) and ground truth boxes 90 | loc_t = torch.Tensor(num, num_priors, 4) 91 | conf_t = torch.LongTensor(num, num_priors) 92 | conf_t0 = torch.LongTensor(num, num_priors) 93 | defaults = priors.data 94 | for idx in range(num): 95 | truths = targets[idx][:, :-1].data 96 | labels = targets[idx][:, -1].data 97 | if self.num_classes == 2: 98 | labels = labels > 0 99 | if use_arm: 100 | hsd_match(0.5, truths, defaults, self.variance, labels, loc_t, conf_t0, conf_t, idx, 101 | arm_loc_data[idx].data, loc_data[idx].data, use_weight=False) 102 | else: 103 | roc_match(self.threshold, truths, defaults, self.variance, labels, 104 | loc_data[idx].data, loc_t, conf_t0, conf_t, idx) 105 | 106 | loc_t = loc_t.cuda() 107 | conf_t = conf_t.cuda() 108 | conf_t0 = conf_t0.cuda() 109 | 110 | # wrap targets 111 | loc_t = Variable(loc_t, requires_grad=False) 112 | conf_t = Variable(conf_t, requires_grad=False) 113 | conf_t0 = Variable(conf_t0, requires_grad=False) 114 | 115 | # print(self.threshold) 116 | if use_arm and filter_object: 117 | P = F.softmax(arm_conf_data, 2) 118 | arm_conf_data_temp = P[:, :, 1] 119 | object_score_index = arm_conf_data_temp <= self.object_score 120 | pos = conf_t > 0 121 | pos[object_score_index.detach()] = 0 122 | 123 | pos0 = conf_t0 > 0 124 | pos0[object_score_index.detach()] = 0 125 | else: 126 | pos = conf_t > 0 127 | pos0 = conf_t0 > 0 128 | num_pos = pos.sum(1, keepdim=True) 129 | num_pos0 = pos0.sum(1, keepdim=True) 130 | 131 | if self.OHEM: 132 | # Compute max conf across batch for hard negative mining 133 | batch_conf = conf_data.view(-1, self.num_classes) 134 | 135 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather( 136 | 1, conf_t.view(-1, 1)) 137 | 138 | # Hard Negative Mining 139 | loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now 140 | loss_c = loss_c.view(num, -1) 141 | _, loss_idx = loss_c.sort(1, descending=True) 142 | _, idx_rank = loss_idx.sort(1) 143 | num_pos = pos.long().sum(1, keepdim=True) 144 | 145 | if num_pos.data.sum() > 0: 146 | num_neg = torch.clamp( 147 | self.negpos_ratio * num_pos, max=pos.size(1) - 1) 148 | else: 149 | fake_num_pos = torch.ones(32, 1).long() * 15 150 | num_neg = torch.clamp( 151 | self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1) 152 | neg = idx_rank < num_neg.expand_as(idx_rank) 153 | 154 | # Confidence Loss Including Positive and Negative Examples 155 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 156 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 157 | conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view( 158 | -1, self.num_classes) 159 | 160 | targets_weighted = conf_t[(pos + neg).gt(0)] 161 | loss_c = F.cross_entropy( 162 | conf_p, targets_weighted, size_average=False) 163 | else: 164 | loss_c = F.cross_entropy(conf_p, conf_t, size_average=False) 165 | 166 | 167 | # Localization Loss (Smooth L1) 168 | # Shape: [batch,num_priors,4] 169 | if num_pos0.data.sum() > 0: 170 | pos_idx = pos0.unsqueeze(pos0.dim()).expand_as(loc_data) 171 | loc_p = loc_data[pos_idx].view(-1, 4) 172 | loc_t = loc_t[pos_idx].view(-1, 4) 173 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 174 | N0 = num_pos0.data.sum() 175 | else: 176 | loss_l = torch.zeros(1) 177 | N0 = 1.0 178 | if num_pos.data.sum() > 0: 179 | N = num_pos.data.sum() 180 | else: 181 | N = 1.0 182 | loss_l /= float(N0) 183 | loss_c /= float(N) 184 | return loss_l, loss_c 185 | -------------------------------------------------------------------------------- /layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | from torch.autograd import Variable 6 | from utils.box_utils import match, roc_match, log_sum_exp 7 | from .focal_loss_softmax import FocalLossSoftmax 8 | from .focal_loss_sigmoid import FocalLossSigmoid 9 | 10 | GPU = False 11 | if torch.cuda.is_available(): 12 | GPU = True 13 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 14 | 15 | 16 | class MultiBoxLoss(nn.Module): 17 | """SSD Weighted Loss Function 18 | Compute Targets: 19 | 1) Produce Confidence Target Indices by matching ground truth boxes 20 | with (default) 'priorboxes' that have jaccard index > threshold parameter 21 | (default threshold: 0.5). 22 | 2) Produce localization target by 'encoding' variance into offsets of ground 23 | truth boxes and their matched 'priorboxes'. 24 | 3) Hard negative mining to filter the excessive number of negative examples 25 | that comes with using a large number of default bounding boxes. 26 | (default negative:positive ratio 3:1) 27 | Objective Loss: 28 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 29 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 30 | weighted by α which is set to 1 by cross val. 31 | Args: 32 | c: class confidences, 33 | l: predicted boxes, 34 | g: ground truth boxes 35 | N: number of matched default boxes 36 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 37 | """ 38 | 39 | def __init__(self, cfg): 40 | super(MultiBoxLoss, self).__init__() 41 | self.cfg = cfg 42 | self.size = cfg.MODEL.SIZE 43 | if self.size == '300': 44 | size_cfg = cfg.SMALL 45 | else: 46 | size_cfg = cfg.BIG 47 | self.variance = size_cfg.VARIANCE 48 | self.num_classes = cfg.MODEL.NUM_CLASSES 49 | self.threshold = cfg.TRAIN.OVERLAP 50 | self.OHEM = cfg.TRAIN.OHEM 51 | self.negpos_ratio = cfg.TRAIN.NEG_RATIO 52 | self.variance = size_cfg.VARIANCE 53 | if cfg.TRAIN.FOCAL_LOSS: 54 | if cfg.TRAIN.FOCAL_LOSS_TYPE == 'SOFTMAX': 55 | self.focaloss = FocalLossSoftmax( 56 | self.num_classes, gamma=2, size_average=False) 57 | else: 58 | self.focaloss = FocalLossSigmoid() 59 | 60 | def forward(self, predictions, targets): 61 | """Multibox Loss 62 | Args: 63 | predictions (tuple): A tuple containing loc preds, conf preds, 64 | and prior boxes from SSD net. 65 | conf shape: torch.size(batch_size,num_priors,num_classes) 66 | loc shape: torch.size(batch_size,num_priors,4) 67 | priors shape: torch.size(num_priors,4) 68 | 69 | ground_truth (tensor): Ground truth boxes and labels for a batch, 70 | shape: [batch_size,num_objs,5] (last idx is the label). 71 | """ 72 | loc_data, conf_data, priors = predictions 73 | num = loc_data.size(0) 74 | priors = priors[:loc_data.size(1), :] 75 | num_priors = (priors.size(0)) 76 | num_classes = self.num_classes 77 | loc_t = torch.Tensor(num, num_priors, 4) 78 | conf_t = torch.LongTensor(num, num_priors) 79 | conf_t0 = torch.LongTensor(num, num_priors) 80 | 81 | for idx in range(num): 82 | truths = targets[idx][:, :-1].data 83 | labels = targets[idx][:, -1].data 84 | if self.num_classes == 2: 85 | labels = labels > 0 86 | defaults = priors.data 87 | roc_match(self.threshold, truths, defaults, self.variance, labels, 88 | loc_data[idx].data, loc_t, conf_t0, conf_t, idx) 89 | 90 | loc_t = loc_t.cuda() 91 | conf_t = conf_t.cuda() 92 | conf_t0 = conf_t0.cuda() 93 | 94 | pos = conf_t > 0 95 | num_pos = pos.sum(1, keepdim=True) 96 | 97 | pos0 = conf_t0 > 0 98 | num_pos0 = pos0.sum(1, keepdim=True) 99 | 100 | if self.OHEM: 101 | # Compute max conf across batch for hard negative mining 102 | batch_conf = conf_data.view(-1, self.num_classes) 103 | 104 | loss_hard = log_sum_exp(batch_conf) - batch_conf.gather( 105 | 1, conf_t.view(-1, 1)) 106 | # Hard Negative Mining 107 | loss_hard[pos.view(-1, 1)] = 0 # filter out pos boxes for now 108 | loss_hard = loss_hard.view(num, -1) 109 | _, loss_idx = loss_hard.sort(1, descending=True) 110 | _, idx_rank = loss_idx.sort(1) 111 | num_pos = pos.long().sum(1, keepdim=True) 112 | if num_pos.data.sum() > 0: 113 | num_neg = torch.clamp( 114 | self.negpos_ratio * num_pos, max=pos.size(1) - 1) 115 | else: 116 | fake_num_pos = torch.ones(32, 1).long() * 15 117 | num_neg = torch.clamp( 118 | self.negpos_ratio * fake_num_pos, max=pos.size(1) - 1) 119 | neg = idx_rank < num_neg.expand_as(idx_rank) 120 | 121 | # Confidence Loss Including Positive and Negative Examples 122 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 123 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 124 | conf_p = conf_data[(pos_idx + neg_idx).gt(0)].view( 125 | -1, self.num_classes) 126 | targets_weighted = conf_t[(pos + neg).gt(0)] 127 | loss_c = F.cross_entropy( 128 | conf_p, targets_weighted, size_average=False) 129 | else: 130 | loss_c = F.cross_entropy(conf_p, conf_t, size_average=False) 131 | # Localization Loss (Smooth L1) 132 | # Shape: [batch,num_priors,4] 133 | if num_pos0.data.sum() > 0: 134 | pos_idx = pos0.unsqueeze(pos0.dim()).expand_as(loc_data) 135 | loc_p = loc_data[pos_idx].view(-1, 4) 136 | loc_t = loc_t[pos_idx].view(-1, 4) 137 | loss_l = F.smooth_l1_loss(loc_p, loc_t, size_average=False) 138 | N = num_pos.data.sum() 139 | N0 = num_pos0.data.sum() 140 | if num_pos0.data.sum() <= 0: 141 | N = 1.0 142 | else: 143 | loss_l = torch.zeros(1) 144 | N = 1.0 145 | N0 = 1.0 146 | 147 | print("Default vs refined: ", str(N0), str(N)) 148 | loss_l /= float(N0) 149 | loss_c /= float(N) 150 | 151 | return loss_l, loss_c -------------------------------------------------------------------------------- /layers/modules/weight_smooth_l1_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class WeightSmoothL1Loss(nn.Module): 11 | def __init__(self, class_num, size_average=False): 12 | super(WeightSmoothL1Loss, self).__init__() 13 | self.class_num = class_num 14 | self.size_average = size_average 15 | 16 | def forward(self, inputs, targets, weights): 17 | N = inputs.size(0) 18 | loc_num = inputs.size(1) 19 | abs_out = torch.abs(inputs - targets) 20 | 21 | if inputs.is_cuda and not weights.is_cuda: 22 | weights = weights.cuda() 23 | 24 | weights = weights.view(-1, 1) 25 | 26 | weights = torch.cat((weights, weights, weights, weights), 1) 27 | mask_big = abs_out >= 1. 28 | mask_small = abs_out < 1. 29 | loss_big = weights[mask_big] * (abs_out[mask_big] - 0.5) 30 | loss_small = weights[mask_small] * 0.5 * torch.pow( 31 | abs_out[mask_small], 2) 32 | loss_sum = loss_big.sum() + loss_small.sum() 33 | 34 | if self.size_average: 35 | loss = loss_sum / N * loc_num 36 | else: 37 | loss = loss_sum 38 | return loss 39 | -------------------------------------------------------------------------------- /layers/modules/weight_softmax_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | 10 | class WeightSoftmaxLoss(nn.Module): 11 | def __init__(self, class_num, gamma=2, size_average=True): 12 | super(WeightSoftmaxLoss, self).__init__() 13 | # if isinstance(weights, Variable): 14 | # self.weights = weights 15 | # else: 16 | # self.weights = Variable(weights) 17 | 18 | self.class_num = class_num 19 | self.gamma = gamma 20 | self.size_average = size_average 21 | 22 | def forward(self, inputs, targets, weights): 23 | N = inputs.size(0) 24 | C = inputs.size(1) 25 | P = F.softmax(inputs) 26 | 27 | class_mask = inputs.data.new(N, C).fill_(0) 28 | class_mask = Variable(class_mask) 29 | ids = targets.view(-1, 1) 30 | class_mask.scatter_(1, ids.data, 1.) 31 | if inputs.is_cuda and not weights.is_cuda: 32 | weights = weights.cuda() 33 | probs = (P * class_mask).sum(1).view(-1, 1) 34 | 35 | log_p = probs.log() 36 | weights = weights.view(-1, 1) 37 | batch_loss = -weights * log_p 38 | 39 | if self.size_average: 40 | loss = batch_loss.mean() 41 | else: 42 | loss = batch_loss.sum() 43 | return loss -------------------------------------------------------------------------------- /make.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | cd ./utils/ 3 | 4 | CUDA_PATH=/usr/local/cuda/ 5 | 6 | python build.py build_ext --inplace 7 | # if you use anaconda3 maybe you need add this 8 | # change code like https://github.com/rbgirshick/py-faster-rcnn/issues/706 9 | #mv nms/cpu_nms.cpython-36m-x86_64-linux-gnu.so nms/cpu_nms.so 10 | #mv nms/gpu_nms.cpython-36m-x86_64-linux-gnu.so nms/gpu_nms.so 11 | cd .. 12 | -------------------------------------------------------------------------------- /models/attention.py: -------------------------------------------------------------------------------- 1 | ########################################################################### 2 | # Created by: CASIA IVA 3 | # Email: jliu@nlpr.ia.ac.cn 4 | # Copyright (c) 2018 5 | ########################################################################### 6 | 7 | import numpy as np 8 | import torch 9 | import math 10 | from torch.nn import Module, Sequential, Conv2d, ReLU,AdaptiveMaxPool2d, AdaptiveAvgPool2d, \ 11 | NLLLoss, BCELoss, CrossEntropyLoss, AvgPool2d, MaxPool2d, Parameter, Linear, Sigmoid, Softmax, Dropout, Embedding 12 | from torch.nn import functional as F 13 | from torch.autograd import Variable 14 | torch_ver = torch.__version__[:3] 15 | 16 | __all__ = ['PAM_Module', 'CAM_Module'] 17 | 18 | 19 | class PAM_Module(Module): 20 | """ Position attention module""" 21 | #Ref from SAGAN 22 | def __init__(self, in_dim): 23 | super(PAM_Module, self).__init__() 24 | self.chanel_in = in_dim 25 | 26 | self.query_conv = Conv2d(in_channels=in_dim, out_channels=in_dim//4, kernel_size=1) 27 | self.key_conv = Conv2d(in_channels=in_dim, out_channels=in_dim//4, kernel_size=1) 28 | self.value_conv = Conv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1) 29 | self.gamma = Parameter(torch.zeros(1)) 30 | 31 | self.softmax = Softmax(dim=-1) 32 | def forward(self, x): 33 | """ 34 | inputs : 35 | x : input feature maps( B X C X H X W) 36 | returns : 37 | out : attention value + input feature 38 | attention: B X (HxW) X (HxW) 39 | """ 40 | m_batchsize, C, height, width = x.size() 41 | proj_query = self.query_conv(x).view(m_batchsize, -1, width*height).permute(0, 2, 1) 42 | proj_key = self.key_conv(x).view(m_batchsize, -1, width*height) 43 | energy = torch.bmm(proj_query, proj_key) 44 | attention = self.softmax(energy) 45 | proj_value = self.value_conv(x).view(m_batchsize, -1, width*height) 46 | 47 | out = torch.bmm(proj_value, attention.permute(0, 2, 1)) 48 | out = out.view(m_batchsize, C, height, width) 49 | 50 | 51 | out = self.gamma*out + x 52 | return out -------------------------------------------------------------------------------- /models/deform/.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | **/*.pyc 3 | **/_ext 4 | **/build 5 | **/dist 6 | **/*.egg-info 7 | **/.eggs 8 | .clang_complete 9 | *.o 10 | -------------------------------------------------------------------------------- /models/deform/README.md: -------------------------------------------------------------------------------- 1 | # Deformable Convolutional Networks in PyTorch 2 | This repo is an implementation of [Deformable Convolution](https://arxiv.org/abs/1703.06211). 3 | Ported from author's MXNet [implementation](https://github.com/msracver/Deformable-ConvNets). 4 | 5 | # Build 6 | 7 | ``` 8 | sh make.sh 9 | CC=g++ python build.py 10 | ``` 11 | 12 | See `test.py` for example usage. 13 | 14 | ### Notice 15 | Only `torch.cuda.FloatTensor` is supported. 16 | -------------------------------------------------------------------------------- /models/deform/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | this_file = os.path.dirname(__file__) 6 | 7 | sources = ['src/deform_conv.c'] 8 | headers = ['src/deform_conv.h'] 9 | defines = [] 10 | with_cuda = False 11 | 12 | if torch.cuda.is_available(): 13 | print('Including CUDA code.') 14 | sources += ['src/deform_conv_cuda.c'] 15 | headers += ['src/deform_conv_cuda.h'] 16 | defines += [('WITH_CUDA', None)] 17 | with_cuda = True 18 | 19 | this_file = os.path.dirname(os.path.realpath(__file__)) 20 | print(this_file) 21 | extra_objects = ['src/deform_conv_cuda_kernel.cu.o'] 22 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 23 | 24 | ffi = create_extension( 25 | '_ext.deform_conv', 26 | headers=headers, 27 | sources=sources, 28 | define_macros=defines, 29 | relative_to=__file__, 30 | with_cuda=with_cuda, 31 | extra_objects=extra_objects 32 | ) 33 | 34 | if __name__ == '__main__': 35 | assert torch.cuda.is_available(), 'Please install CUDA for GPU support.' 36 | ffi.build() 37 | -------------------------------------------------------------------------------- /models/deform/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .deform_conv import conv_offset2d 2 | -------------------------------------------------------------------------------- /models/deform/functions/deform_conv.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from torch.nn.modules.utils import _pair 4 | 5 | from .._ext import deform_conv 6 | 7 | 8 | def conv_offset2d(input, 9 | offset, 10 | weight, 11 | stride=1, 12 | padding=0, 13 | dilation=1, 14 | deform_groups=1): 15 | 16 | if input is not None and input.dim() != 4: 17 | raise ValueError( 18 | "Expected 4D tensor as input, got {}D tensor instead.".format( 19 | input.dim())) 20 | 21 | f = ConvOffset2dFunction( 22 | _pair(stride), _pair(padding), _pair(dilation), deform_groups) 23 | return f(input, offset, weight) 24 | 25 | 26 | class ConvOffset2dFunction(Function): 27 | def __init__(self, stride, padding, dilation, deformable_groups=1): 28 | super(ConvOffset2dFunction, self).__init__() 29 | self.stride = stride 30 | self.padding = padding 31 | self.dilation = dilation 32 | self.deformable_groups = deformable_groups 33 | 34 | def forward(self, input, offset, weight): 35 | self.save_for_backward(input, offset, weight) 36 | 37 | output = input.new(*self._output_size(input, weight)) 38 | 39 | self.bufs_ = [input.new(), input.new()] # columns, ones 40 | 41 | if not input.is_cuda: 42 | raise NotImplementedError 43 | else: 44 | if isinstance(input, torch.autograd.Variable): 45 | if not isinstance(input.data, torch.cuda.FloatTensor): 46 | raise NotImplementedError 47 | else: 48 | if not isinstance(input, torch.cuda.FloatTensor): 49 | raise NotImplementedError 50 | deform_conv.deform_conv_forward_cuda( 51 | input, weight, offset, output, self.bufs_[0], self.bufs_[1], 52 | weight.size(3), weight.size(2), self.stride[1], self.stride[0], 53 | self.padding[1], self.padding[0], self.dilation[1], 54 | self.dilation[0], self.deformable_groups) 55 | return output 56 | 57 | def backward(self, grad_output): 58 | input, offset, weight = self.saved_tensors 59 | 60 | grad_input = grad_offset = grad_weight = None 61 | 62 | if not grad_output.is_cuda: 63 | raise NotImplementedError 64 | else: 65 | if isinstance(grad_output, torch.autograd.Variable): 66 | if not isinstance(grad_output.data, torch.cuda.FloatTensor): 67 | raise NotImplementedError 68 | else: 69 | if not isinstance(grad_output, torch.cuda.FloatTensor): 70 | raise NotImplementedError 71 | if self.needs_input_grad[0] or self.needs_input_grad[1]: 72 | grad_input = input.new(*input.size()).zero_() 73 | grad_offset = offset.new(*offset.size()).zero_() 74 | deform_conv.deform_conv_backward_input_cuda( 75 | input, offset, grad_output, grad_input, 76 | grad_offset, weight, self.bufs_[0], weight.size(3), 77 | weight.size(2), self.stride[1], self.stride[0], 78 | self.padding[1], self.padding[0], self.dilation[1], 79 | self.dilation[0], self.deformable_groups) 80 | 81 | if self.needs_input_grad[2]: 82 | grad_weight = weight.new(*weight.size()).zero_() 83 | deform_conv.deform_conv_backward_parameters_cuda( 84 | input, offset, grad_output, 85 | grad_weight, self.bufs_[0], self.bufs_[1], weight.size(3), 86 | weight.size(2), self.stride[1], self.stride[0], 87 | self.padding[1], self.padding[0], self.dilation[1], 88 | self.dilation[0], self.deformable_groups, 1) 89 | 90 | return grad_input, grad_offset, grad_weight 91 | 92 | def _output_size(self, input, weight): 93 | channels = weight.size(0) 94 | 95 | output_size = (input.size(0), channels) 96 | for d in range(input.dim() - 2): 97 | in_size = input.size(d + 2) 98 | pad = self.padding[d] 99 | kernel = self.dilation[d] * (weight.size(d + 2) - 1) + 1 100 | stride = self.stride[d] 101 | output_size += ((in_size + (2 * pad) - kernel) // stride + 1, ) 102 | if not all(map(lambda s: s > 0, output_size)): 103 | raise ValueError( 104 | "convolution input is too small (output would be {})".format( 105 | 'x'.join(map(str, output_size)))) 106 | return output_size 107 | -------------------------------------------------------------------------------- /models/deform/make.sh: -------------------------------------------------------------------------------- 1 | cd src 2 | nvcc -c -o deform_conv_cuda_kernel.cu.o deform_conv_cuda_kernel.cu -x cu -Xcompiler -fPIC -std=c++11 3 | -------------------------------------------------------------------------------- /models/deform/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .deform_conv import ConvOffset2d 2 | -------------------------------------------------------------------------------- /models/deform/modules/deform_conv.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.nn.modules.module import Module 6 | from torch.nn.modules.utils import _pair 7 | from ..functions import conv_offset2d 8 | 9 | 10 | class ConvOffset2d(Module): 11 | def __init__(self, 12 | in_channels, 13 | out_channels, 14 | kernel_size, 15 | stride=1, 16 | padding=0, 17 | dilation=1, 18 | num_deformable_groups=1): 19 | super(ConvOffset2d, self).__init__() 20 | self.in_channels = in_channels 21 | self.out_channels = out_channels 22 | self.kernel_size = _pair(kernel_size) 23 | self.stride = _pair(stride) 24 | self.padding = _pair(padding) 25 | self.dilation = _pair(dilation) 26 | self.num_deformable_groups = num_deformable_groups 27 | 28 | self.weight = nn.Parameter( 29 | torch.Tensor(out_channels, in_channels, *self.kernel_size)) 30 | 31 | self.reset_parameters() 32 | 33 | def reset_parameters(self): 34 | n = self.in_channels 35 | for k in self.kernel_size: 36 | n *= k 37 | stdv = 1. / math.sqrt(n) 38 | self.weight.data.uniform_(-stdv, stdv) 39 | 40 | def forward(self, input, offset): 41 | return conv_offset2d(input, offset, self.weight, self.stride, 42 | self.padding, self.dilation, 43 | self.num_deformable_groups) 44 | -------------------------------------------------------------------------------- /models/deform/src/deform_conv.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int deform_conv_forward(THFloatTensor *input, THFloatTensor *offset, 4 | THFloatTensor *output) 5 | { 6 | // if (!THFloatTensor_isSameSizeAs(input1, input2)) 7 | // return 0; 8 | // THFloatTensor_resizeAs(output, input); 9 | // THFloatTensor_cadd(output, input1, 1.0, input2); 10 | return 1; 11 | } 12 | 13 | int deform_conv_backward(THFloatTensor *grad_output, THFloatTensor *grad_input, 14 | THFloatTensor *grad_offset) 15 | { 16 | // THFloatTensor_resizeAs(grad_input, grad_output); 17 | // THFloatTensor_fill(grad_input, 1); 18 | return 1; 19 | } 20 | -------------------------------------------------------------------------------- /models/deform/src/deform_conv.h: -------------------------------------------------------------------------------- 1 | int deform_conv_forward(THFloatTensor *input, THFloatTensor *offset, 2 | THFloatTensor *output); 3 | int deform_conv_backward(THFloatTensor *grad_output, THFloatTensor *grad_input, 4 | THFloatTensor *grad_offset); 5 | -------------------------------------------------------------------------------- /models/deform/src/deform_conv_cuda.h: -------------------------------------------------------------------------------- 1 | int deform_conv_forward_cuda(THCudaTensor *input, 2 | THCudaTensor *weight, /*THCudaTensor * bias, */ 3 | THCudaTensor *offset, THCudaTensor *output, 4 | THCudaTensor *columns, THCudaTensor *ones, int kW, 5 | int kH, int dW, int dH, int padW, int padH, 6 | int dilationH, int dilationW, 7 | int deformable_group); 8 | 9 | int deform_conv_backward_input_cuda( 10 | THCudaTensor *input, THCudaTensor *offset, THCudaTensor *gradOutput, 11 | THCudaTensor *gradInput, THCudaTensor *gradOffset, THCudaTensor *weight, 12 | THCudaTensor *columns, int kW, int kH, int dW, int dH, int padW, int padH, 13 | int dilationH, int dilationW, int deformable_group); 14 | 15 | int deform_conv_backward_parameters_cuda( 16 | THCudaTensor *input, THCudaTensor *offset, THCudaTensor *gradOutput, 17 | THCudaTensor *gradWeight, /*THCudaTensor *gradBias, */ 18 | THCudaTensor *columns, THCudaTensor *ones, int kW, int kH, int dW, int dH, 19 | int padW, int padH, int dilationH, int dilationW, int deformable_group, 20 | float scale); 21 | -------------------------------------------------------------------------------- /models/deform/src/deform_conv_cuda_kernel.h: -------------------------------------------------------------------------------- 1 | template 2 | void deformable_im2col(cudaStream_t stream, const DType *data_im, 3 | const DType *data_offset, const int channels, 4 | const int height, const int width, const int ksize_h, 5 | const int ksize_w, const int pad_h, const int pad_w, 6 | const int stride_h, const int stride_w, 7 | const int dilation_h, const int dilation_w, 8 | const int deformable_group, DType *data_col); 9 | 10 | template 11 | void deformable_col2im(cudaStream_t stream, const DType *data_col, 12 | const DType *data_offset, const int channels, 13 | const int height, const int width, const int ksize_h, 14 | const int ksize_w, const int pad_h, const int pad_w, 15 | const int stride_h, const int stride_w, 16 | const int dilation_h, const int dilation_w, 17 | const int deformable_group, DType *grad_im); 18 | 19 | template 20 | void deformable_col2im_coord(cudaStream_t stream, const DType *data_col, 21 | const DType *data_im, const DType *data_offset, 22 | const int channels, const int height, 23 | const int width, const int ksize_h, 24 | const int ksize_w, const int pad_h, 25 | const int pad_w, const int stride_h, 26 | const int stride_w, const int dilation_h, 27 | const int dilation_w, const int deformable_group, 28 | DType *grad_offset); 29 | -------------------------------------------------------------------------------- /models/hsd_res.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | from models.attention import PAM_Module 9 | from models.model_helper import weights_init 10 | 11 | 12 | def conv3x3(in_planes, out_planes, stride=1): 13 | "3x3 convolution with padding" 14 | return nn.Conv2d( 15 | in_planes, 16 | out_planes, 17 | kernel_size=3, 18 | stride=stride, 19 | padding=1, 20 | bias=False) 21 | 22 | 23 | class BasicBlock(nn.Module): 24 | expansion = 1 25 | 26 | def __init__(self, inplanes, planes, stride=1, downsample=None): 27 | super(BasicBlock, self).__init__() 28 | self.conv1 = conv3x3(inplanes, planes, stride) 29 | self.bn1 = nn.BatchNorm2d(planes) 30 | self.relu = nn.ReLU(inplace=True) 31 | self.conv2 = conv3x3(planes, planes) 32 | self.bn2 = nn.BatchNorm2d(planes) 33 | self.downsample = downsample 34 | self.stride = stride 35 | 36 | def forward(self, x): 37 | residual = x 38 | 39 | out = self.conv1(x) 40 | out = self.bn1(out) 41 | out = self.relu(out) 42 | 43 | out = self.conv2(out) 44 | out = self.bn2(out) 45 | 46 | if self.downsample is not None: 47 | residual = self.downsample(x) 48 | out += residual 49 | out = self.relu(out) 50 | 51 | return out 52 | 53 | 54 | class Bottleneck(nn.Module): 55 | expansion = 4 56 | 57 | def __init__(self, inplanes, planes, stride=1, downsample=None): 58 | super(Bottleneck, self).__init__() 59 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 60 | self.bn1 = nn.BatchNorm2d(planes) 61 | self.conv2 = nn.Conv2d( 62 | planes, 63 | planes, 64 | kernel_size=3, 65 | stride=stride, 66 | padding=1, 67 | bias=False) 68 | self.bn2 = nn.BatchNorm2d(planes) 69 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 70 | self.bn3 = nn.BatchNorm2d(planes * 4) 71 | self.relu = nn.ReLU(inplace=True) 72 | self.downsample = downsample 73 | self.stride = stride 74 | 75 | def forward(self, x): 76 | residual = x 77 | 78 | out = self.conv1(x) 79 | out = self.bn1(out) 80 | out = self.relu(out) 81 | 82 | out = self.conv2(out) 83 | out = self.bn2(out) 84 | out = self.relu(out) 85 | 86 | out = self.conv3(out) 87 | out = self.bn3(out) 88 | 89 | if self.downsample is not None: 90 | residual = self.downsample(x) 91 | 92 | out += residual 93 | out = self.relu(out) 94 | 95 | return out 96 | class BasicConv(nn.Module): 97 | 98 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False): 99 | super(BasicConv, self).__init__() 100 | self.out_channels = out_planes 101 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) 102 | self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None 103 | self.relu = nn.ReLU(inplace=True) if relu else None 104 | 105 | def forward(self, x): 106 | x = self.conv(x) 107 | if self.bn is not None: 108 | x = self.bn(x) 109 | if self.relu is not None: 110 | x = self.relu(x) 111 | return x 112 | class FEModule(nn.Module): 113 | def __init__(self, in_channels, out_channels, stride=1, norm_layer=nn.BatchNorm2d): 114 | super(FEModule, self).__init__() 115 | self.out_channels = out_channels 116 | inter_channels = in_channels // 4 117 | self.brancha = nn.Sequential(nn.Conv2d(in_channels, inter_channels, 3, padding=1, bias=False), 118 | norm_layer(inter_channels), 119 | nn.ReLU()) 120 | self.sa = PAM_Module(inter_channels) 121 | self.brancha1 = nn.Sequential(nn.Conv2d(inter_channels, inter_channels, 3, padding=1, bias=False), 122 | norm_layer(inter_channels), 123 | nn.ReLU()) 124 | 125 | self.sl = nn.Sequential( 126 | BasicConv(in_channels, inter_channels, kernel_size=1, stride=1), 127 | BasicConv(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1) 128 | ) 129 | # self.sl = BasicConv(in_channels, inter_channels+inter_channels, kernel_size=3, padding=1, stride=1) 130 | self.sn = nn.Sequential( 131 | BasicConv(in_channels, inter_channels, kernel_size=1, stride=1), 132 | BasicConv(inter_channels, inter_channels, kernel_size=3, stride=1, padding=3, dilation=3) 133 | ) 134 | self.fuse = nn.Sequential(nn.Dropout2d(0.1, False), 135 | nn.Conv2d(inter_channels + inter_channels + inter_channels, out_channels, 136 | kernel_size=3, stride=stride, padding=1, bias=False), 137 | norm_layer(out_channels), 138 | nn.ReLU()) 139 | 140 | 141 | 142 | def forward(self, x): 143 | sa_feat = self.sa(self.brancha(x)) 144 | sa_conv = self.brancha1(sa_feat) 145 | sl_output = self.sl(x) 146 | sn_output = self.sn(x) 147 | feat_cat = torch.cat([sa_conv, sl_output, sn_output], dim=1) 148 | output = self.fuse(feat_cat) 149 | return output 150 | # 151 | def trans_head(): 152 | arm_trans = [] 153 | arm_trans += [BasicConv(512, 256, kernel_size=3, stride=1, padding=1)] 154 | arm_trans += [BasicConv(1024, 256, kernel_size=3, stride=1, padding=1)] 155 | arm_trans += [BasicConv(2048, 256, kernel_size=3, stride=1, padding=1)] 156 | arm_trans += [BasicConv(2048, 256, kernel_size=3, stride=1, padding=1)] 157 | 158 | orm_trans = [] 159 | orm_trans += [BasicConv(256, 256, kernel_size=3, stride=1, padding=1)] 160 | orm_trans += [BasicConv(256, 256, kernel_size=3, stride=1, padding=1)] 161 | orm_trans += [BasicConv(256, 256, kernel_size=3, stride=1, padding=1)] 162 | orm_trans += [BasicConv(256, 256, kernel_size=3, stride=1, padding=1)] 163 | 164 | return arm_trans, orm_trans 165 | 166 | class HSDResnet(nn.Module): 167 | def __init__(self, block, num_blocks, size): 168 | super(HSDResnet, self).__init__() 169 | self.inplanes = 64 170 | 171 | self.conv1 = nn.Conv2d( 172 | 3, 64, kernel_size=7, stride=2, padding=3, bias=False) 173 | self.bn1 = nn.BatchNorm2d(64) 174 | # Bottom-up layers 175 | self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1) 176 | self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2) 177 | self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2) 178 | self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2) 179 | self.extras0 = self._make_layer(block, 512, 2, stride=2) 180 | 181 | self.fe1 = FEModule(512,256) 182 | self.fe2 = FEModule(512,256) 183 | self.fe3 = FEModule(512,256) 184 | self.arm_trans = nn.ModuleList(trans_head()[0]) 185 | self.orm_trans = nn.ModuleList(trans_head()[1]) 186 | 187 | self._init_modules() 188 | 189 | def _make_layer(self, block, planes, blocks, stride=1): 190 | downsample = None 191 | if stride != 1 or self.inplanes != planes * block.expansion: 192 | downsample = nn.Sequential( 193 | nn.Conv2d( 194 | self.inplanes, 195 | planes * block.expansion, 196 | kernel_size=1, 197 | stride=stride, 198 | bias=False), 199 | nn.BatchNorm2d(planes * block.expansion), 200 | ) 201 | 202 | layers = [] 203 | layers.append(block(self.inplanes, planes, stride, downsample)) 204 | self.inplanes = planes * block.expansion 205 | for i in range(1, blocks): 206 | layers.append(block(self.inplanes, planes)) 207 | 208 | return nn.Sequential(*layers) 209 | 210 | def _init_modules(self): 211 | self.extras0.apply(weights_init) 212 | self.arm_trans.apply(weights_init) 213 | self.orm_trans.apply(weights_init) 214 | self.fe1.apply(weights_init) 215 | self.fe2.apply(weights_init) 216 | self.fe3.apply(weights_init) 217 | 218 | def forward(self, x): 219 | c1 = F.relu(self.bn1(self.conv1(x))) 220 | c1 = F.max_pool2d(c1, kernel_size=3, stride=2, padding=1) 221 | c2 = self.layer1(c1) 222 | c3 = self.layer2(c2) 223 | c4 = self.layer3(c3) 224 | c5 = self.layer4(c4) 225 | c6 = self.extras0(c5) 226 | 227 | c3_0 = self.arm_trans[0](c3) 228 | c4_0 = self.arm_trans[1](c4) 229 | c5_0 = self.arm_trans[2](c5) 230 | c6_0 = self.arm_trans[3](c6) 231 | 232 | arm_sources = [c3_0, c4_0, c5_0, c6_0] 233 | 234 | odm_sources = [] 235 | up = F.upsample(arm_sources[1], size=arm_sources[0].size()[2:], mode='bilinear') 236 | odm_sources.append(self.fe1(torch.cat([up, arm_sources[0]], dim = 1))) 237 | up = F.upsample(arm_sources[2], size=arm_sources[1].size()[2:], mode='bilinear') 238 | odm_sources.append(self.fe2(torch.cat([up, arm_sources[1]], dim=1))) 239 | up = F.upsample(arm_sources[3], size=arm_sources[2].size()[2:], mode='bilinear') 240 | odm_sources.append(self.fe3(torch.cat([up, arm_sources[2]], dim=1))) 241 | odm_sources.append(self.orm_trans[3](arm_sources[3])) 242 | 243 | return arm_sources, odm_sources 244 | 245 | 246 | def HSDResnet50(size): 247 | return HSDResnet(Bottleneck, [3, 4, 6, 3], size) 248 | 249 | 250 | def HSDResnet101(size): 251 | return HSDResnet(Bottleneck, [3, 4, 23, 3], size) 252 | 253 | 254 | def HSDResnet152(size): 255 | return HSDResnet(Bottleneck, [3, 8, 36, 3], size) 256 | -------------------------------------------------------------------------------- /models/hsd_vgg.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import torch.nn.init as init 9 | from models.model_helper import weights_init 10 | from models.attention import PAM_Module 11 | 12 | 13 | class L2Norm(nn.Module): 14 | def __init__(self, n_channels, scale): 15 | super(L2Norm, self).__init__() 16 | self.n_channels = n_channels 17 | self.gamma = scale or None 18 | self.eps = 1e-10 19 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 20 | self.reset_parameters() 21 | 22 | def reset_parameters(self): 23 | init.constant_(self.weight, self.gamma) 24 | 25 | def forward(self, x): 26 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 27 | x = x / norm 28 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as( 29 | x) * x 30 | return out 31 | 32 | class BasicBlock(nn.Module): 33 | def __init__(self, in_planes, out_planes, stride=1): 34 | super(BasicBlock, self).__init__() 35 | self.out_channels = out_planes 36 | inter_planes = in_planes // 4 37 | self.single_branch = nn.Sequential( 38 | BasicConv(in_planes, inter_planes, kernel_size=(3, 3), stride=stride, padding=(1, 1)), 39 | BasicConv(inter_planes, inter_planes, kernel_size=3, stride=1, padding=2, dilation=2), 40 | BasicConv(inter_planes, out_planes, kernel_size=(3, 3), stride=1, padding=(1, 1)) 41 | ) 42 | 43 | def forward(self, x): 44 | out = self.single_branch(x) 45 | return out 46 | # This function is derived from torchvision VGG make_layers() 47 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 48 | 49 | class BasicConv(nn.Module): 50 | 51 | def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1, groups=1, relu=True, bn=True, bias=False): 52 | super(BasicConv, self).__init__() 53 | self.out_channels = out_planes 54 | self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation, groups=groups, bias=bias) 55 | self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else None 56 | self.relu = nn.ReLU(inplace=True) if relu else None 57 | 58 | def forward(self, x): 59 | x = self.conv(x) 60 | if self.bn is not None: 61 | x = self.bn(x) 62 | if self.relu is not None: 63 | x = self.relu(x) 64 | return x 65 | 66 | 67 | 68 | def vgg(cfg, i, batch_norm=False): 69 | layers = [] 70 | in_channels = i 71 | for v in cfg: 72 | if v == 'M': 73 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 74 | elif v == 'C': 75 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 76 | else: 77 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 78 | if batch_norm: 79 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 80 | else: 81 | layers += [conv2d, nn.ReLU(inplace=True)] 82 | in_channels = v 83 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 84 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=3, dilation=3) 85 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 86 | layers += [ 87 | pool5, conv6, 88 | nn.ReLU(inplace=True), conv7, 89 | nn.ReLU(inplace=True) 90 | ] 91 | return layers 92 | 93 | 94 | base = { 95 | '300': [ 96 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 97 | 512, 512, 512 98 | ], 99 | '512': [ 100 | 64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 101 | 512, 512, 512 102 | ], 103 | } 104 | 105 | 106 | def add_extras(size): 107 | layers = [] 108 | layers += [BasicBlock(1024, 256, stride=2)] 109 | layers += [BasicBlock(256, 256, stride=2)] 110 | return layers 111 | 112 | # 113 | 114 | class SELayer(nn.Module): 115 | def __init__(self, channel, reduction=8): 116 | super(SELayer, self).__init__() 117 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 118 | self.fc = nn.Sequential( 119 | nn.Linear(channel, channel // reduction), 120 | nn.ReLU(inplace=True), 121 | nn.Linear(channel // reduction, channel), 122 | nn.Sigmoid() 123 | ) 124 | 125 | def forward(self, x): 126 | b, c, _, _ = x.size() 127 | y = self.avg_pool(x).view(b, c) 128 | y = self.fc(y).view(b, c, 1, 1) 129 | return x * (1+y) 130 | 131 | class FEModule(nn.Module): 132 | def __init__(self, in_channels, out_channels, stride=1, norm_layer=nn.BatchNorm2d): 133 | super(FEModule, self).__init__() 134 | self.out_channels = out_channels 135 | inter_channels = in_channels // 4 136 | self.brancha = nn.Sequential(nn.Conv2d(in_channels, inter_channels, 3, padding=1, bias=False), 137 | norm_layer(inter_channels), 138 | nn.ReLU()) 139 | 140 | self.sa = PAM_Module(inter_channels) 141 | self.brancha1 = nn.Sequential(nn.Conv2d(inter_channels, inter_channels, 3, padding=1, bias=False), 142 | norm_layer(inter_channels), 143 | nn.ReLU()) 144 | 145 | # aspp 146 | self.sl = nn.Sequential( 147 | BasicConv(in_channels, inter_channels, kernel_size=1, stride=1), 148 | BasicConv(inter_channels, inter_channels, kernel_size=3, stride=1, padding=1) 149 | ) 150 | self.sn = nn.Sequential( 151 | BasicConv(in_channels, inter_channels, kernel_size=1, stride=1), 152 | BasicConv(inter_channels, inter_channels, kernel_size=3, stride=1, padding=3, dilation=3) 153 | ) 154 | 155 | self.fuse = nn.Sequential(nn.Dropout2d(0.1, False), 156 | nn.Conv2d(inter_channels + inter_channels + inter_channels, out_channels, 157 | kernel_size=3, stride=stride, padding=1, bias=False), 158 | norm_layer(out_channels), 159 | nn.ReLU()) 160 | 161 | 162 | 163 | def forward(self, x): 164 | sa_feat = self.sa(self.brancha(x)) 165 | sa_conv = self.brancha1(sa_feat) 166 | 167 | sl_output = self.sl(x) 168 | sn_output = self.sn(x) 169 | 170 | feat_cat = torch.cat([sa_conv, sl_output, sn_output], dim=1) 171 | sasc_output = self.fuse(feat_cat) 172 | 173 | return sasc_output 174 | 175 | def trans_head(): 176 | arm_trans = [] 177 | arm_trans += [BasicConv(512, 256, kernel_size=3, stride=1, padding=1)] 178 | arm_trans += [BasicConv(1024, 256, kernel_size=3, stride=1, padding=1)] 179 | arm_trans += [BasicConv(256, 256, kernel_size=3, stride=1, padding=1)] 180 | arm_trans += [BasicConv(256, 256, kernel_size=3, stride=1, padding=1)] 181 | 182 | orm_trans = [] 183 | orm_trans += [BasicConv(256, 512, kernel_size=3, stride=1, padding=1)] 184 | orm_trans += [BasicConv(256, 512, kernel_size=3, stride=1, padding=1)] 185 | orm_trans += [BasicConv(256, 512, kernel_size=3, stride=1, padding=1)] 186 | orm_trans += [BasicConv(256, 256, kernel_size=3, stride=1, padding=1)] 187 | 188 | return arm_trans, orm_trans 189 | 190 | class VGG16Extractor(nn.Module): 191 | def __init__(self, size, channel_size='48'): 192 | super(VGG16Extractor, self).__init__() 193 | self.vgg = nn.ModuleList(vgg(base[str(size)], 3)) 194 | self.extras = nn.ModuleList(add_extras(str(size))) 195 | 196 | self.fe1 = FEModule(512,256) 197 | self.fe2 = FEModule(512,256) 198 | self.fe3 = FEModule(512,256) 199 | self.arm_trans = nn.ModuleList(trans_head()[0]) 200 | self.orm_trans = nn.ModuleList(trans_head()[1]) 201 | 202 | self._init_modules() 203 | 204 | def _init_modules(self): 205 | self.extras.apply(weights_init) 206 | self.orm_trans.apply(weights_init) 207 | self.arm_trans.apply(weights_init) 208 | self.fe1.apply(weights_init) 209 | self.fe2.apply(weights_init) 210 | self.fe3.apply(weights_init) 211 | 212 | 213 | def forward(self, x): 214 | """Applies network layers and ops on input image(s) x. 215 | Args: 216 | x: input image or batch of images. Shape: [batch,3*batch,300,300]. 217 | Return: 218 | Depending on phase: 219 | test: 220 | Variable(tensor) of output class label predictions, 221 | confidence score, and corresponding location predictions for 222 | each object detected. Shape: [batch,topk,7] 223 | train: 224 | list of concat outputs from: 225 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 226 | 2: localization layers, Shape: [batch,num_priors*4] 227 | 3: priorbox layers, Shape: [2,num_priors*4] 228 | """ 229 | arm_sources = list() 230 | 231 | for i in range(23): 232 | x = self.vgg[i](x) 233 | #38x38 234 | c2 = x 235 | c2 = self.arm_trans[0](c2) 236 | arm_sources.append(c2) 237 | 238 | for k in range(23, len(self.vgg)): 239 | x = self.vgg[k](x) 240 | #19x19 241 | c3 = x 242 | c3 = self.arm_trans[1](c3) 243 | arm_sources.append(c3) 244 | 245 | # 10x10 246 | x = self.extras[0](x) 247 | 248 | # c4 = x 249 | c4 = self.arm_trans[2](x) 250 | arm_sources.append(c4) 251 | 252 | # 5x5 253 | x = self.extras[1](x) 254 | # c5 = x 255 | c5 = self.arm_trans[3](x) 256 | arm_sources.append(c5) 257 | 258 | odm_sources = [] 259 | up = F.upsample(arm_sources[1], size=arm_sources[0].size()[2:], mode='bilinear') 260 | odm_sources.append(self.fe1(torch.cat([up, arm_sources[0]], dim = 1))) 261 | up = F.upsample(arm_sources[2], size=arm_sources[1].size()[2:], mode='bilinear') 262 | odm_sources.append(self.fe2(torch.cat([up, arm_sources[1]], dim=1))) 263 | up = F.upsample(arm_sources[3], size=arm_sources[2].size()[2:], mode='bilinear') 264 | odm_sources.append(self.fe3(torch.cat([up, arm_sources[2]], dim=1))) 265 | odm_sources.append(self.orm_trans[3](arm_sources[3])) 266 | 267 | 268 | return arm_sources, odm_sources 269 | 270 | 271 | def hsd_vgg(size): 272 | return VGG16Extractor(size) -------------------------------------------------------------------------------- /models/model_builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from layers import * 6 | import os 7 | from models.model_helper import weights_init, weights_init1 8 | import importlib 9 | from layers.functions.prior_layer import PriorLayer 10 | from models.deform.modules import ConvOffset2d 11 | 12 | def get_func(func_name): 13 | """Helper to return a function object by name. func_name must identify a 14 | function in this module or the path to a function relative to the base 15 | 'modeling' module. 16 | """ 17 | if func_name == '': 18 | return None 19 | try: 20 | parts = func_name.split('.') 21 | # Refers to a function in this module 22 | if len(parts) == 1: 23 | return globals()[parts[0]] 24 | # Otherwise, assume we're referencing a module under modeling 25 | module_name = 'models.' + '.'.join(parts[:-1]) 26 | module = importlib.import_module(module_name) 27 | return getattr(module, parts[-1]) 28 | except Exception: 29 | print('Failed to find function: %s', func_name) 30 | raise 31 | 32 | 33 | class SSD(nn.Module): 34 | """Single Shot Multibox Architecture 35 | The network is composed of a base VGG network followed by the 36 | added multibox conv layers. Each multibox layer branches into 37 | 1) conv2d for class conf scores 38 | 2) conv2d for localization predictions 39 | 3) associated priorbox layer to produce default bounding 40 | boxes specific to the layer's feature map size. 41 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 42 | 43 | Args: 44 | phase: (string) Can be "test" or "train" 45 | base: VGG16 layers for input, size of either 300 or 500 46 | extras: extra layers that feed to multibox loc and conf layers 47 | head: "multibox head" consists of loc and conf conv layers 48 | """ 49 | 50 | def _init_modules(self): 51 | self.arm_loc.apply(weights_init) 52 | self.arm_conf.apply(weights_init) 53 | self.deform_conv_c.apply(weights_init) 54 | self.deform_conv_l1.apply(weights_init) 55 | self.deform_conv_c1.apply(weights_init) 56 | self.offset.apply(weights_init1) 57 | self.offset1.apply(weights_init1) 58 | self.offlat.apply(weights_init) 59 | self.offlat1.apply(weights_init) 60 | 61 | if self.cfg.MODEL.LOAD_PRETRAINED_WEIGHTS: 62 | weights = torch.load(self.cfg.MODEL.PRETRAIN_WEIGHTS) 63 | print("load pretrain model {}".format( 64 | self.cfg.MODEL.PRETRAIN_WEIGHTS)) 65 | if self.cfg.MODEL.TYPE.split('_')[-1] == 'vgg': 66 | self.extractor.vgg.load_state_dict(weights) 67 | elif self.cfg.MODEL.TYPE.split('_')[-1] == 'res': 68 | self.extractor.load_state_dict(weights, strict=False) 69 | else: 70 | self.extractor.load_state_dict(weights['state_dict'], strict=False) 71 | print(weights['state_dict']) 72 | def __init__(self, cfg): 73 | super(SSD, self).__init__() 74 | self.cfg = cfg 75 | self.size = cfg.MODEL.SIZE 76 | if self.size == '300': 77 | size_cfg = cfg.SMALL 78 | else: 79 | size_cfg = cfg.BIG 80 | self.num_classes = cfg.MODEL.NUM_CLASSES 81 | self.prior_layer = PriorLayer(cfg) 82 | self.priorbox = PriorBox(cfg) 83 | self.priors = self.priorbox.forward() 84 | self.extractor = get_func(cfg.MODEL.CONV_BODY)(self.size) 85 | if cfg.MODEL.CASCADE: 86 | self.odm_channels = size_cfg.ODM_CHANNELS 87 | self.arm_num_classes = 2 88 | 89 | self.arm_loc = nn.ModuleList() 90 | self.arm_conf = nn.ModuleList() 91 | self.arm_channels = size_cfg.ARM_CHANNELS 92 | self.num_anchors = size_cfg.NUM_ANCHORS 93 | self.input_fixed = size_cfg.INPUT_FIXED 94 | self.arm_loc = nn.ModuleList() 95 | self.arm_conf = nn.ModuleList() 96 | self.deform_conv_c = nn.ModuleList() 97 | self.deform_conv_l1 = nn.ModuleList() 98 | self.deform_conv_c1 = nn.ModuleList() 99 | self.offset = nn.ModuleList() 100 | self.offset1 = nn.ModuleList() 101 | self.offlat1 = nn.ModuleList() 102 | self.offlat = nn.ModuleList() 103 | 104 | for i in range(len(self.arm_channels)): 105 | if cfg.MODEL.CASCADE: 106 | self.arm_loc += [nn.Conv2d(self.arm_channels[i], self.num_anchors[i] * 4, kernel_size=3, padding=1)] 107 | self.arm_conf += [nn.Conv2d(self.arm_channels[i], self.num_anchors[i] * self.arm_num_classes, kernel_size=3, padding=1)] 108 | 109 | self.deform_conv_c += [ConvOffset2d(256*3, self.num_anchors[i] * self.arm_num_classes, 3, stride=1, padding=1, num_deformable_groups=3)] 110 | self.deform_conv_c1 += [ConvOffset2d(256*3, self.num_anchors[i] * self.num_classes, 3, stride=1, padding=1, num_deformable_groups=3)] 111 | self.deform_conv_l1 += [ConvOffset2d(256*3, self.num_anchors[i] * 4, 3, stride=1, padding=1, num_deformable_groups=3)] 112 | self.offset += [nn.Conv2d(36, 2 * 3 * 3 * 3, kernel_size=1, stride=1, padding=0, bias=False, groups = 3)] 113 | self.offset1 += [nn.Conv2d(36, 2 * 3 * 3 * 3, kernel_size=1, stride=1, padding=0, bias=False, groups=3)] 114 | self.offlat += [nn.Sequential(nn.Conv2d(12, 36, kernel_size=1, stride=1, padding=0, groups = 3), nn.ReLU())] 115 | self.offlat1 += [nn.Sequential(nn.Conv2d(12, 36, kernel_size=1, stride=1, padding=0, groups = 3), nn.ReLU())] 116 | 117 | else: 118 | self.arm_loc += [nn.Conv2d(self.arm_channels[i], self.num_anchors[i] * 4, kernel_size=3, padding=1)] 119 | self.arm_conf += [nn.Conv2d(self.arm_channels[i], self.num_anchors[i] * self.num_classes, kernel_size=3, padding=1)] 120 | self.deform_conv_c += [ConvOffset2d(256 * 3, self.num_anchors[i] * self.num_classes, 3, stride=1, padding=1, num_deformable_groups=3)] 121 | self.offset += [nn.Conv2d(36, 2 * 3 * 3 * 3, kernel_size=1, stride=1, padding=0, bias=False, groups=3)] 122 | self.offlat += [nn.Sequential(nn.Conv2d(12, 36, kernel_size=1, stride=1, padding=0, groups=3), nn.ReLU())] 123 | 124 | if cfg.TRAIN.TRAIN_ON: 125 | self._init_modules() 126 | 127 | def forward(self, x): 128 | 129 | arm_loc = list() 130 | arm_conf = list() 131 | if self.cfg.MODEL.CASCADE: 132 | odm_loc = list() 133 | odm_conf = list() 134 | arm_xs, odm_xs = self.extractor(x) 135 | 136 | for (x, x0, l0, c0, deform_conv_c, deform_conv_l1, deform_conv_c1, offset, offset1, offlat, offlat1) in \ 137 | zip(odm_xs, arm_xs, self.arm_loc, self.arm_conf, 138 | self.deform_conv_c, self.deform_conv_l1, self.deform_conv_c1, 139 | self.offset, self.offset1, self.offlat, self.offlat1): 140 | 141 | # s1 142 | x_l = l0(x0) 143 | arm_loc.append(x_l.permute(0, 2, 3, 1).contiguous()) 144 | x_offset = offlat(x_l.detach()) 145 | x_offset = offset(x_offset) 146 | x03 = torch.cat([x0, x0, x0], dim=1) 147 | x_c = deform_conv_c(x03, x_offset) 148 | arm_conf.append(x_c.permute(0, 2, 3, 1).contiguous()) 149 | # s2 150 | x3 = torch.cat([x, x, x], dim=1) 151 | xl = deform_conv_l1(x3, x_offset) 152 | x_offset1 = offlat1(xl.detach()) 153 | x_offset1 = x_offset+offset1(x_offset1) 154 | xc = deform_conv_c1(x3, x_offset1) 155 | 156 | odm_loc.append(xl.permute(0, 2, 3, 1).contiguous()) 157 | odm_conf.append(xc.permute(0, 2, 3, 1).contiguous()) 158 | 159 | arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1) 160 | arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1) 161 | odm_loc = torch.cat([o.view(o.size(0), -1) for o in odm_loc], 1) 162 | odm_conf = torch.cat([o.view(o.size(0), -1) for o in odm_conf], 1) 163 | else: 164 | arm_xs, arm_xs1 = self.extractor(x) 165 | 166 | for (x0, x0_c, l0, c0, deform_conv_c, offset, offlat) in \ 167 | zip(arm_xs, arm_xs1, self.arm_loc, self.arm_conf, self.deform_conv_c, self.offset, self.offlat): 168 | 169 | x_l = l0(x0) 170 | arm_loc.append(x_l.permute(0, 2, 3, 1).contiguous()) 171 | 172 | x_offset = offlat(x_l.detach()) 173 | x_offset = offset(x_offset) 174 | x03 = torch.cat([x0_c, x0_c, x0_c], dim=1) 175 | x_c = deform_conv_c(x03, x_offset) 176 | arm_conf.append(x_c.permute(0, 2, 3, 1).contiguous()) 177 | 178 | 179 | arm_loc = torch.cat([o.view(o.size(0), -1) for o in arm_loc], 1) 180 | arm_conf = torch.cat([o.view(o.size(0), -1) for o in arm_conf], 1) 181 | 182 | img_wh = (x.size(3), x.size(2)) 183 | feature_maps_wh = [(t.size(3), t.size(2)) for t in arm_xs] 184 | 185 | if self.cfg.MODEL.CASCADE: 186 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 187 | arm_conf.view( 188 | arm_conf.size(0), -1, self.arm_num_classes), 189 | odm_loc.view(odm_loc.size(0), -1, 4), 190 | odm_conf.view(odm_conf.size(0), -1, self.num_classes), 191 | self.priors if self.input_fixed else self.prior_layer( 192 | img_wh, feature_maps_wh)) 193 | else: 194 | output = (arm_loc.view(arm_loc.size(0), -1, 4), 195 | arm_conf.view(arm_conf.size(0), -1, self.num_classes), 196 | self.priors if self.input_fixed else self.prior_layer( 197 | img_wh, feature_maps_wh)) 198 | return output 199 | -------------------------------------------------------------------------------- /models/model_helper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Written by yq_yao 3 | 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import torch.nn.init as init 9 | 10 | 11 | def xavier(param): 12 | init.xavier_uniform_(param) 13 | 14 | 15 | # def weights_init(m): 16 | # if isinstance(m, nn.Conv2d): 17 | # xavier(m.weight.data) 18 | # m.bias.data.zero_() 19 | 20 | 21 | def weights_init(m): 22 | for key in m.state_dict(): 23 | if key.split('.')[-1] == 'weight': 24 | if 'conv' in key: 25 | init.kaiming_normal(m.state_dict()[key], mode='fan_out') 26 | if 'offset' in key: 27 | #init.kaiming_normal(m.state_dict()[key], mode='fan_out') 28 | m.state_dict()[key][...] = 0 29 | 30 | if 'bn' in key: 31 | m.state_dict()[key][...] = 1 32 | elif key.split('.')[-1] == 'bias': 33 | m.state_dict()[key][...] = 0 34 | def weights_init1(m): 35 | for key in m.state_dict(): 36 | print(key) 37 | if key.split('.')[-1] == 'weight': 38 | init.constant_(m.state_dict()[key], 0.0) 39 | elif key.split('.')[-1] == 'bias': 40 | m.state_dict()[key][...] = 0 41 | 42 | 43 | 44 | def trans_layers(block, fpn_num): 45 | layers = list() 46 | for i in range(fpn_num): 47 | layers += [ 48 | nn.Sequential( 49 | nn.Conv2d(block[i], 256, kernel_size=3, stride=1, padding=1), 50 | nn.ReLU(inplace=True), 51 | nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)) 52 | ] 53 | 54 | return layers 55 | 56 | 57 | def trans_layers_2(raw_channels, inner_channels): 58 | layers = list() 59 | fpn_num = len(raw_channels) 60 | for i in range(fpn_num): 61 | layers += [ 62 | nn.Sequential( 63 | nn.Conv2d( 64 | raw_channels[i], 65 | inner_channels[i], 66 | kernel_size=3, 67 | stride=1, 68 | padding=1), nn.ReLU(inplace=True), 69 | nn.Conv2d( 70 | inner_channels[i], 71 | inner_channels[i], 72 | kernel_size=3, 73 | stride=1, 74 | padding=1)) 75 | ] 76 | 77 | return layers 78 | 79 | 80 | def latent_layers(fpn_num): 81 | layers = [] 82 | for i in range(fpn_num): 83 | layers += [nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)] 84 | return layers 85 | 86 | 87 | def up_layers(fpn_num): 88 | layers = [] 89 | for i in range(fpn_num - 1): 90 | layers += [nn.Upsample(scale_factor=2, mode='bilinear')] 91 | return layers 92 | 93 | 94 | class FpnAdapter(nn.Module): 95 | def __init__(self, block, fpn_num): 96 | super(FpnAdapter, self).__init__() 97 | self.trans_layers = nn.ModuleList(trans_layers(block, fpn_num)) 98 | self.up_layers = nn.ModuleList(up_layers(fpn_num)) 99 | self.latent_layers = nn.ModuleList(latent_layers(fpn_num)) 100 | self._init_modules() 101 | 102 | def _init_modules(self): 103 | self.trans_layers.apply(weights_init) 104 | self.latent_layers.apply(weights_init) 105 | 106 | def forward(self, x): 107 | trans_layers_list = list() 108 | fpn_out = list() 109 | for (p, t) in zip(x, self.trans_layers): 110 | trans_layers_list.append(t(p)) 111 | last = F.relu( 112 | self.latent_layers[-1](trans_layers_list[-1]), inplace=True) 113 | # last layer 114 | fpn_out.append(last) 115 | _up = self.up_layers[-1](last) 116 | for i in range(len(trans_layers_list) - 2, -1, -1): 117 | q = F.relu(trans_layers_list[i] + _up, inplace=True) 118 | q = F.relu(self.latent_layers[i](q), inplace=True) 119 | fpn_out.append(q) 120 | if i > 0: 121 | _up = self.up_layers[i - 1](q) 122 | fpn_out = fpn_out[::-1] 123 | return fpn_out 124 | 125 | 126 | class ConvPool(nn.Module): 127 | def __init__(self, inplane, plane): 128 | super(ConvPool, self).__init__() 129 | self.conv = nn.Conv2d(inplane, plane, kernel_size=1, stride=1) 130 | self.pool = nn.MaxPool2d(kernel_size=2, stride=2) 131 | self._init_modules() 132 | 133 | def _init_modules(self): 134 | self.conv.apply(weights_init) 135 | 136 | def forward(self, x): 137 | out = self.conv(x) 138 | out = self.pool(out) 139 | return x, out 140 | 141 | 142 | class ConvUpsample(nn.Module): 143 | def __init__(self, inplace, plane): 144 | super(ConvUpsample, self).__init__() 145 | self.conv = nn.Conv2d(inplace, plane, kernel_size=1, stride=1) 146 | self.up_sample = nn.Upsample(scale_factor=2, mode='bilinear') 147 | self.smooth_conv = nn.Conv2d(plane, plane, kernel_size=1, stride=1) 148 | self._init_modules() 149 | 150 | def _init_modules(self): 151 | self.conv.apply(weights_init) 152 | self.smooth_conv.apply(weights_init) 153 | 154 | def forward(self, x): 155 | out = self.conv(x) 156 | out = self.up_sample(out) 157 | out = self.smooth_conv(out) 158 | return x, out 159 | 160 | 161 | class ConvPoolUpsample(nn.Module): 162 | def __init__(self, inplace, plane): 163 | super(ConvPoolUpsample, self).__init__() 164 | self.up_conv = nn.Conv2d(inplace, plane, kernel_size=1, stride=1) 165 | self.pool_conv = nn.Conv2d(inplace, plane, kernel_size=1, stride=1) 166 | self.up_sample = nn.Upsample(scale_factor=2, mode='bilinear') 167 | self.smooth_conv = nn.Conv2d(plane, plane, kernel_size=1, stride=1) 168 | self.pool = nn.MaxPool2d(kernel_size=2, stride=2) 169 | 170 | self._init_modules() 171 | 172 | def _init_modules(self): 173 | self.up_conv.apply(weights_init) 174 | self.smooth_conv.apply(weights_init) 175 | self.pool_conv.apply(weights_init) 176 | 177 | def forward(self, x): 178 | up_out = self.up_conv(x) 179 | pool_out = self.pool_conv(x) 180 | up_out = self.up_sample(up_out) 181 | up_out = self.smooth_conv(up_out) 182 | pool_out = self.pool(pool_out) 183 | return x, pool_out, up_out 184 | 185 | 186 | def weave_layers(block, weave_num): 187 | layers = list() 188 | add_channel = 32 189 | for i in range(weave_num): 190 | if i == 0: 191 | layers += [ConvPool(block[i], add_channel)] 192 | elif i == weave_num - 1: 193 | layers += [ConvUpsample(block[i], add_channel)] 194 | else: 195 | layers += [ConvPoolUpsample(block[i], add_channel)] 196 | return layers 197 | 198 | 199 | class WeaveBlock(nn.Module): 200 | def __init__(self, raw_channel, weave_add_channel, dense_num): 201 | super(WeaveBlock, self).__init__() 202 | layers = list() 203 | for j in range(dense_num): 204 | layers += [ 205 | nn.Conv2d( 206 | raw_channel, weave_add_channel[j], kernel_size=1, stride=1) 207 | ] 208 | self.weave_layers = nn.ModuleList(layers) 209 | self._init_modules() 210 | 211 | def _init_modules(self): 212 | self.weave_layers.apply(weights_init) 213 | 214 | def forward(self, x): 215 | out = list() 216 | out.append(x) 217 | for i in range(len(self.weave_layers)): 218 | out.append(self.weave_layers[i](x)) 219 | return out 220 | 221 | 222 | def weave_layers_2(raw_channels, weave_add_channels): 223 | layers = list() 224 | num = 2 225 | weave_num = len(raw_channels) 226 | for i in range(weave_num): 227 | if i == 0 or i == weave_num - 1: 228 | layers += [ 229 | WeaveBlock(raw_channels[i], weave_add_channels[i], num - 1) 230 | ] 231 | else: 232 | layers += [WeaveBlock(raw_channels[i], weave_add_channels[i], num)] 233 | return layers 234 | 235 | 236 | def weave_concat_layers_2(raw_channels, weave_add_channels, weave_channels): 237 | layers = list() 238 | weave_num = len(raw_channels) 239 | for i in range(weave_num): 240 | if i == 0: 241 | add_channel = weave_add_channels[i + 1][0] 242 | elif i == weave_num - 1: 243 | add_channel = weave_add_channels[i - 1][1] 244 | else: 245 | add_channel = weave_add_channels[i - 1][1] + weave_add_channels[ 246 | i + 1][0] 247 | layers += [ 248 | nn.Conv2d( 249 | raw_channels[i] + add_channel, 250 | weave_channels[i], 251 | kernel_size=1, 252 | stride=1) 253 | ] 254 | return layers 255 | 256 | 257 | def weave_concat_layers(block, weave_num, channel): 258 | layers = list() 259 | for i in range(weave_num): 260 | if i == 0 or i == weave_num - 1: 261 | add_channel = channel 262 | else: 263 | add_channel = channel * 2 264 | layers += [ 265 | nn.Conv2d(block[i] + add_channel, 256, kernel_size=1, stride=1) 266 | ] 267 | return layers 268 | 269 | 270 | def adaptive_upsample(x, size): 271 | return F.upsample(x, size, mode='bilinear') 272 | 273 | 274 | def adaptive_pool(x, size): 275 | return F.adaptive_max_pool2d(x, size) 276 | 277 | 278 | class WeaveAdapter2(nn.Module): 279 | def __init__(self, raw_channels, weave_add_channels, weave_channels): 280 | super(WeaveAdapter2, self).__init__() 281 | self.trans_layers = nn.ModuleList( 282 | trans_layers_2(raw_channels, weave_channels)) 283 | self.weave_layers = nn.ModuleList( 284 | weave_layers_2(weave_channels, weave_add_channels)) 285 | self.weave_concat_layers = nn.ModuleList( 286 | weave_concat_layers_2(weave_channels, weave_add_channels, 287 | weave_channels)) 288 | self.weave_num = len(raw_channels) 289 | self._init_modules() 290 | 291 | def _init_modules(self): 292 | self.trans_layers.apply(weights_init) 293 | self.weave_concat_layers.apply(weights_init) 294 | 295 | def forward(self, x): 296 | trans_layers_list = list() 297 | weave_out = list() 298 | for (p, t) in zip(x, self.trans_layers): 299 | trans_layers_list.append(t(p)) 300 | weave_list = list() 301 | for (t, w) in zip(trans_layers_list, self.weave_layers): 302 | weave_list.append(w(t)) 303 | 304 | for i in range(self.weave_num): 305 | b, c, h, w = weave_list[i][0].size() 306 | if i == 0: 307 | up = adaptive_upsample(weave_list[i + 1][1], (h, w)) 308 | weave = torch.cat((up, weave_list[i][0]), 1) 309 | elif i == self.weave_num - 1: 310 | pool = adaptive_pool(weave_list[i - 1][-1], (h, w)) 311 | weave = torch.cat((pool, weave_list[i][0]), 1) 312 | else: 313 | up = adaptive_upsample(weave_list[i + 1][1], (h, w)) 314 | pool = adaptive_pool(weave_list[i - 1][-1], (h, w)) 315 | weave = torch.cat((up, pool, weave_list[i][0]), 1) 316 | weave = F.relu(self.weave_concat_layers[i](weave), inplace=True) 317 | weave_out.append(weave) 318 | return weave_out 319 | 320 | 321 | class WeaveAdapter(nn.Module): 322 | def __init__(self, block, weave_num): 323 | super(WeaveAdapter, self).__init__() 324 | self.trans_layers = nn.ModuleList(trans_layers(block, weave_num)) 325 | self.weave_layers = nn.ModuleList( 326 | weave_layers([256, 256, 256, 256], weave_num)) 327 | self.weave_concat_layers = nn.ModuleList( 328 | weave_concat_layers([256, 256, 256, 256], weave_num, 48)) 329 | self.weave_num = weave_num 330 | self._init_modules() 331 | 332 | def _init_modules(self): 333 | self.trans_layers.apply(weights_init) 334 | self.weave_concat_layers.apply(weights_init) 335 | 336 | def forward(self, x): 337 | trans_layers_list = list() 338 | weave_out = list() 339 | for (p, t) in zip(x, self.trans_layers): 340 | trans_layers_list.append(t(p)) 341 | weave_list = list() 342 | for (t, w) in zip(trans_layers_list, self.weave_layers): 343 | weave_list.append(w(t)) 344 | 345 | for i in range(self.weave_num): 346 | if i == 0: 347 | weave = torch.cat((weave_list[i][0], weave_list[i + 1][-1]), 1) 348 | elif i == self.weave_num - 1: 349 | weave = torch.cat((weave_list[i][0], weave_list[i - 1][1]), 1) 350 | else: 351 | weave = torch.cat((weave_list[i][0], weave_list[i - 1][1], 352 | weave_list[i + 1][-1]), 1) 353 | weave = F.relu(self.weave_concat_layers[i](weave), inplace=True) 354 | weave_out.append(weave) 355 | return weave_out 356 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | torch.set_num_threads(2) 4 | import torch.nn as nn 5 | import torch.optim as optim 6 | import torch.backends.cudnn as cudnn 7 | import torch.nn.init as init 8 | import argparse 9 | from torch.autograd import Variable 10 | import torch.utils.data as data 11 | from data import COCODetection, VOCDetection, BaseTransform, preproc 12 | from layers.modules import MultiBoxLoss, HSDMultiBoxLoss 13 | from layers.functions import Detect 14 | from utils.nms_wrapper import nms, soft_nms 15 | from configs.config import cfg, cfg_from_file 16 | import numpy as np 17 | import time 18 | import os 19 | import sys 20 | import pickle 21 | import datetime 22 | from models.model_builder import SSD 23 | import yaml 24 | 25 | 26 | def arg_parse(): 27 | parser = argparse.ArgumentParser(description='HSD Training') 28 | parser.add_argument( 29 | '--cfg', 30 | dest='cfg_file', 31 | required=True, 32 | help='Config file for training (and optionally testing)') 33 | parser.add_argument( 34 | '--num_workers', 35 | default=8, 36 | type=int, 37 | help='Number of workers used in dataloading') 38 | parser.add_argument('--ngpu', default=2, type=int, help='gpus') 39 | parser.add_argument( 40 | '--resume_net', default=None, help='resume net for retraining') 41 | parser.add_argument( 42 | '--resume_epoch', 43 | default=0, 44 | type=int, 45 | help='resume iter for retraining') 46 | 47 | parser.add_argument( 48 | '--save_folder', 49 | default='./weights/hsd', 50 | help='Location to save checkpoint models') 51 | args = parser.parse_args() 52 | return args 53 | 54 | def detection_collate(batch): 55 | """Custom collate fn for dealing with batches of images that have a different 56 | number of associated object annotations (bounding boxes). 57 | 58 | Arguments: 59 | batch: (tuple) A tuple of tensor images and lists of annotations 60 | 61 | Return: 62 | A tuple containing: 63 | 1) (tensor) batch of images stacked on their 0 dim 64 | 2) (list of tensors) annotations for a given image are stacked on 0 dim 65 | """ 66 | targets = [] 67 | imgs = [] 68 | img_info = [] 69 | for sample in batch: 70 | imgs.append(sample[0]) 71 | targets.append(torch.FloatTensor(sample[1])) 72 | img_info.append(torch.FloatTensor(sample[2])) 73 | return torch.stack(imgs, 0), targets, img_info 74 | 75 | 76 | def adjust_learning_rate(optimizer, epoch, step_epoch, gamma, epoch_size, 77 | iteration): 78 | """Sets the learning rate 79 | # Adapted from PyTorch Imagenet example: 80 | # https://github.com/pytorch/examples/blob/master/imagenet/main.py 81 | """ 82 | ## warmup 83 | if epoch <= cfg.TRAIN.WARMUP_EPOCH: 84 | if cfg.TRAIN.WARMUP: 85 | iteration += (epoch_size * (epoch - 1)) 86 | lr = 1e-6 + (cfg.SOLVER.BASE_LR - 1e-6) * iteration / ( 87 | epoch_size * cfg.TRAIN.WARMUP_EPOCH) 88 | else: 89 | lr = cfg.SOLVER.BASE_LR 90 | else: 91 | div = 0 92 | if epoch > step_epoch[-1]: 93 | div = len(step_epoch) - 1 94 | else: 95 | for idx, v in enumerate(step_epoch): 96 | if epoch > step_epoch[idx] and epoch <= step_epoch[idx + 1]: 97 | div = idx 98 | break 99 | lr = cfg.SOLVER.BASE_LR * (gamma**div) 100 | 101 | for param_group in optimizer.param_groups: 102 | param_group['lr'] = lr 103 | return lr 104 | 105 | def train(train_loader, net, criterion, optimizer, epoch, epoch_step, gamma, 106 | end_epoch, cfg): 107 | net.train() 108 | begin = time.time() 109 | epoch_size = len(train_loader) 110 | for iteration, (imgs, targets, _) in enumerate(train_loader): 111 | t0 = time.time() 112 | lr = adjust_learning_rate(optimizer, epoch, epoch_step, gamma, 113 | epoch_size, iteration) 114 | imgs = imgs.cuda() 115 | imgs.requires_grad_() 116 | with torch.no_grad(): 117 | targets = [anno.cuda() for anno in targets] 118 | output = net(imgs) 119 | optimizer.zero_grad() 120 | if not cfg.MODEL.CASCADE: 121 | ssd_criterion = criterion[0] 122 | loss_l, loss_c = ssd_criterion(output, targets) 123 | loss = loss_l + loss_c 124 | else: 125 | arm_criterion = criterion[0] 126 | odm_criterion = criterion[1] 127 | arm_loss_l, arm_loss_c = arm_criterion(output, targets) 128 | odm_loss_l, odm_loss_c = odm_criterion( 129 | output, targets, use_arm=True, filter_object=True) 130 | loss = arm_loss_l + arm_loss_c + odm_loss_l + odm_loss_c 131 | loss.backward() 132 | optimizer.step() 133 | t1 = time.time() 134 | iteration_time = t1 - t0 135 | all_time = ((end_epoch - epoch) * epoch_size + 136 | (epoch_size - iteration)) * iteration_time 137 | eta = str(datetime.timedelta(seconds=int(all_time))) 138 | if iteration % 10 == 0: 139 | if not cfg.MODEL.CASCADE: 140 | print('Epoch:' + repr(epoch) + ' || epochiter: ' + 141 | repr(iteration % epoch_size) + '/' + repr(epoch_size) + 142 | ' || L: %.4f C: %.4f||' % 143 | (loss_l.item(), loss_c.item()) + 144 | 'iteration time: %.4f sec. ||' % (t1 - t0) + 145 | 'LR: %.5f' % (lr) + ' || eta time: {}'.format(eta)) 146 | else: 147 | print('Epoch:' + repr(epoch) + ' || epochiter: ' + 148 | repr(iteration % epoch_size) + '/' + repr(epoch_size) + 149 | '|| 1st_L: %.4f 1st_C: %.4f||' % 150 | (arm_loss_l.item(), arm_loss_c.item()) + 151 | ' 2rd_L: %.4f 2rd_C: %.4f||' % 152 | (odm_loss_l.item(), odm_loss_c.item()) + 153 | ' loss: %.4f||' % (loss.item()) + 154 | 'iteration time: %.4f sec. ||' % (t1 - t0) + 155 | 'LR: %.5f' % (lr) + ' || eta time: {}'.format(eta)) 156 | 157 | 158 | def save_checkpoint(net, epoch, size, optimizer): 159 | save_name = os.path.join( 160 | args.save_folder, 161 | cfg.MODEL.TYPE + "_epoch_{}_{}".format(str(epoch), str(size)) + '.pth') 162 | torch.save({ 163 | 'epoch': epoch, 164 | 'size': size, 165 | 'batch_size': cfg.TRAIN.BATCH_SIZE, 166 | 'model': net.state_dict(), 167 | 'optimizer': optimizer.state_dict() 168 | }, save_name) 169 | 170 | 171 | def eval_net(val_dataset, 172 | val_loader, 173 | net, 174 | detector, 175 | cfg, 176 | transform, 177 | max_per_image=300, 178 | thresh=0.01, 179 | batch_size=1): 180 | net.eval() 181 | num_images = len(val_dataset) 182 | num_classes = cfg.MODEL.NUM_CLASSES 183 | eval_save_folder = "./eval/" 184 | if not os.path.exists(eval_save_folder): 185 | os.mkdir(eval_save_folder) 186 | all_boxes = [[[] for _ in range(num_images)] for _ in range(num_classes)] 187 | det_file = os.path.join(eval_save_folder, 'detections.pkl') 188 | st = time.time() 189 | for idx, (imgs, _, img_info) in enumerate(val_loader): 190 | with torch.no_grad(): 191 | t1 = time.time() 192 | x = imgs 193 | x = x.cuda() 194 | output = net(x) 195 | t4 = time.time() 196 | boxes, scores = detector.forward(output) 197 | t2 = time.time() 198 | for k in range(boxes.size(0)): 199 | i = idx * batch_size + k 200 | boxes_ = boxes[k] 201 | scores_ = scores[k] 202 | boxes_ = boxes_.cpu().numpy() 203 | scores_ = scores_.cpu().numpy() 204 | img_wh = img_info[k] 205 | scale = np.array([img_wh[0], img_wh[1], img_wh[0], img_wh[1]]) 206 | boxes_ *= scale 207 | for j in range(1, num_classes): 208 | inds = np.where(scores_[:, j] > thresh)[0] 209 | if len(inds) == 0: 210 | all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) 211 | continue 212 | c_bboxes = boxes_[inds] 213 | c_scores = scores_[inds, j] 214 | c_dets = np.hstack((c_bboxes, 215 | c_scores[:, np.newaxis])).astype( 216 | np.float32, copy=False) 217 | keep = nms(c_dets, cfg.TEST.NMS_OVERLAP, force_cpu=True) 218 | keep = keep[:50] 219 | c_dets = c_dets[keep, :] 220 | all_boxes[j][i] = c_dets 221 | t3 = time.time() 222 | detect_time = t2 - t1 223 | nms_time = t3 - t2 224 | forward_time = t4 - t1 225 | if idx % 10 == 0: 226 | print('im_detect: {:d}/{:d} {:.3f}s {:.3f}s {:.3f}s'.format( 227 | i + 1, num_images, forward_time, detect_time, nms_time)) 228 | print("detect time: ", time.time() - st) 229 | with open(det_file, 'wb') as f: 230 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 231 | print('Evaluating detections') 232 | val_dataset.evaluate_detections(all_boxes, eval_save_folder) 233 | 234 | 235 | 236 | def main(): 237 | global args 238 | args = arg_parse() 239 | cfg_from_file(args.cfg_file) 240 | save_folder = args.save_folder 241 | batch_size = cfg.TRAIN.BATCH_SIZE 242 | bgr_means = cfg.TRAIN.BGR_MEAN 243 | p = 0.6 244 | gamma = cfg.SOLVER.GAMMA 245 | momentum = cfg.SOLVER.MOMENTUM 246 | weight_decay = cfg.SOLVER.WEIGHT_DECAY 247 | size = cfg.MODEL.SIZE 248 | thresh = cfg.TEST.CONFIDENCE_THRESH 249 | if cfg.DATASETS.DATA_TYPE == 'VOC': 250 | trainvalDataset = VOCDetection 251 | top_k = 200 252 | else: 253 | trainvalDataset = COCODetection 254 | top_k = 300 255 | dataset_name = cfg.DATASETS.DATA_TYPE 256 | dataroot = cfg.DATASETS.DATAROOT 257 | trainSet = cfg.DATASETS.TRAIN_TYPE 258 | valSet = cfg.DATASETS.VAL_TYPE 259 | num_classes = cfg.MODEL.NUM_CLASSES 260 | start_epoch = args.resume_epoch 261 | epoch_step = cfg.SOLVER.EPOCH_STEPS 262 | end_epoch = cfg.SOLVER.END_EPOCH 263 | if not os.path.exists(save_folder): 264 | os.mkdir(save_folder) 265 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 266 | net = SSD(cfg) 267 | print(net) 268 | if cfg.MODEL.SIZE == '300': 269 | size_cfg = cfg.SMALL 270 | else: 271 | size_cfg = cfg.BIG 272 | optimizer = optim.SGD( 273 | net.parameters(), 274 | lr=cfg.SOLVER.BASE_LR, 275 | momentum=momentum, 276 | weight_decay=weight_decay) 277 | if args.resume_net != None: 278 | checkpoint = torch.load(args.resume_net) 279 | state_dict = checkpoint['model'] 280 | from collections import OrderedDict 281 | new_state_dict = OrderedDict() 282 | for k, v in state_dict.items(): 283 | head = k[:7] 284 | if head == 'module.': 285 | name = k[7:] # remove `module.` 286 | else: 287 | name = k 288 | new_state_dict[name] = v 289 | net.load_state_dict(new_state_dict) 290 | optimizer.load_state_dict(checkpoint['optimizer']) 291 | print('Loading resume network...') 292 | if args.ngpu > 1: 293 | net = torch.nn.DataParallel(net) 294 | net.cuda() 295 | cudnn.benchmark = True 296 | 297 | criterion = list() 298 | if cfg.MODEL.CASCADE: 299 | detector = Detect(cfg) 300 | arm_criterion = HSDMultiBoxLoss(cfg, 2) 301 | odm_criterion = HSDMultiBoxLoss(cfg, cfg.MODEL.NUM_CLASSES) 302 | criterion.append(arm_criterion) 303 | criterion.append(odm_criterion) 304 | else: 305 | detector = Detect(cfg) 306 | ssd_criterion = MultiBoxLoss(cfg) 307 | criterion.append(ssd_criterion) 308 | 309 | TrainTransform = preproc(size_cfg.IMG_WH, bgr_means, p) 310 | ValTransform = BaseTransform(size_cfg.IMG_WH, bgr_means, (2, 0, 1)) 311 | 312 | val_dataset = trainvalDataset(dataroot, valSet, ValTransform, dataset_name) 313 | val_loader = data.DataLoader( 314 | val_dataset, 315 | batch_size, 316 | shuffle=False, 317 | num_workers=args.num_workers, 318 | collate_fn=detection_collate) 319 | 320 | for epoch in range(start_epoch + 1, end_epoch + 1): 321 | train_dataset = trainvalDataset(dataroot, trainSet, TrainTransform, 322 | dataset_name) 323 | epoch_size = len(train_dataset) 324 | train_loader = data.DataLoader( 325 | train_dataset, 326 | batch_size, 327 | shuffle=True, 328 | num_workers=args.num_workers, 329 | collate_fn=detection_collate) 330 | train(train_loader, net, criterion, optimizer, epoch, epoch_step, 331 | gamma, end_epoch, cfg) 332 | if (epoch % 10 == 0) or (epoch % 10 == 0 and epoch >= 110): 333 | save_checkpoint(net, epoch, size, optimizer) 334 | if (epoch >= 50 and epoch % 10 == 0): 335 | eval_net( 336 | val_dataset, 337 | val_loader, 338 | net, 339 | detector, 340 | cfg, 341 | ValTransform, 342 | top_k, 343 | thresh=thresh, 344 | batch_size=batch_size) 345 | save_checkpoint(net, end_epoch, size, optimizer) 346 | 347 | 348 | if __name__ == '__main__': 349 | main() 350 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JialeCao001/HSD/8abcf78db5f313266a3bb3f85b9424927fe59a2d/utils/__init__.py -------------------------------------------------------------------------------- /utils/averageMeter.py: -------------------------------------------------------------------------------- 1 | class AverageMeter(object): 2 | """Computes and stores the average and current value""" 3 | 4 | def __init__(self): 5 | self.reset() 6 | 7 | def reset(self): 8 | self.val = 0 9 | self.avg = 0 10 | self.sum = 0 11 | self.count = 0 12 | 13 | def update(self, val, n=1): 14 | self.val = val 15 | self.sum += val * n 16 | self.count += n 17 | self.avg = self.sum / self.count -------------------------------------------------------------------------------- /utils/build.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | from os.path import join as pjoin 10 | import numpy as np 11 | from distutils.core import setup 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | 15 | 16 | def find_in_path(name, path): 17 | "Find a file in a search path" 18 | # adapted fom http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 19 | for dir in path.split(os.pathsep): 20 | binpath = pjoin(dir, name) 21 | if os.path.exists(binpath): 22 | return os.path.abspath(binpath) 23 | return None 24 | 25 | 26 | def locate_cuda(): 27 | """Locate the CUDA environment on the system 28 | 29 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 30 | and values giving the absolute path to each directory. 31 | 32 | Starts by looking for the CUDAHOME env variable. If not found, everything 33 | is based on finding 'nvcc' in the PATH. 34 | """ 35 | 36 | # first check if the CUDAHOME env variable is in use 37 | if 'CUDAHOME' in os.environ: 38 | home = os.environ['CUDAHOME'] 39 | nvcc = pjoin(home, 'bin', 'nvcc') 40 | else: 41 | # otherwise, search the PATH for NVCC 42 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 43 | nvcc = find_in_path('nvcc', 44 | os.environ['PATH'] + os.pathsep + default_path) 45 | if nvcc is None: 46 | raise EnvironmentError( 47 | 'The nvcc binary could not be ' 48 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME' 49 | ) 50 | home = os.path.dirname(os.path.dirname(nvcc)) 51 | 52 | cudaconfig = { 53 | 'home': home, 54 | 'nvcc': nvcc, 55 | 'include': pjoin(home, 'include'), 56 | 'lib64': pjoin(home, 'lib64') 57 | } 58 | for k, v in cudaconfig.items(): 59 | if not os.path.exists(v): 60 | raise EnvironmentError( 61 | 'The CUDA %s path could not be located in %s' % (k, v)) 62 | 63 | return cudaconfig 64 | 65 | 66 | CUDA = locate_cuda() 67 | 68 | # Obtain the numpy include directory. This logic works across numpy versions. 69 | try: 70 | numpy_include = np.get_include() 71 | except AttributeError: 72 | numpy_include = np.get_numpy_include() 73 | 74 | 75 | def customize_compiler_for_nvcc(self): 76 | """inject deep into distutils to customize how the dispatch 77 | to gcc/nvcc works. 78 | 79 | If you subclass UnixCCompiler, it's not trivial to get your subclass 80 | injected in, and still have the right customizations (i.e. 81 | distutils.sysconfig.customize_compiler) run on it. So instead of going 82 | the OO route, I have this. Note, it's kindof like a wierd functional 83 | subclassing going on.""" 84 | 85 | # tell the compiler it can processes .cu 86 | self.src_extensions.append('.cu') 87 | 88 | # save references to the default compiler_so and _comple methods 89 | default_compiler_so = self.compiler_so 90 | super = self._compile 91 | 92 | # now redefine the _compile method. This gets executed for each 93 | # object but distutils doesn't have the ability to change compilers 94 | # based on source extension: we add it. 95 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 96 | print(extra_postargs) 97 | if os.path.splitext(src)[1] == '.cu': 98 | # use the cuda for .cu files 99 | self.set_executable('compiler_so', CUDA['nvcc']) 100 | # use only a subset of the extra_postargs, which are 1-1 translated 101 | # from the extra_compile_args in the Extension class 102 | postargs = extra_postargs['nvcc'] 103 | else: 104 | postargs = extra_postargs['gcc'] 105 | 106 | super(obj, src, ext, cc_args, postargs, pp_opts) 107 | # reset the default compiler_so, which we might have changed for cuda 108 | self.compiler_so = default_compiler_so 109 | 110 | # inject our redefined _compile method into the class 111 | self._compile = _compile 112 | 113 | 114 | # run the customize_compiler 115 | class custom_build_ext(build_ext): 116 | def build_extensions(self): 117 | customize_compiler_for_nvcc(self.compiler) 118 | build_ext.build_extensions(self) 119 | 120 | 121 | ext_modules = [ 122 | Extension( 123 | "nms.cpu_nms", ["nms/cpu_nms.pyx"], 124 | extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]}, 125 | include_dirs=[numpy_include]), 126 | Extension( 127 | 'nms.gpu_nms', 128 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 129 | library_dirs=[CUDA['lib64']], 130 | libraries=['cudart'], 131 | language='c++', 132 | runtime_library_dirs=[CUDA['lib64']], 133 | # this syntax is specific to this build system 134 | # we're only going to use certain compiler args with nvcc and not with gcc 135 | # the implementation of this trick is in customize_compiler() below 136 | extra_compile_args={ 137 | 'gcc': ["-Wno-unused-function"], 138 | 'nvcc': [ 139 | '-arch=sm_61', '--ptxas-options=-v', '-c', 140 | '--compiler-options', "'-fPIC'" 141 | ] 142 | }, 143 | include_dirs=[numpy_include, CUDA['include']]) 144 | ] 145 | 146 | setup( 147 | name='mot_utils', 148 | ext_modules=ext_modules, 149 | # inject our custom trigger 150 | cmdclass={'build_ext': custom_build_ext}, 151 | ) 152 | -------------------------------------------------------------------------------- /utils/collections.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | ############################################################################## 15 | """A simple attribute dictionary used for representing configuration options.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | from __future__ import unicode_literals 21 | 22 | 23 | class AttrDict(dict): 24 | 25 | IMMUTABLE = '__immutable__' 26 | 27 | def __init__(self, *args, **kwargs): 28 | super(AttrDict, self).__init__(*args, **kwargs) 29 | self.__dict__[AttrDict.IMMUTABLE] = False 30 | 31 | def __getattr__(self, name): 32 | if name in self.__dict__: 33 | return self.__dict__[name] 34 | elif name in self: 35 | return self[name] 36 | else: 37 | raise AttributeError(name) 38 | 39 | def __setattr__(self, name, value): 40 | if not self.__dict__[AttrDict.IMMUTABLE]: 41 | if name in self.__dict__: 42 | self.__dict__[name] = value 43 | else: 44 | self[name] = value 45 | else: 46 | raise AttributeError( 47 | 'Attempted to set "{}" to "{}", but AttrDict is immutable'. 48 | format(name, value)) 49 | 50 | def immutable(self, is_immutable): 51 | """Set immutability to is_immutable and recursively apply the setting 52 | to all nested AttrDicts. 53 | """ 54 | self.__dict__[AttrDict.IMMUTABLE] = is_immutable 55 | # Recursively set immutable state 56 | for v in self.__dict__.values(): 57 | if isinstance(v, AttrDict): 58 | v.immutable(is_immutable) 59 | for v in self.values(): 60 | if isinstance(v, AttrDict): 61 | v.immutable(is_immutable) 62 | 63 | def is_immutable(self): 64 | return self.__dict__[AttrDict.IMMUTABLE] 65 | -------------------------------------------------------------------------------- /utils/get_class_map.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import argparse 4 | import os.path as osp 5 | 6 | 7 | def check_size(submit_file): 8 | max_size = 60 * 1024 * 1024 9 | if osp.getsize(submit_file) > max_size: 10 | raise ( 11 | IOError, 12 | "File size exceeds the specified maximum size, which is 60M for the server." 13 | ) 14 | 15 | 16 | def parse_submission(submit_file): 17 | with open(submit_file, 'r') as f: 18 | lines = f.readlines() 19 | submit_dict = dict() 20 | final_dict = dict() 21 | splitlines = [x.strip().split(' ') for x in lines] 22 | for idx, val in enumerate(splitlines): 23 | cls = str(int(float(val[1]))) 24 | if cls not in submit_dict: 25 | submit_dict[cls] = list() 26 | final_dict[cls] = dict() 27 | submit_dict[cls].append( 28 | [val[0], val[2], val[3], val[4], val[5], val[6]]) 29 | for k, v in submit_dict.items(): 30 | image_ids = [x[0] for x in v] 31 | confidence = np.array([float(x[1]) for x in v]) 32 | BB = np.array([[float(z) for z in x[2:]] for x in v]) 33 | sorted_ind = np.argsort(-confidence) 34 | sorted_scores = np.sort(-confidence) 35 | BB = BB[sorted_ind, :] 36 | image_ids = [image_ids[x] for x in sorted_ind] 37 | final_dict[k]["image_ids"] = image_ids 38 | final_dict[k]["BB"] = np.array(BB) 39 | return final_dict 40 | 41 | 42 | def parse_gt_annotation(gt_file): 43 | with open(gt_file, 'r') as f: 44 | lines = f.readlines() 45 | info = [x.strip().split() for x in lines] 46 | gt = {} 47 | for item in info: 48 | img_id = item[0] 49 | obj_struct = {} 50 | obj_struct['class'] = item[1] 51 | obj_struct['bbox'] = [ 52 | int(item[2]), 53 | int(item[3]), 54 | int(item[4]), 55 | int(item[5]) 56 | ] 57 | if img_id not in gt: 58 | gt[img_id] = list() 59 | gt[img_id].append(obj_struct) 60 | return gt 61 | 62 | 63 | def get_class_recs(recs, classname): 64 | npos = 0 65 | class_recs = {} 66 | for key in recs.keys(): 67 | R = [obj for obj in recs[key] if obj['class'] == classname] 68 | bbox = np.array([x['bbox'] for x in R]) 69 | det = [False] * len(R) 70 | npos += len(R) 71 | class_recs[key] = {'bbox': bbox, 'det': det} 72 | return class_recs, npos 73 | 74 | 75 | def compute_ap(rec, prec): 76 | mrec = np.concatenate(([0.], rec, [1.])) 77 | mpre = np.concatenate(([0.], prec, [0.])) 78 | for i in range(mpre.size - 1, 0, -1): 79 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 80 | i = np.where(mrec[1:] != mrec[:-1])[0] 81 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 82 | return ap 83 | 84 | 85 | def eval(submit_file, gt_file, ovthresh, classname): 86 | recs = parse_gt_annotation(gt_file) 87 | submit_result = parse_submission(submit_file) 88 | # get one class result 89 | class_recs, npos = get_class_recs(recs, classname) 90 | image_ids = submit_result[classname]["image_ids"] 91 | BB = submit_result[classname]["BB"] 92 | nd = len(image_ids) 93 | tp = np.zeros(nd) 94 | fp = np.zeros(nd) 95 | for d in range(nd): 96 | if image_ids[d] not in recs.keys(): 97 | raise KeyError( 98 | "Can not find image {} in the groundtruth file, did you submit the result file for the right dataset?" 99 | .format(image_ids[d])) 100 | for d in range(nd): 101 | R = class_recs[image_ids[d]] 102 | bb = BB[d, :].astype(float) 103 | ovmax = -np.inf 104 | BBGT = R['bbox'].astype(float) 105 | if BBGT.size > 0: 106 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 107 | iymin = np.maximum(BBGT[:, 1], bb[1]) 108 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 109 | iymax = np.minimum(BBGT[:, 3], bb[3]) 110 | iw = np.maximum(ixmax - ixmin + 1., 0.) 111 | ih = np.maximum(iymax - iymin + 1., 0.) 112 | inters = iw * ih 113 | uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + 114 | (BBGT[:, 2] - BBGT[:, 0] + 1.) * 115 | (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) 116 | overlaps = inters / uni 117 | ovmax = np.max(overlaps) 118 | jmax = np.argmax(overlaps) 119 | if ovmax > ovthresh: 120 | if not R['det'][jmax]: 121 | tp[d] = 1. 122 | R['det'][jmax] = 1 123 | else: 124 | fp[d] = 1. 125 | else: 126 | fp[d] = 1. 127 | fp = np.cumsum(fp) 128 | tp = np.cumsum(tp) 129 | rec = tp / float(npos) 130 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 131 | ap = compute_ap(rec, prec) 132 | return ap 133 | 134 | 135 | def result_eval(submit_file, gt, class_list): 136 | ove_aap = [] 137 | for ove in np.arange(0.5, 1.0, 0.05): 138 | cls_aap = [] 139 | for cls in class_list: 140 | ap = eval(submit_file, gt, ove, cls) 141 | cls_aap.append(ap) 142 | cls_mAP = np.average(cls_aap) 143 | print("thresh", round(ove, 3), "map", round(cls_mAP * 100, 3)) 144 | ove_aap.append(cls_mAP) 145 | mAP = np.average(ove_aap) * 100 146 | return round(mAP, 3) 147 | 148 | 149 | if __name__ == '__main__': 150 | ''' 151 | submit_file: image_id, class, score, xmin, ymin, xmax, ymax 152 | gt_file: image_id, class, xmin, ymin, xmax, ymax 153 | ''' 154 | class_list = [] 155 | for i in range(1, 61): 156 | class_list.append(str(i)) 157 | submit_file = "./results/fpn_dcn_result.csv" 158 | gt_file = "./results/val_label.txt" 159 | check_size(submit_file) 160 | mAP = result_eval(submit_file, gt_file, class_list) 161 | out = {'Average AP': str(round(mAP, 3))} 162 | print(out) -------------------------------------------------------------------------------- /utils/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JialeCao001/HSD/8abcf78db5f313266a3bb3f85b9424927fe59a2d/utils/nms/__init__.py -------------------------------------------------------------------------------- /utils/nms/cpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 12 | return a if a >= b else b 13 | 14 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 15 | return a if a <= b else b 16 | 17 | cdef inline np.float32_t abs(np.float32_t a, np.float32_t b): 18 | return a - b if a >= b else b - a 19 | 20 | def get_iou_weights(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold, float init_weight): 21 | 22 | cdef: 23 | int num = ious.shape[0] 24 | # np.ndarray[np.float32_t, ndim=1] out = np.zeros(num, dtype=np.float) 25 | int idx 26 | float iou 27 | float weight 28 | 29 | for idx, iou in enumerate(ious): 30 | weight = init_weight 31 | if iou > 0.0: 32 | if iou > threshold + 0.1: 33 | weight += 1.0 34 | elif iou < threshold - 0.1: 35 | weight += 1.0 36 | else: 37 | weight += 0.0 38 | ious[idx] = weight 39 | return ious 40 | 41 | def get_mask(np.ndarray[np.float32_t, ndim=1] ious, np.float threshold): 42 | cdef: 43 | int num = ious.shape[0] 44 | int idx = 0 45 | float distance 46 | float iou 47 | np.ndarray[np.int64_t, ndim=1] out = np.zeros((num), dtype=np.int64) 48 | for idx, iou in enumerate(ious): 49 | # if iou >= threshold: 50 | # distance = iou - threshold 51 | # if distance < 0.1: 52 | # out[idx] = 0 53 | # elif distance < 0.2: 54 | # out[idx] = 1 55 | # else: 56 | # out[idx] = 2 57 | # else: 58 | # distance = threshold - iou 59 | # if distance < 0.1: 60 | # out[idx] = 2 61 | # elif distance < 0.2: 62 | # out[idx] = 1 63 | # else: 64 | # out[idx] = 0 65 | distance = abs(iou, threshold) 66 | if distance < 0.1: 67 | # out[:,2] = 1 68 | out[idx] = 2 69 | elif distance < 0.2: 70 | # out[:,1] = 1 71 | out[idx] = 1 72 | else: 73 | # out[:,0] = 0 74 | out[idx] = 0 75 | return out 76 | 77 | def cpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh): 78 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:, 0] 79 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:, 1] 80 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:, 2] 81 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:, 3] 82 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 83 | 84 | cdef np.ndarray[np.float32_t, ndim=1] areas = (x2 - x1 + 1) * (y2 - y1 + 1) 85 | cdef np.ndarray[np.int_t, ndim=1] order = scores.argsort()[::-1] 86 | 87 | cdef int ndets = dets.shape[0] 88 | cdef np.ndarray[np.int_t, ndim=1] suppressed = \ 89 | np.zeros((ndets), dtype=np.int) 90 | 91 | # nominal indices 92 | cdef int _i, _j 93 | # sorted indices 94 | cdef int i, j 95 | # temp variables for box i's (the box currently under consideration) 96 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 97 | # variables for computing overlap with box j (lower scoring box) 98 | cdef np.float32_t xx1, yy1, xx2, yy2 99 | cdef np.float32_t w, h 100 | cdef np.float32_t inter, ovr 101 | 102 | keep = [] 103 | for _i in range(ndets): 104 | i = order[_i] 105 | if suppressed[i] == 1: 106 | continue 107 | keep.append(i) 108 | ix1 = x1[i] 109 | iy1 = y1[i] 110 | ix2 = x2[i] 111 | iy2 = y2[i] 112 | iarea = areas[i] 113 | for _j in range(_i + 1, ndets): 114 | j = order[_j] 115 | if suppressed[j] == 1: 116 | continue 117 | xx1 = max(ix1, x1[j]) 118 | yy1 = max(iy1, y1[j]) 119 | xx2 = min(ix2, x2[j]) 120 | yy2 = min(iy2, y2[j]) 121 | w = max(0.0, xx2 - xx1 + 1) 122 | h = max(0.0, yy2 - yy1 + 1) 123 | inter = w * h 124 | ovr = inter / (iarea + areas[j] - inter) 125 | if ovr >= thresh: 126 | suppressed[j] = 1 127 | 128 | return keep 129 | 130 | def cpu_soft_nms(np.ndarray[float, ndim=2] boxes, float sigma=0.5, float Nt=0.3, float threshold=0.001, unsigned int method=0): 131 | cdef unsigned int N = boxes.shape[0] 132 | cdef float iw, ih, box_area 133 | cdef float ua 134 | cdef int pos = 0 135 | cdef float maxscore = 0 136 | cdef int maxpos = 0 137 | cdef float x1,x2,y1,y2,tx1,tx2,ty1,ty2,ts,area,weight,ov 138 | 139 | for i in range(N): 140 | maxscore = boxes[i, 4] 141 | maxpos = i 142 | 143 | tx1 = boxes[i,0] 144 | ty1 = boxes[i,1] 145 | tx2 = boxes[i,2] 146 | ty2 = boxes[i,3] 147 | ts = boxes[i,4] 148 | 149 | pos = i + 1 150 | # get max box 151 | while pos < N: 152 | if maxscore < boxes[pos, 4]: 153 | maxscore = boxes[pos, 4] 154 | maxpos = pos 155 | pos = pos + 1 156 | 157 | # add max box as a detection 158 | boxes[i,0] = boxes[maxpos,0] 159 | boxes[i,1] = boxes[maxpos,1] 160 | boxes[i,2] = boxes[maxpos,2] 161 | boxes[i,3] = boxes[maxpos,3] 162 | boxes[i,4] = boxes[maxpos,4] 163 | 164 | # swap ith box with position of max box 165 | boxes[maxpos,0] = tx1 166 | boxes[maxpos,1] = ty1 167 | boxes[maxpos,2] = tx2 168 | boxes[maxpos,3] = ty2 169 | boxes[maxpos,4] = ts 170 | 171 | tx1 = boxes[i,0] 172 | ty1 = boxes[i,1] 173 | tx2 = boxes[i,2] 174 | ty2 = boxes[i,3] 175 | ts = boxes[i,4] 176 | 177 | pos = i + 1 178 | # NMS iterations, note that N changes if detection boxes fall below threshold 179 | while pos < N: 180 | x1 = boxes[pos, 0] 181 | y1 = boxes[pos, 1] 182 | x2 = boxes[pos, 2] 183 | y2 = boxes[pos, 3] 184 | s = boxes[pos, 4] 185 | 186 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 187 | iw = (min(tx2, x2) - max(tx1, x1) + 1) 188 | if iw > 0: 189 | ih = (min(ty2, y2) - max(ty1, y1) + 1) 190 | if ih > 0: 191 | ua = float((tx2 - tx1 + 1) * (ty2 - ty1 + 1) + area - iw * ih) 192 | ov = iw * ih / ua #iou between max box and detection box 193 | 194 | if method == 1: # linear 195 | if ov > Nt: 196 | weight = 1 - ov 197 | else: 198 | weight = 1 199 | elif method == 2: # gaussian 200 | weight = np.exp(-(ov * ov)/sigma) 201 | else: # original NMS 202 | if ov > Nt: 203 | weight = 0 204 | else: 205 | weight = 1 206 | 207 | boxes[pos, 4] = weight*boxes[pos, 4] 208 | 209 | # if box score falls below threshold, discard the box by swapping with last box 210 | # update N 211 | if boxes[pos, 4] < threshold: 212 | boxes[pos,0] = boxes[N-1, 0] 213 | boxes[pos,1] = boxes[N-1, 1] 214 | boxes[pos,2] = boxes[N-1, 2] 215 | boxes[pos,3] = boxes[N-1, 3] 216 | boxes[pos,4] = boxes[N-1, 4] 217 | N = N - 1 218 | pos = pos - 1 219 | 220 | pos = pos + 1 221 | 222 | keep = [i for i in range(N)] 223 | return keep 224 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /utils/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Faster R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | cimport numpy as np 10 | 11 | assert sizeof(int) == sizeof(np.int32_t) 12 | 13 | cdef extern from "gpu_nms.hpp": 14 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 15 | 16 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 17 | np.int32_t device_id=0): 18 | cdef int boxes_num = dets.shape[0] 19 | cdef int boxes_dim = dets.shape[1] 20 | cdef int num_out 21 | cdef np.ndarray[np.int32_t, ndim=1] \ 22 | keep = np.zeros(boxes_num, dtype=np.int32) 23 | cdef np.ndarray[np.float32_t, ndim=1] \ 24 | scores = dets[:, 4] 25 | cdef np.ndarray[np.int_t, ndim=1] \ 26 | order = scores.argsort()[::-1] 27 | cdef np.ndarray[np.float32_t, ndim=2] \ 28 | sorted_dets = dets[order, :] 29 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 30 | keep = keep[:num_out] 31 | return list(order[keep]) 32 | -------------------------------------------------------------------------------- /utils/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | #include "gpu_nms.hpp" 9 | #include 10 | #include 11 | 12 | #define CUDA_CHECK(condition) \ 13 | /* Code block avoids redefinition of cudaError_t error */ \ 14 | do { \ 15 | cudaError_t error = condition; \ 16 | if (error != cudaSuccess) { \ 17 | std::cout << cudaGetErrorString(error) << std::endl; \ 18 | } \ 19 | } while (0) 20 | 21 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 22 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 23 | 24 | __device__ inline float devIoU(float const * const a, float const * const b) { 25 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 26 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 27 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 28 | float interS = width * height; 29 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 30 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 31 | return interS / (Sa + Sb - interS); 32 | } 33 | 34 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 35 | const float *dev_boxes, unsigned long long *dev_mask) { 36 | const int row_start = blockIdx.y; 37 | const int col_start = blockIdx.x; 38 | 39 | // if (row_start > col_start) return; 40 | 41 | const int row_size = 42 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 43 | const int col_size = 44 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 45 | 46 | __shared__ float block_boxes[threadsPerBlock * 5]; 47 | if (threadIdx.x < col_size) { 48 | block_boxes[threadIdx.x * 5 + 0] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 50 | block_boxes[threadIdx.x * 5 + 1] = 51 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 52 | block_boxes[threadIdx.x * 5 + 2] = 53 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 54 | block_boxes[threadIdx.x * 5 + 3] = 55 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 56 | block_boxes[threadIdx.x * 5 + 4] = 57 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 58 | } 59 | __syncthreads(); 60 | 61 | if (threadIdx.x < row_size) { 62 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 63 | const float *cur_box = dev_boxes + cur_box_idx * 5; 64 | int i = 0; 65 | unsigned long long t = 0; 66 | int start = 0; 67 | if (row_start == col_start) { 68 | start = threadIdx.x + 1; 69 | } 70 | for (i = start; i < col_size; i++) { 71 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 72 | t |= 1ULL << i; 73 | } 74 | } 75 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 76 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 77 | } 78 | } 79 | 80 | void _set_device(int device_id) { 81 | int current_device; 82 | CUDA_CHECK(cudaGetDevice(¤t_device)); 83 | if (current_device == device_id) { 84 | return; 85 | } 86 | // The call to cudaSetDevice must come before any calls to Get, which 87 | // may perform initialization using the GPU. 88 | CUDA_CHECK(cudaSetDevice(device_id)); 89 | } 90 | 91 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 92 | int boxes_dim, float nms_overlap_thresh, int device_id) { 93 | _set_device(device_id); 94 | 95 | float* boxes_dev = NULL; 96 | unsigned long long* mask_dev = NULL; 97 | 98 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 99 | 100 | CUDA_CHECK(cudaMalloc(&boxes_dev, 101 | boxes_num * boxes_dim * sizeof(float))); 102 | CUDA_CHECK(cudaMemcpy(boxes_dev, 103 | boxes_host, 104 | boxes_num * boxes_dim * sizeof(float), 105 | cudaMemcpyHostToDevice)); 106 | 107 | CUDA_CHECK(cudaMalloc(&mask_dev, 108 | boxes_num * col_blocks * sizeof(unsigned long long))); 109 | 110 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 111 | DIVUP(boxes_num, threadsPerBlock)); 112 | dim3 threads(threadsPerBlock); 113 | nms_kernel<<>>(boxes_num, 114 | nms_overlap_thresh, 115 | boxes_dev, 116 | mask_dev); 117 | 118 | std::vector mask_host(boxes_num * col_blocks); 119 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 120 | mask_dev, 121 | sizeof(unsigned long long) * boxes_num * col_blocks, 122 | cudaMemcpyDeviceToHost)); 123 | 124 | std::vector remv(col_blocks); 125 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 126 | 127 | int num_to_keep = 0; 128 | for (int i = 0; i < boxes_num; i++) { 129 | int nblock = i / threadsPerBlock; 130 | int inblock = i % threadsPerBlock; 131 | 132 | if (!(remv[nblock] & (1ULL << inblock))) { 133 | keep_out[num_to_keep++] = i; 134 | unsigned long long *p = &mask_host[0] + i * col_blocks; 135 | for (int j = nblock; j < col_blocks; j++) { 136 | remv[j] |= p[j]; 137 | } 138 | } 139 | } 140 | *num_out = num_to_keep; 141 | 142 | CUDA_CHECK(cudaFree(boxes_dev)); 143 | CUDA_CHECK(cudaFree(mask_dev)); 144 | } 145 | -------------------------------------------------------------------------------- /utils/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /utils/nms_wrapper.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | from .nms.cpu_nms import cpu_nms, cpu_soft_nms 9 | from .nms.gpu_nms import gpu_nms 10 | 11 | # def nms(dets, thresh, force_cpu=False): 12 | # """Dispatch to either CPU or GPU NMS implementations.""" 13 | 14 | # if dets.shape[0] == 0: 15 | # return [] 16 | # if cfg.USE_GPU_NMS and not force_cpu: 17 | # return gpu_nms(dets, thresh, device_id=cfg.GPU_ID) 18 | # else: 19 | # return cpu_nms(dets, thresh) 20 | 21 | 22 | def nms(dets, thresh, force_cpu=False): 23 | """Dispatch to either CPU or GPU NMS implementations.""" 24 | 25 | if dets.shape[0] == 0: 26 | return [] 27 | if force_cpu: 28 | #return cpu_soft_nms(dets, thresh, method = 0) 29 | return cpu_nms(dets, thresh) 30 | return gpu_nms(dets, thresh) 31 | 32 | 33 | def soft_nms(dets, Nt=0.3, sigma=0.5, thresh=0.001, method=1): 34 | """Dispatch to either CPU or GPU NMS implementations.""" 35 | 36 | if dets.shape[0] == 0: 37 | return [] 38 | return cpu_soft_nms(dets, sigma, Nt, thresh, method) -------------------------------------------------------------------------------- /utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | 11 | class Timer(object): 12 | """A simple timer.""" 13 | 14 | def __init__(self): 15 | self.total_time = 0. 16 | self.calls = 0 17 | self.start_time = 0. 18 | self.diff = 0. 19 | self.average_time = 0. 20 | 21 | def tic(self): 22 | # using time.time instead of time.clock because time time.clock 23 | # does not normalize for multithreading 24 | self.start_time = time.time() 25 | 26 | def toc(self, average=True): 27 | self.diff = time.time() - self.start_time 28 | self.total_time += self.diff 29 | self.calls += 1 30 | self.average_time = self.total_time / self.calls 31 | if average: 32 | return self.average_time 33 | else: 34 | return self.diff 35 | 36 | def clear(self): 37 | self.total_time = 0. 38 | self.calls = 0 39 | self.start_time = 0. 40 | self.diff = 0. 41 | self.average_time = 0. 42 | --------------------------------------------------------------------------------