├── mmdet └── models │ ├── utils │ ├── __init__.py │ └── effdet_utils.py │ ├── necks │ ├── __init__.py │ └── bifpn.py │ ├── backbones │ ├── __init__.py │ └── efficientnet.py │ └── anchor_heads │ ├── __init__.py │ └── retina_sepconv_head.py ├── LICENSE ├── .gitignore ├── README.md └── configs └── efficientdet ├── efficientdet_d1_8gpu.py ├── efficientdet_d2_8gpu.py ├── efficientdet_d3_8gpu.py ├── efficientdet_d4_8gpu.py ├── efficientdet_d0_4gpu.py └── efficientdet_d5_8gpu.py /mmdet/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .weight_init import bias_init_with_prob 2 | from .effdet_utils import MemoryEfficientSwish, Swish, SeparableConv2d 3 | 4 | __all__ = ['bias_init_with_prob', 'MemoryEfficientSwish', 'SeparableConv2d', 'Swish'] 5 | -------------------------------------------------------------------------------- /mmdet/models/necks/__init__.py: -------------------------------------------------------------------------------- 1 | from .bfp import BFP 2 | from .fpn import FPN 3 | from .fpn_carafe import FPN_CARAFE 4 | from .hrfpn import HRFPN 5 | from .nas_fpn import NASFPN 6 | from .bifpn import BiFPN 7 | 8 | __all__ = ['FPN', 'BFP', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'BiFPN'] 9 | -------------------------------------------------------------------------------- /mmdet/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .hrnet import HRNet 2 | from .resnet import ResNet, make_res_layer 3 | from .resnext import ResNeXt 4 | from .ssd_vgg import SSDVGG 5 | from .efficientnet import EfficientNet 6 | 7 | __all__ = ['ResNet', 'make_res_layer', 'ResNeXt', 'SSDVGG', 'HRNet', 'EfficientNet'] 8 | -------------------------------------------------------------------------------- /mmdet/models/anchor_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .anchor_head import AnchorHead 2 | from .atss_head import ATSSHead 3 | from .fcos_head import FCOSHead 4 | from .fovea_head import FoveaHead 5 | from .free_anchor_retina_head import FreeAnchorRetinaHead 6 | from .ga_retina_head import GARetinaHead 7 | from .ga_rpn_head import GARPNHead 8 | from .guided_anchor_head import FeatureAdaption, GuidedAnchorHead 9 | from .reppoints_head import RepPointsHead 10 | from .retina_head import RetinaHead 11 | from .retina_sepbn_head import RetinaSepBNHead 12 | from .rpn_head import RPNHead 13 | from .ssd_head import SSDHead 14 | from .retina_sepconv_head import RetinaSepConvHead 15 | 16 | __all__ = [ 17 | 'AnchorHead', 'GuidedAnchorHead', 'FeatureAdaption', 'RPNHead', 18 | 'GARPNHead', 'RetinaHead', 'RetinaSepBNHead', 'GARetinaHead', 'SSDHead', 19 | 'FCOSHead', 'RepPointsHead', 'FoveaHead', 'FreeAnchorRetinaHead', 'RetinaSepConvHead', 20 | 'ATSSHead' 21 | ] 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 lucifer443 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /mmdet/models/utils/effdet_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from mmcv.cnn import xavier_init 4 | from mmdet.ops import ConvModule 5 | 6 | 7 | class SwishImplementation(torch.autograd.Function): 8 | @staticmethod 9 | def forward(ctx, i): 10 | result = i * torch.sigmoid(i) 11 | ctx.save_for_backward(i) 12 | return result 13 | 14 | @staticmethod 15 | def backward(ctx, grad_output): 16 | i = ctx.saved_variables[0] 17 | sigmoid_i = torch.sigmoid(i) 18 | return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i))) 19 | 20 | 21 | class MemoryEfficientSwish(nn.Module): 22 | def forward(self, x): 23 | return SwishImplementation.apply(x) 24 | 25 | 26 | class Swish(nn.Module): 27 | def forward(self, x): 28 | return x * torch.sigmoid(x) 29 | 30 | 31 | class SeparableConv2d(nn.Module): 32 | def __init__(self, in_channels, 33 | out_channels, 34 | kernel_size=1, 35 | stride=1, 36 | padding=0, 37 | dilation=1, 38 | norm_cfg=dict(type='BN', momentum=0.003, eps=1e-4, requires_grad=True), 39 | activation=None, 40 | bias=False): 41 | super(SeparableConv2d, self).__init__() 42 | self.depthwise = nn.Conv2d(in_channels, in_channels, kernel_size, 43 | stride, padding, dilation, groups=in_channels, bias=False) 44 | self.pointwise = ConvModule(in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=None, bias=bias, inplace=False) 45 | if activation == "ReLU": 46 | self.act = nn.ReLU() 47 | elif activation == "Swish": 48 | self.act = MemoryEfficientSwish() 49 | else: 50 | self.act = None 51 | 52 | def init_weights(self): 53 | xavier_init(self.depthwise, distribution='uniform') 54 | xavier_init(self.pointwise.conv, distribution='uniform') 55 | 56 | def forward(self, x): 57 | x = self.depthwise(x) 58 | x = self.pointwise(x) 59 | if self.act: 60 | x = self.act(x) 61 | return x 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EfficientDet-Pytorch 2 | This project is a kind of implementation of EfficientDet using mmdetection. 3 | 4 | It is based on the 5 | 6 | * the paper [EfficientDet: Scalable and Efficient Object Detection](https://arxiv.org/abs/1911.09070) 7 | * [official TensorFlow implementation](https://github.com/google/automl) 8 | * [Pytorch implementation of EfficientNet](https://github.com/lukemelas/EfficientNet-PyTorch) 9 | 10 | ## Models 11 | 12 | | Variant | mAP(val2017) | Params | FLOPs | mAP(val2017) in paper | Params in paper | FLOPs in paper | 13 | | ------- | ------------ | ------ | ------- | --------------------- | --------------- | -------------- | 14 | | D0 | 32.02 | 3.87M | 2.55B | 33.5 | 3.9M | 2.5B | 15 | | D1 | 37.78 | 6.62M | 6.12B | 39.1 | 6.6M | 6.1B | 16 | | D2 | —— | 8.09M | 11B | 42.5 | 8.1M | 11B | 17 | | D3 | —— | 12.02M | 24.88B | 45.9 | 12M | 25B | 18 | | D4 | —— | 20.7M | 55.13B | 49.0 | 21M | 55B | 19 | | D5 | —— | 33.63M | 135.31B | 50.5 | 34M | 135B | 20 | | D6 | —— | —— | —— | 51.3 | 52M | 226B | 21 | 22 | ## Usage 23 | 24 | 1. **Install mmdetection** 25 | 26 | This implementation is based on [mmdetection](https://github.com/open-mmlab/mmdetection)(v1.1.0+8732ed9). Please refer to [INSTALL.md](docs/INSTALL.md) for installation and dataset preparation. 27 | 28 | 2. **Copy the codes to mmdetection directory** 29 | 30 | ```shell 31 | cp -r mmdet/ ${MMDETECTION_PATH}/ 32 | cp -r configs/ ${MMDETECTION_PATH}/ 33 | ``` 34 | 35 | 3. **Prepare data** 36 | 37 | The directories should be arranged like this: 38 | 39 | > mmdetection 40 | > ├── mmdet 41 | > ├── tools 42 | > ├── configs 43 | > ├── data 44 | > │ ├── coco 45 | > │ │ ├── annotations 46 | > │ │ ├── train2017 47 | > │ │ ├── val2017 48 | > │ │ ├── test2017 49 | 50 | 51 | 4. **Train D0 with 4 GPUs** 52 | 53 | ```shell 54 | CONFIG_FILE=configs/efficientdet/efficientdet_d0_4gpu.py 55 | ./ tools/dist_train.py ${CONFIG_FILE} 4 56 | ``` 57 | 58 | 5. **Calculate parameters and flops** 59 | 60 | ```shell 61 | python tools/get_flops.py ${CONFIG_FILE} --shape $SIZE $SIZE 62 | ``` 63 | 64 | 6. **Test** 65 | 66 | ```shell 67 | python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --out ${OUTPUT_FILE} --eval bbox 68 | ``` 69 | 70 | More usages can reference [mmdetection documentation](https://mmdetection.readthedocs.io/en/latest/GETTING_STARTED.html#inference-with-pretrained-models). 71 | 72 | ## Update log 73 | 74 | - [2020-04-27] Update results and add SyncBN in backbone. 75 | - [2020-04-20] Fix some bug in bifpn and use separate BN in head. 76 | - [2020-04-17] Add efficientdet-d0 training config. 77 | - [2020-04-16] Add efficientnet.py and retina_sepconv_head.py. 78 | - [2020-04-06] Create this repository. 79 | 80 | ## Notice 81 | 82 | 1. For small reason, I can't release the model. But you can reproduce the result easily using the config file that I provide. 83 | 2. I find the training procedure of EfficientDet is unstable and there is a small chance that results can be 3% mAP lower. 84 | 3. The number of bifpn in the latest version of paper is a little different from the first version, but the parameters and flops are the same. I use the structure in the latest version of paper. 85 | 4. Training from scratch is a time-consuming task. For exmaple, it took me 4 days to train D0 from scratch using 4 GTX TiTAN V GPUs. 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /configs/efficientdet/efficientdet_d1_8gpu.py: -------------------------------------------------------------------------------- 1 | cudnn_benchmark = True 2 | # model settings 3 | norm_cfg = dict(type='BN', momentum=0.01, eps=1e-3, requires_grad=True) # using SyncBN during training 4 | model = dict( 5 | type='RetinaNet', 6 | pretrained='pretrained/efficientnet-b1-f1951068.pth', 7 | backbone=dict( 8 | type='EfficientNet', 9 | arch='efficientnet-b1', 10 | out_indices=[4, 6, 8], 11 | norm_cfg=norm_cfg, 12 | norm_eval=False), 13 | neck=dict( 14 | type='BiFPN', 15 | in_channels=[40, 112, 320], 16 | target_size_list=[80, 40, 20, 10, 5], 17 | out_channels=88, 18 | stack=4, 19 | start_level=0, 20 | norm_cfg=norm_cfg, 21 | num_outs=5), 22 | bbox_head=dict( 23 | type='RetinaSepConvHead', 24 | num_classes=81, 25 | num_ins=5, 26 | in_channels=88, 27 | stacked_convs=3, 28 | feat_channels=88, 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | anchor_ratios=[0.5, 1.0, 2.0], 32 | anchor_strides=[8, 16, 32, 64, 128], 33 | target_means=[.0, .0, .0, .0], 34 | target_stds=[1.0, 1.0, 1.0, 1.0], 35 | norm_cfg=norm_cfg, 36 | loss_cls=dict( 37 | type='FocalLoss', 38 | use_sigmoid=True, 39 | gamma=1.5, 40 | alpha=0.25, 41 | loss_weight=1.0), 42 | loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0))) 43 | # training and testing settings 44 | train_cfg = dict( 45 | assigner=dict( 46 | type='MaxIoUAssigner', 47 | pos_iou_thr=0.5, 48 | neg_iou_thr=0.5, 49 | min_pos_iou=0, 50 | ignore_iof_thr=-1), 51 | allowed_border=-1, 52 | pos_weight=-1, 53 | debug=False) 54 | test_cfg = dict( 55 | nms_pre=1000, 56 | min_bbox_size=0, 57 | score_thr=0.05, 58 | nms=dict(type='nms', iou_thr=0.5), 59 | max_per_img=100) 60 | # dataset settings 61 | dataset_type = 'CocoDataset' 62 | data_root = 'data/coco/' 63 | img_norm_cfg = dict( 64 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 65 | img_size = 640 66 | train_pipeline = [ 67 | dict(type='LoadImageFromFile'), 68 | dict(type='LoadAnnotations', with_bbox=True), 69 | dict( 70 | type='Resize', 71 | img_scale=(img_size, img_size), 72 | ratio_range=(0.1, 2.0), 73 | keep_ratio=True), 74 | dict(type='RandomCrop', crop_size=(img_size, img_size)), 75 | dict(type='RandomFlip', flip_ratio=0.5), 76 | dict(type='Normalize', **img_norm_cfg), 77 | dict(type='Pad', size=(img_size, img_size)), 78 | dict(type='DefaultFormatBundle'), 79 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 80 | ] 81 | test_pipeline = [ 82 | dict(type='LoadImageFromFile'), 83 | dict( 84 | type='MultiScaleFlipAug', 85 | img_scale=(img_size, img_size), 86 | flip=False, 87 | transforms=[ 88 | dict(type='Resize', keep_ratio=True), 89 | dict(type='RandomFlip'), 90 | dict(type='Normalize', **img_norm_cfg), 91 | dict(type='Pad', size=(img_size, img_size)), 92 | dict(type='ImageToTensor', keys=['img']), 93 | dict(type='Collect', keys=['img']), 94 | ]) 95 | ] 96 | data = dict( 97 | imgs_per_gpu=8, 98 | workers_per_gpu=4, 99 | train=dict( 100 | type=dataset_type, 101 | ann_file=data_root + 'annotations/instances_train2017.json', 102 | img_prefix=data_root + 'train2017/', 103 | pipeline=train_pipeline), 104 | val=dict( 105 | type=dataset_type, 106 | ann_file=data_root + 'annotations/instances_val2017.json', 107 | img_prefix=data_root + 'val2017/', 108 | pipeline=test_pipeline), 109 | test=dict( 110 | type=dataset_type, 111 | ann_file=data_root + 'annotations/instances_val2017.json', 112 | img_prefix=data_root + 'val2017/', 113 | pipeline=test_pipeline)) 114 | evaluation = dict(interval=1, metric='bbox') 115 | # optimizer 116 | optimizer = dict( 117 | type='SGD', 118 | lr=0.08, 119 | momentum=0.9, 120 | weight_decay=4e-5, 121 | paramwise_options=dict(norm_decay_mult=0)) 122 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 123 | # learning policy 124 | lr_config = dict( 125 | policy='cosine', 126 | warmup='linear', 127 | warmup_iters=2000, 128 | warmup_ratio=0.008) 129 | checkpoint_config = dict(interval=10) 130 | # yapf:disable 131 | log_config = dict( 132 | interval=50, 133 | hooks=[ 134 | dict(type='TextLoggerHook'), 135 | # dict(type='TensorboardLoggerHook') 136 | ]) 137 | # yapf:enable 138 | # runtime settings 139 | total_epochs = 300 140 | dist_params = dict(backend='nccl') 141 | log_level = 'INFO' 142 | work_dir = './efficientdet_d1' 143 | load_from = None 144 | resume_from = None 145 | workflow = [('train', 1)] 146 | -------------------------------------------------------------------------------- /configs/efficientdet/efficientdet_d2_8gpu.py: -------------------------------------------------------------------------------- 1 | cudnn_benchmark = True 2 | # model settings 3 | norm_cfg = dict(type='BN', momentum=0.01, eps=1e-3, requires_grad=True) # using SyncBN during training 4 | model = dict( 5 | type='RetinaNet', 6 | pretrained='pretrained/efficientnet-b2-8bb5946d.pth', 7 | backbone=dict( 8 | type='EfficientNet', 9 | arch='efficientnet-b2', 10 | out_indices=[4, 6, 8], 11 | norm_cfg=norm_cfg, 12 | norm_eval=False), 13 | neck=dict( 14 | type='BiFPN', 15 | in_channels=[40, 112, 320], 16 | target_size_list=[96, 48, 24, 12, 6], 17 | out_channels=112, 18 | stack=5, 19 | start_level=0, 20 | norm_cfg=norm_cfg, 21 | num_outs=5), 22 | bbox_head=dict( 23 | type='RetinaSepConvHead', 24 | num_classes=81, 25 | num_ins=5, 26 | in_channels=112, 27 | stacked_convs=3, 28 | feat_channels=112, 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | anchor_ratios=[0.5, 1.0, 2.0], 32 | anchor_strides=[8, 16, 32, 64, 128], 33 | target_means=[.0, .0, .0, .0], 34 | target_stds=[1.0, 1.0, 1.0, 1.0], 35 | norm_cfg=norm_cfg, 36 | loss_cls=dict( 37 | type='FocalLoss', 38 | use_sigmoid=True, 39 | gamma=1.5, 40 | alpha=0.25, 41 | loss_weight=1.0), 42 | loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0))) 43 | # training and testing settings 44 | train_cfg = dict( 45 | assigner=dict( 46 | type='MaxIoUAssigner', 47 | pos_iou_thr=0.5, 48 | neg_iou_thr=0.5, 49 | min_pos_iou=0, 50 | ignore_iof_thr=-1), 51 | allowed_border=-1, 52 | pos_weight=-1, 53 | debug=False) 54 | test_cfg = dict( 55 | nms_pre=1000, 56 | min_bbox_size=0, 57 | score_thr=0.05, 58 | nms=dict(type='nms', iou_thr=0.5), 59 | max_per_img=100) 60 | # dataset settings 61 | dataset_type = 'CocoDataset' 62 | data_root = 'data/coco/' 63 | img_norm_cfg = dict( 64 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 65 | img_size = 768 66 | train_pipeline = [ 67 | dict(type='LoadImageFromFile'), 68 | dict(type='LoadAnnotations', with_bbox=True), 69 | dict( 70 | type='Resize', 71 | img_scale=(img_size, img_size), 72 | ratio_range=(0.1, 2.0), 73 | keep_ratio=True), 74 | dict(type='RandomCrop', crop_size=(img_size, img_size)), 75 | dict(type='RandomFlip', flip_ratio=0.5), 76 | dict(type='Normalize', **img_norm_cfg), 77 | dict(type='Pad', size=(img_size, img_size)), 78 | dict(type='DefaultFormatBundle'), 79 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 80 | ] 81 | test_pipeline = [ 82 | dict(type='LoadImageFromFile'), 83 | dict( 84 | type='MultiScaleFlipAug', 85 | img_scale=(img_size, img_size), 86 | flip=False, 87 | transforms=[ 88 | dict(type='Resize', keep_ratio=True), 89 | dict(type='RandomFlip'), 90 | dict(type='Normalize', **img_norm_cfg), 91 | dict(type='Pad', size=(img_size, img_size)), 92 | dict(type='ImageToTensor', keys=['img']), 93 | dict(type='Collect', keys=['img']), 94 | ]) 95 | ] 96 | data = dict( 97 | imgs_per_gpu=8, 98 | workers_per_gpu=4, 99 | train=dict( 100 | type=dataset_type, 101 | ann_file=data_root + 'annotations/instances_train2017.json', 102 | img_prefix=data_root + 'train2017/', 103 | pipeline=train_pipeline), 104 | val=dict( 105 | type=dataset_type, 106 | ann_file=data_root + 'annotations/instances_val2017.json', 107 | img_prefix=data_root + 'val2017/', 108 | pipeline=test_pipeline), 109 | test=dict( 110 | type=dataset_type, 111 | ann_file=data_root + 'annotations/instances_val2017.json', 112 | img_prefix=data_root + 'val2017/', 113 | pipeline=test_pipeline)) 114 | evaluation = dict(interval=1, metric='bbox') 115 | # optimizer 116 | optimizer = dict( 117 | type='SGD', 118 | lr=0.08, 119 | momentum=0.9, 120 | weight_decay=4e-5, 121 | paramwise_options=dict(norm_decay_mult=0)) 122 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 123 | # learning policy 124 | lr_config = dict( 125 | policy='cosine', 126 | warmup='linear', 127 | warmup_iters=2000, 128 | warmup_ratio=0.008) 129 | checkpoint_config = dict(interval=10) 130 | # yapf:disable 131 | log_config = dict( 132 | interval=50, 133 | hooks=[ 134 | dict(type='TextLoggerHook'), 135 | # dict(type='TensorboardLoggerHook') 136 | ]) 137 | # yapf:enable 138 | # runtime settings 139 | total_epochs = 300 140 | dist_params = dict(backend='nccl') 141 | log_level = 'INFO' 142 | work_dir = './efficientdet_d2' 143 | load_from = None 144 | resume_from = None 145 | workflow = [('train', 1)] 146 | -------------------------------------------------------------------------------- /configs/efficientdet/efficientdet_d3_8gpu.py: -------------------------------------------------------------------------------- 1 | cudnn_benchmark = True 2 | # model settings 3 | norm_cfg = dict(type='BN', momentum=0.01, eps=1e-3, requires_grad=True) # using SyncBN during training 4 | model = dict( 5 | type='RetinaNet', 6 | pretrained='pretrained/efficientnet-b3-5fb5a3c3.pth', 7 | backbone=dict( 8 | type='EfficientNet', 9 | arch='efficientnet-b3', 10 | out_indices=[4, 6, 8], 11 | norm_cfg=norm_cfg, 12 | norm_eval=False), 13 | neck=dict( 14 | type='BiFPN', 15 | in_channels=[48, 136, 384], 16 | target_size_list=[112, 56, 28, 14, 7], 17 | out_channels=160, 18 | stack=6, 19 | start_level=0, 20 | norm_cfg=norm_cfg, 21 | num_outs=5), 22 | bbox_head=dict( 23 | type='RetinaSepConvHead', 24 | num_classes=81, 25 | num_ins=5, 26 | in_channels=160, 27 | stacked_convs=4, 28 | feat_channels=160, 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | anchor_ratios=[0.5, 1.0, 2.0], 32 | anchor_strides=[8, 16, 32, 64, 128], 33 | target_means=[.0, .0, .0, .0], 34 | target_stds=[1.0, 1.0, 1.0, 1.0], 35 | norm_cfg=norm_cfg, 36 | loss_cls=dict( 37 | type='FocalLoss', 38 | use_sigmoid=True, 39 | gamma=1.5, 40 | alpha=0.25, 41 | loss_weight=1.0), 42 | loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0))) 43 | # training and testing settings 44 | train_cfg = dict( 45 | assigner=dict( 46 | type='MaxIoUAssigner', 47 | pos_iou_thr=0.5, 48 | neg_iou_thr=0.5, 49 | min_pos_iou=0, 50 | ignore_iof_thr=-1), 51 | allowed_border=-1, 52 | pos_weight=-1, 53 | debug=False) 54 | test_cfg = dict( 55 | nms_pre=1000, 56 | min_bbox_size=0, 57 | score_thr=0.05, 58 | nms=dict(type='nms', iou_thr=0.5), 59 | max_per_img=100) 60 | # dataset settings 61 | dataset_type = 'CocoDataset' 62 | data_root = 'data/coco/' 63 | img_norm_cfg = dict( 64 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 65 | img_size = 896 66 | train_pipeline = [ 67 | dict(type='LoadImageFromFile'), 68 | dict(type='LoadAnnotations', with_bbox=True), 69 | dict( 70 | type='Resize', 71 | img_scale=(img_size, img_size), 72 | ratio_range=(0.1, 2.0), 73 | keep_ratio=True), 74 | dict(type='RandomCrop', crop_size=(img_size, img_size)), 75 | dict(type='RandomFlip', flip_ratio=0.5), 76 | dict(type='Normalize', **img_norm_cfg), 77 | dict(type='Pad', size=(img_size, img_size)), 78 | dict(type='DefaultFormatBundle'), 79 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 80 | ] 81 | test_pipeline = [ 82 | dict(type='LoadImageFromFile'), 83 | dict( 84 | type='MultiScaleFlipAug', 85 | img_scale=(img_size, img_size), 86 | flip=False, 87 | transforms=[ 88 | dict(type='Resize', keep_ratio=True), 89 | dict(type='RandomFlip'), 90 | dict(type='Normalize', **img_norm_cfg), 91 | dict(type='Pad', size=(img_size, img_size)), 92 | dict(type='ImageToTensor', keys=['img']), 93 | dict(type='Collect', keys=['img']), 94 | ]) 95 | ] 96 | data = dict( 97 | imgs_per_gpu=8, 98 | workers_per_gpu=4, 99 | train=dict( 100 | type=dataset_type, 101 | ann_file=data_root + 'annotations/instances_train2017.json', 102 | img_prefix=data_root + 'train2017/', 103 | pipeline=train_pipeline), 104 | val=dict( 105 | type=dataset_type, 106 | ann_file=data_root + 'annotations/instances_val2017.json', 107 | img_prefix=data_root + 'val2017/', 108 | pipeline=test_pipeline), 109 | test=dict( 110 | type=dataset_type, 111 | ann_file=data_root + 'annotations/instances_val2017.json', 112 | img_prefix=data_root + 'val2017/', 113 | pipeline=test_pipeline)) 114 | evaluation = dict(interval=1, metric='bbox') 115 | # optimizer 116 | optimizer = dict( 117 | type='SGD', 118 | lr=0.08, 119 | momentum=0.9, 120 | weight_decay=4e-5, 121 | paramwise_options=dict(norm_decay_mult=0)) 122 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 123 | # learning policy 124 | lr_config = dict( 125 | policy='cosine', 126 | warmup='linear', 127 | warmup_iters=2000, 128 | warmup_ratio=0.008) 129 | checkpoint_config = dict(interval=10) 130 | # yapf:disable 131 | log_config = dict( 132 | interval=50, 133 | hooks=[ 134 | dict(type='TextLoggerHook'), 135 | # dict(type='TensorboardLoggerHook') 136 | ]) 137 | # yapf:enable 138 | # runtime settings 139 | total_epochs = 300 140 | dist_params = dict(backend='nccl') 141 | log_level = 'INFO' 142 | work_dir = './efficientdet_d3' 143 | load_from = None 144 | resume_from = None 145 | workflow = [('train', 1)] 146 | -------------------------------------------------------------------------------- /configs/efficientdet/efficientdet_d4_8gpu.py: -------------------------------------------------------------------------------- 1 | cudnn_benchmark = True 2 | # model settings 3 | norm_cfg = dict(type='BN', momentum=0.01, eps=1e-3, requires_grad=True) # using SyncBN during training 4 | model = dict( 5 | type='RetinaNet', 6 | pretrained='pretrained/efficientnet-b4-6ed6700e.pth', 7 | backbone=dict( 8 | type='EfficientNet', 9 | arch='efficientnet-b4', 10 | out_indices=[4, 6, 8], 11 | norm_cfg=norm_cfg, 12 | norm_eval=False), 13 | neck=dict( 14 | type='BiFPN', 15 | in_channels=[56, 160, 448], 16 | target_size_list=[128, 64, 32, 16, 8], 17 | out_channels=224, 18 | stack=7, 19 | start_level=0, 20 | norm_cfg=norm_cfg, 21 | num_outs=5), 22 | bbox_head=dict( 23 | type='RetinaSepConvHead', 24 | num_classes=81, 25 | num_ins=5, 26 | in_channels=224, 27 | stacked_convs=4, 28 | feat_channels=224, 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | anchor_ratios=[0.5, 1.0, 2.0], 32 | anchor_strides=[8, 16, 32, 64, 128], 33 | target_means=[.0, .0, .0, .0], 34 | target_stds=[1.0, 1.0, 1.0, 1.0], 35 | norm_cfg=norm_cfg, 36 | loss_cls=dict( 37 | type='FocalLoss', 38 | use_sigmoid=True, 39 | gamma=1.5, 40 | alpha=0.25, 41 | loss_weight=1.0), 42 | loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0))) 43 | # training and testing settings 44 | train_cfg = dict( 45 | assigner=dict( 46 | type='MaxIoUAssigner', 47 | pos_iou_thr=0.5, 48 | neg_iou_thr=0.5, 49 | min_pos_iou=0, 50 | ignore_iof_thr=-1), 51 | allowed_border=-1, 52 | pos_weight=-1, 53 | debug=False) 54 | test_cfg = dict( 55 | nms_pre=1000, 56 | min_bbox_size=0, 57 | score_thr=0.05, 58 | nms=dict(type='nms', iou_thr=0.5), 59 | max_per_img=100) 60 | # dataset settings 61 | dataset_type = 'CocoDataset' 62 | data_root = 'data/coco/' 63 | img_norm_cfg = dict( 64 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 65 | img_size = 1024 66 | train_pipeline = [ 67 | dict(type='LoadImageFromFile'), 68 | dict(type='LoadAnnotations', with_bbox=True), 69 | dict( 70 | type='Resize', 71 | img_scale=(img_size, img_size), 72 | ratio_range=(0.1, 2.0), 73 | keep_ratio=True), 74 | dict(type='RandomCrop', crop_size=(img_size, img_size)), 75 | dict(type='RandomFlip', flip_ratio=0.5), 76 | dict(type='Normalize', **img_norm_cfg), 77 | dict(type='Pad', size=(img_size, img_size)), 78 | dict(type='DefaultFormatBundle'), 79 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 80 | ] 81 | test_pipeline = [ 82 | dict(type='LoadImageFromFile'), 83 | dict( 84 | type='MultiScaleFlipAug', 85 | img_scale=(img_size, img_size), 86 | flip=False, 87 | transforms=[ 88 | dict(type='Resize', keep_ratio=True), 89 | dict(type='RandomFlip'), 90 | dict(type='Normalize', **img_norm_cfg), 91 | dict(type='Pad', size=(img_size, img_size)), 92 | dict(type='ImageToTensor', keys=['img']), 93 | dict(type='Collect', keys=['img']), 94 | ]) 95 | ] 96 | data = dict( 97 | imgs_per_gpu=8, 98 | workers_per_gpu=4, 99 | train=dict( 100 | type=dataset_type, 101 | ann_file=data_root + 'annotations/instances_train2017.json', 102 | img_prefix=data_root + 'train2017/', 103 | pipeline=train_pipeline), 104 | val=dict( 105 | type=dataset_type, 106 | ann_file=data_root + 'annotations/instances_val2017.json', 107 | img_prefix=data_root + 'val2017/', 108 | pipeline=test_pipeline), 109 | test=dict( 110 | type=dataset_type, 111 | ann_file=data_root + 'annotations/instances_val2017.json', 112 | img_prefix=data_root + 'val2017/', 113 | pipeline=test_pipeline)) 114 | evaluation = dict(interval=1, metric='bbox') 115 | # optimizer 116 | optimizer = dict( 117 | type='SGD', 118 | lr=0.08, 119 | momentum=0.9, 120 | weight_decay=4e-5, 121 | paramwise_options=dict(norm_decay_mult=0)) 122 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 123 | # learning policy 124 | lr_config = dict( 125 | policy='cosine', 126 | warmup='linear', 127 | warmup_iters=2000, 128 | warmup_ratio=0.008) 129 | checkpoint_config = dict(interval=10) 130 | # yapf:disable 131 | log_config = dict( 132 | interval=50, 133 | hooks=[ 134 | dict(type='TextLoggerHook'), 135 | # dict(type='TensorboardLoggerHook') 136 | ]) 137 | # yapf:enable 138 | # runtime settings 139 | total_epochs = 300 140 | dist_params = dict(backend='nccl') 141 | log_level = 'INFO' 142 | work_dir = './efficientdet_d4' 143 | load_from = None 144 | resume_from = None 145 | workflow = [('train', 1)] 146 | -------------------------------------------------------------------------------- /configs/efficientdet/efficientdet_d0_4gpu.py: -------------------------------------------------------------------------------- 1 | cudnn_benchmark = True 2 | # model settings 3 | norm_cfg = dict(type='SyncBN', momentum=0.01, eps=1e-3, requires_grad=True) # using SyncBN during training 4 | model = dict( 5 | type='RetinaNet', 6 | pretrained='pretrained/adv-efficientnet-b0-b64d5a18.pth', 7 | backbone=dict( 8 | type='EfficientNet', 9 | arch='efficientnet-b0', 10 | out_indices=[4, 6, 8], 11 | norm_cfg=norm_cfg, 12 | norm_eval=False), 13 | neck=dict( 14 | type='BiFPN', 15 | in_channels=[40, 112, 320], 16 | target_size_list=[64, 32, 16, 8, 4], 17 | out_channels=64, 18 | stack=3, 19 | start_level=0, 20 | norm_cfg=norm_cfg, 21 | num_outs=5), 22 | bbox_head=dict( 23 | type='RetinaSepConvHead', 24 | num_classes=81, 25 | num_ins=5, 26 | in_channels=64, 27 | stacked_convs=3, 28 | feat_channels=64, 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | anchor_ratios=[0.5, 1.0, 2.0], 32 | anchor_strides=[8, 16, 32, 64, 128], 33 | target_means=[.0, .0, .0, .0], 34 | target_stds=[1.0, 1.0, 1.0, 1.0], 35 | norm_cfg=norm_cfg, 36 | loss_cls=dict( 37 | type='FocalLoss', 38 | use_sigmoid=True, 39 | gamma=1.5, 40 | alpha=0.25, 41 | loss_weight=1.0), 42 | loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0))) 43 | # training and testing settings 44 | train_cfg = dict( 45 | assigner=dict( 46 | type='MaxIoUAssigner', 47 | pos_iou_thr=0.5, 48 | neg_iou_thr=0.5, 49 | min_pos_iou=0, 50 | ignore_iof_thr=-1), 51 | allowed_border=-1, 52 | pos_weight=-1, 53 | debug=False) 54 | test_cfg = dict( 55 | nms_pre=1000, 56 | min_bbox_size=0, 57 | score_thr=0.05, 58 | nms=dict(type='nms', iou_thr=0.5), 59 | max_per_img=100) 60 | # dataset settings 61 | dataset_type = 'CocoDataset' 62 | data_root = 'data/coco/' 63 | img_norm_cfg = dict( 64 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 65 | img_size = 512 66 | train_pipeline = [ 67 | dict(type='LoadImageFromFile'), 68 | dict(type='LoadAnnotations', with_bbox=True), 69 | dict( 70 | type='Resize', 71 | img_scale=(img_size, img_size), 72 | ratio_range=(0.1, 2.0), 73 | keep_ratio=True), 74 | dict(type='RandomCrop', crop_size=(img_size, img_size)), 75 | dict(type='RandomFlip', flip_ratio=0.5), 76 | dict(type='Normalize', **img_norm_cfg), 77 | dict(type='Pad', size=(img_size, img_size)), 78 | dict(type='DefaultFormatBundle'), 79 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 80 | ] 81 | test_pipeline = [ 82 | dict(type='LoadImageFromFile'), 83 | dict( 84 | type='MultiScaleFlipAug', 85 | img_scale=(img_size, img_size), 86 | flip=False, 87 | transforms=[ 88 | dict(type='Resize', keep_ratio=True), 89 | dict(type='RandomFlip'), 90 | dict(type='Normalize', **img_norm_cfg), 91 | dict(type='Pad', size=(img_size, img_size)), 92 | dict(type='ImageToTensor', keys=['img']), 93 | dict(type='Collect', keys=['img']), 94 | ]) 95 | ] 96 | data = dict( 97 | imgs_per_gpu=16, 98 | workers_per_gpu=4, 99 | train=dict( 100 | type=dataset_type, 101 | ann_file=data_root + 'annotations/instances_train2017.json', 102 | img_prefix=data_root + 'train2017/', 103 | pipeline=train_pipeline), 104 | val=dict( 105 | type=dataset_type, 106 | ann_file=data_root + 'annotations/instances_val2017.json', 107 | img_prefix=data_root + 'val2017/', 108 | pipeline=test_pipeline), 109 | test=dict( 110 | type=dataset_type, 111 | ann_file=data_root + 'annotations/instances_val2017.json', 112 | img_prefix=data_root + 'val2017/', 113 | pipeline=test_pipeline)) 114 | evaluation = dict(interval=1, metric='bbox') 115 | # optimizer 116 | optimizer = dict( 117 | type='SGD', 118 | lr=0.08, 119 | momentum=0.9, 120 | weight_decay=4e-5, 121 | paramwise_options=dict(norm_decay_mult=0)) 122 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 123 | # learning policy 124 | lr_config = dict( 125 | policy='cosine', 126 | warmup='linear', 127 | warmup_iters=2000, 128 | warmup_ratio=0.008) 129 | checkpoint_config = dict(interval=10) 130 | # yapf:disable 131 | log_config = dict( 132 | interval=50, 133 | hooks=[ 134 | dict(type='TextLoggerHook'), 135 | # dict(type='TensorboardLoggerHook') 136 | ]) 137 | # yapf:enable 138 | # runtime settings 139 | total_epochs = 300 140 | dist_params = dict(backend='nccl') 141 | log_level = 'INFO' 142 | work_dir = './efficientdet_d0' 143 | load_from = None 144 | resume_from = None 145 | workflow = [('train', 1)] 146 | -------------------------------------------------------------------------------- /configs/efficientdet/efficientdet_d5_8gpu.py: -------------------------------------------------------------------------------- 1 | cudnn_benchmark = True 2 | # model settings 3 | norm_cfg = dict(type='BN', momentum=0.01, eps=1e-3, requires_grad=True) # using SyncBN during training 4 | model = dict( 5 | type='RetinaNet', 6 | pretrained='pretrained/efficientnet-b5-b6417697.pth', 7 | backbone=dict( 8 | type='EfficientNet', 9 | arch='efficientnet-b5', 10 | out_indices=[4, 6, 8], 11 | norm_cfg=norm_cfg, 12 | norm_eval=False), 13 | neck=dict( 14 | type='BiFPN', 15 | in_channels=[64, 176, 512], 16 | target_size_list=[160, 80, 40, 20, 10], 17 | out_channels=288, 18 | stack=7, 19 | start_level=0, 20 | norm_cfg=norm_cfg, 21 | num_outs=5), 22 | bbox_head=dict( 23 | type='RetinaSepConvHead', 24 | num_classes=81, 25 | num_ins=5, 26 | in_channels=288, 27 | stacked_convs=4, 28 | feat_channels=288, 29 | octave_base_scale=4, 30 | scales_per_octave=3, 31 | anchor_ratios=[0.5, 1.0, 2.0], 32 | anchor_strides=[8, 16, 32, 64, 128], 33 | target_means=[.0, .0, .0, .0], 34 | target_stds=[1.0, 1.0, 1.0, 1.0], 35 | norm_cfg=norm_cfg, 36 | loss_cls=dict( 37 | type='FocalLoss', 38 | use_sigmoid=True, 39 | gamma=1.5, 40 | alpha=0.25, 41 | loss_weight=1.0), 42 | loss_bbox=dict(type='SmoothL1Loss', beta=0.11, loss_weight=1.0))) 43 | # training and testing settings 44 | train_cfg = dict( 45 | assigner=dict( 46 | type='MaxIoUAssigner', 47 | pos_iou_thr=0.5, 48 | neg_iou_thr=0.5, 49 | min_pos_iou=0, 50 | ignore_iof_thr=-1), 51 | allowed_border=-1, 52 | pos_weight=-1, 53 | debug=False) 54 | test_cfg = dict( 55 | nms_pre=1000, 56 | min_bbox_size=0, 57 | score_thr=0.05, 58 | nms=dict(type='nms', iou_thr=0.5), 59 | max_per_img=100) 60 | # dataset settings 61 | dataset_type = 'CocoDataset' 62 | data_root = 'data/coco/' 63 | img_norm_cfg = dict( 64 | mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) 65 | img_size = 1280 66 | train_pipeline = [ 67 | dict(type='LoadImageFromFile'), 68 | dict(type='LoadAnnotations', with_bbox=True), 69 | dict( 70 | type='Resize', 71 | img_scale=(img_size, img_size), 72 | ratio_range=(0.1, 2.0), 73 | keep_ratio=True), 74 | dict(type='RandomCrop', crop_size=(img_size, img_size)), 75 | dict(type='RandomFlip', flip_ratio=0.5), 76 | dict(type='Normalize', **img_norm_cfg), 77 | dict(type='Pad', size=(img_size, img_size)), 78 | dict(type='DefaultFormatBundle'), 79 | dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']), 80 | ] 81 | test_pipeline = [ 82 | dict(type='LoadImageFromFile'), 83 | dict( 84 | type='MultiScaleFlipAug', 85 | img_scale=(img_size, img_size), 86 | flip=False, 87 | transforms=[ 88 | dict(type='Resize', keep_ratio=True), 89 | dict(type='RandomFlip'), 90 | dict(type='Normalize', **img_norm_cfg), 91 | dict(type='Pad', size=(img_size, img_size)), 92 | dict(type='ImageToTensor', keys=['img']), 93 | dict(type='Collect', keys=['img']), 94 | ]) 95 | ] 96 | data = dict( 97 | imgs_per_gpu=8, 98 | workers_per_gpu=4, 99 | train=dict( 100 | type=dataset_type, 101 | ann_file=data_root + 'annotations/instances_train2017.json', 102 | img_prefix=data_root + 'train2017/', 103 | pipeline=train_pipeline), 104 | val=dict( 105 | type=dataset_type, 106 | ann_file=data_root + 'annotations/instances_val2017.json', 107 | img_prefix=data_root + 'val2017/', 108 | pipeline=test_pipeline), 109 | test=dict( 110 | type=dataset_type, 111 | ann_file=data_root + 'annotations/instances_val2017.json', 112 | img_prefix=data_root + 'val2017/', 113 | pipeline=test_pipeline)) 114 | evaluation = dict(interval=1, metric='bbox') 115 | # optimizer 116 | optimizer = dict( 117 | type='SGD', 118 | lr=0.08, 119 | momentum=0.9, 120 | weight_decay=4e-5, 121 | paramwise_options=dict(norm_decay_mult=0)) 122 | optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) 123 | # learning policy 124 | lr_config = dict( 125 | policy='cosine', 126 | warmup='linear', 127 | warmup_iters=2000, 128 | warmup_ratio=0.008) 129 | checkpoint_config = dict(interval=10) 130 | # yapf:disable 131 | log_config = dict( 132 | interval=50, 133 | hooks=[ 134 | dict(type='TextLoggerHook'), 135 | # dict(type='TensorboardLoggerHook') 136 | ]) 137 | # yapf:enable 138 | # runtime settings 139 | total_epochs = 300 140 | dist_params = dict(backend='nccl') 141 | log_level = 'INFO' 142 | work_dir = './efficientdet_d5' 143 | load_from = None 144 | resume_from = None 145 | workflow = [('train', 1)] 146 | -------------------------------------------------------------------------------- /mmdet/models/anchor_heads/retina_sepconv_head.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch.nn as nn 3 | from mmcv.cnn import kaiming_init 4 | 5 | from ..registry import HEADS 6 | from ..utils import bias_init_with_prob, SeparableConv2d 7 | from .anchor_head import AnchorHead 8 | 9 | 10 | @HEADS.register_module 11 | class RetinaSepConvHead(AnchorHead): 12 | """"RetinaHead with separate BN and separable conv. 13 | 14 | In RetinaHead, conv/norm layers are shared across different FPN levels, 15 | while in RetinaSepBNHead, conv layers are shared across different FPN 16 | levels, but BN layers are separated. 17 | 18 | In EfficientDet, using separable conv as conv module. 19 | """ 20 | 21 | def __init__(self, 22 | num_classes, 23 | num_ins, 24 | in_channels, 25 | stacked_convs=4, 26 | octave_base_scale=4, 27 | scales_per_octave=3, 28 | conv_cfg=None, 29 | norm_cfg=None, 30 | **kwargs): 31 | self.stacked_convs = stacked_convs 32 | self.octave_base_scale = octave_base_scale 33 | self.scales_per_octave = scales_per_octave 34 | self.conv_cfg = conv_cfg 35 | self.norm_cfg = norm_cfg 36 | self.num_ins = num_ins 37 | octave_scales = np.array( 38 | [2 ** (i / scales_per_octave) for i in range(scales_per_octave)]) 39 | anchor_scales = octave_scales * octave_base_scale 40 | super(RetinaSepConvHead, self).__init__( 41 | num_classes, in_channels, anchor_scales=anchor_scales, **kwargs) 42 | 43 | def _init_layers(self): 44 | self.relu = nn.ReLU(inplace=True) 45 | self.cls_convs = nn.ModuleList() 46 | self.reg_convs = nn.ModuleList() 47 | for i in range(self.num_ins): 48 | cls_convs = nn.ModuleList() 49 | reg_convs = nn.ModuleList() 50 | for i in range(self.stacked_convs): 51 | chn = self.in_channels if i == 0 else self.feat_channels 52 | cls_convs.append( 53 | SeparableConv2d( 54 | chn, 55 | self.feat_channels, 56 | 3, 57 | stride=1, 58 | padding=1, 59 | activation="Swish", 60 | bias=True, 61 | norm_cfg=self.norm_cfg)) 62 | reg_convs.append( 63 | SeparableConv2d( 64 | chn, 65 | self.feat_channels, 66 | 3, 67 | stride=1, 68 | padding=1, 69 | activation="Swish", 70 | bias=True, 71 | norm_cfg=self.norm_cfg)) 72 | self.cls_convs.append(cls_convs) 73 | self.reg_convs.append(reg_convs) 74 | for i in range(self.stacked_convs): 75 | for j in range(1, self.num_ins): 76 | self.cls_convs[j][i].depthwise = self.cls_convs[0][i].depthwise 77 | self.cls_convs[j][i].pointwise.conv = self.cls_convs[0][i].pointwise.conv 78 | self.reg_convs[j][i].depthwise = self.reg_convs[0][i].depthwise 79 | self.reg_convs[j][i].pointwise.conv = self.reg_convs[0][i].pointwise.conv 80 | self.retina_cls = SeparableConv2d( 81 | self.feat_channels, 82 | self.num_anchors * self.cls_out_channels, 83 | 3, 84 | padding=1, 85 | bias=True, 86 | norm_cfg=None) 87 | self.retina_reg = SeparableConv2d( 88 | self.feat_channels, self.num_anchors * 4, 3, padding=1, bias=True, norm_cfg=None) 89 | 90 | def init_weights(self): 91 | for m in self.cls_convs[0]: 92 | kaiming_init(m.depthwise, mode='fan_in') 93 | kaiming_init(m.pointwise.conv, mode='fan_in') 94 | for m in self.reg_convs[0]: 95 | kaiming_init(m.depthwise, mode='fan_in') 96 | kaiming_init(m.pointwise.conv, mode='fan_in') 97 | bias_cls = bias_init_with_prob(0.01) 98 | kaiming_init(self.retina_cls.depthwise, mode='fan_in') 99 | kaiming_init(self.retina_cls.pointwise.conv, mode='fan_in', bias=bias_cls) 100 | kaiming_init(self.retina_reg.depthwise, mode='fan_in') 101 | kaiming_init(self.retina_reg.pointwise.conv, mode='fan_in') 102 | 103 | def forward(self, feats): 104 | cls_scores = [] 105 | bbox_preds = [] 106 | for i, x in enumerate(feats): 107 | cls_feat = feats[i] 108 | reg_feat = feats[i] 109 | for cls_conv in self.cls_convs[i]: 110 | cls_feat = cls_conv(cls_feat) 111 | for reg_conv in self.reg_convs[i]: 112 | reg_feat = reg_conv(reg_feat) 113 | cls_score = self.retina_cls(cls_feat) 114 | bbox_pred = self.retina_reg(reg_feat) 115 | cls_scores.append(cls_score) 116 | bbox_preds.append(bbox_pred) 117 | return cls_scores, bbox_preds 118 | -------------------------------------------------------------------------------- /mmdet/models/necks/bifpn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from mmcv.cnn import xavier_init 5 | 6 | from mmdet.core import auto_fp16 7 | from ..registry import NECKS 8 | from mmdet.ops import ConvModule 9 | from ..utils import SeparableConv2d, MemoryEfficientSwish 10 | 11 | 12 | class WeightedMerge(nn.Module): 13 | def __init__(self, in_channels, out_channels, target_size, norm_cfg, apply_bn=False, eps=0.0001): 14 | super(WeightedMerge, self).__init__() 15 | self.conv = SeparableConv2d(out_channels, out_channels, 3, padding=1, norm_cfg=norm_cfg, bias=True) 16 | self.eps = eps 17 | self.num_ins = len(in_channels) 18 | self.weight = nn.Parameter(torch.Tensor(self.num_ins).fill_(1)) 19 | self.relu = nn.ReLU(inplace=False) 20 | self.swish = MemoryEfficientSwish() 21 | self.resample_ops = nn.ModuleList() 22 | for in_c in in_channels: 23 | self.resample_ops.append(Resample(in_c, out_channels, target_size, norm_cfg, apply_bn)) 24 | 25 | def forward(self, inputs): 26 | assert isinstance(inputs, list) 27 | assert len(inputs) == self.num_ins 28 | w = self.relu(self.weight) 29 | w /= (w.sum() + self.eps) 30 | x = 0 31 | for i in range(self.num_ins): 32 | x += w[i] * self.resample_ops[i](inputs[i]) 33 | output = self.conv(self.swish(x)) 34 | return output 35 | 36 | 37 | class Resample(nn.Module): 38 | def __init__(self, in_channels, out_channels, target_size, norm_cfg, apply_bn=False): 39 | super(Resample, self).__init__() 40 | self.target_size = torch.Size([target_size, target_size]) 41 | self.is_conv = in_channels != out_channels 42 | if self.is_conv: 43 | self.conv = ConvModule(in_channels, 44 | out_channels, 45 | 1, 46 | norm_cfg=norm_cfg if apply_bn else None, 47 | bias=True, 48 | act_cfg=None, 49 | inplace=False) 50 | 51 | def _resize(self, x, size): 52 | if x.shape[-2:] == size: 53 | return x 54 | elif x.shape[-2:] < size: 55 | return F.interpolate(x, size=size, mode='nearest') 56 | else: 57 | assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0 58 | kernel_size = x.shape[-1] // size[-1] 59 | x = F.max_pool2d(x, kernel_size=kernel_size+1, stride=kernel_size, padding=1) 60 | return x 61 | 62 | def forward(self, inputs): 63 | if self.is_conv: 64 | inputs = self.conv(inputs) 65 | return self._resize(inputs, self.target_size) 66 | 67 | 68 | class bifpn_layer(nn.Module): 69 | def __init__(self, 70 | in_channels, 71 | out_channels, 72 | target_size_list, 73 | num_outs=5, 74 | conv_cfg=None, 75 | norm_cfg=None, 76 | activation=None): 77 | super(bifpn_layer, self).__init__() 78 | assert num_outs >= 2 79 | self.out_channels = out_channels 80 | self.conv_cfg = conv_cfg 81 | self.norm_cfg = norm_cfg 82 | self.activation = activation 83 | self.num_outs = num_outs 84 | 85 | self.top_down_merge = nn.ModuleList() 86 | for i in range(self.num_outs - 1, 0, -1): 87 | in_channels_list = [out_channels, in_channels[i-1]] if i < self.num_outs - 1 else [in_channels[i], in_channels[i-1]] 88 | merge_op = WeightedMerge(in_channels_list, out_channels, target_size_list[i-1], norm_cfg, apply_bn=True) 89 | self.top_down_merge.append(merge_op) 90 | 91 | self.bottom_up_merge = nn.ModuleList() 92 | for i in range(0, self.num_outs - 1): 93 | in_channels_list = [out_channels, in_channels[i+1], out_channels] if i < self.num_outs - 2 else [in_channels[-1], out_channels] 94 | merge_op = WeightedMerge(in_channels_list, out_channels, target_size_list[i+1], norm_cfg, apply_bn=True) 95 | self.bottom_up_merge.append(merge_op) 96 | 97 | def forward(self, inputs): 98 | assert len(inputs) == self.num_outs 99 | 100 | # top down merge 101 | md_x = [] 102 | for i in range(self.num_outs - 1, 0, -1): 103 | inputs_list = [md_x[-1], inputs[i-1]] if i < self.num_outs - 1 else [inputs[i], inputs[i-1]] 104 | x = self.top_down_merge[self.num_outs-i-1](inputs_list) 105 | md_x.append(x) 106 | 107 | # bottom up merge 108 | outputs = md_x[::-1] 109 | for i in range(1, self.num_outs - 1): 110 | outputs[i] = self.bottom_up_merge[i-1]([outputs[i], inputs[i], outputs[i-1]]) 111 | outputs.append(self.bottom_up_merge[-1]([inputs[-1], outputs[-1]])) 112 | return outputs 113 | 114 | 115 | @NECKS.register_module 116 | class BiFPN(nn.Module): 117 | def __init__(self, 118 | in_channels, 119 | out_channels, 120 | target_size_list, 121 | num_outs, 122 | start_level=0, 123 | end_level=-1, 124 | stack=1, 125 | conv_cfg=None, 126 | norm_cfg=dict(type='BN', momentum=0.003, eps=1e-4, requires_grad=True)): 127 | super(BiFPN, self).__init__() 128 | assert len(in_channels) >= 3 129 | self.in_channels = in_channels 130 | self.out_channels = out_channels 131 | self.num_ins = len(in_channels) 132 | self.stack = stack 133 | self.num_outs = num_outs 134 | self.fp16_enabled = False 135 | 136 | if end_level == -1: 137 | self.backbone_end_level = self.num_ins 138 | assert num_outs >= self.num_ins - start_level 139 | else: 140 | # if end_level < inputs, no extra level is allowed 141 | self.backbone_end_level = end_level 142 | assert end_level <= len(in_channels) 143 | assert num_outs == end_level - start_level 144 | self.start_level = start_level 145 | self.end_level = end_level 146 | 147 | # add extra feature layers using resampling 148 | self.extra_ops = nn.ModuleList() 149 | for i in range(self.backbone_end_level, self.num_outs): 150 | in_c = in_channels[-1] 151 | self.extra_ops.append( 152 | Resample(in_c, out_channels, target_size_list[i] , norm_cfg, apply_bn=True) 153 | ) 154 | in_channels.append(out_channels) 155 | 156 | self.stack_bifpns = nn.ModuleList() 157 | for _ in range(stack): 158 | self.stack_bifpns.append( 159 | bifpn_layer(in_channels, 160 | out_channels, 161 | target_size_list, 162 | num_outs=self.num_outs, 163 | conv_cfg=conv_cfg, 164 | norm_cfg=norm_cfg)) 165 | in_channels = [out_channels] * self.num_outs 166 | 167 | def init_weights(self): 168 | for m in self.modules(): 169 | if isinstance(m, SeparableConv2d): 170 | m.init_weights() 171 | 172 | @auto_fp16() 173 | def forward(self, inputs): 174 | outs = list(inputs) 175 | for _, extra_op in enumerate(self.extra_ops): 176 | outs.append(extra_op(outs[-1])) 177 | 178 | for _, stack_bifpn in enumerate(self.stack_bifpns): 179 | outs = stack_bifpn(outs) 180 | 181 | return tuple(outs[:self.num_outs]) 182 | 183 | -------------------------------------------------------------------------------- /mmdet/models/backbones/efficientnet.py: -------------------------------------------------------------------------------- 1 | # This EfficientNet code is copy from https://github.com/lukemelas/EfficientNet-PyTorch 2 | # Thanks to the authors of this repository! 3 | # ===================================================================================== 4 | 5 | import logging 6 | import math 7 | import collections 8 | import re 9 | 10 | import torch 11 | import torch.nn as nn 12 | from torch.nn import functional as F 13 | from torch.utils import model_zoo 14 | 15 | from functools import partial 16 | 17 | from mmcv.runner import load_checkpoint 18 | from ..utils import MemoryEfficientSwish, Swish 19 | from mmdet.ops import build_norm_layer 20 | 21 | from ..registry import BACKBONES 22 | 23 | 24 | def round_filters(filters, global_params): 25 | """ Calculate and round number of filters based on depth multiplier. """ 26 | multiplier = global_params.width_coefficient 27 | if not multiplier: 28 | return filters 29 | divisor = global_params.depth_divisor 30 | min_depth = global_params.min_depth 31 | filters *= multiplier 32 | min_depth = min_depth or divisor 33 | new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor) 34 | if new_filters < 0.9 * filters: # prevent rounding by more than 10% 35 | new_filters += divisor 36 | return int(new_filters) 37 | 38 | 39 | def round_repeats(repeats, global_params): 40 | """ Round number of filters based on depth multiplier. """ 41 | multiplier = global_params.depth_coefficient 42 | if not multiplier: 43 | return repeats 44 | return int(math.ceil(multiplier * repeats)) 45 | 46 | def drop_connect(inputs, p, training): 47 | """ Drop connect. """ 48 | if not training: return inputs 49 | batch_size = inputs.shape[0] 50 | keep_prob = 1 - p 51 | random_tensor = keep_prob 52 | random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device) 53 | binary_tensor = torch.floor(random_tensor) 54 | output = inputs / keep_prob * binary_tensor 55 | return output 56 | 57 | def get_same_padding_conv2d(image_size=None): 58 | """ Chooses static padding if you have specified an image size, and dynamic padding otherwise. 59 | Static padding is necessary for ONNX exporting of models. """ 60 | if image_size is None: 61 | return Conv2dDynamicSamePadding 62 | else: 63 | return partial(Conv2dStaticSamePadding, image_size=image_size) 64 | 65 | class Conv2dDynamicSamePadding(nn.Conv2d): 66 | """ 2D Convolutions like TensorFlow, for a dynamic image size """ 67 | 68 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True): 69 | super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) 70 | self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2 71 | 72 | def forward(self, x): 73 | ih, iw = x.size()[-2:] 74 | kh, kw = self.weight.size()[-2:] 75 | sh, sw = self.stride 76 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) 77 | pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) 78 | pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) 79 | if pad_h > 0 or pad_w > 0: 80 | x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]) 81 | return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 82 | 83 | class Conv2dStaticSamePadding(nn.Conv2d): 84 | """ 2D Convolutions like TensorFlow, for a fixed image size""" 85 | 86 | def __init__(self, in_channels, out_channels, kernel_size, image_size=None, **kwargs): 87 | super().__init__(in_channels, out_channels, kernel_size, **kwargs) 88 | self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2 89 | 90 | # Calculate padding based on image size and save it 91 | assert image_size is not None 92 | ih, iw = image_size if type(image_size) == list else [image_size, image_size] 93 | kh, kw = self.weight.size()[-2:] 94 | sh, sw = self.stride 95 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) 96 | pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) 97 | pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) 98 | if pad_h > 0 or pad_w > 0: 99 | self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)) 100 | else: 101 | self.static_padding = Identity() 102 | 103 | def forward(self, x): 104 | x = self.static_padding(x) 105 | x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 106 | return x 107 | 108 | class Identity(nn.Module): 109 | def __init__(self, ): 110 | super(Identity, self).__init__() 111 | 112 | def forward(self, input): 113 | return input 114 | 115 | 116 | class MBConvBlock(nn.Module): 117 | """ 118 | Mobile Inverted Residual Bottleneck Block 119 | 120 | Args: 121 | block_args (namedtuple): BlockArgs, see above 122 | global_params (namedtuple): GlobalParam, see above 123 | 124 | Attributes: 125 | has_se (bool): Whether the block contains a Squeeze and Excitation layer. 126 | """ 127 | 128 | def __init__(self, block_args, global_params, norm_cfg): 129 | super().__init__() 130 | self._block_args = block_args 131 | self._bn_mom = 1 - global_params.batch_norm_momentum 132 | self._bn_eps = global_params.batch_norm_epsilon 133 | self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1) 134 | self.id_skip = block_args.id_skip # skip connection and drop connect 135 | 136 | # Get static or dynamic convolution depending on image size 137 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 138 | 139 | # Expansion phase 140 | inp = self._block_args.input_filters # number of input channels 141 | oup = self._block_args.input_filters * self._block_args.expand_ratio # number of output channels 142 | if self._block_args.expand_ratio != 1: 143 | self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False) 144 | self._bn0 = build_norm_layer(norm_cfg, num_features=oup, postfix=0)[1] 145 | 146 | # Depthwise convolution phase 147 | k = self._block_args.kernel_size 148 | s = self._block_args.stride 149 | self._depthwise_conv = Conv2d( 150 | in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise 151 | kernel_size=k, stride=s, bias=False) 152 | self._bn1 = build_norm_layer(norm_cfg, num_features=oup, postfix=1)[1] 153 | 154 | # Squeeze and Excitation layer, if desired 155 | if self.has_se: 156 | num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio)) 157 | self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1) 158 | self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1) 159 | 160 | # Output phase 161 | final_oup = self._block_args.output_filters 162 | self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False) 163 | self._bn2 = build_norm_layer(norm_cfg, num_features=final_oup, postfix=2)[1] 164 | self._swish = MemoryEfficientSwish() 165 | 166 | def forward(self, inputs, drop_connect_rate=None): 167 | """ 168 | :param inputs: input tensor 169 | :param drop_connect_rate: drop connect rate (float, between 0 and 1) 170 | :return: output of block 171 | """ 172 | 173 | # Expansion and Depthwise Convolution 174 | x = inputs 175 | if self._block_args.expand_ratio != 1: 176 | x = self._swish(self._bn0(self._expand_conv(inputs))) 177 | x = self._swish(self._bn1(self._depthwise_conv(x))) 178 | 179 | # Squeeze and Excitation 180 | if self.has_se: 181 | x_squeezed = F.adaptive_avg_pool2d(x, 1) 182 | x_squeezed = self._se_expand(self._swish(self._se_reduce(x_squeezed))) 183 | x = torch.sigmoid(x_squeezed) * x 184 | 185 | x = self._bn2(self._project_conv(x)) 186 | 187 | # Skip connection and drop connect 188 | input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters 189 | if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters: 190 | if drop_connect_rate: 191 | x = drop_connect(x, p=drop_connect_rate, training=self.training) 192 | x = x + inputs # skip connection 193 | return x 194 | 195 | def set_swish(self, memory_efficient=True): 196 | """Sets swish function as memory efficient (for training) or standard (for export)""" 197 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 198 | 199 | 200 | def efficientnet_params(model_name): 201 | """ Map EfficientNet model name to parameter coefficients. """ 202 | params_dict = { 203 | # Coefficients: width,depth,res,dropout 204 | 'efficientnet-b0': (1.0, 1.0, 224, 0.2), 205 | 'efficientnet-b1': (1.0, 1.1, 240, 0.2), 206 | 'efficientnet-b2': (1.1, 1.2, 260, 0.3), 207 | 'efficientnet-b3': (1.2, 1.4, 300, 0.3), 208 | 'efficientnet-b4': (1.4, 1.8, 380, 0.4), 209 | 'efficientnet-b5': (1.6, 2.2, 456, 0.4), 210 | 'efficientnet-b6': (1.8, 2.6, 528, 0.5), 211 | 'efficientnet-b7': (2.0, 3.1, 600, 0.5), 212 | } 213 | return params_dict[model_name] 214 | 215 | class BlockDecoder(object): 216 | """ Block Decoder for readability, straight from the official TensorFlow repository """ 217 | 218 | @staticmethod 219 | def _decode_block_string(block_string): 220 | """ Gets a block through a string notation of arguments. """ 221 | assert isinstance(block_string, str) 222 | 223 | ops = block_string.split('_') 224 | options = {} 225 | for op in ops: 226 | splits = re.split(r'(\d.*)', op) 227 | if len(splits) >= 2: 228 | key, value = splits[:2] 229 | options[key] = value 230 | 231 | # Check stride 232 | assert (('s' in options and len(options['s']) == 1) or 233 | (len(options['s']) == 2 and options['s'][0] == options['s'][1])) 234 | 235 | return BlockArgs( 236 | kernel_size=int(options['k']), 237 | num_repeat=int(options['r']), 238 | input_filters=int(options['i']), 239 | output_filters=int(options['o']), 240 | expand_ratio=int(options['e']), 241 | id_skip=('noskip' not in block_string), 242 | se_ratio=float(options['se']) if 'se' in options else None, 243 | stride=[int(options['s'][0])]) 244 | 245 | @staticmethod 246 | def _encode_block_string(block): 247 | """Encodes a block to a string.""" 248 | args = [ 249 | 'r%d' % block.num_repeat, 250 | 'k%d' % block.kernel_size, 251 | 's%d%d' % (block.strides[0], block.strides[1]), 252 | 'e%s' % block.expand_ratio, 253 | 'i%d' % block.input_filters, 254 | 'o%d' % block.output_filters 255 | ] 256 | if 0 < block.se_ratio <= 1: 257 | args.append('se%s' % block.se_ratio) 258 | if block.id_skip is False: 259 | args.append('noskip') 260 | return '_'.join(args) 261 | 262 | @staticmethod 263 | def decode(string_list): 264 | """ 265 | Decodes a list of string notations to specify blocks inside the network. 266 | 267 | :param string_list: a list of strings, each string is a notation of block 268 | :return: a list of BlockArgs namedtuples of block args 269 | """ 270 | assert isinstance(string_list, list) 271 | blocks_args = [] 272 | for block_string in string_list: 273 | blocks_args.append(BlockDecoder._decode_block_string(block_string)) 274 | return blocks_args 275 | 276 | @staticmethod 277 | def encode(blocks_args): 278 | """ 279 | Encodes a list of BlockArgs to a list of strings. 280 | 281 | :param blocks_args: a list of BlockArgs namedtuples of block args 282 | :return: a list of strings, each string is a notation of block 283 | """ 284 | block_strings = [] 285 | for block in blocks_args: 286 | block_strings.append(BlockDecoder._encode_block_string(block)) 287 | return block_strings 288 | 289 | def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2, 290 | drop_connect_rate=0.2, image_size=None, num_classes=1000): 291 | """ Creates a efficientnet model. """ 292 | 293 | blocks_args = [ 294 | 'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25', 295 | 'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25', 296 | 'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25', 297 | 'r1_k3_s11_e6_i192_o320_se0.25', 298 | ] 299 | blocks_args = BlockDecoder.decode(blocks_args) 300 | 301 | global_params = GlobalParams( 302 | batch_norm_momentum=0.99, #0.99 -> 0.997 303 | batch_norm_epsilon=1e-3, #1e-3 -> 1e-4 304 | dropout_rate=dropout_rate, 305 | drop_connect_rate=drop_connect_rate, 306 | # data_format='channels_last', # removed, this is always true in PyTorch 307 | num_classes=num_classes, 308 | width_coefficient=width_coefficient, 309 | depth_coefficient=depth_coefficient, 310 | depth_divisor=8, 311 | min_depth=None, 312 | image_size=image_size, 313 | ) 314 | 315 | return blocks_args, global_params 316 | 317 | 318 | # Parameters for the entire model (stem, all blocks, and head) 319 | GlobalParams = collections.namedtuple('GlobalParams', [ 320 | 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 321 | 'num_classes', 'width_coefficient', 'depth_coefficient', 322 | 'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size']) 323 | 324 | # Parameters for an individual model block 325 | BlockArgs = collections.namedtuple('BlockArgs', [ 326 | 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', 327 | 'expand_ratio', 'id_skip', 'stride', 'se_ratio']) 328 | 329 | # Change namedtuple defaults 330 | GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields) 331 | BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields) 332 | 333 | def get_model_params(model_name, override_params): 334 | """ Get the block args and global params for a given model """ 335 | if model_name.startswith('efficientnet'): 336 | w, d, s, p = efficientnet_params(model_name) 337 | # note: all models have drop connect rate = 0.2 338 | blocks_args, global_params = efficientnet( 339 | width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s) 340 | else: 341 | raise NotImplementedError('model name is not pre-defined: %s' % model_name) 342 | if override_params: 343 | # ValueError will be raised here if override_params has fields not included in global_params. 344 | global_params = global_params._replace(**override_params) 345 | return blocks_args, global_params 346 | 347 | ################################################################################################################## 348 | @BACKBONES.register_module 349 | class EfficientNet(nn.Module): 350 | """ 351 | An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods 352 | 353 | Args: 354 | blocks_args (list): A list of BlockArgs to construct blocks 355 | global_params (namedtuple): A set of GlobalParams shared between blocks 356 | 357 | Example: 358 | model = EfficientNet.from_pretrained('efficientnet-b0') 359 | 360 | """ 361 | 362 | def __init__(self, 363 | arch='efficientnet-b0', 364 | out_indices=[4, 5, 6, 7, 8], 365 | norm_cfg=dict(type="BN"), 366 | norm_eval=True, 367 | override_params=None): 368 | super(EfficientNet, self).__init__() 369 | self._check_model_name_is_valid(arch) 370 | blocks_args, global_params = get_model_params(arch, override_params) 371 | assert isinstance(blocks_args, list), 'blocks_args should be a list' 372 | assert len(blocks_args) > 0, 'block args must be greater than 0' 373 | self._global_params = global_params 374 | self._blocks_args = blocks_args 375 | self.out_indices = out_indices 376 | self.norm_eval = norm_eval 377 | 378 | # Get static or dynamic convolution depending on image size 379 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 380 | 381 | # Batch norm parameters 382 | bn_mom = 1 - self._global_params.batch_norm_momentum 383 | bn_eps = self._global_params.batch_norm_epsilon 384 | 385 | # Stem 386 | in_channels = 3 # rgb 387 | out_channels = round_filters(32, self._global_params) # number of output channels 388 | self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) 389 | self._bn0 = build_norm_layer(norm_cfg, num_features=out_channels, postfix=0)[1] 390 | 391 | # Build blocks 392 | self._blocks = nn.ModuleList([]) 393 | self.per_last_stage_idx = [] 394 | cum_idx = 0 395 | for block_args in self._blocks_args: 396 | # Update block input and output filters based on depth multiplier. 397 | block_args = block_args._replace( 398 | input_filters=round_filters(block_args.input_filters, self._global_params), 399 | output_filters=round_filters(block_args.output_filters, self._global_params), 400 | num_repeat=round_repeats(block_args.num_repeat, self._global_params) 401 | ) 402 | 403 | # The first block needs to take care of stride and filter size increase. 404 | self._blocks.append(MBConvBlock(block_args, self._global_params, norm_cfg)) 405 | if block_args.num_repeat > 1: 406 | block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) 407 | for _ in range(block_args.num_repeat - 1): 408 | self._blocks.append(MBConvBlock(block_args, self._global_params, norm_cfg)) 409 | # last_stage_idx 410 | cum_idx += block_args.num_repeat 411 | self.per_last_stage_idx.append(cum_idx) 412 | 413 | # out_indices_stage_idx 414 | self.out_stage_idx = [] 415 | for stage in self.out_indices: 416 | self.out_stage_idx.append(self.per_last_stage_idx[stage - 2] - 1) 417 | 418 | 419 | # Head 420 | # in_channels = block_args.output_filters # output of final block 421 | # out_channels = round_filters(1280, self._global_params) 422 | # self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False) 423 | # self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) 424 | 425 | # Final linear layer 426 | # self._avg_pooling = nn.AdaptiveAvgPool2d(1) 427 | # self._dropout = nn.Dropout(self._global_params.dropout_rate) 428 | # self._fc = nn.Linear(out_channels, self._global_params.num_classes) 429 | self._swish = MemoryEfficientSwish() 430 | 431 | def set_swish(self, memory_efficient=True): 432 | """Sets swish function as memory efficient (for training) or standard (for export)""" 433 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 434 | for block in self._blocks: 435 | block.set_swish(memory_efficient) 436 | 437 | def get_last_stage_idx(self): 438 | pass 439 | 440 | def extract_features(self, inputs): 441 | """ Returns output of the final convolution layer """ 442 | 443 | # Stem 444 | x = self._swish(self._bn0(self._conv_stem(inputs))) 445 | 446 | # Blocks 447 | outs = [] 448 | for idx, block in enumerate(self._blocks): 449 | drop_connect_rate = self._global_params.drop_connect_rate 450 | if drop_connect_rate: 451 | drop_connect_rate *= float(idx) / len(self._blocks) 452 | x = block(x, drop_connect_rate=drop_connect_rate) 453 | # stage_idx = 454 | if idx in self.out_stage_idx: 455 | outs.append(x) 456 | # Head 457 | # x = self._swish(self._bn1(self._conv_head(x))) 458 | 459 | return tuple(outs) 460 | 461 | def init_weights(self, pretrained=None): 462 | if isinstance(pretrained, str): 463 | logger = logging.getLogger() 464 | load_checkpoint(self, pretrained, strict=False, logger=logger) 465 | elif pretrained is None: 466 | # print("pretrained is None") 467 | return 468 | else: 469 | raise TypeError('pretrained must be a str or None') 470 | 471 | 472 | def forward(self, inputs): 473 | """ Calls extract_features to extract features, applies final linear layer, and returns logits. """ 474 | bs = inputs.size(0) 475 | # Convolution layers 476 | x = self.extract_features(inputs) 477 | 478 | # # Pooling and final linear layer 479 | # x = self._avg_pooling(x) 480 | # x = x.view(bs, -1) 481 | # x = self._dropout(x) 482 | # x = self._fc(x) 483 | return x 484 | 485 | @classmethod 486 | def from_name(cls, model_name, override_params=None): 487 | cls._check_model_name_is_valid(model_name) 488 | blocks_args, global_params = get_model_params(model_name, override_params) 489 | return cls(blocks_args, global_params) 490 | 491 | @classmethod 492 | def from_pretrained(cls, model_name, num_classes=1000, in_channels = 3): 493 | model = cls.from_name(model_name, override_params={'num_classes': num_classes}) 494 | load_pretrained_weights(model, model_name, load_fc=(num_classes == 1000)) 495 | if in_channels != 3: 496 | Conv2d = get_same_padding_conv2d(image_size = model._global_params.image_size) 497 | out_channels = round_filters(32, model._global_params) 498 | model._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) 499 | return model 500 | 501 | @classmethod 502 | def from_pretrained(cls, model_name, num_classes=1000): 503 | model = cls.from_name(model_name, override_params={'num_classes': num_classes}) 504 | load_pretrained_weights(model, model_name, load_fc=(num_classes == 1000)) 505 | 506 | return model 507 | 508 | @classmethod 509 | def get_image_size(cls, model_name): 510 | cls._check_model_name_is_valid(model_name) 511 | _, _, res, _ = efficientnet_params(model_name) 512 | return res 513 | 514 | @classmethod 515 | def _check_model_name_is_valid(cls, model_name, also_need_pretrained_weights=False): 516 | """ Validates model name. None that pretrained weights are only available for 517 | the first four models (efficientnet-b{i} for i in 0,1,2,3) at the moment. """ 518 | num_models = 4 if also_need_pretrained_weights else 8 519 | valid_models = ['efficientnet-b'+str(i) for i in range(num_models)] 520 | if model_name not in valid_models: 521 | raise ValueError('model_name should be one of: ' + ', '.join(valid_models)) 522 | 523 | def train(self, mode=True): 524 | super(EfficientNet, self).train(mode) 525 | # self._freeze_stages() 526 | if mode and self.norm_eval: 527 | for m in self.modules(): 528 | # trick: eval have effect on BatchNorm only 529 | if isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.SyncBatchNorm): 530 | m.eval() 531 | 532 | 533 | 534 | url_map = { 535 | 'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b0-355c32eb.pth', 536 | 'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b1-f1951068.pth', 537 | 'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b2-8bb594d6.pth', 538 | 'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b3-5fb5a3c3.pth', 539 | 'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b4-6ed6700e.pth', 540 | 'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b5-b6417697.pth', 541 | 'efficientnet-b6': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b6-c76e70fd.pth', 542 | 'efficientnet-b7': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b7-dcc49843.pth', 543 | } 544 | 545 | def load_pretrained_weights(model, model_name, load_fc=True): 546 | """ Loads pretrained weights, and downloads if loading for the first time. """ 547 | state_dict = model_zoo.load_url(url_map[model_name]) 548 | if load_fc: 549 | model.load_state_dict(state_dict) 550 | else: 551 | state_dict.pop('_fc.weight') 552 | state_dict.pop('_fc.bias') 553 | res = model.load_state_dict(state_dict, strict=False) 554 | assert set(res.missing_keys) == set(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights' 555 | print('Loaded pretrained weights for {}'.format(model_name)) 556 | --------------------------------------------------------------------------------