├── .circleci └── config.yml ├── .gitignore ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── configs └── efficientdet-d0.yaml ├── datasets ├── __init__.py ├── augmentation.py ├── coco.py ├── coco_labels.txt ├── scripts │ ├── COCO2014.sh │ ├── COCO2017.sh │ ├── VOC2007.sh │ └── VOC2012.sh ├── visual_aug.py └── voc0712.py ├── demo.py ├── docs ├── arch.png ├── compare.png ├── demo.png ├── output.png ├── performance.png └── pytoan.gif ├── eval.py ├── models ├── __init__.py ├── bifpn.py ├── efficientdet.py ├── efficientnet.py ├── losses.py ├── module.py ├── retinahead.py └── utils.py ├── requirements.txt ├── test.py ├── train.py └── utils ├── __init__.py ├── config_eff.py ├── helper.py ├── metric.py ├── util.py ├── vis_bbox.py └── visualization.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | build: 4 | docker: 5 | - image: toandaominh1997/pytoan:latest 6 | steps: 7 | - checkout # check out the code in the project directory 8 | - run: | 9 | pip install flake8 10 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 11 | python test.py 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # vscode 107 | .vscode/* 108 | !.vscode/settings.json 109 | !.vscode/tasks.json 110 | !.vscode/launch.json 111 | !.vscode/extensions.json 112 | *.code-workspace 113 | 114 | 115 | saved/ 116 | weights/ 117 | val2017_bbox_results.json -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/home/toandm2/devtools/anaconda3/envs/pytoan/bin/python" 3 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Toan Dao Minh(bigkizd) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EfficientDet: Scalable and Efficient Object Detection, in PyTorch 2 | A [PyTorch](http://pytorch.org/) implementation of [EfficientDet](https://arxiv.org/abs/1911.09070) from the 2019 paper by Mingxing Tan Ruoming Pang Quoc V. Le 3 | Google Research, Brain Team. The official and original: comming soon. 4 | 5 | 6 | 7 | 8 | # Fun with Demo: 9 | ```Shell 10 | python demo.py --weight ./checkpoint_VOC_efficientdet-d1_97.pth --threshold 0.6 --iou_threshold 0.5 --cam --score 11 | ``` 12 | 13 |

14 | 15 |

16 | 17 | 18 | ### Table of Contents 19 | - Recent Update 20 | - Benchmarking 21 | - Installation 22 | - Installation 23 | - Prerequisites 24 | - Datasets 25 | - Train 26 | - Evaluate 27 | - Performance 28 | - Demo 29 | - Future Work 30 | - Reference 31 | 32 |   33 |   34 |   35 |   36 | 37 | ## Recent Update 38 | - [06/01/2020] Support both DistributedDataParallel and DataParallel, change augmentation, eval_voc 39 | - [17/12/2019] Add Fast normalized fusion, Augmentation with Ratio, Change RetinaHead, Fix Support EfficientDet-D0->D7 40 | - [7/12/2019] Support EfficientDet-D0, EfficientDet-D1, EfficientDet-D2, EfficientDet-D3, EfficientDet-D4,... . Support change gradient accumulation steps, AdamW. 41 | ## Benchmarking 42 | 43 | We benchmark our code thoroughly on three datasets: pascal voc and coco, using family efficientnet different network architectures: EfficientDet-D0->7. Below are the results: 44 | 45 | 1). PASCAL VOC 2007 (Train/Test: 07trainval/07test, scale=600, ROI Align) 46 | 47 | model   | mAP | 48 | ---------|--------| 49 | [EfficientDet-D0(with Weight)](https://drive.google.com/file/d/1r7MAyBfG5OK_9F_cU8yActUWxTHOuOpL/view?usp=sharing | 62.16 50 | 51 | 52 | ## Installation 53 | - Install [PyTorch](http://pytorch.org/) by selecting your environment on the website and running the appropriate command. 54 | - Clone this repository and install package [prerequisites](#prerequisites) below. 55 | - Then download the dataset by following the [instructions](#datasets) below. 56 | - Note: For training, we currently support [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [COCO](http://mscoco.org/), and aim to add [ImageNet](http://www.image-net.org/) support soon. 57 | 58 | ### prerequisites 59 | 60 | * Python 3.6+ 61 | * PyTorch 1.3+ 62 | * Torchvision 0.4.0+ (**We need high version because Torchvision support nms now.**) 63 | * requirements.txt 64 | ## Datasets 65 | To make things easy, we provide bash scripts to handle the dataset downloads and setup for you. We also provide simple dataset loaders that inherit `torch.utils.data.Dataset`, making them fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html). 66 | 67 | ### VOC Dataset 68 | PASCAL VOC: Visual Object Classes 69 | 70 | ##### Download VOC2007 + VOC2012 trainval & test 71 | ```Shell 72 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 73 | sh datasets/scripts/VOC2007.sh 74 | sh datasets/scripts/VOC2012.sh 75 | ``` 76 | 77 | ### COCO 78 | Microsoft COCO: Common Objects in Context 79 | 80 | ##### Download COCO 2017 81 | ```Shell 82 | # specify a directory for dataset to be downloaded into, else default is ~/data/ 83 | sh datasets/scripts/COCO2017.sh 84 | ``` 85 | 86 | ## Training EfficientDet 87 | 88 | - To train EfficientDet using the train script simply specify the parameters listed in `train.py` as a flag or manually change them. 89 | 90 | ```Shell 91 | python train.py --network effcientdet-d0 # Example 92 | ``` 93 | 94 | - With VOC Dataset: 95 | ```Shell 96 | # DataParallel 97 | python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network effcientdet-d0 --batch_size 32 98 | # DistributedDataParallel with backend nccl 99 | python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network effcientdet-d0 --batch_size 32 --multiprocessing-distributed 100 | ``` 101 | - With COCO Dataset: 102 | ```Shell 103 | # DataParallel 104 | python train.py --dataset COCO --dataset_root ~/data/coco/ --network effcientdet-d0 --batch_size 32 105 | # DistributedDataParallel with backend nccl 106 | python train.py --dataset COCO --dataset_root ~/data/coco/ --network effcientdet-d0 --batch_size 32 --multiprocessing-distributed 107 | ``` 108 | 109 | ## Evaluation 110 | To evaluate a trained network: 111 | - With VOC Dataset: 112 | ```Shell 113 | python eval_voc.py --dataset_root ~/data/VOCdevkit --weight ./checkpoint_VOC_efficientdet-d0_261.pth 114 | ``` 115 | - With COCO Dataset 116 | comming soon. 117 | ## Demo 118 | 119 | ```Shell 120 | python demo.py --threshold 0.5 --iou_threshold 0.5 --score --weight checkpoint_VOC_efficientdet-d1_34.pth --file_name demo.png 121 | ``` 122 | 123 | Output: 124 | 125 |

126 | 127 |

128 | 129 | ## Webcam Demo 130 | 131 | You can use a webcam in a real-time demo by running: 132 | ```Shell 133 | python demo.py --threshold 0.5 --iou_threshold 0.5 --cam --score --weight checkpoint_VOC_efficientdet-d1_34.pth 134 | ``` 135 | 136 | ## Performance 137 | 138 | 139 | 140 | 141 | ## TODO 142 | We have accumulated the following to-do list, which we hope to complete in the near future 143 | - Still to come: 144 | * [x] EfficientDet-[D0-7] 145 | * [x] GPU-Parallel 146 | * [x] NMS 147 | * [ ] Soft-NMS 148 | * [x] Pretrained model 149 | * [x] Demo 150 | * [ ] Model zoo 151 | * [ ] TorchScript 152 | * [ ] Mobile 153 | * [ ] C++ Onnx 154 | 155 | 156 | ## Authors 157 | 158 | * [**Toan Dao Minh**](https://github.com/toandaominh1997) 159 | 160 | ***Note:*** Unfortunately, this is just a hobby of ours and not a full-time job, so we'll do our best to keep things up to date, but no guarantees. That being said, thanks to everyone for your continued help and feedback as it is really appreciated. We will try to address everything as soon as possible. 161 | 162 | ## References 163 | - tanmingxing, rpang, qvl, et al. "EfficientDet: Scalable and Efficient Object Detection." [EfficientDet](https://arxiv.org/abs/1911.09070). 164 | - A list of other great EfficientDet ports that were sources of inspiration: 165 | * [EfficientNet](https://github.com/lukemelas/EfficientNet-PyTorch) 166 | * [SSD.Pytorch](https://github.com/amdegroot/ssd.pytorch) 167 | * [mmdetection](https://github.com/open-mmlab/mmdetection) 168 | * [RetinaNet.Pytorch](https://github.com/yhenon/pytorch-retinanet) 169 | * [NMS.Torchvision](https://pytorch.org/docs/stable/torchvision/ops.html) 170 | 171 | 172 | ## Citation 173 | 174 | @article{efficientdetpytoan, 175 | Author = {Toan Dao Minh}, 176 | Title = {A Pytorch Implementation of EfficientDet Object Detection}, 177 | Journal = {github.com/toandaominh1997/EfficientDet.Pytorch}, 178 | Year = {2019} 179 | } 180 | -------------------------------------------------------------------------------- /configs/efficientdet-d0.yaml: -------------------------------------------------------------------------------- 1 | SEED: 42 2 | DEVICE: [0, 1] 3 | # DATASET 4 | DATA_TRAIN: VOC 5 | 6 | GRADIENT_ACCUMULATION_STEPS: 1 7 | GRADIENT_CLIPPING: 1 8 | NUM_EPOCH: 500 9 | EARLY_STOPPING: 50 10 | VALIDATION_FREQUENCY: 2 11 | TENSORBOARD: True 12 | CHECKPOINT_DIR: ./saved 13 | RESUME_PATH: 14 | 15 | TRAIN_DATASET: 16 | PY: datasets 17 | CLASS: spoofDataset 18 | ARGS: 19 | root_dir: ./ 20 | phase: train 21 | 22 | VALID_DATASET: 23 | PY: datasets 24 | CLASS: spoofDataset 25 | ARGS: 26 | root_dir: ./ 27 | phase: valid 28 | 29 | TEST_DATASET: 30 | PY: datasets.dataset 31 | CLASS: spoofDataset 32 | ARGS: 33 | root_dir: ./data 34 | phase: valid 35 | 36 | TRAIN_DATALOADER: 37 | PY: torch.utils.data 38 | CLASS: DataLoader 39 | ARGS: 40 | batch_size: 8 41 | shuffle: True 42 | num_workers: 8 43 | pin_memory: True 44 | 45 | VALID_DATALOADER: 46 | PY: torch.utils.data 47 | CLASS: DataLoader 48 | ARGS: 49 | batch_size: 8 50 | shuffle: False 51 | num_workers: 8 52 | pin_memory: True 53 | 54 | TEST_DATALOADER: 55 | PY: torch.utils.data 56 | CLASS: DataLoader 57 | ARGS: 58 | batch_size: 8 59 | shuffle: False 60 | num_workers: 8 61 | 62 | MODEL: 63 | PY: models 64 | CLASS: EfficientDet 65 | ARGS: 66 | num_class: 21 67 | levels: 3 68 | num_channels: 128 69 | model_name: efficientnet-b0 70 | 71 | CRITERION: 72 | PY: layers.modules 73 | CLASS: MultiBoxLoss 74 | ARGS: 75 | num_classes: 21 76 | overlap_thresh: 0.5 77 | prior_for_matching: True 78 | bkg_label: 0 79 | neg_mining: True 80 | neg_pos: 3 81 | neg_overlap: 0.5 82 | encode_target: False 83 | use_gpu: False 84 | 85 | OPTIMIZER: 86 | PY: torch.optim 87 | CLASS: AdamW 88 | ARGS: 89 | lr: 0.0001 90 | weight_decay: 0.000005 91 | 92 | SCHEDULER: 93 | PY: torch.optim.lr_scheduler 94 | CLASS: ReduceLROnPlateau 95 | ARGS: 96 | factor: 0.15 97 | patience: 2 98 | -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .voc0712 import VOCDetection, VOC_CLASSES 2 | from .augmentation import get_augumentation, detection_collate, Resizer, Normalizer, Augmenter, collater 3 | from .coco import CocoDataset -------------------------------------------------------------------------------- /datasets/augmentation.py: -------------------------------------------------------------------------------- 1 | import albumentations as albu 2 | from albumentations.pytorch.transforms import ToTensor 3 | import torch 4 | import numpy as np 5 | import cv2 6 | 7 | 8 | def get_augumentation(phase, width=512, height=512, min_area=0., min_visibility=0.): 9 | list_transforms = [] 10 | if phase == 'train': 11 | list_transforms.extend([ 12 | albu.augmentations.transforms.LongestMaxSize( 13 | max_size=width, always_apply=True), 14 | albu.PadIfNeeded(min_height=height, min_width=width, 15 | always_apply=True, border_mode=0, value=[0, 0, 0]), 16 | albu.augmentations.transforms.RandomResizedCrop( 17 | height=height, 18 | width=width, p=0.3), 19 | albu.augmentations.transforms.Flip(), 20 | albu.augmentations.transforms.Transpose(), 21 | albu.OneOf([ 22 | albu.RandomBrightnessContrast(brightness_limit=0.5, 23 | contrast_limit=0.4), 24 | albu.RandomGamma(gamma_limit=(50, 150)), 25 | albu.NoOp() 26 | ]), 27 | albu.OneOf([ 28 | albu.RGBShift(r_shift_limit=20, b_shift_limit=15, 29 | g_shift_limit=15), 30 | albu.HueSaturationValue(hue_shift_limit=5, 31 | sat_shift_limit=5), 32 | albu.NoOp() 33 | ]), 34 | albu.CLAHE(p=0.8), 35 | albu.HorizontalFlip(p=0.5), 36 | albu.VerticalFlip(p=0.5), 37 | ]) 38 | if(phase == 'test' or phase == 'valid'): 39 | list_transforms.extend([ 40 | albu.Resize(height=height, width=width) 41 | ]) 42 | list_transforms.extend([ 43 | albu.Normalize(mean=(0.485, 0.456, 0.406), 44 | std=(0.229, 0.224, 0.225), p=1), 45 | ToTensor() 46 | ]) 47 | if(phase == 'test'): 48 | return albu.Compose(list_transforms) 49 | return albu.Compose(list_transforms, bbox_params=albu.BboxParams(format='pascal_voc', min_area=min_area, 50 | min_visibility=min_visibility, label_fields=['category_id'])) 51 | 52 | 53 | def detection_collate(batch): 54 | imgs = [s['image'] for s in batch] 55 | annots = [s['bboxes'] for s in batch] 56 | labels = [s['category_id'] for s in batch] 57 | 58 | max_num_annots = max(len(annot) for annot in annots) 59 | annot_padded = np.ones((len(annots), max_num_annots, 5))*-1 60 | 61 | if max_num_annots > 0: 62 | for idx, (annot, lab) in enumerate(zip(annots, labels)): 63 | if len(annot) > 0: 64 | annot_padded[idx, :len(annot), :4] = annot 65 | annot_padded[idx, :len(annot), 4] = lab 66 | return (torch.stack(imgs, 0), torch.FloatTensor(annot_padded)) 67 | 68 | 69 | def collater(data): 70 | imgs = [s['img'] for s in data] 71 | annots = [s['annot'] for s in data] 72 | scales = [s['scale'] for s in data] 73 | 74 | imgs = torch.from_numpy(np.stack(imgs, axis=0)) 75 | 76 | max_num_annots = max(annot.shape[0] for annot in annots) 77 | 78 | if max_num_annots > 0: 79 | 80 | annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1 81 | 82 | if max_num_annots > 0: 83 | for idx, annot in enumerate(annots): 84 | if annot.shape[0] > 0: 85 | annot_padded[idx, :annot.shape[0], :] = annot 86 | else: 87 | annot_padded = torch.ones((len(annots), 1, 5)) * -1 88 | 89 | imgs = imgs.permute(0, 3, 1, 2) 90 | 91 | return (imgs, torch.FloatTensor(annot_padded)) 92 | 93 | 94 | class Resizer(object): 95 | """Convert ndarrays in sample to Tensors.""" 96 | 97 | def __call__(self, sample, common_size=512): 98 | image, annots = sample['img'], sample['annot'] 99 | height, width, _ = image.shape 100 | if height > width: 101 | scale = common_size / height 102 | resized_height = common_size 103 | resized_width = int(width * scale) 104 | else: 105 | scale = common_size / width 106 | resized_height = int(height * scale) 107 | resized_width = common_size 108 | 109 | image = cv2.resize(image, (resized_width, resized_height)) 110 | 111 | new_image = np.zeros((common_size, common_size, 3)) 112 | new_image[0:resized_height, 0:resized_width] = image 113 | annots[:, :4] *= scale 114 | 115 | return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale} 116 | 117 | 118 | class Augmenter(object): 119 | """Convert ndarrays in sample to Tensors.""" 120 | 121 | def __call__(self, sample, flip_x=0.5): 122 | if np.random.rand() < flip_x: 123 | image, annots = sample['img'], sample['annot'] 124 | image = image[:, ::-1, :] 125 | 126 | rows, cols, channels = image.shape 127 | 128 | x1 = annots[:, 0].copy() 129 | x2 = annots[:, 2].copy() 130 | 131 | x_tmp = x1.copy() 132 | 133 | annots[:, 0] = cols - x2 134 | annots[:, 2] = cols - x_tmp 135 | 136 | sample = {'img': image, 'annot': annots} 137 | 138 | return sample 139 | 140 | 141 | class Normalizer(object): 142 | 143 | def __init__(self): 144 | self.mean = np.array([[[0.485, 0.456, 0.406]]]) 145 | self.std = np.array([[[0.229, 0.224, 0.225]]]) 146 | 147 | def __call__(self, sample): 148 | image, annots = sample['img'], sample['annot'] 149 | 150 | return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots} 151 | -------------------------------------------------------------------------------- /datasets/coco.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import sys 3 | import os 4 | import torch 5 | import numpy as np 6 | import random 7 | import csv 8 | 9 | from torch.utils.data import Dataset, DataLoader 10 | from torchvision import transforms, utils 11 | from torch.utils.data.sampler import Sampler 12 | 13 | from pycocotools.coco import COCO 14 | 15 | import skimage.io 16 | import skimage.transform 17 | import skimage.color 18 | import skimage 19 | import cv2 20 | from PIL import Image 21 | 22 | 23 | class CocoDataset(Dataset): 24 | """Coco dataset.""" 25 | 26 | def __init__(self, root_dir, set_name='train2017', transform=None): 27 | """ 28 | Args: 29 | root_dir (string): COCO directory. 30 | transform (callable, optional): Optional transform to be applied 31 | on a sample. 32 | """ 33 | self.root_dir = root_dir 34 | self.set_name = set_name 35 | self.transform = transform 36 | 37 | self.coco = COCO(os.path.join(self.root_dir, 'annotations', 38 | 'instances_' + self.set_name + '.json')) 39 | self.image_ids = self.coco.getImgIds() 40 | 41 | self.load_classes() 42 | 43 | def load_classes(self): 44 | # load class names (name -> label) 45 | categories = self.coco.loadCats(self.coco.getCatIds()) 46 | categories.sort(key=lambda x: x['id']) 47 | 48 | self.classes = {} 49 | self.coco_labels = {} 50 | self.coco_labels_inverse = {} 51 | for c in categories: 52 | self.coco_labels[len(self.classes)] = c['id'] 53 | self.coco_labels_inverse[c['id']] = len(self.classes) 54 | self.classes[c['name']] = len(self.classes) 55 | 56 | # also load the reverse (label -> name) 57 | self.labels = {} 58 | for key, value in self.classes.items(): 59 | self.labels[value] = key 60 | 61 | def __len__(self): 62 | return len(self.image_ids) 63 | 64 | def __getitem__(self, idx): 65 | 66 | img = self.load_image(idx) 67 | annot = self.load_annotations(idx) 68 | sample = {'img': img, 'annot': annot} 69 | if self.transform: 70 | sample = self.transform(sample) 71 | return sample 72 | 73 | def load_image(self, image_index): 74 | image_info = self.coco.loadImgs(self.image_ids[image_index])[0] 75 | path = os.path.join(self.root_dir, 'images', 76 | self.set_name, image_info['file_name']) 77 | img = cv2.imread(path) 78 | 79 | if len(img.shape) == 2: 80 | img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) 81 | return img 82 | 83 | def load_annotations(self, image_index): 84 | # get ground truth annotations 85 | annotations_ids = self.coco.getAnnIds( 86 | imgIds=self.image_ids[image_index], iscrowd=False) 87 | annotations = np.zeros((0, 5)) 88 | 89 | # some images appear to miss annotations (like image with id 257034) 90 | if len(annotations_ids) == 0: 91 | return annotations 92 | 93 | # parse annotations 94 | coco_annotations = self.coco.loadAnns(annotations_ids) 95 | for idx, a in enumerate(coco_annotations): 96 | 97 | # some annotations have basically no width / height, skip them 98 | if a['bbox'][2] < 1 or a['bbox'][3] < 1: 99 | continue 100 | 101 | annotation = np.zeros((1, 5)) 102 | annotation[0, :4] = a['bbox'] 103 | annotation[0, 4] = self.coco_label_to_label(a['category_id']) 104 | annotations = np.append(annotations, annotation, axis=0) 105 | 106 | # transform from [x, y, w, h] to [x1, y1, x2, y2] 107 | annotations[:, 2] = annotations[:, 0] + annotations[:, 2] 108 | annotations[:, 3] = annotations[:, 1] + annotations[:, 3] 109 | 110 | return annotations 111 | 112 | def coco_label_to_label(self, coco_label): 113 | return self.coco_labels_inverse[coco_label] 114 | 115 | def label_to_coco_label(self, label): 116 | return self.coco_labels[label] 117 | 118 | def image_aspect_ratio(self, image_index): 119 | image = self.coco.loadImgs(self.image_ids[image_index])[0] 120 | return float(image['width']) / float(image['height']) 121 | 122 | def num_classes(self): 123 | return 80 124 | 125 | 126 | if __name__ == '__main__': 127 | from augmentation import get_augumentation 128 | dataset = CocoDataset(root_dir='/root/data/coco', set_name='trainval35k', 129 | transform=get_augumentation(phase='train')) 130 | sample = dataset[0] 131 | print('sample: ', sample) 132 | -------------------------------------------------------------------------------- /datasets/coco_labels.txt: -------------------------------------------------------------------------------- 1 | 1,1,person 2 | 2,2,bicycle 3 | 3,3,car 4 | 4,4,motorcycle 5 | 5,5,airplane 6 | 6,6,bus 7 | 7,7,train 8 | 8,8,truck 9 | 9,9,boat 10 | 10,10,traffic light 11 | 11,11,fire hydrant 12 | 13,12,stop sign 13 | 14,13,parking meter 14 | 15,14,bench 15 | 16,15,bird 16 | 17,16,cat 17 | 18,17,dog 18 | 19,18,horse 19 | 20,19,sheep 20 | 21,20,cow 21 | 22,21,elephant 22 | 23,22,bear 23 | 24,23,zebra 24 | 25,24,giraffe 25 | 27,25,backpack 26 | 28,26,umbrella 27 | 31,27,handbag 28 | 32,28,tie 29 | 33,29,suitcase 30 | 34,30,frisbee 31 | 35,31,skis 32 | 36,32,snowboard 33 | 37,33,sports ball 34 | 38,34,kite 35 | 39,35,baseball bat 36 | 40,36,baseball glove 37 | 41,37,skateboard 38 | 42,38,surfboard 39 | 43,39,tennis racket 40 | 44,40,bottle 41 | 46,41,wine glass 42 | 47,42,cup 43 | 48,43,fork 44 | 49,44,knife 45 | 50,45,spoon 46 | 51,46,bowl 47 | 52,47,banana 48 | 53,48,apple 49 | 54,49,sandwich 50 | 55,50,orange 51 | 56,51,broccoli 52 | 57,52,carrot 53 | 58,53,hot dog 54 | 59,54,pizza 55 | 60,55,donut 56 | 61,56,cake 57 | 62,57,chair 58 | 63,58,couch 59 | 64,59,potted plant 60 | 65,60,bed 61 | 67,61,dining table 62 | 70,62,toilet 63 | 72,63,tv 64 | 73,64,laptop 65 | 74,65,mouse 66 | 75,66,remote 67 | 76,67,keyboard 68 | 77,68,cell phone 69 | 78,69,microwave 70 | 79,70,oven 71 | 80,71,toaster 72 | 81,72,sink 73 | 82,73,refrigerator 74 | 84,74,book 75 | 85,75,clock 76 | 86,76,vase 77 | 87,77,scissors 78 | 88,78,teddy bear 79 | 89,79,hair drier 80 | 90,80,toothbrush -------------------------------------------------------------------------------- /datasets/scripts/COCO2014.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start=`date +%s` 4 | 5 | # handle optional download dir 6 | if [ -z "$1" ] 7 | then 8 | # navigate to ~/data 9 | echo "navigating to ~/data/ ..." 10 | mkdir -p ~/data 11 | cd ~/data/ 12 | mkdir -p ./coco 13 | cd ./coco 14 | mkdir -p ./images 15 | mkdir -p ./annotations 16 | else 17 | # check if specified dir is valid 18 | if [ ! -d $1 ]; then 19 | echo $1 " is not a valid directory" 20 | exit 0 21 | fi 22 | echo "navigating to " $1 " ..." 23 | cd $1 24 | fi 25 | 26 | if [ ! -d images ] 27 | then 28 | mkdir -p ./images 29 | fi 30 | 31 | # Download the image data. 32 | cd ./images 33 | echo "Downloading MSCOCO train images ..." 34 | curl -LO http://images.cocodataset.org/zips/train2014.zip 35 | echo "Downloading MSCOCO val images ..." 36 | curl -LO http://images.cocodataset.org/zips/val2014.zip 37 | 38 | cd ../ 39 | if [ ! -d annotations] 40 | then 41 | mkdir -p ./annotations 42 | fi 43 | 44 | # Download the annotation data. 45 | cd ./annotations 46 | echo "Downloading MSCOCO train/val annotations ..." 47 | curl -LO http://images.cocodataset.org/annotations/annotations_trainval2014.zip 48 | echo "Finished downloading. Now extracting ..." 49 | 50 | # Unzip data 51 | echo "Extracting train images ..." 52 | unzip ../images/train2014.zip -d ../images 53 | echo "Extracting val images ..." 54 | unzip ../images/val2014.zip -d ../images 55 | echo "Extracting annotations ..." 56 | unzip ./annotations_trainval2014.zip 57 | 58 | echo "Removing zip files ..." 59 | rm ../images/train2014.zip 60 | rm ../images/val2014.zip 61 | rm ./annotations_trainval2014.zip 62 | 63 | echo "Creating trainval35k dataset..." 64 | 65 | # Download annotations json 66 | echo "Downloading trainval35k annotations from S3" 67 | curl -LO https://s3.amazonaws.com/amdegroot-datasets/instances_trainval35k.json.zip 68 | 69 | # combine train and val 70 | echo "Combining train and val images" 71 | mkdir ../images/trainval35k 72 | cd ../images/train2014 73 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} + # dir too large for cp 74 | cd ../val2014 75 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} + 76 | 77 | 78 | end=`date +%s` 79 | runtime=$((end-start)) 80 | 81 | echo "Completed in " $runtime " seconds" 82 | -------------------------------------------------------------------------------- /datasets/scripts/COCO2017.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start=`date +%s` 4 | 5 | # handle optional download dir 6 | if [ -z "$1" ] 7 | then 8 | # navigate to ~/data 9 | echo "navigating to ~/data/ ..." 10 | mkdir -p ~/data 11 | cd ~/data/ 12 | mkdir -p ./coco 13 | cd ./coco 14 | mkdir -p ./images 15 | mkdir -p ./annotations 16 | else 17 | # check if specified dir is valid 18 | if [ ! -d $1 ]; then 19 | echo $1 " is not a valid directory" 20 | exit 0 21 | fi 22 | echo "navigating to " $1 " ..." 23 | cd $1 24 | fi 25 | 26 | if [ ! -d images ] 27 | then 28 | mkdir -p ./images 29 | fi 30 | 31 | # Download the image data. 32 | cd ./images 33 | echo "Downloading MSCOCO train images ..." 34 | curl -LO http://images.cocodataset.org/zips/train2017.zip 35 | echo "Downloading MSCOCO val images ..." 36 | curl -LO http://images.cocodataset.org/zips/val2017.zip 37 | 38 | cd ../ 39 | if [ ! -d annotations] 40 | then 41 | mkdir -p ./annotations 42 | fi 43 | 44 | # Download the annotation data. 45 | cd ./annotations 46 | echo "Downloading MSCOCO train/val annotations ..." 47 | curl -LO http://images.cocodataset.org/annotations/annotations_trainval2017.zip 48 | echo "Finished downloading. Now extracting ..." 49 | 50 | # Unzip data 51 | echo "Extracting train images ..." 52 | unzip ../images/train2017.zip -d ../images 53 | echo "Extracting val images ..." 54 | unzip ../images/val2017.zip -d ../images 55 | echo "Extracting annotations ..." 56 | unzip ./annotations_trainval2017.zip 57 | 58 | echo "Removing zip files ..." 59 | rm ../images/train2017.zip 60 | rm ../images/val2017.zip 61 | rm ./annotations_trainval2017.zip 62 | 63 | echo "Completed in " $runtime " seconds" 64 | -------------------------------------------------------------------------------- /datasets/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /datasets/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /datasets/visual_aug.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from augmentation import get_augumentation 3 | from voc0712 import VOCDetection 4 | import matplotlib.pyplot as plt 5 | EFFICIENTDET = { 6 | 'efficientdet-d0': {'input_size': 512, 7 | 'backbone': 'B0', 8 | 'W_bifpn': 64, 9 | 'D_bifpn': 2, 10 | 'D_class': 3}, 11 | 'efficientdet-d1': {'input_size': 640, 12 | 'backbone': 'B1', 13 | 'W_bifpn': 88, 14 | 'D_bifpn': 3, 15 | 'D_class': 3}, 16 | 'efficientdet-d2': {'input_size': 768, 17 | 'backbone': 'B2', 18 | 'W_bifpn': 112, 19 | 'D_bifpn': 4, 20 | 'D_class': 3}, 21 | } 22 | 23 | 24 | # Functions to visualize bounding boxes and class labels on an image. 25 | # Based on https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/vis.py 26 | 27 | BOX_COLOR = (255, 0, 0) 28 | TEXT_COLOR = (255, 255, 255) 29 | 30 | 31 | def visualize_bbox(img, bbox, class_id, class_idx_to_name, color=BOX_COLOR, thickness=2): 32 | x_min, y_min, x_max, y_max = bbox 33 | x_min, x_max, y_min, y_max = int(x_min), int(x_max), int(y_min), int(y_max) 34 | cv2.rectangle(img, (x_min, y_min), (x_max, y_max), 35 | color=color, thickness=thickness) 36 | # class_name = class_idx_to_name[class_id] 37 | # ((text_width, text_height), _) = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.35, 1) 38 | # cv2.rectangle(img, (x_min, y_min - int(1.3 * text_height)), (x_min + text_width, y_min), BOX_COLOR, -1) 39 | # cv2.putText(img, class_name, (x_min, y_min - int(0.3 * text_height)), cv2.FONT_HERSHEY_SIMPLEX, 0.35,TEXT_COLOR, lineType=cv2.LINE_AA) 40 | return img 41 | 42 | 43 | def visualize(annotations, category_id_to_name): 44 | img = annotations['image'].copy() 45 | for idx, bbox in enumerate(annotations['bboxes']): 46 | img = visualize_bbox( 47 | img, bbox, annotations['category_id'][idx], category_id_to_name) 48 | # plt.figure(figsize=(12, 12)) 49 | # plt.imshow(img) 50 | return img 51 | 52 | 53 | dataset_root = '/root/data/VOCdevkit' 54 | network = 'efficientdet-d0' 55 | dataset = VOCDetection(root=dataset_root, 56 | transform=get_augumentation(phase='train', width=EFFICIENTDET[network]['input_size'], height=EFFICIENTDET[network]['input_size'])) 57 | 58 | 59 | def visual_data(data, name): 60 | img = data['image'] 61 | bboxes = data['bboxes'] 62 | annotations = {'image': data['image'], 'bboxes': data['bboxes'], 'category_id': range( 63 | len(data['bboxes']))} 64 | category_id_to_name = {v: v for v in range(len(data['bboxes']))} 65 | 66 | img = visualize(annotations, category_id_to_name) 67 | cv2.imwrite(name, img) 68 | 69 | 70 | for i in range(20, 25): 71 | visual_data(dataset[i], "name"+str(i)+".png") 72 | -------------------------------------------------------------------------------- /datasets/voc0712.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import sys 3 | import torch 4 | import torch.utils.data as data 5 | import cv2 6 | import numpy as np 7 | if sys.version_info[0] == 2: 8 | import xml.etree.cElementTree as ET 9 | else: 10 | import xml.etree.ElementTree as ET 11 | 12 | VOC_CLASSES = ( # always index 0 13 | 'aeroplane', 'bicycle', 'bird', 'boat', 14 | 'bottle', 'bus', 'car', 'cat', 'chair', 15 | 'cow', 'diningtable', 'dog', 'horse', 16 | 'motorbike', 'person', 'pottedplant', 17 | 'sheep', 'sofa', 'train', 'tvmonitor') 18 | 19 | # note: if you used our download scripts, this should be right 20 | VOC_ROOT = osp.join('/home/toandm2', "data/VOCdevkit/") 21 | 22 | 23 | class VOCAnnotationTransform(object): 24 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 25 | Initilized with a dictionary lookup of classnames to indexes 26 | Arguments: 27 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 28 | (default: alphabetic indexing of VOC's 20 classes) 29 | keep_difficult (bool, optional): keep difficult instances or not 30 | (default: False) 31 | height (int): height 32 | width (int): width 33 | """ 34 | 35 | def __init__(self, class_to_ind=None, keep_difficult=False): 36 | self.class_to_ind = class_to_ind or dict( 37 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 38 | self.keep_difficult = keep_difficult 39 | 40 | def __call__(self, target, width, height): 41 | """ 42 | Arguments: 43 | target (annotation) : the target annotation to be made usable 44 | will be an ET.Element 45 | Returns: 46 | a list containing lists of bounding boxes [bbox coords, class name] 47 | """ 48 | res = [] 49 | for obj in target.iter('object'): 50 | difficult = int(obj.find('difficult').text) == 1 51 | if not self.keep_difficult and difficult: 52 | continue 53 | name = obj.find('name').text.lower().strip() 54 | bbox = obj.find('bndbox') 55 | 56 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 57 | bndbox = [] 58 | for i, pt in enumerate(pts): 59 | cur_pt = float(bbox.find(pt).text) - 1 60 | # scale height or width 61 | # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 62 | bndbox.append(cur_pt) 63 | label_idx = self.class_to_ind[name] 64 | bndbox.append(label_idx) 65 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 66 | # img_id = target.find('filename').text[:-4] 67 | 68 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 69 | 70 | 71 | class VOCDetection(data.Dataset): 72 | """VOC Detection Dataset Object 73 | input is image, target is annotation 74 | Arguments: 75 | root (string): filepath to VOCdevkit folder. 76 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 77 | transform (callable, optional): transformation to perform on the 78 | input image 79 | target_transform (callable, optional): transformation to perform on the 80 | target `annotation` 81 | (eg: take in caption string, return tensor of word indices) 82 | dataset_name (string, optional): which dataset to load 83 | (default: 'VOC2007') 84 | """ 85 | 86 | def __init__(self, root, 87 | image_sets=[('2007', 'trainval'), ('2012', 'trainval')], 88 | transform=None, target_transform=VOCAnnotationTransform(), 89 | dataset_name='VOC0712'): 90 | self.root = root 91 | self.image_set = image_sets 92 | self.transform = transform 93 | self.target_transform = target_transform 94 | self.name = dataset_name 95 | self._annopath = osp.join('%s', 'Annotations', '%s.xml') 96 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') 97 | self.ids = list() 98 | for (year, name) in image_sets: 99 | rootpath = osp.join(self.root, 'VOC' + year) 100 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 101 | self.ids.append((rootpath, line.strip())) 102 | 103 | def __getitem__(self, index): 104 | img_id = self.ids[index] 105 | 106 | target = ET.parse(self._annopath % img_id).getroot() 107 | img = cv2.imread(self._imgpath % img_id) 108 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 109 | img = img.astype(np.float32)/255. 110 | height, width, channels = img.shape 111 | 112 | if self.target_transform is not None: 113 | target = self.target_transform(target, width, height) 114 | target = np.array(target) 115 | sample = {'img': img, 'annot': target} 116 | if self.transform is not None: 117 | sample = self.transform(sample) 118 | return sample 119 | 120 | bbox = target[:, :4] 121 | labels = target[:, 4] 122 | 123 | if self.transform is not None: 124 | annotation = {'image': img, 'bboxes': bbox, 'category_id': labels} 125 | augmentation = self.transform(**annotation) 126 | img = augmentation['image'] 127 | bbox = augmentation['bboxes'] 128 | labels = augmentation['category_id'] 129 | return {'image': img, 'bboxes': bbox, 'category_id': labels} 130 | 131 | def __len__(self): 132 | return len(self.ids) 133 | 134 | def num_classes(self): 135 | return len(VOC_CLASSES) 136 | 137 | def label_to_name(self, label): 138 | return VOC_CLASSES[label] 139 | 140 | def load_annotations(self, index): 141 | img_id = self.ids[index] 142 | anno = ET.parse(self._annopath % img_id).getroot() 143 | gt = self.target_transform(anno, 1, 1) 144 | gt = np.array(gt) 145 | return gt 146 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import cv2 3 | from PIL import Image 4 | import matplotlib.pyplot as plt 5 | from models import EfficientDet 6 | from torchvision import transforms 7 | import numpy as np 8 | import skimage 9 | from datasets import get_augumentation, VOC_CLASSES 10 | from timeit import default_timer as timer 11 | import argparse 12 | import copy 13 | from utils import vis_bbox, EFFICIENTDET 14 | 15 | parser = argparse.ArgumentParser(description='EfficientDet') 16 | 17 | parser.add_argument('-n', '--network', default='efficientdet-d0', 18 | help='efficientdet-[d0, d1, ..]') 19 | parser.add_argument('-s', '--score', default=True, 20 | action="store_true", help='Show score') 21 | parser.add_argument('-t', '--threshold', default=0.6, 22 | type=float, help='Visualization threshold') 23 | parser.add_argument('-it', '--iou_threshold', default=0.6, 24 | type=float, help='Visualization threshold') 25 | parser.add_argument('-w', '--weight', default='./weights/voc0712.pth', 26 | type=str, help='Weight model path') 27 | parser.add_argument('-c', '--cam', 28 | action="store_true", help='Use camera') 29 | parser.add_argument('-f', '--file_name', default='pic.jpg', 30 | help='Image path') 31 | parser.add_argument('--num_class', default=21, type=int, 32 | help='Number of class used in model') 33 | args = parser.parse_args() 34 | 35 | 36 | class Detect(object): 37 | """ 38 | dir_name: Folder or image_file 39 | """ 40 | 41 | def __init__(self, weights, num_class=21, network='efficientdet-d0', size_image=(512, 512)): 42 | super(Detect, self).__init__() 43 | self.weights = weights 44 | self.size_image = size_image 45 | self.device = torch.device( 46 | "cuda:0" if torch.cuda.is_available() else 'cpu') 47 | self.transform = get_augumentation(phase='test') 48 | if(self.weights is not None): 49 | print('Load pretrained Model') 50 | checkpoint = torch.load( 51 | self.weights, map_location=lambda storage, loc: storage) 52 | params = checkpoint['parser'] 53 | num_class = params.num_class 54 | network = params.network 55 | 56 | self.model = EfficientDet(num_classes=num_class, 57 | network=network, 58 | W_bifpn=EFFICIENTDET[network]['W_bifpn'], 59 | D_bifpn=EFFICIENTDET[network]['D_bifpn'], 60 | D_class=EFFICIENTDET[network]['D_class'], 61 | is_training=False 62 | ) 63 | 64 | if(self.weights is not None): 65 | state_dict = checkpoint['state_dict'] 66 | self.model.load_state_dict(state_dict) 67 | if torch.cuda.is_available(): 68 | self.model = self.model.cuda() 69 | self.model.eval() 70 | 71 | def process(self, file_name=None, img=None, show=False): 72 | if file_name is not None: 73 | img = cv2.imread(file_name) 74 | origin_img = copy.deepcopy(img) 75 | augmentation = self.transform(image=img) 76 | img = augmentation['image'] 77 | img = img.to(self.device) 78 | img = img.unsqueeze(0) 79 | 80 | with torch.no_grad(): 81 | scores, classification, transformed_anchors = self.model(img) 82 | bboxes = list() 83 | labels = list() 84 | bbox_scores = list() 85 | colors = list() 86 | for j in range(scores.shape[0]): 87 | bbox = transformed_anchors[[j], :][0].data.cpu().numpy() 88 | x1 = int(bbox[0]*origin_img.shape[1]/self.size_image[1]) 89 | y1 = int(bbox[1]*origin_img.shape[0]/self.size_image[0]) 90 | x2 = int(bbox[2]*origin_img.shape[1]/self.size_image[1]) 91 | y2 = int(bbox[3]*origin_img.shape[0]/self.size_image[0]) 92 | bboxes.append([x1, y1, x2, y2]) 93 | label_name = VOC_CLASSES[int(classification[[j]])] 94 | labels.append(label_name) 95 | 96 | if(args.cam): 97 | cv2.rectangle(origin_img, (x1, y1), 98 | (x2, y2), (179, 255, 179), 2, 1) 99 | if args.score: 100 | score = np.around( 101 | scores[[j]].cpu().numpy(), decimals=2) * 100 102 | if(args.cam): 103 | labelSize, baseLine = cv2.getTextSize('{} {}'.format( 104 | label_name, int(score)), cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2) 105 | cv2.rectangle( 106 | origin_img, (x1, y1-labelSize[1]), (x1+labelSize[0], y1+baseLine), (223, 128, 255), cv2.FILLED) 107 | cv2.putText( 108 | origin_img, '{} {}'.format(label_name, int(score)), 109 | (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 110 | 0.8, (0, 0, 0), 2 111 | ) 112 | bbox_scores.append(int(score)) 113 | else: 114 | if(args.cam): 115 | labelSize, baseLine = cv2.getTextSize('{}'.format( 116 | label_name), cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2) 117 | cv2.rectangle( 118 | origin_img, (x1, y1-labelSize[1]), (x1+labelSize[0], y1+baseLine), (0, 102, 255), cv2.FILLED) 119 | cv2.putText( 120 | origin_img, '{} {}'.format(label_name, int(score)), 121 | (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 122 | 0.8, (0, 0, 0), 2 123 | ) 124 | if show: 125 | fig, ax = vis_bbox(img=origin_img, bbox=bboxes, 126 | label=labels, score=bbox_scores) 127 | fig.savefig('./docs/demo.png') 128 | plt.show() 129 | else: 130 | return origin_img 131 | 132 | def camera(self): 133 | cap = cv2.VideoCapture(0) 134 | if not cap.isOpened(): 135 | print("Unable to open camera") 136 | exit(-1) 137 | count_tfps = 1 138 | accum_time = 0 139 | curr_fps = 0 140 | fps = "FPS: ??" 141 | prev_time = timer() 142 | while True: 143 | res, img = cap.read() 144 | curr_time = timer() 145 | exec_time = curr_time - prev_time 146 | prev_time = curr_time 147 | accum_time = accum_time + exec_time 148 | curr_fps = curr_fps + 1 149 | 150 | if accum_time > 1: 151 | accum_time = accum_time - 1 152 | fps = curr_fps 153 | curr_fps = 0 154 | if res: 155 | show_image = self.process(img=img) 156 | cv2.putText( 157 | show_image, "FPS: " + str(fps), (10, 20), 158 | cv2.FONT_HERSHEY_SIMPLEX, 0.9, (204, 51, 51), 2 159 | ) 160 | 161 | cv2.imshow("Detection", show_image) 162 | k = cv2.waitKey(1) 163 | if k == 27: 164 | break 165 | else: 166 | print("Unable to read image") 167 | exit(-1) 168 | count_tfps += 1 169 | cap.release() 170 | cv2.destroyAllWindows() 171 | 172 | 173 | if __name__ == '__main__': 174 | detect = Detect(weights=args.weight) 175 | print('cam: ', args.cam) 176 | if args.cam: 177 | detect.camera() 178 | else: 179 | detect.process(file_name=args.file_name, show=True) 180 | -------------------------------------------------------------------------------- /docs/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/arch.png -------------------------------------------------------------------------------- /docs/compare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/compare.png -------------------------------------------------------------------------------- /docs/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/demo.png -------------------------------------------------------------------------------- /docs/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/output.png -------------------------------------------------------------------------------- /docs/performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/performance.png -------------------------------------------------------------------------------- /docs/pytoan.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/pytoan.gif -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import numpy as np 5 | import torch 6 | from torch.utils.data import DataLoader 7 | from torchvision import transforms 8 | from tqdm import tqdm 9 | from pycocotools.cocoeval import COCOeval 10 | import json 11 | 12 | from datasets import (Augmenter, CocoDataset, Normalizer, 13 | Resizer, VOCDetection, collater, detection_collate, 14 | get_augumentation) 15 | from models.efficientdet import EfficientDet 16 | from utils import EFFICIENTDET, get_state_dict 17 | 18 | 19 | def compute_overlap(a, b): 20 | """ 21 | Parameters 22 | ---------- 23 | a: (N, 4) ndarray of float 24 | b: (K, 4) ndarray of float 25 | Returns 26 | ------- 27 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 28 | """ 29 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 30 | 31 | iw = np.minimum(np.expand_dims( 32 | a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0]) 33 | ih = np.minimum(np.expand_dims( 34 | a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1]) 35 | 36 | iw = np.maximum(iw, 0) 37 | ih = np.maximum(ih, 0) 38 | 39 | ua = np.expand_dims((a[:, 2] - a[:, 0]) * 40 | (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih 41 | 42 | ua = np.maximum(ua, np.finfo(float).eps) 43 | 44 | intersection = iw * ih 45 | 46 | return intersection / ua 47 | 48 | 49 | def _compute_ap(recall, precision): 50 | """ Compute the average precision, given the recall and precision curves. 51 | Code originally from https://github.com/rbgirshick/py-faster-rcnn. 52 | # Arguments 53 | recall: The recall curve (list). 54 | precision: The precision curve (list). 55 | # Returns 56 | The average precision as computed in py-faster-rcnn. 57 | """ 58 | # correct AP calculation 59 | # first append sentinel values at the end 60 | mrec = np.concatenate(([0.], recall, [1.])) 61 | mpre = np.concatenate(([0.], precision, [0.])) 62 | 63 | # compute the precision envelope 64 | for i in range(mpre.size - 1, 0, -1): 65 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 66 | 67 | # to calculate area under PR curve, look for points 68 | # where X axis (recall) changes value 69 | i = np.where(mrec[1:] != mrec[:-1])[0] 70 | 71 | # and sum (\Delta recall) * prec 72 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 73 | return ap 74 | 75 | 76 | def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, save_path=None): 77 | """ Get the detections from the retinanet using the generator. 78 | The result is a list of lists such that the size is: 79 | all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes] 80 | # Arguments 81 | dataset : The generator used to run images through the retinanet. 82 | retinanet : The retinanet to run on the images. 83 | score_threshold : The score confidence threshold to use. 84 | max_detections : The maximum number of detections to use per image. 85 | save_path : The path to save the images with visualized detections to. 86 | # Returns 87 | A list of lists containing the detections for each image in the generator. 88 | """ 89 | all_detections = [[None for i in range( 90 | dataset.num_classes())] for j in range(len(dataset))] 91 | 92 | retinanet.eval() 93 | 94 | with torch.no_grad(): 95 | 96 | for index in range(len(dataset)): 97 | data = dataset[index] 98 | scale = data['scale'] 99 | 100 | # run network 101 | scores, labels, boxes = retinanet(data['img'].permute( 102 | 2, 0, 1).cuda().float().unsqueeze(dim=0)) 103 | scores = scores.cpu().numpy() 104 | labels = labels.cpu().numpy() 105 | boxes = boxes.cpu().numpy() 106 | 107 | # correct boxes for image scale 108 | boxes /= scale 109 | 110 | # select indices which have a score above the threshold 111 | indices = np.where(scores > score_threshold)[0] 112 | if indices.shape[0] > 0: 113 | # select those scores 114 | scores = scores[indices] 115 | 116 | # find the order with which to sort the scores 117 | scores_sort = np.argsort(-scores)[:max_detections] 118 | 119 | # select detections 120 | image_boxes = boxes[indices[scores_sort], :] 121 | image_scores = scores[scores_sort] 122 | image_labels = labels[indices[scores_sort]] 123 | image_detections = np.concatenate([image_boxes, np.expand_dims( 124 | image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1) 125 | 126 | # copy detections to all_detections 127 | for label in range(dataset.num_classes()): 128 | all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1] 129 | else: 130 | # copy detections to all_detections 131 | for label in range(dataset.num_classes()): 132 | all_detections[index][label] = np.zeros((0, 5)) 133 | 134 | print('{}/{}'.format(index + 1, len(dataset)), end='\r') 135 | 136 | return all_detections 137 | 138 | 139 | def _get_annotations(generator): 140 | """ Get the ground truth annotations from the generator. 141 | The result is a list of lists such that the size is: 142 | all_detections[num_images][num_classes] = annotations[num_detections, 5] 143 | # Arguments 144 | generator : The generator used to retrieve ground truth annotations. 145 | # Returns 146 | A list of lists containing the annotations for each image in the generator. 147 | """ 148 | all_annotations = [[None for i in range( 149 | generator.num_classes())] for j in range(len(generator))] 150 | 151 | for i in range(len(generator)): 152 | # load the annotations 153 | annotations = generator.load_annotations(i) 154 | 155 | # copy detections to all_annotations 156 | for label in range(generator.num_classes()): 157 | all_annotations[i][label] = annotations[annotations[:, 4] 158 | == label, :4].copy() 159 | 160 | print('{}/{}'.format(i + 1, len(generator)), end='\r') 161 | 162 | return all_annotations 163 | 164 | 165 | def evaluate( 166 | generator, 167 | retinanet, 168 | iou_threshold=0.5, 169 | score_threshold=0.05, 170 | max_detections=100, 171 | save_path=None 172 | ): 173 | """ Evaluate a given dataset using a given retinanet. 174 | # Arguments 175 | generator : The generator that represents the dataset to evaluate. 176 | retinanet : The retinanet to evaluate. 177 | iou_threshold : The threshold used to consider when a detection is positive or negative. 178 | score_threshold : The score confidence threshold to use for detections. 179 | max_detections : The maximum number of detections to use per image. 180 | save_path : The path to save images with visualized detections to. 181 | # Returns 182 | A dict mapping class names to mAP scores. 183 | """ 184 | 185 | # gather all detections and annotations 186 | 187 | all_detections = _get_detections( 188 | generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path) 189 | all_annotations = _get_annotations(generator) 190 | 191 | average_precisions = {} 192 | 193 | for label in range(generator.num_classes()): 194 | false_positives = np.zeros((0,)) 195 | true_positives = np.zeros((0,)) 196 | scores = np.zeros((0,)) 197 | num_annotations = 0.0 198 | 199 | for i in range(len(generator)): 200 | detections = all_detections[i][label] 201 | annotations = all_annotations[i][label] 202 | num_annotations += annotations.shape[0] 203 | detected_annotations = [] 204 | 205 | for d in detections: 206 | scores = np.append(scores, d[4]) 207 | 208 | if annotations.shape[0] == 0: 209 | false_positives = np.append(false_positives, 1) 210 | true_positives = np.append(true_positives, 0) 211 | continue 212 | 213 | overlaps = compute_overlap( 214 | np.expand_dims(d, axis=0), annotations) 215 | assigned_annotation = np.argmax(overlaps, axis=1) 216 | max_overlap = overlaps[0, assigned_annotation] 217 | 218 | if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: 219 | false_positives = np.append(false_positives, 0) 220 | true_positives = np.append(true_positives, 1) 221 | detected_annotations.append(assigned_annotation) 222 | else: 223 | false_positives = np.append(false_positives, 1) 224 | true_positives = np.append(true_positives, 0) 225 | 226 | # no annotations -> AP for this class is 0 (is this correct?) 227 | if num_annotations == 0: 228 | average_precisions[label] = 0, 0 229 | continue 230 | 231 | # sort by score 232 | indices = np.argsort(-scores) 233 | false_positives = false_positives[indices] 234 | true_positives = true_positives[indices] 235 | 236 | # compute false positives and true positives 237 | false_positives = np.cumsum(false_positives) 238 | true_positives = np.cumsum(true_positives) 239 | 240 | # compute recall and precision 241 | recall = true_positives / num_annotations 242 | precision = true_positives / \ 243 | np.maximum(true_positives + false_positives, 244 | np.finfo(np.float64).eps) 245 | 246 | # compute average precision 247 | average_precision = _compute_ap(recall, precision) 248 | average_precisions[label] = average_precision, num_annotations 249 | 250 | print('\nmAP:') 251 | avg_mAP = [] 252 | for label in range(generator.num_classes()): 253 | label_name = generator.label_to_name(label) 254 | print('{}: {}'.format(label_name, average_precisions[label][0])) 255 | avg_mAP.append(average_precisions[label][0]) 256 | print('avg mAP: {}'.format(np.mean(avg_mAP))) 257 | return np.mean(avg_mAP), average_precisions 258 | 259 | 260 | def evaluate_coco(dataset, model, threshold=0.05): 261 | 262 | model.eval() 263 | 264 | with torch.no_grad(): 265 | 266 | # start collecting results 267 | results = [] 268 | image_ids = [] 269 | 270 | for index in range(len(dataset)): 271 | data = dataset[index] 272 | scale = data['scale'] 273 | 274 | # run network 275 | scores, labels, boxes = model(data['img'].permute( 276 | 2, 0, 1).cuda().float().unsqueeze(dim=0)) 277 | scores = scores.cpu() 278 | labels = labels.cpu() 279 | boxes = boxes.cpu() 280 | 281 | # correct boxes for image scale 282 | boxes /= scale 283 | 284 | if boxes.shape[0] > 0: 285 | # change to (x, y, w, h) (MS COCO standard) 286 | boxes[:, 2] -= boxes[:, 0] 287 | boxes[:, 3] -= boxes[:, 1] 288 | 289 | # compute predicted labels and scores 290 | # for box, score, label in zip(boxes[0], scores[0], labels[0]): 291 | for box_id in range(boxes.shape[0]): 292 | score = float(scores[box_id]) 293 | label = int(labels[box_id]) 294 | box = boxes[box_id, :] 295 | 296 | # scores are sorted, so we can break 297 | if score < threshold: 298 | break 299 | 300 | # append detection for each positively labeled class 301 | image_result = { 302 | 'image_id': dataset.image_ids[index], 303 | 'category_id': dataset.label_to_coco_label(label), 304 | 'score': float(score), 305 | 'bbox': box.tolist(), 306 | } 307 | 308 | # append detection to results 309 | results.append(image_result) 310 | 311 | # append image to list of processed images 312 | image_ids.append(dataset.image_ids[index]) 313 | 314 | # print progress 315 | print('{}/{}'.format(index, len(dataset)), end='\r') 316 | 317 | if not len(results): 318 | return 319 | 320 | # write output 321 | json.dump(results, open('{}_bbox_results.json'.format( 322 | dataset.set_name), 'w'), indent=4) 323 | 324 | # load results in COCO evaluation tool 325 | coco_true = dataset.coco 326 | coco_pred = coco_true.loadRes( 327 | '{}_bbox_results.json'.format(dataset.set_name)) 328 | 329 | # run COCO evaluation 330 | coco_eval = COCOeval(coco_true, coco_pred, 'bbox') 331 | coco_eval.params.imgIds = image_ids 332 | coco_eval.evaluate() 333 | coco_eval.accumulate() 334 | coco_eval.summarize() 335 | 336 | model.train() 337 | 338 | return 339 | 340 | 341 | if __name__ == '__main__': 342 | parser = argparse.ArgumentParser( 343 | description='EfficientDet Training With Pytorch') 344 | train_set = parser.add_mutually_exclusive_group() 345 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'], 346 | type=str, help='VOC or COCO') 347 | parser.add_argument('--dataset_root', default='/root/data/VOCdevkit/', 348 | help='Dataset root directory path [/root/data/VOCdevkit/, /root/data/coco/]') 349 | parser.add_argument('-t', '--threshold', default=0.4, 350 | type=float, help='Visualization threshold') 351 | parser.add_argument('-it', '--iou_threshold', default=0.5, 352 | type=float, help='Visualization threshold') 353 | parser.add_argument('--weight', default='./checkpoint_VOC_efficientdet-d0_248.pth', type=str, 354 | help='Checkpoint state_dict file to resume training from') 355 | args = parser.parse_args() 356 | 357 | if(args.weight is not None): 358 | resume_path = str(args.weight) 359 | print("Loading checkpoint: {} ...".format(resume_path)) 360 | checkpoint = torch.load( 361 | args.weight, map_location=lambda storage, loc: storage) 362 | params = checkpoint['parser'] 363 | args.num_class = params.num_class 364 | args.network = params.network 365 | model = EfficientDet( 366 | num_classes=args.num_class, 367 | network=args.network, 368 | W_bifpn=EFFICIENTDET[args.network]['W_bifpn'], 369 | D_bifpn=EFFICIENTDET[args.network]['D_bifpn'], 370 | D_class=EFFICIENTDET[args.network]['D_class'], 371 | is_training=False, 372 | threshold=args.threshold, 373 | iou_threshold=args.iou_threshold) 374 | model.load_state_dict(checkpoint['state_dict']) 375 | model = model.cuda() 376 | if(args.dataset == 'VOC'): 377 | valid_dataset = VOCDetection(root=args.dataset_root, image_sets=[('2007', 'test')], 378 | transform=transforms.Compose([Normalizer(), Resizer()])) 379 | evaluate(valid_dataset, model) 380 | else: 381 | valid_dataset = CocoDataset(root_dir=args.dataset_root, set_name='val2017', 382 | transform=transforms.Compose([Normalizer(), Resizer()])) 383 | evaluate_coco(valid_dataset, model) 384 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .efficientdet import EfficientDet -------------------------------------------------------------------------------- /models/bifpn.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | 5 | from .module import ConvModule, xavier_init 6 | import torch 7 | 8 | 9 | class BIFPN(nn.Module): 10 | def __init__(self, 11 | in_channels, 12 | out_channels, 13 | num_outs, 14 | start_level=0, 15 | end_level=-1, 16 | stack=1, 17 | add_extra_convs=False, 18 | extra_convs_on_inputs=True, 19 | relu_before_extra_convs=False, 20 | no_norm_on_lateral=False, 21 | conv_cfg=None, 22 | norm_cfg=None, 23 | activation=None): 24 | super(BIFPN, self).__init__() 25 | assert isinstance(in_channels, list) 26 | self.in_channels = in_channels 27 | self.out_channels = out_channels 28 | self.num_ins = len(in_channels) 29 | self.num_outs = num_outs 30 | self.activation = activation 31 | self.relu_before_extra_convs = relu_before_extra_convs 32 | self.no_norm_on_lateral = no_norm_on_lateral 33 | self.stack = stack 34 | 35 | if end_level == -1: 36 | self.backbone_end_level = self.num_ins 37 | assert num_outs >= self.num_ins - start_level 38 | else: 39 | # if end_level < inputs, no extra level is allowed 40 | self.backbone_end_level = end_level 41 | assert end_level <= len(in_channels) 42 | assert num_outs == end_level - start_level 43 | self.start_level = start_level 44 | self.end_level = end_level 45 | self.add_extra_convs = add_extra_convs 46 | self.extra_convs_on_inputs = extra_convs_on_inputs 47 | 48 | self.lateral_convs = nn.ModuleList() 49 | self.fpn_convs = nn.ModuleList() 50 | self.stack_bifpn_convs = nn.ModuleList() 51 | 52 | for i in range(self.start_level, self.backbone_end_level): 53 | l_conv = ConvModule( 54 | in_channels[i], 55 | out_channels, 56 | 1, 57 | conv_cfg=conv_cfg, 58 | norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, 59 | activation=self.activation, 60 | inplace=False) 61 | self.lateral_convs.append(l_conv) 62 | 63 | for ii in range(stack): 64 | self.stack_bifpn_convs.append(BiFPNModule(channels=out_channels, 65 | levels=self.backbone_end_level-self.start_level, 66 | conv_cfg=conv_cfg, 67 | norm_cfg=norm_cfg, 68 | activation=activation)) 69 | # add extra conv layers (e.g., RetinaNet) 70 | extra_levels = num_outs - self.backbone_end_level + self.start_level 71 | if add_extra_convs and extra_levels >= 1: 72 | for i in range(extra_levels): 73 | if i == 0 and self.extra_convs_on_inputs: 74 | in_channels = self.in_channels[self.backbone_end_level - 1] 75 | else: 76 | in_channels = out_channels 77 | extra_fpn_conv = ConvModule( 78 | in_channels, 79 | out_channels, 80 | 3, 81 | stride=2, 82 | padding=1, 83 | conv_cfg=conv_cfg, 84 | norm_cfg=norm_cfg, 85 | activation=self.activation, 86 | inplace=False) 87 | self.fpn_convs.append(extra_fpn_conv) 88 | self.init_weights() 89 | 90 | # default init_weights for conv(msra) and norm in ConvModule 91 | def init_weights(self): 92 | for m in self.modules(): 93 | if isinstance(m, nn.Conv2d): 94 | xavier_init(m, distribution='uniform') 95 | 96 | def forward(self, inputs): 97 | assert len(inputs) == len(self.in_channels) 98 | 99 | # build laterals 100 | laterals = [ 101 | lateral_conv(inputs[i + self.start_level]) 102 | for i, lateral_conv in enumerate(self.lateral_convs) 103 | ] 104 | 105 | # part 1: build top-down and down-top path with stack 106 | used_backbone_levels = len(laterals) 107 | for bifpn_module in self.stack_bifpn_convs: 108 | laterals = bifpn_module(laterals) 109 | outs = laterals 110 | # part 2: add extra levels 111 | if self.num_outs > len(outs): 112 | # use max pool to get more levels on top of outputs 113 | # (e.g., Faster R-CNN, Mask R-CNN) 114 | if not self.add_extra_convs: 115 | for i in range(self.num_outs - used_backbone_levels): 116 | outs.append(F.max_pool2d(outs[-1], 1, stride=2)) 117 | # add conv layers on top of original feature maps (RetinaNet) 118 | else: 119 | if self.extra_convs_on_inputs: 120 | orig = inputs[self.backbone_end_level - 1] 121 | outs.append(self.fpn_convs[0](orig)) 122 | else: 123 | outs.append(self.fpn_convs[0](outs[-1])) 124 | for i in range(1, self.num_outs - used_backbone_levels): 125 | if self.relu_before_extra_convs: 126 | outs.append(self.fpn_convs[i](F.relu(outs[-1]))) 127 | else: 128 | outs.append(self.fpn_convs[i](outs[-1])) 129 | return tuple(outs) 130 | 131 | 132 | class BiFPNModule(nn.Module): 133 | def __init__(self, 134 | channels, 135 | levels, 136 | init=0.5, 137 | conv_cfg=None, 138 | norm_cfg=None, 139 | activation=None, 140 | eps=0.0001): 141 | super(BiFPNModule, self).__init__() 142 | self.activation = activation 143 | self.eps = eps 144 | self.levels = levels 145 | self.bifpn_convs = nn.ModuleList() 146 | # weighted 147 | self.w1 = nn.Parameter(torch.Tensor(2, levels).fill_(init)) 148 | self.relu1 = nn.ReLU() 149 | self.w2 = nn.Parameter(torch.Tensor(3, levels - 2).fill_(init)) 150 | self.relu2 = nn.ReLU() 151 | for jj in range(2): 152 | for i in range(self.levels-1): # 1,2,3 153 | fpn_conv = nn.Sequential( 154 | ConvModule( 155 | channels, 156 | channels, 157 | 3, 158 | padding=1, 159 | conv_cfg=conv_cfg, 160 | norm_cfg=norm_cfg, 161 | activation=self.activation, 162 | inplace=False) 163 | ) 164 | self.bifpn_convs.append(fpn_conv) 165 | 166 | # default init_weights for conv(msra) and norm in ConvModule 167 | def init_weights(self): 168 | for m in self.modules(): 169 | if isinstance(m, nn.Conv2d): 170 | xavier_init(m, distribution='uniform') 171 | 172 | def forward(self, inputs): 173 | assert len(inputs) == self.levels 174 | # build top-down and down-top path with stack 175 | levels = self.levels 176 | # w relu 177 | w1 = self.relu1(self.w1) 178 | w1 /= torch.sum(w1, dim=0) + self.eps # normalize 179 | w2 = self.relu2(self.w2) 180 | w2 /= torch.sum(w2, dim=0) + self.eps # normalize 181 | # build top-down 182 | idx_bifpn = 0 183 | pathtd = inputs 184 | inputs_clone = [] 185 | for in_tensor in inputs: 186 | inputs_clone.append(in_tensor.clone()) 187 | 188 | for i in range(levels - 1, 0, -1): 189 | pathtd[i - 1] = (w1[0, i-1]*pathtd[i - 1] + w1[1, i-1]*F.interpolate( 190 | pathtd[i], scale_factor=2, mode='nearest'))/(w1[0, i-1] + w1[1, i-1] + self.eps) 191 | pathtd[i - 1] = self.bifpn_convs[idx_bifpn](pathtd[i - 1]) 192 | idx_bifpn = idx_bifpn + 1 193 | # build down-top 194 | for i in range(0, levels - 2, 1): 195 | pathtd[i + 1] = (w2[0, i] * pathtd[i + 1] + w2[1, i] * F.max_pool2d(pathtd[i], kernel_size=2) + 196 | w2[2, i] * inputs_clone[i + 1])/(w2[0, i] + w2[1, i] + w2[2, i] + self.eps) 197 | pathtd[i + 1] = self.bifpn_convs[idx_bifpn](pathtd[i + 1]) 198 | idx_bifpn = idx_bifpn + 1 199 | 200 | pathtd[levels - 1] = (w1[0, levels-1] * pathtd[levels - 1] + w1[1, levels-1] * F.max_pool2d( 201 | pathtd[levels - 2], kernel_size=2))/(w1[0, levels-1] + w1[1, levels-1] + self.eps) 202 | pathtd[levels - 1] = self.bifpn_convs[idx_bifpn](pathtd[levels - 1]) 203 | return pathtd 204 | -------------------------------------------------------------------------------- /models/efficientdet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | from models.efficientnet import EfficientNet 5 | from models.bifpn import BIFPN 6 | from .retinahead import RetinaHead 7 | from models.module import RegressionModel, ClassificationModel, Anchors, ClipBoxes, BBoxTransform 8 | from torchvision.ops import nms 9 | from .losses import FocalLoss 10 | MODEL_MAP = { 11 | 'efficientdet-d0': 'efficientnet-b0', 12 | 'efficientdet-d1': 'efficientnet-b1', 13 | 'efficientdet-d2': 'efficientnet-b2', 14 | 'efficientdet-d3': 'efficientnet-b3', 15 | 'efficientdet-d4': 'efficientnet-b4', 16 | 'efficientdet-d5': 'efficientnet-b5', 17 | 'efficientdet-d6': 'efficientnet-b6', 18 | 'efficientdet-d7': 'efficientnet-b6', 19 | } 20 | 21 | 22 | class EfficientDet(nn.Module): 23 | def __init__(self, 24 | num_classes, 25 | network='efficientdet-d0', 26 | D_bifpn=3, 27 | W_bifpn=88, 28 | D_class=3, 29 | is_training=True, 30 | threshold=0.01, 31 | iou_threshold=0.5): 32 | super(EfficientDet, self).__init__() 33 | self.backbone = EfficientNet.from_pretrained(MODEL_MAP[network]) 34 | self.is_training = is_training 35 | self.neck = BIFPN(in_channels=self.backbone.get_list_features()[-5:], 36 | out_channels=W_bifpn, 37 | stack=D_bifpn, 38 | num_outs=5) 39 | self.bbox_head = RetinaHead(num_classes=num_classes, 40 | in_channels=W_bifpn) 41 | 42 | self.anchors = Anchors() 43 | self.regressBoxes = BBoxTransform() 44 | self.clipBoxes = ClipBoxes() 45 | self.threshold = threshold 46 | self.iou_threshold = iou_threshold 47 | for m in self.modules(): 48 | if isinstance(m, nn.Conv2d): 49 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 50 | m.weight.data.normal_(0, math.sqrt(2. / n)) 51 | elif isinstance(m, nn.BatchNorm2d): 52 | m.weight.data.fill_(1) 53 | m.bias.data.zero_() 54 | self.freeze_bn() 55 | self.criterion = FocalLoss() 56 | 57 | def forward(self, inputs): 58 | if self.is_training: 59 | inputs, annotations = inputs 60 | else: 61 | inputs = inputs 62 | x = self.extract_feat(inputs) 63 | outs = self.bbox_head(x) 64 | classification = torch.cat([out for out in outs[0]], dim=1) 65 | regression = torch.cat([out for out in outs[1]], dim=1) 66 | anchors = self.anchors(inputs) 67 | if self.is_training: 68 | return self.criterion(classification, regression, anchors, annotations) 69 | else: 70 | transformed_anchors = self.regressBoxes(anchors, regression) 71 | transformed_anchors = self.clipBoxes(transformed_anchors, inputs) 72 | scores = torch.max(classification, dim=2, keepdim=True)[0] 73 | scores_over_thresh = (scores > self.threshold)[0, :, 0] 74 | 75 | if scores_over_thresh.sum() == 0: 76 | print('No boxes to NMS') 77 | # no boxes to NMS, just return 78 | return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] 79 | classification = classification[:, scores_over_thresh, :] 80 | transformed_anchors = transformed_anchors[:, scores_over_thresh, :] 81 | scores = scores[:, scores_over_thresh, :] 82 | anchors_nms_idx = nms( 83 | transformed_anchors[0, :, :], scores[0, :, 0], iou_threshold=self.iou_threshold) 84 | nms_scores, nms_class = classification[0, anchors_nms_idx, :].max( 85 | dim=1) 86 | return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]] 87 | 88 | def freeze_bn(self): 89 | '''Freeze BatchNorm layers.''' 90 | for layer in self.modules(): 91 | if isinstance(layer, nn.BatchNorm2d): 92 | layer.eval() 93 | 94 | def extract_feat(self, img): 95 | """ 96 | Directly extract features from the backbone+neck 97 | """ 98 | x = self.backbone(img) 99 | x = self.neck(x[-5:]) 100 | return x 101 | -------------------------------------------------------------------------------- /models/efficientnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from models.utils import ( 6 | round_filters, 7 | round_repeats, 8 | drop_connect, 9 | get_same_padding_conv2d, 10 | get_model_params, 11 | efficientnet_params, 12 | load_pretrained_weights, 13 | Swish, 14 | MemoryEfficientSwish, 15 | ) 16 | 17 | 18 | class MBConvBlock(nn.Module): 19 | """ 20 | Mobile Inverted Residual Bottleneck Block 21 | Args: 22 | block_args (namedtuple): BlockArgs, see above 23 | global_params (namedtuple): GlobalParam, see above 24 | Attributes: 25 | has_se (bool): Whether the block contains a Squeeze and Excitation layer. 26 | """ 27 | 28 | def __init__(self, block_args, global_params): 29 | super().__init__() 30 | self._block_args = block_args 31 | self._bn_mom = 1 - global_params.batch_norm_momentum 32 | self._bn_eps = global_params.batch_norm_epsilon 33 | self.has_se = (self._block_args.se_ratio is not None) and ( 34 | 0 < self._block_args.se_ratio <= 1) 35 | self.id_skip = block_args.id_skip # skip connection and drop connect 36 | 37 | # Get static or dynamic convolution depending on image size 38 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 39 | 40 | # Expansion phase 41 | inp = self._block_args.input_filters # number of input channels 42 | oup = self._block_args.input_filters * \ 43 | self._block_args.expand_ratio # number of output channels 44 | if self._block_args.expand_ratio != 1: 45 | self._expand_conv = Conv2d( 46 | in_channels=inp, out_channels=oup, kernel_size=1, bias=False) 47 | self._bn0 = nn.BatchNorm2d( 48 | num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 49 | # Depthwise convolution phase 50 | k = self._block_args.kernel_size 51 | s = self._block_args.stride 52 | self._depthwise_conv = Conv2d( 53 | in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise 54 | kernel_size=k, stride=s, bias=False) 55 | self._bn1 = nn.BatchNorm2d( 56 | num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 57 | 58 | # Squeeze and Excitation layer, if desired 59 | if self.has_se: 60 | num_squeezed_channels = max( 61 | 1, int(self._block_args.input_filters * self._block_args.se_ratio)) 62 | self._se_reduce = Conv2d( 63 | in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1) 64 | self._se_expand = Conv2d( 65 | in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1) 66 | 67 | # Output phase 68 | final_oup = self._block_args.output_filters 69 | self._project_conv = Conv2d( 70 | in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False) 71 | self._bn2 = nn.BatchNorm2d( 72 | num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps) 73 | self._swish = MemoryEfficientSwish() 74 | 75 | def forward(self, inputs, drop_connect_rate=None): 76 | """ 77 | :param inputs: input tensor 78 | :param drop_connect_rate: drop connect rate (float, between 0 and 1) 79 | :return: output of block 80 | """ 81 | 82 | # Expansion and Depthwise Convolution 83 | x = inputs 84 | if self._block_args.expand_ratio != 1: 85 | x = self._swish(self._bn0(self._expand_conv(inputs))) 86 | 87 | x = self._swish(self._bn1(self._depthwise_conv(x))) 88 | 89 | # Squeeze and Excitation 90 | if self.has_se: 91 | x_squeezed = F.adaptive_avg_pool2d(x, 1) 92 | x_squeezed = self._se_expand( 93 | self._swish(self._se_reduce(x_squeezed))) 94 | x = torch.sigmoid(x_squeezed) * x 95 | 96 | x = self._bn2(self._project_conv(x)) 97 | 98 | # Skip connection and drop connect 99 | input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters 100 | if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters: 101 | if drop_connect_rate: 102 | x = drop_connect(x, p=drop_connect_rate, 103 | training=self.training) 104 | x = x + inputs # skip connection 105 | return x 106 | 107 | def set_swish(self, memory_efficient=True): 108 | """Sets swish function as memory efficient (for training) or standard (for export)""" 109 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 110 | 111 | 112 | class EfficientNet(nn.Module): 113 | """ 114 | An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods 115 | Args: 116 | blocks_args (list): A list of BlockArgs to construct blocks 117 | global_params (namedtuple): A set of GlobalParams shared between blocks 118 | Example: 119 | model = EfficientNet.from_pretrained('efficientnet-b0') 120 | """ 121 | 122 | def __init__(self, blocks_args=None, global_params=None): 123 | super().__init__() 124 | assert isinstance(blocks_args, list), 'blocks_args should be a list' 125 | assert len(blocks_args) > 0, 'block args must be greater than 0' 126 | self._global_params = global_params 127 | self._blocks_args = blocks_args 128 | 129 | # Get static or dynamic convolution depending on image size 130 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 131 | 132 | # Batch norm parameters 133 | bn_mom = 1 - self._global_params.batch_norm_momentum 134 | bn_eps = self._global_params.batch_norm_epsilon 135 | 136 | # Stem 137 | in_channels = 3 # rgb 138 | # number of output channels 139 | out_channels = round_filters(32, self._global_params) 140 | self._conv_stem = Conv2d( 141 | in_channels, out_channels, kernel_size=3, stride=2, bias=False) 142 | self._bn0 = nn.BatchNorm2d( 143 | num_features=out_channels, momentum=bn_mom, eps=bn_eps) 144 | 145 | # Build blocks 146 | self._blocks = nn.ModuleList([]) 147 | for i in range(len(self._blocks_args)): 148 | # Update block input and output filters based on depth multiplier. 149 | self._blocks_args[i] = self._blocks_args[i]._replace( 150 | input_filters=round_filters( 151 | self._blocks_args[i].input_filters, self._global_params), 152 | output_filters=round_filters( 153 | self._blocks_args[i].output_filters, self._global_params), 154 | num_repeat=round_repeats( 155 | self._blocks_args[i].num_repeat, self._global_params) 156 | ) 157 | 158 | # The first block needs to take care of stride and filter size increase. 159 | self._blocks.append(MBConvBlock( 160 | self._blocks_args[i], self._global_params)) 161 | if self._blocks_args[i].num_repeat > 1: 162 | self._blocks_args[i] = self._blocks_args[i]._replace( 163 | input_filters=self._blocks_args[i].output_filters, stride=1) 164 | for _ in range(self._blocks_args[i].num_repeat - 1): 165 | self._blocks.append(MBConvBlock( 166 | self._blocks_args[i], self._global_params)) 167 | 168 | # Head'efficientdet-d0': 'efficientnet-b0', 169 | # output of final block 170 | in_channels = self._blocks_args[len( 171 | self._blocks_args)-1].output_filters 172 | out_channels = round_filters(1280, self._global_params) 173 | self._conv_head = Conv2d( 174 | in_channels, out_channels, kernel_size=1, bias=False) 175 | self._bn1 = nn.BatchNorm2d( 176 | num_features=out_channels, momentum=bn_mom, eps=bn_eps) 177 | 178 | # Final linear layer 179 | self._avg_pooling = nn.AdaptiveAvgPool2d(1) 180 | self._dropout = nn.Dropout(self._global_params.dropout_rate) 181 | self._fc = nn.Linear(out_channels, self._global_params.num_classes) 182 | self._swish = MemoryEfficientSwish() 183 | 184 | def set_swish(self, memory_efficient=True): 185 | """Sets swish function as memory efficient (for training) or standard (for export)""" 186 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 187 | for block in self._blocks: 188 | block.set_swish(memory_efficient) 189 | 190 | def extract_features(self, inputs): 191 | """ Returns output of the final convolution layer """ 192 | # Stem 193 | x = self._swish(self._bn0(self._conv_stem(inputs))) 194 | 195 | P = [] 196 | index = 0 197 | num_repeat = 0 198 | # Blocks 199 | for idx, block in enumerate(self._blocks): 200 | drop_connect_rate = self._global_params.drop_connect_rate 201 | if drop_connect_rate: 202 | drop_connect_rate *= float(idx) / len(self._blocks) 203 | x = block(x, drop_connect_rate=drop_connect_rate) 204 | num_repeat = num_repeat + 1 205 | if(num_repeat == self._blocks_args[index].num_repeat): 206 | num_repeat = 0 207 | index = index + 1 208 | P.append(x) 209 | return P 210 | 211 | def forward(self, inputs): 212 | """ Calls extract_features to extract features, applies final linear layer, and returns logits. """ 213 | # Convolution layers 214 | P = self.extract_features(inputs) 215 | return P 216 | 217 | @classmethod 218 | def from_name(cls, model_name, override_params=None): 219 | cls._check_model_name_is_valid(model_name) 220 | blocks_args, global_params = get_model_params( 221 | model_name, override_params) 222 | return cls(blocks_args, global_params) 223 | 224 | @classmethod 225 | def from_pretrained(cls, model_name, num_classes=1000, in_channels=3): 226 | model = cls.from_name(model_name, override_params={ 227 | 'num_classes': num_classes}) 228 | load_pretrained_weights( 229 | model, model_name, load_fc=(num_classes == 1000)) 230 | if in_channels != 3: 231 | Conv2d = get_same_padding_conv2d( 232 | image_size=model._global_params.image_size) 233 | out_channels = round_filters(32, model._global_params) 234 | model._conv_stem = Conv2d( 235 | in_channels, out_channels, kernel_size=3, stride=2, bias=False) 236 | return model 237 | 238 | @classmethod 239 | def from_pretrained(cls, model_name, num_classes=1000): 240 | model = cls.from_name(model_name, override_params={ 241 | 'num_classes': num_classes}) 242 | load_pretrained_weights( 243 | model, model_name, load_fc=(num_classes == 1000)) 244 | 245 | return model 246 | 247 | @classmethod 248 | def get_image_size(cls, model_name): 249 | cls._check_model_name_is_valid(model_name) 250 | _, _, res, _ = efficientnet_params(model_name) 251 | return res 252 | 253 | @classmethod 254 | def _check_model_name_is_valid(cls, model_name, also_need_pretrained_weights=False): 255 | """ Validates model name. None that pretrained weights are only available for 256 | the first four models (efficientnet-b{i} for i in 0,1,2,3) at the moment. """ 257 | num_models = 4 if also_need_pretrained_weights else 8 258 | valid_models = ['efficientnet-b'+str(i) for i in range(num_models)] 259 | if model_name not in valid_models: 260 | raise ValueError('model_name should be one of: ' + 261 | ', '.join(valid_models)) 262 | 263 | def get_list_features(self): 264 | list_feature = [] 265 | for idx in range(len(self._blocks_args)): 266 | list_feature.append(self._blocks_args[idx].output_filters) 267 | 268 | return list_feature 269 | 270 | 271 | if __name__ == '__main__': 272 | model = EfficientNet.from_pretrained('efficientnet-b0') 273 | inputs = torch.randn(4, 3, 640, 640) 274 | P = model(inputs) 275 | for idx, p in enumerate(P): 276 | print('P{}: {}'.format(idx, p.size())) 277 | # print('model: ', model) 278 | -------------------------------------------------------------------------------- /models/losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | def calc_iou(a, b): 7 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 8 | 9 | iw = torch.min(torch.unsqueeze( 10 | a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0]) 11 | ih = torch.min(torch.unsqueeze( 12 | a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1]) 13 | 14 | iw = torch.clamp(iw, min=0) 15 | ih = torch.clamp(ih, min=0) 16 | 17 | ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * 18 | (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih 19 | 20 | ua = torch.clamp(ua, min=1e-8) 21 | 22 | intersection = iw * ih 23 | 24 | IoU = intersection / ua 25 | 26 | return IoU 27 | 28 | 29 | class FocalLoss(nn.Module): 30 | # def __init__(self): 31 | 32 | def forward(self, classifications, regressions, anchors, annotations): 33 | alpha = 0.25 34 | gamma = 2.0 35 | batch_size = classifications.shape[0] 36 | classification_losses = [] 37 | regression_losses = [] 38 | 39 | anchor = anchors[0, :, :] 40 | 41 | anchor_widths = anchor[:, 2] - anchor[:, 0] 42 | anchor_heights = anchor[:, 3] - anchor[:, 1] 43 | anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths 44 | anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights 45 | 46 | for j in range(batch_size): 47 | 48 | classification = classifications[j, :, :] 49 | regression = regressions[j, :, :] 50 | 51 | bbox_annotation = annotations[j, :, :] 52 | bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] 53 | 54 | if bbox_annotation.shape[0] == 0: 55 | regression_losses.append(torch.tensor(0).float().cuda()) 56 | classification_losses.append(torch.tensor(0).float().cuda()) 57 | 58 | continue 59 | 60 | classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) 61 | 62 | # num_anchors x num_annotations 63 | IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) 64 | 65 | IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1 66 | 67 | #import pdb 68 | # pdb.set_trace() 69 | 70 | # compute the loss for classification 71 | targets = torch.ones(classification.shape) * -1 72 | targets = targets.cuda() 73 | 74 | targets[torch.lt(IoU_max, 0.4), :] = 0 75 | 76 | positive_indices = torch.ge(IoU_max, 0.5) 77 | 78 | num_positive_anchors = positive_indices.sum() 79 | 80 | assigned_annotations = bbox_annotation[IoU_argmax, :] 81 | 82 | targets[positive_indices, :] = 0 83 | targets[positive_indices, 84 | assigned_annotations[positive_indices, 4].long()] = 1 85 | 86 | alpha_factor = torch.ones(targets.shape).cuda() * alpha 87 | 88 | alpha_factor = torch.where( 89 | torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor) 90 | focal_weight = torch.where( 91 | torch.eq(targets, 1.), 1. - classification, classification) 92 | focal_weight = alpha_factor * torch.pow(focal_weight, gamma) 93 | 94 | bce = -(targets * torch.log(classification) + 95 | (1.0 - targets) * torch.log(1.0 - classification)) 96 | 97 | # cls_loss = focal_weight * torch.pow(bce, gamma) 98 | cls_loss = focal_weight * bce 99 | 100 | cls_loss = torch.where( 101 | torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda()) 102 | 103 | classification_losses.append( 104 | cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0)) 105 | 106 | # compute the loss for regression 107 | 108 | if positive_indices.sum() > 0: 109 | assigned_annotations = assigned_annotations[positive_indices, :] 110 | 111 | anchor_widths_pi = anchor_widths[positive_indices] 112 | anchor_heights_pi = anchor_heights[positive_indices] 113 | anchor_ctr_x_pi = anchor_ctr_x[positive_indices] 114 | anchor_ctr_y_pi = anchor_ctr_y[positive_indices] 115 | 116 | gt_widths = assigned_annotations[:, 117 | 2] - assigned_annotations[:, 0] 118 | gt_heights = assigned_annotations[:, 119 | 3] - assigned_annotations[:, 1] 120 | gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths 121 | gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights 122 | 123 | # clip widths to 1 124 | gt_widths = torch.clamp(gt_widths, min=1) 125 | gt_heights = torch.clamp(gt_heights, min=1) 126 | 127 | targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi 128 | targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi 129 | targets_dw = torch.log(gt_widths / anchor_widths_pi) 130 | targets_dh = torch.log(gt_heights / anchor_heights_pi) 131 | 132 | targets = torch.stack( 133 | (targets_dx, targets_dy, targets_dw, targets_dh)) 134 | targets = targets.t() 135 | 136 | targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda() 137 | 138 | negative_indices = 1 + (~positive_indices) 139 | 140 | regression_diff = torch.abs( 141 | targets - regression[positive_indices, :]) 142 | 143 | regression_loss = torch.where( 144 | torch.le(regression_diff, 1.0 / 9.0), 145 | 0.5 * 9.0 * torch.pow(regression_diff, 2), 146 | regression_diff - 0.5 / 9.0 147 | ) 148 | regression_losses.append(regression_loss.mean()) 149 | else: 150 | regression_losses.append(torch.tensor(0).float().cuda()) 151 | 152 | return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True) 153 | -------------------------------------------------------------------------------- /models/module.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import torch 4 | import warnings 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | 9 | class BBoxTransform(nn.Module): 10 | 11 | def __init__(self, mean=None, std=None): 12 | super(BBoxTransform, self).__init__() 13 | if mean is None: 14 | self.mean = torch.from_numpy( 15 | np.array([0, 0, 0, 0]).astype(np.float32)) 16 | else: 17 | self.mean = mean 18 | if std is None: 19 | self.std = torch.from_numpy( 20 | np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)) 21 | else: 22 | self.std = std 23 | 24 | def forward(self, boxes, deltas): 25 | 26 | widths = boxes[:, :, 2] - boxes[:, :, 0] 27 | heights = boxes[:, :, 3] - boxes[:, :, 1] 28 | ctr_x = boxes[:, :, 0] + 0.5 * widths 29 | ctr_y = boxes[:, :, 1] + 0.5 * heights 30 | 31 | dx = deltas[:, :, 0] * self.std[0] + self.mean[0] 32 | dy = deltas[:, :, 1] * self.std[1] + self.mean[1] 33 | dw = deltas[:, :, 2] * self.std[2] + self.mean[2] 34 | dh = deltas[:, :, 3] * self.std[3] + self.mean[3] 35 | 36 | pred_ctr_x = ctr_x + dx * widths 37 | pred_ctr_y = ctr_y + dy * heights 38 | pred_w = torch.exp(dw) * widths 39 | pred_h = torch.exp(dh) * heights 40 | 41 | pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w 42 | pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h 43 | pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w 44 | pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h 45 | 46 | pred_boxes = torch.stack( 47 | [pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2) 48 | 49 | return pred_boxes 50 | 51 | 52 | class ClipBoxes(nn.Module): 53 | 54 | def __init__(self, width=None, height=None): 55 | super(ClipBoxes, self).__init__() 56 | 57 | def forward(self, boxes, img): 58 | 59 | batch_size, num_channels, height, width = img.shape 60 | 61 | boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0) 62 | boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0) 63 | 64 | boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width) 65 | boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height) 66 | 67 | return boxes 68 | 69 | 70 | class RegressionModel(nn.Module): 71 | def __init__(self, num_features_in, num_anchors=9, feature_size=256): 72 | super(RegressionModel, self).__init__() 73 | 74 | self.conv1 = nn.Conv2d( 75 | num_features_in, feature_size, kernel_size=3, padding=1) 76 | self.act1 = nn.ReLU() 77 | self.conv2 = nn.Conv2d(feature_size, feature_size, 78 | kernel_size=3, padding=1) 79 | self.act2 = nn.ReLU() 80 | self.conv3 = nn.Conv2d(feature_size, feature_size, 81 | kernel_size=3, padding=1) 82 | self.act3 = nn.ReLU() 83 | self.conv4 = nn.Conv2d(feature_size, feature_size, 84 | kernel_size=3, padding=1) 85 | self.act4 = nn.ReLU() 86 | self.output = nn.Conv2d( 87 | feature_size, num_anchors*4, kernel_size=3, padding=1) 88 | 89 | def forward(self, x): 90 | out = self.conv1(x) 91 | out = self.act1(out) 92 | out = self.conv2(out) 93 | out = self.act2(out) 94 | out = self.conv3(out) 95 | out = self.act3(out) 96 | out = self.conv4(out) 97 | out = self.act4(out) 98 | out = self.output(out) 99 | # out is B x C x W x H, with C = 4*num_anchors 100 | out = out.permute(0, 2, 3, 1) 101 | return out.contiguous().view(out.shape[0], -1, 4) 102 | 103 | 104 | class ClassificationModel(nn.Module): 105 | def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256): 106 | super(ClassificationModel, self).__init__() 107 | self.num_classes = num_classes 108 | self.num_anchors = num_anchors 109 | 110 | self.conv1 = nn.Conv2d( 111 | num_features_in, feature_size, kernel_size=3, padding=1) 112 | self.act1 = nn.ReLU() 113 | self.conv2 = nn.Conv2d(feature_size, feature_size, 114 | kernel_size=3, padding=1) 115 | self.act2 = nn.ReLU() 116 | self.conv3 = nn.Conv2d(feature_size, feature_size, 117 | kernel_size=3, padding=1) 118 | self.act3 = nn.ReLU() 119 | self.conv4 = nn.Conv2d(feature_size, feature_size, 120 | kernel_size=3, padding=1) 121 | self.act4 = nn.ReLU() 122 | self.output = nn.Conv2d( 123 | feature_size, num_anchors*num_classes, kernel_size=3, padding=1) 124 | self.output_act = nn.Sigmoid() 125 | 126 | def forward(self, x): 127 | out = self.conv1(x) 128 | out = self.act1(out) 129 | out = self.conv2(out) 130 | out = self.act2(out) 131 | out = self.conv3(out) 132 | out = self.act3(out) 133 | out = self.conv4(out) 134 | out = self.act4(out) 135 | out = self.output(out) 136 | out = self.output_act(out) 137 | # out is B x C x W x H, with C = n_classes + n_anchors 138 | out1 = out.permute(0, 2, 3, 1) 139 | batch_size, width, height, channels = out1.shape 140 | out2 = out1.view(batch_size, width, height, 141 | self.num_anchors, self.num_classes) 142 | return out2.contiguous().view(x.shape[0], -1, self.num_classes) 143 | 144 | 145 | class Anchors(nn.Module): 146 | def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None): 147 | super(Anchors, self).__init__() 148 | 149 | if pyramid_levels is None: 150 | self.pyramid_levels = [3, 4, 5, 6, 7] 151 | if strides is None: 152 | self.strides = [2 ** x for x in self.pyramid_levels] 153 | if sizes is None: 154 | self.sizes = [2 ** (x + 2) for x in self.pyramid_levels] 155 | if ratios is None: 156 | self.ratios = np.array([0.5, 1, 2]) 157 | if scales is None: 158 | self.scales = np.array( 159 | [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 160 | 161 | def forward(self, image): 162 | 163 | image_shape = image.shape[2:] 164 | image_shape = np.array(image_shape) 165 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) 166 | for x in self.pyramid_levels] 167 | 168 | # compute anchors over all pyramid levels 169 | all_anchors = np.zeros((0, 4)).astype(np.float32) 170 | 171 | for idx, p in enumerate(self.pyramid_levels): 172 | anchors = generate_anchors( 173 | base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales) 174 | shifted_anchors = shift( 175 | image_shapes[idx], self.strides[idx], anchors) 176 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 177 | 178 | all_anchors = np.expand_dims(all_anchors, axis=0) 179 | 180 | return torch.from_numpy(all_anchors.astype(np.float32)).to(image.device) 181 | 182 | 183 | def generate_anchors(base_size=16, ratios=None, scales=None): 184 | """ 185 | Generate anchor (reference) windows by enumerating aspect ratios X 186 | scales w.r.t. a reference window. 187 | """ 188 | 189 | if ratios is None: 190 | ratios = np.array([0.5, 1, 2]) 191 | 192 | if scales is None: 193 | scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 194 | 195 | num_anchors = len(ratios) * len(scales) 196 | 197 | # initialize output anchors 198 | anchors = np.zeros((num_anchors, 4)) 199 | 200 | # scale base_size 201 | anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T 202 | 203 | # compute areas of anchors 204 | areas = anchors[:, 2] * anchors[:, 3] 205 | 206 | # correct for ratios 207 | anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales))) 208 | anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales)) 209 | 210 | # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) 211 | anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T 212 | anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T 213 | 214 | return anchors 215 | 216 | 217 | def compute_shape(image_shape, pyramid_levels): 218 | """Compute shapes based on pyramid levels. 219 | :param image_shape: 220 | :param pyramid_levels: 221 | :return: 222 | """ 223 | image_shape = np.array(image_shape[:2]) 224 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) 225 | for x in pyramid_levels] 226 | return image_shapes 227 | 228 | 229 | def anchors_for_shape( 230 | image_shape, 231 | pyramid_levels=None, 232 | ratios=None, 233 | scales=None, 234 | strides=None, 235 | sizes=None, 236 | shapes_callback=None, 237 | ): 238 | 239 | image_shapes = compute_shape(image_shape, pyramid_levels) 240 | 241 | # compute anchors over all pyramid levels 242 | all_anchors = np.zeros((0, 4)) 243 | for idx, p in enumerate(pyramid_levels): 244 | anchors = generate_anchors( 245 | base_size=sizes[idx], ratios=ratios, scales=scales) 246 | shifted_anchors = shift(image_shapes[idx], strides[idx], anchors) 247 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 248 | 249 | return all_anchors 250 | 251 | 252 | def shift(shape, stride, anchors): 253 | shift_x = (np.arange(0, shape[1]) + 0.5) * stride 254 | shift_y = (np.arange(0, shape[0]) + 0.5) * stride 255 | 256 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 257 | 258 | shifts = np.vstack(( 259 | shift_x.ravel(), shift_y.ravel(), 260 | shift_x.ravel(), shift_y.ravel() 261 | )).transpose() 262 | 263 | # add A anchors (1, A, 4) to 264 | # cell K shifts (K, 1, 4) to get 265 | # shift anchors (K, A, 4) 266 | # reshape to (K*A, 4) shifted anchors 267 | A = anchors.shape[0] 268 | K = shifts.shape[0] 269 | all_anchors = (anchors.reshape((1, A, 4)) + 270 | shifts.reshape((1, K, 4)).transpose((1, 0, 2))) 271 | all_anchors = all_anchors.reshape((K * A, 4)) 272 | 273 | return all_anchors 274 | 275 | 276 | def conv_ws_2d(input, 277 | weight, 278 | bias=None, 279 | stride=1, 280 | padding=0, 281 | dilation=1, 282 | groups=1, 283 | eps=1e-5): 284 | c_in = weight.size(0) 285 | weight_flat = weight.view(c_in, -1) 286 | mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1) 287 | std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1) 288 | weight = (weight - mean) / (std + eps) 289 | return F.conv2d(input, weight, bias, stride, padding, dilation, groups) 290 | 291 | 292 | class ConvWS2d(nn.Conv2d): 293 | def __init__(self, 294 | in_channels, 295 | out_channels, 296 | kernel_size, 297 | stride=1, 298 | padding=0, 299 | dilation=1, 300 | groups=1, 301 | bias=True, 302 | eps=1e-5): 303 | super(ConvWS2d, self).__init__( 304 | in_channels, 305 | out_channels, 306 | kernel_size, 307 | stride=stride, 308 | padding=padding, 309 | dilation=dilation, 310 | groups=groups, 311 | bias=bias) 312 | self.eps = eps 313 | 314 | def forward(self, x): 315 | return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding, 316 | self.dilation, self.groups, self.eps) 317 | 318 | 319 | conv_cfg = { 320 | 'Conv': nn.Conv2d, 321 | 'ConvWS': ConvWS2d, 322 | # TODO: octave conv 323 | } 324 | 325 | 326 | def build_conv_layer(cfg, *args, **kwargs): 327 | """ Build convolution layer 328 | Args: 329 | cfg (None or dict): cfg should contain: 330 | type (str): identify conv layer type. 331 | layer args: args needed to instantiate a conv layer. 332 | Returns: 333 | layer (nn.Module): created conv layer 334 | """ 335 | if cfg is None: 336 | cfg_ = dict(type='Conv') 337 | else: 338 | assert isinstance(cfg, dict) and 'type' in cfg 339 | cfg_ = cfg.copy() 340 | 341 | layer_type = cfg_.pop('type') 342 | if layer_type not in conv_cfg: 343 | raise KeyError('Unrecognized norm type {}'.format(layer_type)) 344 | else: 345 | conv_layer = conv_cfg[layer_type] 346 | 347 | layer = conv_layer(*args, **kwargs, **cfg_) 348 | 349 | return layer 350 | 351 | 352 | norm_cfg = { 353 | # format: layer_type: (abbreviation, module) 354 | 'BN': ('bn', nn.BatchNorm2d), 355 | 'SyncBN': ('bn', nn.SyncBatchNorm), 356 | 'GN': ('gn', nn.GroupNorm), 357 | # and potentially 'SN' 358 | } 359 | 360 | 361 | def build_norm_layer(cfg, num_features, postfix=''): 362 | """ Build normalization layer 363 | Args: 364 | cfg (dict): cfg should contain: 365 | type (str): identify norm layer type. 366 | layer args: args needed to instantiate a norm layer. 367 | requires_grad (bool): [optional] whether stop gradient updates 368 | num_features (int): number of channels from input. 369 | postfix (int, str): appended into norm abbreviation to 370 | create named layer. 371 | Returns: 372 | name (str): abbreviation + postfix 373 | layer (nn.Module): created norm layer 374 | """ 375 | assert isinstance(cfg, dict) and 'type' in cfg 376 | cfg_ = cfg.copy() 377 | 378 | layer_type = cfg_.pop('type') 379 | if layer_type not in norm_cfg: 380 | raise KeyError('Unrecognized norm type {}'.format(layer_type)) 381 | else: 382 | abbr, norm_layer = norm_cfg[layer_type] 383 | if norm_layer is None: 384 | raise NotImplementedError 385 | 386 | assert isinstance(postfix, (int, str)) 387 | name = abbr + str(postfix) 388 | 389 | requires_grad = cfg_.pop('requires_grad', True) 390 | cfg_.setdefault('eps', 1e-5) 391 | if layer_type != 'GN': 392 | layer = norm_layer(num_features, **cfg_) 393 | if layer_type == 'SyncBN': 394 | layer._specify_ddp_gpu_num(1) 395 | else: 396 | assert 'num_groups' in cfg_ 397 | layer = norm_layer(num_channels=num_features, **cfg_) 398 | 399 | for param in layer.parameters(): 400 | param.requires_grad = requires_grad 401 | 402 | return name, layer 403 | 404 | 405 | class ConvModule(nn.Module): 406 | """A conv block that contains conv/norm/activation layers. 407 | Args: 408 | in_channels (int): Same as nn.Conv2d. 409 | out_channels (int): Same as nn.Conv2d. 410 | kernel_size (int or tuple[int]): Same as nn.Conv2d. 411 | stride (int or tuple[int]): Same as nn.Conv2d. 412 | padding (int or tuple[int]): Same as nn.Conv2d. 413 | dilation (int or tuple[int]): Same as nn.Conv2d. 414 | groups (int): Same as nn.Conv2d. 415 | bias (bool or str): If specified as `auto`, it will be decided by the 416 | norm_cfg. Bias will be set as True if norm_cfg is None, otherwise 417 | False. 418 | conv_cfg (dict): Config dict for convolution layer. 419 | norm_cfg (dict): Config dict for normalization layer. 420 | activation (str or None): Activation type, "ReLU" by default. 421 | inplace (bool): Whether to use inplace mode for activation. 422 | order (tuple[str]): The order of conv/norm/activation layers. It is a 423 | sequence of "conv", "norm" and "act". Examples are 424 | ("conv", "norm", "act") and ("act", "conv", "norm"). 425 | """ 426 | 427 | def __init__(self, 428 | in_channels, 429 | out_channels, 430 | kernel_size, 431 | stride=1, 432 | padding=0, 433 | dilation=1, 434 | groups=1, 435 | bias='auto', 436 | conv_cfg=None, 437 | norm_cfg=None, 438 | activation='relu', 439 | inplace=True, 440 | order=('conv', 'norm', 'act')): 441 | super(ConvModule, self).__init__() 442 | assert conv_cfg is None or isinstance(conv_cfg, dict) 443 | assert norm_cfg is None or isinstance(norm_cfg, dict) 444 | self.conv_cfg = conv_cfg 445 | self.norm_cfg = norm_cfg 446 | self.activation = activation 447 | self.inplace = inplace 448 | self.order = order 449 | assert isinstance(self.order, tuple) and len(self.order) == 3 450 | assert set(order) == set(['conv', 'norm', 'act']) 451 | 452 | self.with_norm = norm_cfg is not None 453 | self.with_activatation = activation is not None 454 | # if the conv layer is before a norm layer, bias is unnecessary. 455 | if bias == 'auto': 456 | bias = False if self.with_norm else True 457 | self.with_bias = bias 458 | 459 | if self.with_norm and self.with_bias: 460 | warnings.warn('ConvModule has norm and bias at the same time') 461 | 462 | # build convolution layer 463 | self.conv = build_conv_layer( 464 | conv_cfg, 465 | in_channels, 466 | out_channels, 467 | kernel_size, 468 | stride=stride, 469 | padding=padding, 470 | dilation=dilation, 471 | groups=groups, 472 | bias=bias) 473 | # export the attributes of self.conv to a higher level for convenience 474 | self.in_channels = self.conv.in_channels 475 | self.out_channels = self.conv.out_channels 476 | self.kernel_size = self.conv.kernel_size 477 | self.stride = self.conv.stride 478 | self.padding = self.conv.padding 479 | self.dilation = self.conv.dilation 480 | self.transposed = self.conv.transposed 481 | self.output_padding = self.conv.output_padding 482 | self.groups = self.conv.groups 483 | 484 | # build normalization layers 485 | if self.with_norm: 486 | # norm layer is after conv layer 487 | if order.index('norm') > order.index('conv'): 488 | norm_channels = out_channels 489 | else: 490 | norm_channels = in_channels 491 | self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) 492 | self.add_module(self.norm_name, norm) 493 | 494 | # build activation layer 495 | if self.with_activatation: 496 | # TODO: introduce `act_cfg` and supports more activation layers 497 | if self.activation not in ['relu']: 498 | raise ValueError('{} is currently not supported.'.format( 499 | self.activation)) 500 | if self.activation == 'relu': 501 | self.activate = nn.ReLU(inplace=inplace) 502 | 503 | @property 504 | def norm(self): 505 | return getattr(self, self.norm_name) 506 | 507 | def forward(self, x, activate=True, norm=True): 508 | for layer in self.order: 509 | if layer == 'conv': 510 | x = self.conv(x) 511 | elif layer == 'norm' and norm and self.with_norm: 512 | x = self.norm(x) 513 | elif layer == 'act' and activate and self.with_activatation: 514 | x = self.activate(x) 515 | return x 516 | 517 | 518 | def xavier_init(module, gain=1, bias=0, distribution='normal'): 519 | assert distribution in ['uniform', 'normal'] 520 | if distribution == 'uniform': 521 | nn.init.xavier_uniform_(module.weight, gain=gain) 522 | else: 523 | nn.init.xavier_normal_(module.weight, gain=gain) 524 | if hasattr(module, 'bias'): 525 | nn.init.constant_(module.bias, bias) 526 | 527 | 528 | def normal_init(module, mean=0, std=1, bias=0): 529 | nn.init.normal_(module.weight, mean, std) 530 | if hasattr(module, 'bias'): 531 | nn.init.constant_(module.bias, bias) 532 | 533 | 534 | def uniform_init(module, a=0, b=1, bias=0): 535 | nn.init.uniform_(module.weight, a, b) 536 | if hasattr(module, 'bias'): 537 | nn.init.constant_(module.bias, bias) 538 | 539 | 540 | def kaiming_init(module, 541 | mode='fan_out', 542 | nonlinearity='relu', 543 | bias=0, 544 | distribution='normal'): 545 | assert distribution in ['uniform', 'normal'] 546 | if distribution == 'uniform': 547 | nn.init.kaiming_uniform_( 548 | module.weight, mode=mode, nonlinearity=nonlinearity) 549 | else: 550 | nn.init.kaiming_normal_( 551 | module.weight, mode=mode, nonlinearity=nonlinearity) 552 | if hasattr(module, 'bias'): 553 | nn.init.constant_(module.bias, bias) 554 | 555 | 556 | def bias_init_with_prob(prior_prob): 557 | """ initialize conv/fc bias value according to giving probablity""" 558 | bias_init = float(-np.log((1 - prior_prob) / prior_prob)) 559 | return bias_init 560 | -------------------------------------------------------------------------------- /models/retinahead.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import numpy as np 4 | import torch.nn as nn 5 | 6 | from .module import ConvModule, bias_init_with_prob, normal_init 7 | from six.moves import map, zip 8 | 9 | 10 | def multi_apply(func, *args, **kwargs): 11 | pfunc = partial(func, **kwargs) if kwargs else func 12 | map_results = map(pfunc, *args) 13 | return tuple(map(list, zip(*map_results))) 14 | 15 | 16 | class RetinaHead(nn.Module): 17 | """ 18 | An anchor-based head used in [1]_. 19 | The head contains two subnetworks. The first classifies anchor boxes and 20 | the second regresses deltas for the anchors. 21 | References: 22 | .. [1] https://arxiv.org/pdf/1708.02002.pdf 23 | Example: 24 | >>> import torch 25 | >>> self = RetinaHead(11, 7) 26 | >>> x = torch.rand(1, 7, 32, 32) 27 | >>> cls_score, bbox_pred = self.forward_single(x) 28 | >>> # Each anchor predicts a score for each class except background 29 | >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors 30 | >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors 31 | >>> assert cls_per_anchor == (self.num_classes - 1) 32 | >>> assert box_per_anchor == 4 33 | """ 34 | 35 | def __init__(self, 36 | num_classes, 37 | in_channels, 38 | feat_channels=256, 39 | anchor_scales=[8, 16, 32], 40 | anchor_ratios=[0.5, 1.0, 2.0], 41 | anchor_strides=[4, 8, 16, 32, 64], 42 | stacked_convs=4, 43 | octave_base_scale=4, 44 | scales_per_octave=3, 45 | conv_cfg=None, 46 | norm_cfg=None, 47 | **kwargs): 48 | super(RetinaHead, self).__init__() 49 | self.in_channels = in_channels 50 | self.num_classes = num_classes 51 | self.feat_channels = feat_channels 52 | self.anchor_scales = anchor_scales 53 | self.anchor_ratios = anchor_ratios 54 | self.anchor_strides = anchor_strides 55 | self.stacked_convs = stacked_convs 56 | self.octave_base_scale = octave_base_scale 57 | self.scales_per_octave = scales_per_octave 58 | self.conv_cfg = conv_cfg 59 | self.norm_cfg = norm_cfg 60 | octave_scales = np.array( 61 | [2**(i / scales_per_octave) for i in range(scales_per_octave)]) 62 | anchor_scales = octave_scales * octave_base_scale 63 | self.cls_out_channels = num_classes 64 | self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales) 65 | self._init_layers() 66 | 67 | def _init_layers(self): 68 | self.relu = nn.ReLU(inplace=True) 69 | self.cls_convs = nn.ModuleList() 70 | self.reg_convs = nn.ModuleList() 71 | for i in range(self.stacked_convs): 72 | chn = self.in_channels if i == 0 else self.feat_channels 73 | self.cls_convs.append( 74 | ConvModule( 75 | chn, 76 | self.feat_channels, 77 | 3, 78 | stride=1, 79 | padding=1, 80 | conv_cfg=self.conv_cfg, 81 | norm_cfg=self.norm_cfg)) 82 | self.reg_convs.append( 83 | ConvModule( 84 | chn, 85 | self.feat_channels, 86 | 3, 87 | stride=1, 88 | padding=1, 89 | conv_cfg=self.conv_cfg, 90 | norm_cfg=self.norm_cfg)) 91 | self.retina_cls = nn.Conv2d( 92 | self.feat_channels, 93 | self.num_anchors * self.cls_out_channels, 94 | 3, 95 | padding=1) 96 | self.retina_reg = nn.Conv2d( 97 | self.feat_channels, self.num_anchors * 4, 3, padding=1) 98 | self.output_act = nn.Sigmoid() 99 | 100 | def init_weights(self): 101 | for m in self.cls_convs: 102 | normal_init(m.conv, std=0.01) 103 | for m in self.reg_convs: 104 | normal_init(m.conv, std=0.01) 105 | bias_cls = bias_init_with_prob(0.01) 106 | normal_init(self.retina_cls, std=0.01, bias=bias_cls) 107 | normal_init(self.retina_reg, std=0.01) 108 | 109 | def forward_single(self, x): 110 | cls_feat = x 111 | reg_feat = x 112 | for cls_conv in self.cls_convs: 113 | cls_feat = cls_conv(cls_feat) 114 | for reg_conv in self.reg_convs: 115 | reg_feat = reg_conv(reg_feat) 116 | 117 | cls_score = self.retina_cls(cls_feat) 118 | cls_score = self.output_act(cls_score) 119 | # out is B x C x W x H, with C = n_classes + n_anchors 120 | cls_score = cls_score.permute(0, 2, 3, 1) 121 | batch_size, width, height, channels = cls_score.shape 122 | cls_score = cls_score.view( 123 | batch_size, width, height, self.num_anchors, self.num_classes) 124 | cls_score = cls_score.contiguous().view(x.size(0), -1, self.num_classes) 125 | 126 | bbox_pred = self.retina_reg(reg_feat) 127 | bbox_pred = bbox_pred.permute(0, 2, 3, 1) 128 | bbox_pred = bbox_pred.contiguous().view(bbox_pred.size(0), -1, 4) 129 | return cls_score, bbox_pred 130 | 131 | def forward(self, feats): 132 | return multi_apply(self.forward_single, feats) 133 | -------------------------------------------------------------------------------- /models/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import math 3 | import collections 4 | from functools import partial 5 | import torch 6 | from torch import nn 7 | from torch.nn import functional as F 8 | from torch.utils import model_zoo 9 | 10 | ######################################################################## 11 | ############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ############### 12 | ######################################################################## 13 | 14 | 15 | # Parameters for the entire model (stem, all blocks, and head) 16 | GlobalParams = collections.namedtuple('GlobalParams', [ 17 | 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 18 | 'num_classes', 'width_coefficient', 'depth_coefficient', 19 | 'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size']) 20 | 21 | # Parameters for an individual model block 22 | BlockArgs = collections.namedtuple('BlockArgs', [ 23 | 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', 24 | 'expand_ratio', 'id_skip', 'stride', 'se_ratio']) 25 | 26 | # Change namedtuple defaults 27 | GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields) 28 | BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields) 29 | 30 | 31 | class SwishImplementation(torch.autograd.Function): 32 | @staticmethod 33 | def forward(ctx, i): 34 | result = i * torch.sigmoid(i) 35 | ctx.save_for_backward(i) 36 | return result 37 | 38 | @staticmethod 39 | def backward(ctx, grad_output): 40 | i = ctx.saved_variables[0] 41 | sigmoid_i = torch.sigmoid(i) 42 | return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i))) 43 | 44 | 45 | class MemoryEfficientSwish(nn.Module): 46 | def forward(self, x): 47 | return SwishImplementation.apply(x) 48 | 49 | 50 | class Swish(nn.Module): 51 | def forward(self, x): 52 | return x * torch.sigmoid(x) 53 | 54 | 55 | def round_filters(filters, global_params): 56 | """ Calculate and round number of filters based on depth multiplier. """ 57 | multiplier = global_params.width_coefficient 58 | if not multiplier: 59 | return filters 60 | divisor = global_params.depth_divisor 61 | min_depth = global_params.min_depth 62 | filters *= multiplier 63 | min_depth = min_depth or divisor 64 | new_filters = max(min_depth, int( 65 | filters + divisor / 2) // divisor * divisor) 66 | if new_filters < 0.9 * filters: # prevent rounding by more than 10% 67 | new_filters += divisor 68 | return int(new_filters) 69 | 70 | 71 | def round_repeats(repeats, global_params): 72 | """ Round number of filters based on depth multiplier. """ 73 | multiplier = global_params.depth_coefficient 74 | if not multiplier: 75 | return repeats 76 | return int(math.ceil(multiplier * repeats)) 77 | 78 | 79 | def drop_connect(inputs, p, training): 80 | """ Drop connect. """ 81 | if not training: 82 | return inputs 83 | batch_size = inputs.shape[0] 84 | keep_prob = 1 - p 85 | random_tensor = keep_prob 86 | random_tensor += torch.rand([batch_size, 1, 1, 1], 87 | dtype=inputs.dtype, device=inputs.device) 88 | binary_tensor = torch.floor(random_tensor) 89 | output = inputs / keep_prob * binary_tensor 90 | return output 91 | 92 | 93 | def get_same_padding_conv2d(image_size=None): 94 | """ Chooses static padding if you have specified an image size, and dynamic padding otherwise. 95 | Static padding is necessary for ONNX exporting of models. """ 96 | if image_size is None: 97 | return Conv2dDynamicSamePadding 98 | else: 99 | return partial(Conv2dStaticSamePadding, image_size=image_size) 100 | 101 | 102 | class Conv2dDynamicSamePadding(nn.Conv2d): 103 | """ 2D Convolutions like TensorFlow, for a dynamic image size """ 104 | 105 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True): 106 | super().__init__(in_channels, out_channels, 107 | kernel_size, stride, 0, dilation, groups, bias) 108 | self.stride = self.stride if len(self.stride) == 2 else [ 109 | self.stride[0]] * 2 110 | 111 | def forward(self, x): 112 | ih, iw = x.size()[-2:] 113 | kh, kw = self.weight.size()[-2:] 114 | sh, sw = self.stride 115 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) 116 | pad_h = max((oh - 1) * self.stride[0] + 117 | (kh - 1) * self.dilation[0] + 1 - ih, 0) 118 | pad_w = max((ow - 1) * self.stride[1] + 119 | (kw - 1) * self.dilation[1] + 1 - iw, 0) 120 | if pad_h > 0 or pad_w > 0: 121 | x = F.pad(x, [pad_w // 2, pad_w - pad_w // 122 | 2, pad_h // 2, pad_h - pad_h // 2]) 123 | return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 124 | 125 | 126 | class Conv2dStaticSamePadding(nn.Conv2d): 127 | """ 2D Convolutions like TensorFlow, for a fixed image size""" 128 | 129 | def __init__(self, in_channels, out_channels, kernel_size, image_size=None, **kwargs): 130 | super().__init__(in_channels, out_channels, kernel_size, **kwargs) 131 | self.stride = self.stride if len(self.stride) == 2 else [ 132 | self.stride[0]] * 2 133 | 134 | # Calculate padding based on image size and save it 135 | assert image_size is not None 136 | ih, iw = image_size if type(image_size) == list else [ 137 | image_size, image_size] 138 | kh, kw = self.weight.size()[-2:] 139 | sh, sw = self.stride 140 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) 141 | pad_h = max((oh - 1) * self.stride[0] + 142 | (kh - 1) * self.dilation[0] + 1 - ih, 0) 143 | pad_w = max((ow - 1) * self.stride[1] + 144 | (kw - 1) * self.dilation[1] + 1 - iw, 0) 145 | if pad_h > 0 or pad_w > 0: 146 | self.static_padding = nn.ZeroPad2d( 147 | (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)) 148 | else: 149 | self.static_padding = Identity() 150 | 151 | def forward(self, x): 152 | x = self.static_padding(x) 153 | x = F.conv2d(x, self.weight, self.bias, self.stride, 154 | self.padding, self.dilation, self.groups) 155 | return x 156 | 157 | 158 | class Identity(nn.Module): 159 | def __init__(self, ): 160 | super(Identity, self).__init__() 161 | 162 | def forward(self, input): 163 | return input 164 | 165 | 166 | ######################################################################## 167 | ############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ############## 168 | ######################################################################## 169 | 170 | 171 | def efficientnet_params(model_name): 172 | """ Map EfficientNet model name to parameter coefficients. """ 173 | params_dict = { 174 | # Coefficients: width,depth,res,dropout 175 | 'efficientnet-b0': (1.0, 1.0, 224, 0.2), 176 | 'efficientnet-b1': (1.0, 1.1, 240, 0.2), 177 | 'efficientnet-b2': (1.1, 1.2, 260, 0.3), 178 | 'efficientnet-b3': (1.2, 1.4, 300, 0.3), 179 | 'efficientnet-b4': (1.4, 1.8, 380, 0.4), 180 | 'efficientnet-b5': (1.6, 2.2, 456, 0.4), 181 | 'efficientnet-b6': (1.8, 2.6, 528, 0.5), 182 | 'efficientnet-b7': (2.0, 3.1, 600, 0.5), 183 | } 184 | return params_dict[model_name] 185 | 186 | 187 | class BlockDecoder(object): 188 | """ Block Decoder for readability, straight from the official TensorFlow repository """ 189 | 190 | @staticmethod 191 | def _decode_block_string(block_string): 192 | """ Gets a block through a string notation of arguments. """ 193 | assert isinstance(block_string, str) 194 | 195 | ops = block_string.split('_') 196 | options = {} 197 | for op in ops: 198 | splits = re.split(r'(\d.*)', op) 199 | if len(splits) >= 2: 200 | key, value = splits[:2] 201 | options[key] = value 202 | 203 | # Check stride 204 | assert (('s' in options and len(options['s']) == 1) or 205 | (len(options['s']) == 2 and options['s'][0] == options['s'][1])) 206 | 207 | return BlockArgs( 208 | kernel_size=int(options['k']), 209 | num_repeat=int(options['r']), 210 | input_filters=int(options['i']), 211 | output_filters=int(options['o']), 212 | expand_ratio=int(options['e']), 213 | id_skip=('noskip' not in block_string), 214 | se_ratio=float(options['se']) if 'se' in options else None, 215 | stride=[int(options['s'][0])]) 216 | 217 | @staticmethod 218 | def _encode_block_string(block): 219 | """Encodes a block to a string.""" 220 | args = [ 221 | 'r%d' % block.num_repeat, 222 | 'k%d' % block.kernel_size, 223 | 's%d%d' % (block.strides[0], block.strides[1]), 224 | 'e%s' % block.expand_ratio, 225 | 'i%d' % block.input_filters, 226 | 'o%d' % block.output_filters 227 | ] 228 | if 0 < block.se_ratio <= 1: 229 | args.append('se%s' % block.se_ratio) 230 | if block.id_skip is False: 231 | args.append('noskip') 232 | return '_'.join(args) 233 | 234 | @staticmethod 235 | def decode(string_list): 236 | """ 237 | Decodes a list of string notations to specify blocks inside the network. 238 | :param string_list: a list of strings, each string is a notation of block 239 | :return: a list of BlockArgs namedtuples of block args 240 | """ 241 | assert isinstance(string_list, list) 242 | blocks_args = [] 243 | for block_string in string_list: 244 | blocks_args.append(BlockDecoder._decode_block_string(block_string)) 245 | return blocks_args 246 | 247 | @staticmethod 248 | def encode(blocks_args): 249 | """ 250 | Encodes a list of BlockArgs to a list of strings. 251 | :param blocks_args: a list of BlockArgs namedtuples of block args 252 | :return: a list of strings, each string is a notation of block 253 | """ 254 | block_strings = [] 255 | for block in blocks_args: 256 | block_strings.append(BlockDecoder._encode_block_string(block)) 257 | return block_strings 258 | 259 | 260 | def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2, 261 | drop_connect_rate=0.2, image_size=None, num_classes=1000): 262 | """ Creates a efficientnet model. """ 263 | 264 | blocks_args = [ 265 | 'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25', 266 | 'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25', 267 | 'r3_k5_s22_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25', 268 | 'r1_k3_s22_e6_i192_o320_se0.25', 269 | ] 270 | blocks_args = BlockDecoder.decode(blocks_args) 271 | 272 | global_params = GlobalParams( 273 | batch_norm_momentum=0.99, 274 | batch_norm_epsilon=1e-3, 275 | dropout_rate=dropout_rate, 276 | drop_connect_rate=drop_connect_rate, 277 | # data_format='channels_last', # removed, this is always true in PyTorch 278 | num_classes=num_classes, 279 | width_coefficient=width_coefficient, 280 | depth_coefficient=depth_coefficient, 281 | depth_divisor=8, 282 | min_depth=None, 283 | image_size=image_size, 284 | ) 285 | 286 | return blocks_args, global_params 287 | 288 | 289 | def get_model_params(model_name, override_params): 290 | """ Get the block args and global params for a given model """ 291 | if model_name.startswith('efficientnet'): 292 | w, d, s, p = efficientnet_params(model_name) 293 | # note: all models have drop connect rate = 0.2 294 | blocks_args, global_params = efficientnet( 295 | width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s) 296 | else: 297 | raise NotImplementedError( 298 | 'model name is not pre-defined: %s' % model_name) 299 | if override_params: 300 | # ValueError will be raised here if override_params has fields not included in global_params. 301 | global_params = global_params._replace(**override_params) 302 | return blocks_args, global_params 303 | 304 | 305 | url_map = { 306 | 'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b0-355c32eb.pth', 307 | 'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b1-f1951068.pth', 308 | 'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b2-8bb594d6.pth', 309 | 'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b3-5fb5a3c3.pth', 310 | 'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b4-6ed6700e.pth', 311 | 'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b5-b6417697.pth', 312 | 'efficientnet-b6': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b6-c76e70fd.pth', 313 | 'efficientnet-b7': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b7-dcc49843.pth', 314 | } 315 | 316 | 317 | def load_pretrained_weights(model, model_name, load_fc=True): 318 | """ Loads pretrained weights, and downloads if loading for the first time. """ 319 | state_dict = model_zoo.load_url(url_map[model_name]) 320 | if load_fc: 321 | model.load_state_dict(state_dict) 322 | else: 323 | state_dict.pop('_fc.weight') 324 | state_dict.pop('_fc.bias') 325 | res = model.load_state_dict(state_dict, strict=False) 326 | assert set(res.missing_keys) == set( 327 | ['_fc.weight', '_fc.bias']), 'issue loading pretrained weights' 328 | print('Loaded pretrained weights for {}'.format(model_name)) 329 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | torch 4 | torchvision 5 | pytoan 6 | albumentations 7 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from models import EfficientDet 3 | from models.efficientnet import EfficientNet 4 | 5 | if __name__ == '__main__': 6 | 7 | inputs = torch.randn(5, 3, 512, 512) 8 | 9 | # Test EfficientNet 10 | model = EfficientNet.from_pretrained('efficientnet-b0') 11 | inputs = torch.randn(4, 3, 512, 512) 12 | P = model(inputs) 13 | for idx, p in enumerate(P): 14 | print('P{}: {}'.format(idx, p.size())) 15 | 16 | # print('model: ', model) 17 | 18 | # Test inference 19 | model = EfficientDet(num_classes=20, is_training=False) 20 | output = model(inputs) 21 | for out in output: 22 | print(out.size()) 23 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from tqdm import tqdm 2 | import argparse 3 | import os 4 | import random 5 | import shutil 6 | import time 7 | import warnings 8 | import torch 9 | import torch.nn as nn 10 | import torch.nn.parallel 11 | import torch.backends.cudnn as cudnn 12 | import torch.distributed as dist 13 | import torch.optim 14 | import torch.multiprocessing as mp 15 | import torch.utils.data 16 | import torch.utils.data.distributed 17 | import torchvision.transforms as transforms 18 | import torchvision.datasets as datasets 19 | 20 | import os 21 | import sys 22 | import time 23 | import argparse 24 | import numpy as np 25 | import torch 26 | import torch.optim as optim 27 | import torch.backends.cudnn as cudnn 28 | from torch.utils.data import DataLoader 29 | 30 | from models.efficientdet import EfficientDet 31 | from models.losses import FocalLoss 32 | from datasets import VOCDetection, CocoDataset, get_augumentation, detection_collate, Resizer, Normalizer, Augmenter, collater 33 | from utils import EFFICIENTDET, get_state_dict 34 | from eval import evaluate, evaluate_coco 35 | 36 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training') 37 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'], 38 | type=str, help='VOC or COCO') 39 | parser.add_argument( 40 | '--dataset_root', 41 | default='/root/data/VOCdevkit/', 42 | help='Dataset root directory path [/root/data/VOCdevkit/, /root/data/coco/]') 43 | parser.add_argument('--network', default='efficientdet-d0', type=str, 44 | help='efficientdet-[d0, d1, ..]') 45 | 46 | parser.add_argument('--resume', default=None, type=str, 47 | help='Checkpoint state_dict file to resume training from') 48 | parser.add_argument('--num_epoch', default=500, type=int, 49 | help='Num epoch for training') 50 | parser.add_argument('--batch_size', default=32, type=int, 51 | help='Batch size for training') 52 | parser.add_argument('--num_class', default=20, type=int, 53 | help='Number of class used in model') 54 | parser.add_argument('--device', default=[0, 1], type=list, 55 | help='Use CUDA to train model') 56 | parser.add_argument('--grad_accumulation_steps', default=1, type=int, 57 | help='Number of gradient accumulation steps') 58 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float, 59 | help='initial learning rate') 60 | parser.add_argument('--momentum', default=0.9, type=float, 61 | help='Momentum value for optim') 62 | parser.add_argument('--weight_decay', default=5e-4, type=float, 63 | help='Weight decay for SGD') 64 | parser.add_argument('--gamma', default=0.1, type=float, 65 | help='Gamma update for SGD') 66 | parser.add_argument('--save_folder', default='./saved/weights/', type=str, 67 | help='Directory for saving checkpoint models') 68 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', 69 | help='number of data loading workers (default: 4)') 70 | parser.add_argument('--start_epoch', default=0, type=int, metavar='N', 71 | help='manual epoch number (useful on restarts)') 72 | parser.add_argument('--world-size', default=1, type=int, 73 | help='number of nodes for distributed training') 74 | parser.add_argument('--rank', default=0, type=int, 75 | help='node rank for distributed training') 76 | parser.add_argument('--dist-url', default='env://', type=str, 77 | help='url used to set up distributed training') 78 | parser.add_argument('--dist-backend', default='nccl', type=str, 79 | help='distributed backend') 80 | parser.add_argument('--seed', default=24, type=int, 81 | help='seed for initializing training. ') 82 | parser.add_argument('--gpu', default=None, type=int, 83 | help='GPU id to use.') 84 | parser.add_argument( 85 | '--multiprocessing-distributed', 86 | action='store_true', 87 | help='Use multi-processing distributed training to launch ' 88 | 'N processes per node, which has N GPUs. This is the ' 89 | 'fastest way to use PyTorch for either single node or ' 90 | 'multi node data parallel training') 91 | 92 | iteration = 1 93 | 94 | 95 | def train(train_loader, model, scheduler, optimizer, epoch, args): 96 | global iteration 97 | print("{} epoch: \t start training....".format(epoch)) 98 | start = time.time() 99 | total_loss = [] 100 | model.train() 101 | model.module.is_training = True 102 | model.module.freeze_bn() 103 | optimizer.zero_grad() 104 | for idx, (images, annotations) in enumerate(train_loader): 105 | images = images.cuda().float() 106 | annotations = annotations.cuda() 107 | classification_loss, regression_loss = model([images, annotations]) 108 | classification_loss = classification_loss.mean() 109 | regression_loss = regression_loss.mean() 110 | loss = classification_loss + regression_loss 111 | if bool(loss == 0): 112 | print('loss equal zero(0)') 113 | continue 114 | loss.backward() 115 | if (idx + 1) % args.grad_accumulation_steps == 0: 116 | torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1) 117 | optimizer.step() 118 | optimizer.zero_grad() 119 | 120 | total_loss.append(loss.item()) 121 | if(iteration % 300 == 0): 122 | print('{} iteration: training ...'.format(iteration)) 123 | ans = { 124 | 'epoch': epoch, 125 | 'iteration': iteration, 126 | 'cls_loss': classification_loss.item(), 127 | 'reg_loss': regression_loss.item(), 128 | 'mean_loss': np.mean(total_loss) 129 | } 130 | for key, value in ans.items(): 131 | print(' {:15s}: {}'.format(str(key), value)) 132 | iteration += 1 133 | scheduler.step(np.mean(total_loss)) 134 | result = { 135 | 'time': time.time() - start, 136 | 'loss': np.mean(total_loss) 137 | } 138 | for key, value in result.items(): 139 | print(' {:15s}: {}'.format(str(key), value)) 140 | 141 | 142 | def test(dataset, model, epoch, args): 143 | print("{} epoch: \t start validation....".format(epoch)) 144 | model = model.module 145 | model.eval() 146 | model.is_training = False 147 | with torch.no_grad(): 148 | if(args.dataset == 'VOC'): 149 | evaluate(dataset, model) 150 | else: 151 | evaluate_coco(dataset, model) 152 | 153 | 154 | def main_worker(gpu, ngpus_per_node, args): 155 | args.gpu = gpu 156 | if args.gpu is not None: 157 | print("Use GPU: {} for training".format(args.gpu)) 158 | 159 | if args.distributed: 160 | if args.dist_url == "env://" and args.rank == -1: 161 | # args.rank = int(os.environ["RANK"]) 162 | args.rank = 1 163 | if args.multiprocessing_distributed: 164 | # For multiprocessing distributed training, rank needs to be the 165 | # global rank among all the processes 166 | args.rank = args.rank * ngpus_per_node + gpu 167 | dist.init_process_group( 168 | backend=args.dist_backend, 169 | init_method=args.dist_url, 170 | world_size=args.world_size, 171 | rank=args.rank) 172 | 173 | # Training dataset 174 | train_dataset = [] 175 | if(args.dataset == 'VOC'): 176 | train_dataset = VOCDetection(root=args.dataset_root, transform=transforms.Compose( 177 | [Normalizer(), Augmenter(), Resizer()])) 178 | valid_dataset = VOCDetection(root=args.dataset_root, image_sets=[( 179 | '2007', 'test')], transform=transforms.Compose([Normalizer(), Resizer()])) 180 | args.num_class = train_dataset.num_classes() 181 | elif(args.dataset == 'COCO'): 182 | train_dataset = CocoDataset( 183 | root_dir=args.dataset_root, 184 | set_name='train2017', 185 | transform=transforms.Compose( 186 | [ 187 | Normalizer(), 188 | Augmenter(), 189 | Resizer()])) 190 | valid_dataset = CocoDataset( 191 | root_dir=args.dataset_root, 192 | set_name='val2017', 193 | transform=transforms.Compose( 194 | [ 195 | Normalizer(), 196 | Resizer()])) 197 | args.num_class = train_dataset.num_classes() 198 | 199 | train_loader = DataLoader(train_dataset, 200 | batch_size=args.batch_size, 201 | num_workers=args.workers, 202 | shuffle=True, 203 | collate_fn=collater, 204 | pin_memory=True) 205 | valid_loader = DataLoader(valid_dataset, 206 | batch_size=1, 207 | num_workers=args.workers, 208 | shuffle=False, 209 | collate_fn=collater, 210 | pin_memory=True) 211 | 212 | checkpoint = [] 213 | if(args.resume is not None): 214 | if os.path.isfile(args.resume): 215 | print("=> loading checkpoint '{}'".format(args.resume)) 216 | if args.gpu is None: 217 | checkpoint = torch.load(args.resume) 218 | else: 219 | # Map model to be loaded to specified single gpu. 220 | loc = 'cuda:{}'.format(args.gpu) 221 | checkpoint = torch.load(args.resume, map_location=loc) 222 | params = checkpoint['parser'] 223 | args.num_class = params.num_class 224 | args.network = params.network 225 | args.start_epoch = checkpoint['epoch'] + 1 226 | del params 227 | 228 | model = EfficientDet(num_classes=args.num_class, 229 | network=args.network, 230 | W_bifpn=EFFICIENTDET[args.network]['W_bifpn'], 231 | D_bifpn=EFFICIENTDET[args.network]['D_bifpn'], 232 | D_class=EFFICIENTDET[args.network]['D_class'] 233 | ) 234 | if(args.resume is not None): 235 | model.load_state_dict(checkpoint['state_dict']) 236 | del checkpoint 237 | if args.distributed: 238 | # For multiprocessing distributed, DistributedDataParallel constructor 239 | # should always set the single device scope, otherwise, 240 | # DistributedDataParallel will use all available devices. 241 | if args.gpu is not None: 242 | torch.cuda.set_device(args.gpu) 243 | model.cuda(args.gpu) 244 | # When using a single GPU per process and per 245 | # DistributedDataParallel, we need to divide the batch size 246 | # ourselves based on the total number of GPUs we have 247 | args.batch_size = int(args.batch_size / ngpus_per_node) 248 | args.workers = int( 249 | (args.workers + ngpus_per_node - 1) / ngpus_per_node) 250 | model = torch.nn.parallel.DistributedDataParallel( 251 | model, device_ids=[args.gpu], find_unused_parameters=True) 252 | print('Run with DistributedDataParallel with divice_ids....') 253 | else: 254 | model.cuda() 255 | # DistributedDataParallel will divide and allocate batch_size to all 256 | # available GPUs if device_ids are not set 257 | model = torch.nn.parallel.DistributedDataParallel(model) 258 | print('Run with DistributedDataParallel without device_ids....') 259 | elif args.gpu is not None: 260 | torch.cuda.set_device(args.gpu) 261 | model = model.cuda(args.gpu) 262 | else: 263 | model = model.cuda() 264 | print('Run with DataParallel ....') 265 | model = torch.nn.DataParallel(model).cuda() 266 | 267 | # define loss function (criterion) , optimizer, scheduler 268 | optimizer = optim.AdamW(model.parameters(), lr=args.lr) 269 | scheduler = optim.lr_scheduler.ReduceLROnPlateau( 270 | optimizer, patience=3, verbose=True) 271 | cudnn.benchmark = True 272 | 273 | for epoch in range(args.start_epoch, args.num_epoch): 274 | train(train_loader, model, scheduler, optimizer, epoch, args) 275 | 276 | if (epoch + 1) % 5 == 0: 277 | test(valid_dataset, model, epoch, args) 278 | 279 | state = { 280 | 'epoch': epoch, 281 | 'parser': args, 282 | 'state_dict': get_state_dict(model) 283 | } 284 | 285 | torch.save( 286 | state, 287 | os.path.join( 288 | args.save_folder, 289 | args.dataset, 290 | args.network, 291 | "checkpoint_{}.pth".format(epoch))) 292 | 293 | 294 | def main(): 295 | args = parser.parse_args() 296 | if(not os.path.exists(os.path.join(args.save_folder, args.dataset, args.network))): 297 | os.makedirs(os.path.join(args.save_folder, args.dataset, args.network)) 298 | if args.seed is not None: 299 | random.seed(args.seed) 300 | torch.manual_seed(args.seed) 301 | cudnn.deterministic = True 302 | warnings.warn('You have chosen to seed training. ' 303 | 'This will turn on the CUDNN deterministic setting, ' 304 | 'which can slow down your training considerably! ' 305 | 'You may see unexpected behavior when restarting ' 306 | 'from checkpoints.') 307 | 308 | if args.gpu is not None: 309 | warnings.warn('You have chosen a specific GPU. This will completely ' 310 | 'disable data parallelism.') 311 | os.environ['MASTER_ADDR'] = 'localhost' 312 | os.environ['MASTER_PORT'] = '12355' 313 | os.environ['WORLD_SIZE'] = '2' 314 | if args.dist_url == "env://" and args.world_size == -1: 315 | args.world_size = int(os.environ["WORLD_SIZE"]) 316 | 317 | args.distributed = args.world_size > 1 or args.multiprocessing_distributed 318 | ngpus_per_node = torch.cuda.device_count() 319 | if args.multiprocessing_distributed: 320 | # Since we have ngpus_per_node processes per node, the total world_size 321 | # needs to be adjusted accordingly 322 | args.world_size = ngpus_per_node * args.world_size 323 | # Use torch.multiprocessing.spawn to launch distributed processes: the 324 | # main_worker process function 325 | mp.spawn(main_worker, nprocs=ngpus_per_node, 326 | args=(ngpus_per_node, args)) 327 | else: 328 | # Simply call main_worker function 329 | main_worker(args.gpu, ngpus_per_node, args) 330 | 331 | 332 | if __name__ == "__main__": 333 | main() 334 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .helper import * 2 | from .util import * 3 | from .visualization import * 4 | from .vis_bbox import vis_bbox 5 | from .config_eff import * -------------------------------------------------------------------------------- /utils/config_eff.py: -------------------------------------------------------------------------------- 1 | EFFICIENTDET = { 2 | 'efficientdet-d0': {'input_size': 512, 3 | 'backbone': 'B0', 4 | 'W_bifpn': 64, 5 | 'D_bifpn': 2, 6 | 'D_class': 3}, 7 | 'efficientdet-d1': {'input_size': 640, 8 | 'backbone': 'B1', 9 | 'W_bifpn': 88, 10 | 'D_bifpn': 3, 11 | 'D_class': 3}, 12 | 'efficientdet-d2': {'input_size': 768, 13 | 'backbone': 'B2', 14 | 'W_bifpn': 112, 15 | 'D_bifpn': 4, 16 | 'D_class': 3}, 17 | 'efficientdet-d3': {'input_size': 896, 18 | 'backbone': 'B3', 19 | 'W_bifpn': 160, 20 | 'D_bifpn': 5, 21 | 'D_class': 4}, 22 | 'efficientdet-d4': {'input_size': 1024, 23 | 'backbone': 'B4', 24 | 'W_bifpn': 224, 25 | 'D_bifpn': 6, 26 | 'D_class': 4}, 27 | 'efficientdet-d5': {'input_size': 1280, 28 | 'backbone': 'B5', 29 | 'W_bifpn': 288, 30 | 'D_bifpn': 7, 31 | 'D_class': 4}, 32 | 'efficientdet-d6': {'input_size': 1408, 33 | 'backbone': 'B6', 34 | 'W_bifpn': 384, 35 | 'D_bifpn': 8, 36 | 'D_class': 5}, 37 | 'efficientdet-d7': {'input_size': 1636, 38 | 'backbone': 'B6', 39 | 'W_bifpn': 384, 40 | 'D_bifpn': 8, 41 | 'D_class': 5}, 42 | } -------------------------------------------------------------------------------- /utils/helper.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import numpy as np 3 | import torch 4 | import os 5 | import requests 6 | import socket 7 | import datetime 8 | import json 9 | 10 | 11 | def load_yaml(file_name): 12 | with open(file_name, 'r') as stream: 13 | config = yaml.load(stream, Loader=yaml.FullLoader) 14 | return config 15 | 16 | 17 | def init_seed(SEED=42): 18 | os.environ['PYTHONHASHSEED'] = str(SEED) 19 | np.random.seed(SEED) 20 | torch.manual_seed(SEED) 21 | torch.cuda.manual_seed(SEED) 22 | torch.backends.cudnn.deterministic = True 23 | 24 | 25 | def get_state_dict(model): 26 | if type(model) == torch.nn.DataParallel: 27 | state_dict = model.module.state_dict() 28 | else: 29 | state_dict = model.state_dict() 30 | return state_dict 31 | -------------------------------------------------------------------------------- /utils/metric.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import json 5 | import os 6 | 7 | import torch 8 | 9 | 10 | def compute_overlap(a, b): 11 | """ 12 | Parameters 13 | ---------- 14 | a: (N, 4) ndarray of float 15 | b: (K, 4) ndarray of float 16 | Returns 17 | ------- 18 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 19 | """ 20 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 21 | 22 | iw = np.minimum(np.expand_dims( 23 | a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0]) 24 | ih = np.minimum(np.expand_dims( 25 | a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1]) 26 | 27 | iw = np.maximum(iw, 0) 28 | ih = np.maximum(ih, 0) 29 | 30 | ua = np.expand_dims((a[:, 2] - a[:, 0]) * 31 | (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih 32 | 33 | ua = np.maximum(ua, np.finfo(float).eps) 34 | 35 | intersection = iw * ih 36 | 37 | return intersection / ua 38 | 39 | 40 | def _compute_ap(recall, precision): 41 | """ Compute the average precision, given the recall and precision curves. 42 | Code originally from https://github.com/rbgirshick/py-faster-rcnn. 43 | # Arguments 44 | recall: The recall curve (list). 45 | precision: The precision curve (list). 46 | # Returns 47 | The average precision as computed in py-faster-rcnn. 48 | """ 49 | # correct AP calculation 50 | # first append sentinel values at the end 51 | mrec = np.concatenate(([0.], recall, [1.])) 52 | mpre = np.concatenate(([0.], precision, [0.])) 53 | 54 | # compute the precision envelope 55 | for i in range(mpre.size - 1, 0, -1): 56 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 57 | 58 | # to calculate area under PR curve, look for points 59 | # where X axis (recall) changes value 60 | i = np.where(mrec[1:] != mrec[:-1])[0] 61 | 62 | # and sum (\Delta recall) * prec 63 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 64 | return ap 65 | 66 | 67 | def _get_detections(dataset, model, score_threshold=0.05, max_detections=100, save_path=None): 68 | """ Get the detections from the retinanet using the generator. 69 | The result is a list of lists such that the size is: 70 | all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes] 71 | # Arguments 72 | dataset : The generator used to run images through the retinanet. 73 | retinanet : The retinanet to run on the images. 74 | score_threshold : The score confidence threshold to use. 75 | max_detections : The maximum number of detections to use per image. 76 | save_path : The path to save the images with visualized detections to. 77 | # Returns 78 | A list of lists containing the detections for each image in the generator. 79 | """ 80 | all_detections = [[None for i in range( 81 | dataset.num_classes())] for j in range(len(dataset))] 82 | 83 | model.eval() 84 | 85 | with torch.no_grad(): 86 | for index in range(len(dataset)): 87 | data = dataset[index] 88 | scale = data['scale'] 89 | 90 | # run network 91 | scores, labels, boxes = model(data['img'].permute( 92 | 2, 0, 1).cuda().float().unsqueeze(dim=0)) 93 | scores = scores.cpu().numpy() 94 | labels = labels.cpu().numpy() 95 | boxes = boxes.cpu().numpy() 96 | 97 | # correct boxes for image scale 98 | boxes /= scale 99 | 100 | # select indices which have a score above the threshold 101 | indices = np.where(scores > score_threshold)[0] 102 | if indices.shape[0] > 0: 103 | # select those scores 104 | scores = scores[indices] 105 | 106 | # find the order with which to sort the scores 107 | scores_sort = np.argsort(-scores)[:max_detections] 108 | 109 | # select detections 110 | image_boxes = boxes[indices[scores_sort], :] 111 | image_scores = scores[scores_sort] 112 | image_labels = labels[indices[scores_sort]] 113 | image_detections = np.concatenate([image_boxes, np.expand_dims( 114 | image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1) 115 | 116 | # copy detections to all_detections 117 | for label in range(dataset.num_classes()): 118 | all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1] 119 | else: 120 | # copy detections to all_detections 121 | for label in range(dataset.num_classes()): 122 | all_detections[index][label] = np.zeros((0, 5)) 123 | 124 | print('{}/{}'.format(index + 1, len(dataset)), end='\r') 125 | 126 | return all_detections 127 | 128 | 129 | def _get_annotations(generator): 130 | """ Get the ground truth annotations from the generator. 131 | The result is a list of lists such that the size is: 132 | all_detections[num_images][num_classes] = annotations[num_detections, 5] 133 | # Arguments 134 | generator : The generator used to retrieve ground truth annotations. 135 | # Returns 136 | A list of lists containing the annotations for each image in the generator. 137 | """ 138 | all_annotations = [[None for i in range( 139 | generator.num_classes())] for j in range(len(generator))] 140 | 141 | for i in range(len(generator)): 142 | # load the annotations 143 | annotations = generator.load_annotations(i) 144 | 145 | # copy detections to all_annotations 146 | for label in range(generator.num_classes()): 147 | all_annotations[i][label] = annotations[annotations[:, 4] 148 | == label, :4].copy() 149 | 150 | print('{}/{}'.format(i + 1, len(generator)), end='\r') 151 | 152 | return all_annotations 153 | 154 | 155 | def evaluate( 156 | generator, 157 | retinanet, 158 | iou_threshold=0.5, 159 | score_threshold=0.05, 160 | max_detections=100, 161 | save_path=None 162 | ): 163 | """ Evaluate a given dataset using a given retinanet. 164 | # Arguments 165 | generator : The generator that represents the dataset to evaluate. 166 | retinanet : The retinanet to evaluate. 167 | iou_threshold : The threshold used to consider when a detection is positive or negative. 168 | score_threshold : The score confidence threshold to use for detections. 169 | max_detections : The maximum number of detections to use per image. 170 | save_path : The path to save images with visualized detections to. 171 | # Returns 172 | A dict mapping class names to mAP scores. 173 | """ 174 | 175 | # gather all detections and annotations 176 | 177 | all_detections = _get_detections( 178 | generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path) 179 | all_annotations = _get_annotations(generator) 180 | 181 | average_precisions = {} 182 | 183 | for label in range(generator.num_classes()): 184 | false_positives = np.zeros((0,)) 185 | true_positives = np.zeros((0,)) 186 | scores = np.zeros((0,)) 187 | num_annotations = 0.0 188 | 189 | for i in range(len(generator)): 190 | detections = all_detections[i][label] 191 | annotations = all_annotations[i][label] 192 | num_annotations += annotations.shape[0] 193 | detected_annotations = [] 194 | 195 | for d in detections: 196 | scores = np.append(scores, d[4]) 197 | 198 | if annotations.shape[0] == 0: 199 | false_positives = np.append(false_positives, 1) 200 | true_positives = np.append(true_positives, 0) 201 | continue 202 | 203 | overlaps = compute_overlap( 204 | np.expand_dims(d, axis=0), annotations) 205 | assigned_annotation = np.argmax(overlaps, axis=1) 206 | max_overlap = overlaps[0, assigned_annotation] 207 | 208 | if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: 209 | false_positives = np.append(false_positives, 0) 210 | true_positives = np.append(true_positives, 1) 211 | detected_annotations.append(assigned_annotation) 212 | else: 213 | false_positives = np.append(false_positives, 1) 214 | true_positives = np.append(true_positives, 0) 215 | 216 | # no annotations -> AP for this class is 0 (is this correct?) 217 | if num_annotations == 0: 218 | average_precisions[label] = 0, 0 219 | continue 220 | 221 | # sort by score 222 | indices = np.argsort(-scores) 223 | false_positives = false_positives[indices] 224 | true_positives = true_positives[indices] 225 | 226 | # compute false positives and true positives 227 | false_positives = np.cumsum(false_positives) 228 | true_positives = np.cumsum(true_positives) 229 | 230 | # compute recall and precision 231 | recall = true_positives / num_annotations 232 | precision = true_positives / \ 233 | np.maximum(true_positives + false_positives, 234 | np.finfo(np.float64).eps) 235 | 236 | # compute average precision 237 | average_precision = _compute_ap(recall, precision) 238 | average_precisions[label] = average_precision, num_annotations 239 | 240 | print('\nmAP:') 241 | for label in range(generator.num_classes()): 242 | label_name = generator.label_to_name(label) 243 | print('{}: {}'.format(label_name, average_precisions[label][0])) 244 | 245 | return average_precisions 246 | -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | class MetricTracker: 5 | def __init__(self, *keys, writer=None): 6 | self.writer = writer 7 | self._data = pd.DataFrame( 8 | index=keys, columns=['total', 'counts', 'average']) 9 | self.reset() 10 | 11 | def reset(self): 12 | for col in self._data.columns: 13 | self._data[col].values[:] = 0 14 | 15 | def update(self, key, value, n=1): 16 | if self.writer is not None: 17 | self.writer.add_scalar(key, value) 18 | self._data.total[key] += value * n 19 | self._data.counts[key] += n 20 | self._data.average[key] = self._data.total[key] / \ 21 | self._data.counts[key] 22 | 23 | def avg(self, key): 24 | return self._data.average[key] 25 | 26 | def result(self): 27 | return dict(self._data.average) 28 | -------------------------------------------------------------------------------- /utils/vis_bbox.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from PIL import Image 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | 6 | 7 | def vis_bbox(img, bbox, label=None, score=None, 8 | instance_colors=None, alpha=1., linewidth=2., ax=None): 9 | """Visualize bounding boxes inside the image. 10 | Args: 11 | img (~numpy.ndarray): An array of shape :math:`(3, height, width)`. 12 | This is in RGB format and the range of its value is 13 | :math:`[0, 255]`. If this is :obj:`None`, no image is displayed. 14 | bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where 15 | :math:`R` is the number of bounding boxes in the image. 16 | Each element is organized 17 | by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis. 18 | label (~numpy.ndarray): An integer array of shape :math:`(R,)`. 19 | The values correspond to id for label names stored in 20 | :obj:`label_names`. This is optional. 21 | score (~numpy.ndarray): A float array of shape :math:`(R,)`. 22 | Each value indicates how confident the prediction is. 23 | This is optional. 24 | label_names (iterable of strings): Name of labels ordered according 25 | to label ids. If this is :obj:`None`, labels will be skipped. 26 | instance_colors (iterable of tuples): List of colors. 27 | Each color is RGB format and the range of its values is 28 | :math:`[0, 255]`. The :obj:`i`-th element is the color used 29 | to visualize the :obj:`i`-th instance. 30 | If :obj:`instance_colors` is :obj:`None`, the red is used for 31 | all boxes. 32 | alpha (float): The value which determines transparency of the 33 | bounding boxes. The range of this value is :math:`[0, 1]`. 34 | linewidth (float): The thickness of the edges of the bounding boxes. 35 | ax (matplotlib.axes.Axis): The visualization is displayed on this 36 | axis. If this is :obj:`None` (default), a new axis is created. 37 | Returns: 38 | ~matploblib.axes.Axes: 39 | Returns the Axes object with the plot for further tweaking. 40 | from: https://github.com/chainer/chainercv 41 | """ 42 | 43 | if label is not None and not len(bbox) == len(label): 44 | raise ValueError('The length of label must be same as that of bbox') 45 | if score is not None and not len(bbox) == len(score): 46 | raise ValueError('The length of score must be same as that of bbox') 47 | 48 | # Returns newly instantiated matplotlib.axes.Axes object if ax is None 49 | if ax is None: 50 | fig = plt.figure() 51 | # ax = fig.add_subplot(1, 1, 1) 52 | h, w, _ = img.shape 53 | w_ = w / 60.0 54 | h_ = w_ * (h / w) 55 | fig.set_size_inches((w_, h_)) 56 | ax = plt.axes([0, 0, 1, 1]) 57 | ax.imshow(img.astype(np.uint8)) 58 | ax.axis('off') 59 | # If there is no bounding box to display, visualize the image and exit. 60 | if len(bbox) == 0: 61 | return fig, ax 62 | 63 | if instance_colors is None: 64 | # Red 65 | instance_colors = np.zeros((len(bbox), 3), dtype=np.float32) 66 | instance_colors[:, 0] = 51 67 | instance_colors[:, 1] = 51 68 | instance_colors[:, 2] = 224 69 | instance_colors = np.array(instance_colors) 70 | 71 | for i, bb in enumerate(bbox): 72 | xy = (bb[0], bb[1]) 73 | height = bb[3] - bb[1] 74 | width = bb[2] - bb[0] 75 | color = instance_colors[i % len(instance_colors)] / 255 76 | ax.add_patch(plt.Rectangle( 77 | xy, width, height, fill=False, 78 | edgecolor=color, linewidth=linewidth, alpha=alpha)) 79 | 80 | caption = [] 81 | caption.append(label[i]) 82 | if(len(score) > 0): 83 | sc = score[i] 84 | caption.append('{}'.format(sc)) 85 | 86 | if len(caption) > 0: 87 | face_color = np.array([225, 51, 123])/255 88 | ax.text(bb[0], bb[1], 89 | ': '.join(caption), 90 | fontsize=12, 91 | color='black', 92 | style='italic', 93 | bbox={'facecolor': face_color, 'edgecolor': face_color, 'alpha': 1, 'pad': 0}) 94 | return fig, ax 95 | 96 | 97 | if __name__ == '__main__': 98 | img = cv2.imread('./../docs/output.png') 99 | print('img: ', img.shape) 100 | img = np.array(img) 101 | # img = img.convert('RGB') 102 | bbox = np.array([[50, 50, 200, 200]]) 103 | label = np.array(['toan']) 104 | score = np.array([100]) 105 | ax, fig = vis_bbox(img=img, 106 | bbox=bbox, 107 | label=label, 108 | score=score, 109 | label_names=label_names 110 | ) 111 | fig.savefig('kaka.png') 112 | fig.show() 113 | plt.show() 114 | -------------------------------------------------------------------------------- /utils/visualization.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | from datetime import datetime 3 | 4 | 5 | class TensorboardWriter(): 6 | def __init__(self, log_dir, enabled): 7 | self.writer = None 8 | self.selected_module = "" 9 | 10 | if enabled: 11 | log_dir = str(log_dir) 12 | 13 | # Retrieve vizualization writer. 14 | succeeded = False 15 | for module in ["torch.utils.tensorboard", "tensorboardX"]: 16 | try: 17 | self.writer = importlib.import_module( 18 | module).SummaryWriter(log_dir) 19 | succeeded = True 20 | break 21 | except ImportError: 22 | succeeded = False 23 | self.selected_module = module 24 | 25 | if not succeeded: 26 | message = "Warning: visualization (Tensorboard) is configured to use, but currently not installed on " \ 27 | "this machine. Please install TensorboardX with 'pip install tensorboardx', upgrade PyTorch to " \ 28 | "version >= 1.1 to use 'torch.utils.tensorboard' or turn off the option in the 'config.json' file." 29 | print(message) 30 | 31 | self.step = 0 32 | self.mode = '' 33 | 34 | self.tb_writer_ftns = { 35 | 'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio', 36 | 'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding', 'add_graph' 37 | } 38 | self.tag_mode_exceptions = {'add_histogram', 'add_embedding'} 39 | self.timer = datetime.now() 40 | 41 | def set_step(self, step, mode='train'): 42 | self.mode = mode 43 | self.step = step 44 | if step == 0: 45 | self.timer = datetime.now() 46 | else: 47 | duration = datetime.now() - self.timer 48 | self.add_scalar('steps_per_sec', 1 / duration.total_seconds()) 49 | self.timer = datetime.now() 50 | 51 | def __getattr__(self, name): 52 | """ 53 | If visualization is configured to use: 54 | return add_data() methods of tensorboard with additional information (step, tag) added. 55 | Otherwise: 56 | return a blank function handle that does nothing 57 | """ 58 | if name in self.tb_writer_ftns: 59 | add_data = getattr(self.writer, name, None) 60 | 61 | def wrapper(tag, data, *args, **kwargs): 62 | if add_data is not None: 63 | # add mode(train/valid) tag 64 | if name not in self.tag_mode_exceptions: 65 | tag = '{}/{}'.format(tag, self.mode) 66 | add_data(tag, data, self.step, *args, **kwargs) 67 | return wrapper 68 | else: 69 | # default action for returning methods defined in this class, set_step() for instance. 70 | try: 71 | attr = object.__getattr__(name) 72 | except AttributeError: 73 | raise AttributeError("type object '{}' has no attribute '{}'".format( 74 | self.selected_module, name)) 75 | return attr 76 | --------------------------------------------------------------------------------