├── .circleci
└── config.yml
├── .gitignore
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── configs
└── efficientdet-d0.yaml
├── datasets
├── __init__.py
├── augmentation.py
├── coco.py
├── coco_labels.txt
├── scripts
│ ├── COCO2014.sh
│ ├── COCO2017.sh
│ ├── VOC2007.sh
│ └── VOC2012.sh
├── visual_aug.py
└── voc0712.py
├── demo.py
├── docs
├── arch.png
├── compare.png
├── demo.png
├── output.png
├── performance.png
└── pytoan.gif
├── eval.py
├── models
├── __init__.py
├── bifpn.py
├── efficientdet.py
├── efficientnet.py
├── losses.py
├── module.py
├── retinahead.py
└── utils.py
├── requirements.txt
├── test.py
├── train.py
└── utils
├── __init__.py
├── config_eff.py
├── helper.py
├── metric.py
├── util.py
├── vis_bbox.py
└── visualization.py
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | jobs:
3 | build:
4 | docker:
5 | - image: toandaominh1997/pytoan:latest
6 | steps:
7 | - checkout # check out the code in the project directory
8 | - run: |
9 | pip install flake8
10 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
11 | python test.py
12 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # vscode
107 | .vscode/*
108 | !.vscode/settings.json
109 | !.vscode/tasks.json
110 | !.vscode/launch.json
111 | !.vscode/extensions.json
112 | *.code-workspace
113 |
114 |
115 | saved/
116 | weights/
117 | val2017_bbox_results.json
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.pythonPath": "/home/toandm2/devtools/anaconda3/envs/pytoan/bin/python"
3 | }
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Toan Dao Minh(bigkizd)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # EfficientDet: Scalable and Efficient Object Detection, in PyTorch
2 | A [PyTorch](http://pytorch.org/) implementation of [EfficientDet](https://arxiv.org/abs/1911.09070) from the 2019 paper by Mingxing Tan Ruoming Pang Quoc V. Le
3 | Google Research, Brain Team. The official and original: comming soon.
4 |
5 |
6 |
7 |
8 | # Fun with Demo:
9 | ```Shell
10 | python demo.py --weight ./checkpoint_VOC_efficientdet-d1_97.pth --threshold 0.6 --iou_threshold 0.5 --cam --score
11 | ```
12 |
13 |
14 |
15 |
16 |
17 |
18 | ### Table of Contents
19 | - Recent Update
20 | - Benchmarking
21 | - Installation
22 | - Installation
23 | - Prerequisites
24 | - Datasets
25 | - Train
26 | - Evaluate
27 | - Performance
28 | - Demo
29 | - Future Work
30 | - Reference
31 |
32 |
33 |
34 |
35 |
36 |
37 | ## Recent Update
38 | - [06/01/2020] Support both DistributedDataParallel and DataParallel, change augmentation, eval_voc
39 | - [17/12/2019] Add Fast normalized fusion, Augmentation with Ratio, Change RetinaHead, Fix Support EfficientDet-D0->D7
40 | - [7/12/2019] Support EfficientDet-D0, EfficientDet-D1, EfficientDet-D2, EfficientDet-D3, EfficientDet-D4,... . Support change gradient accumulation steps, AdamW.
41 | ## Benchmarking
42 |
43 | We benchmark our code thoroughly on three datasets: pascal voc and coco, using family efficientnet different network architectures: EfficientDet-D0->7. Below are the results:
44 |
45 | 1). PASCAL VOC 2007 (Train/Test: 07trainval/07test, scale=600, ROI Align)
46 |
47 | model | mAP |
48 | ---------|--------|
49 | [EfficientDet-D0(with Weight)](https://drive.google.com/file/d/1r7MAyBfG5OK_9F_cU8yActUWxTHOuOpL/view?usp=sharing | 62.16
50 |
51 |
52 | ## Installation
53 | - Install [PyTorch](http://pytorch.org/) by selecting your environment on the website and running the appropriate command.
54 | - Clone this repository and install package [prerequisites](#prerequisites) below.
55 | - Then download the dataset by following the [instructions](#datasets) below.
56 | - Note: For training, we currently support [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [COCO](http://mscoco.org/), and aim to add [ImageNet](http://www.image-net.org/) support soon.
57 |
58 | ### prerequisites
59 |
60 | * Python 3.6+
61 | * PyTorch 1.3+
62 | * Torchvision 0.4.0+ (**We need high version because Torchvision support nms now.**)
63 | * requirements.txt
64 | ## Datasets
65 | To make things easy, we provide bash scripts to handle the dataset downloads and setup for you. We also provide simple dataset loaders that inherit `torch.utils.data.Dataset`, making them fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html).
66 |
67 | ### VOC Dataset
68 | PASCAL VOC: Visual Object Classes
69 |
70 | ##### Download VOC2007 + VOC2012 trainval & test
71 | ```Shell
72 | # specify a directory for dataset to be downloaded into, else default is ~/data/
73 | sh datasets/scripts/VOC2007.sh
74 | sh datasets/scripts/VOC2012.sh
75 | ```
76 |
77 | ### COCO
78 | Microsoft COCO: Common Objects in Context
79 |
80 | ##### Download COCO 2017
81 | ```Shell
82 | # specify a directory for dataset to be downloaded into, else default is ~/data/
83 | sh datasets/scripts/COCO2017.sh
84 | ```
85 |
86 | ## Training EfficientDet
87 |
88 | - To train EfficientDet using the train script simply specify the parameters listed in `train.py` as a flag or manually change them.
89 |
90 | ```Shell
91 | python train.py --network effcientdet-d0 # Example
92 | ```
93 |
94 | - With VOC Dataset:
95 | ```Shell
96 | # DataParallel
97 | python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network effcientdet-d0 --batch_size 32
98 | # DistributedDataParallel with backend nccl
99 | python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network effcientdet-d0 --batch_size 32 --multiprocessing-distributed
100 | ```
101 | - With COCO Dataset:
102 | ```Shell
103 | # DataParallel
104 | python train.py --dataset COCO --dataset_root ~/data/coco/ --network effcientdet-d0 --batch_size 32
105 | # DistributedDataParallel with backend nccl
106 | python train.py --dataset COCO --dataset_root ~/data/coco/ --network effcientdet-d0 --batch_size 32 --multiprocessing-distributed
107 | ```
108 |
109 | ## Evaluation
110 | To evaluate a trained network:
111 | - With VOC Dataset:
112 | ```Shell
113 | python eval_voc.py --dataset_root ~/data/VOCdevkit --weight ./checkpoint_VOC_efficientdet-d0_261.pth
114 | ```
115 | - With COCO Dataset
116 | comming soon.
117 | ## Demo
118 |
119 | ```Shell
120 | python demo.py --threshold 0.5 --iou_threshold 0.5 --score --weight checkpoint_VOC_efficientdet-d1_34.pth --file_name demo.png
121 | ```
122 |
123 | Output:
124 |
125 |
126 |
127 |
128 |
129 | ## Webcam Demo
130 |
131 | You can use a webcam in a real-time demo by running:
132 | ```Shell
133 | python demo.py --threshold 0.5 --iou_threshold 0.5 --cam --score --weight checkpoint_VOC_efficientdet-d1_34.pth
134 | ```
135 |
136 | ## Performance
137 |
138 |
139 |
140 |
141 | ## TODO
142 | We have accumulated the following to-do list, which we hope to complete in the near future
143 | - Still to come:
144 | * [x] EfficientDet-[D0-7]
145 | * [x] GPU-Parallel
146 | * [x] NMS
147 | * [ ] Soft-NMS
148 | * [x] Pretrained model
149 | * [x] Demo
150 | * [ ] Model zoo
151 | * [ ] TorchScript
152 | * [ ] Mobile
153 | * [ ] C++ Onnx
154 |
155 |
156 | ## Authors
157 |
158 | * [**Toan Dao Minh**](https://github.com/toandaominh1997)
159 |
160 | ***Note:*** Unfortunately, this is just a hobby of ours and not a full-time job, so we'll do our best to keep things up to date, but no guarantees. That being said, thanks to everyone for your continued help and feedback as it is really appreciated. We will try to address everything as soon as possible.
161 |
162 | ## References
163 | - tanmingxing, rpang, qvl, et al. "EfficientDet: Scalable and Efficient Object Detection." [EfficientDet](https://arxiv.org/abs/1911.09070).
164 | - A list of other great EfficientDet ports that were sources of inspiration:
165 | * [EfficientNet](https://github.com/lukemelas/EfficientNet-PyTorch)
166 | * [SSD.Pytorch](https://github.com/amdegroot/ssd.pytorch)
167 | * [mmdetection](https://github.com/open-mmlab/mmdetection)
168 | * [RetinaNet.Pytorch](https://github.com/yhenon/pytorch-retinanet)
169 | * [NMS.Torchvision](https://pytorch.org/docs/stable/torchvision/ops.html)
170 |
171 |
172 | ## Citation
173 |
174 | @article{efficientdetpytoan,
175 | Author = {Toan Dao Minh},
176 | Title = {A Pytorch Implementation of EfficientDet Object Detection},
177 | Journal = {github.com/toandaominh1997/EfficientDet.Pytorch},
178 | Year = {2019}
179 | }
180 |
--------------------------------------------------------------------------------
/configs/efficientdet-d0.yaml:
--------------------------------------------------------------------------------
1 | SEED: 42
2 | DEVICE: [0, 1]
3 | # DATASET
4 | DATA_TRAIN: VOC
5 |
6 | GRADIENT_ACCUMULATION_STEPS: 1
7 | GRADIENT_CLIPPING: 1
8 | NUM_EPOCH: 500
9 | EARLY_STOPPING: 50
10 | VALIDATION_FREQUENCY: 2
11 | TENSORBOARD: True
12 | CHECKPOINT_DIR: ./saved
13 | RESUME_PATH:
14 |
15 | TRAIN_DATASET:
16 | PY: datasets
17 | CLASS: spoofDataset
18 | ARGS:
19 | root_dir: ./
20 | phase: train
21 |
22 | VALID_DATASET:
23 | PY: datasets
24 | CLASS: spoofDataset
25 | ARGS:
26 | root_dir: ./
27 | phase: valid
28 |
29 | TEST_DATASET:
30 | PY: datasets.dataset
31 | CLASS: spoofDataset
32 | ARGS:
33 | root_dir: ./data
34 | phase: valid
35 |
36 | TRAIN_DATALOADER:
37 | PY: torch.utils.data
38 | CLASS: DataLoader
39 | ARGS:
40 | batch_size: 8
41 | shuffle: True
42 | num_workers: 8
43 | pin_memory: True
44 |
45 | VALID_DATALOADER:
46 | PY: torch.utils.data
47 | CLASS: DataLoader
48 | ARGS:
49 | batch_size: 8
50 | shuffle: False
51 | num_workers: 8
52 | pin_memory: True
53 |
54 | TEST_DATALOADER:
55 | PY: torch.utils.data
56 | CLASS: DataLoader
57 | ARGS:
58 | batch_size: 8
59 | shuffle: False
60 | num_workers: 8
61 |
62 | MODEL:
63 | PY: models
64 | CLASS: EfficientDet
65 | ARGS:
66 | num_class: 21
67 | levels: 3
68 | num_channels: 128
69 | model_name: efficientnet-b0
70 |
71 | CRITERION:
72 | PY: layers.modules
73 | CLASS: MultiBoxLoss
74 | ARGS:
75 | num_classes: 21
76 | overlap_thresh: 0.5
77 | prior_for_matching: True
78 | bkg_label: 0
79 | neg_mining: True
80 | neg_pos: 3
81 | neg_overlap: 0.5
82 | encode_target: False
83 | use_gpu: False
84 |
85 | OPTIMIZER:
86 | PY: torch.optim
87 | CLASS: AdamW
88 | ARGS:
89 | lr: 0.0001
90 | weight_decay: 0.000005
91 |
92 | SCHEDULER:
93 | PY: torch.optim.lr_scheduler
94 | CLASS: ReduceLROnPlateau
95 | ARGS:
96 | factor: 0.15
97 | patience: 2
98 |
--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .voc0712 import VOCDetection, VOC_CLASSES
2 | from .augmentation import get_augumentation, detection_collate, Resizer, Normalizer, Augmenter, collater
3 | from .coco import CocoDataset
--------------------------------------------------------------------------------
/datasets/augmentation.py:
--------------------------------------------------------------------------------
1 | import albumentations as albu
2 | from albumentations.pytorch.transforms import ToTensor
3 | import torch
4 | import numpy as np
5 | import cv2
6 |
7 |
8 | def get_augumentation(phase, width=512, height=512, min_area=0., min_visibility=0.):
9 | list_transforms = []
10 | if phase == 'train':
11 | list_transforms.extend([
12 | albu.augmentations.transforms.LongestMaxSize(
13 | max_size=width, always_apply=True),
14 | albu.PadIfNeeded(min_height=height, min_width=width,
15 | always_apply=True, border_mode=0, value=[0, 0, 0]),
16 | albu.augmentations.transforms.RandomResizedCrop(
17 | height=height,
18 | width=width, p=0.3),
19 | albu.augmentations.transforms.Flip(),
20 | albu.augmentations.transforms.Transpose(),
21 | albu.OneOf([
22 | albu.RandomBrightnessContrast(brightness_limit=0.5,
23 | contrast_limit=0.4),
24 | albu.RandomGamma(gamma_limit=(50, 150)),
25 | albu.NoOp()
26 | ]),
27 | albu.OneOf([
28 | albu.RGBShift(r_shift_limit=20, b_shift_limit=15,
29 | g_shift_limit=15),
30 | albu.HueSaturationValue(hue_shift_limit=5,
31 | sat_shift_limit=5),
32 | albu.NoOp()
33 | ]),
34 | albu.CLAHE(p=0.8),
35 | albu.HorizontalFlip(p=0.5),
36 | albu.VerticalFlip(p=0.5),
37 | ])
38 | if(phase == 'test' or phase == 'valid'):
39 | list_transforms.extend([
40 | albu.Resize(height=height, width=width)
41 | ])
42 | list_transforms.extend([
43 | albu.Normalize(mean=(0.485, 0.456, 0.406),
44 | std=(0.229, 0.224, 0.225), p=1),
45 | ToTensor()
46 | ])
47 | if(phase == 'test'):
48 | return albu.Compose(list_transforms)
49 | return albu.Compose(list_transforms, bbox_params=albu.BboxParams(format='pascal_voc', min_area=min_area,
50 | min_visibility=min_visibility, label_fields=['category_id']))
51 |
52 |
53 | def detection_collate(batch):
54 | imgs = [s['image'] for s in batch]
55 | annots = [s['bboxes'] for s in batch]
56 | labels = [s['category_id'] for s in batch]
57 |
58 | max_num_annots = max(len(annot) for annot in annots)
59 | annot_padded = np.ones((len(annots), max_num_annots, 5))*-1
60 |
61 | if max_num_annots > 0:
62 | for idx, (annot, lab) in enumerate(zip(annots, labels)):
63 | if len(annot) > 0:
64 | annot_padded[idx, :len(annot), :4] = annot
65 | annot_padded[idx, :len(annot), 4] = lab
66 | return (torch.stack(imgs, 0), torch.FloatTensor(annot_padded))
67 |
68 |
69 | def collater(data):
70 | imgs = [s['img'] for s in data]
71 | annots = [s['annot'] for s in data]
72 | scales = [s['scale'] for s in data]
73 |
74 | imgs = torch.from_numpy(np.stack(imgs, axis=0))
75 |
76 | max_num_annots = max(annot.shape[0] for annot in annots)
77 |
78 | if max_num_annots > 0:
79 |
80 | annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1
81 |
82 | if max_num_annots > 0:
83 | for idx, annot in enumerate(annots):
84 | if annot.shape[0] > 0:
85 | annot_padded[idx, :annot.shape[0], :] = annot
86 | else:
87 | annot_padded = torch.ones((len(annots), 1, 5)) * -1
88 |
89 | imgs = imgs.permute(0, 3, 1, 2)
90 |
91 | return (imgs, torch.FloatTensor(annot_padded))
92 |
93 |
94 | class Resizer(object):
95 | """Convert ndarrays in sample to Tensors."""
96 |
97 | def __call__(self, sample, common_size=512):
98 | image, annots = sample['img'], sample['annot']
99 | height, width, _ = image.shape
100 | if height > width:
101 | scale = common_size / height
102 | resized_height = common_size
103 | resized_width = int(width * scale)
104 | else:
105 | scale = common_size / width
106 | resized_height = int(height * scale)
107 | resized_width = common_size
108 |
109 | image = cv2.resize(image, (resized_width, resized_height))
110 |
111 | new_image = np.zeros((common_size, common_size, 3))
112 | new_image[0:resized_height, 0:resized_width] = image
113 | annots[:, :4] *= scale
114 |
115 | return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale}
116 |
117 |
118 | class Augmenter(object):
119 | """Convert ndarrays in sample to Tensors."""
120 |
121 | def __call__(self, sample, flip_x=0.5):
122 | if np.random.rand() < flip_x:
123 | image, annots = sample['img'], sample['annot']
124 | image = image[:, ::-1, :]
125 |
126 | rows, cols, channels = image.shape
127 |
128 | x1 = annots[:, 0].copy()
129 | x2 = annots[:, 2].copy()
130 |
131 | x_tmp = x1.copy()
132 |
133 | annots[:, 0] = cols - x2
134 | annots[:, 2] = cols - x_tmp
135 |
136 | sample = {'img': image, 'annot': annots}
137 |
138 | return sample
139 |
140 |
141 | class Normalizer(object):
142 |
143 | def __init__(self):
144 | self.mean = np.array([[[0.485, 0.456, 0.406]]])
145 | self.std = np.array([[[0.229, 0.224, 0.225]]])
146 |
147 | def __call__(self, sample):
148 | image, annots = sample['img'], sample['annot']
149 |
150 | return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots}
151 |
--------------------------------------------------------------------------------
/datasets/coco.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function, division
2 | import sys
3 | import os
4 | import torch
5 | import numpy as np
6 | import random
7 | import csv
8 |
9 | from torch.utils.data import Dataset, DataLoader
10 | from torchvision import transforms, utils
11 | from torch.utils.data.sampler import Sampler
12 |
13 | from pycocotools.coco import COCO
14 |
15 | import skimage.io
16 | import skimage.transform
17 | import skimage.color
18 | import skimage
19 | import cv2
20 | from PIL import Image
21 |
22 |
23 | class CocoDataset(Dataset):
24 | """Coco dataset."""
25 |
26 | def __init__(self, root_dir, set_name='train2017', transform=None):
27 | """
28 | Args:
29 | root_dir (string): COCO directory.
30 | transform (callable, optional): Optional transform to be applied
31 | on a sample.
32 | """
33 | self.root_dir = root_dir
34 | self.set_name = set_name
35 | self.transform = transform
36 |
37 | self.coco = COCO(os.path.join(self.root_dir, 'annotations',
38 | 'instances_' + self.set_name + '.json'))
39 | self.image_ids = self.coco.getImgIds()
40 |
41 | self.load_classes()
42 |
43 | def load_classes(self):
44 | # load class names (name -> label)
45 | categories = self.coco.loadCats(self.coco.getCatIds())
46 | categories.sort(key=lambda x: x['id'])
47 |
48 | self.classes = {}
49 | self.coco_labels = {}
50 | self.coco_labels_inverse = {}
51 | for c in categories:
52 | self.coco_labels[len(self.classes)] = c['id']
53 | self.coco_labels_inverse[c['id']] = len(self.classes)
54 | self.classes[c['name']] = len(self.classes)
55 |
56 | # also load the reverse (label -> name)
57 | self.labels = {}
58 | for key, value in self.classes.items():
59 | self.labels[value] = key
60 |
61 | def __len__(self):
62 | return len(self.image_ids)
63 |
64 | def __getitem__(self, idx):
65 |
66 | img = self.load_image(idx)
67 | annot = self.load_annotations(idx)
68 | sample = {'img': img, 'annot': annot}
69 | if self.transform:
70 | sample = self.transform(sample)
71 | return sample
72 |
73 | def load_image(self, image_index):
74 | image_info = self.coco.loadImgs(self.image_ids[image_index])[0]
75 | path = os.path.join(self.root_dir, 'images',
76 | self.set_name, image_info['file_name'])
77 | img = cv2.imread(path)
78 |
79 | if len(img.shape) == 2:
80 | img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
81 | return img
82 |
83 | def load_annotations(self, image_index):
84 | # get ground truth annotations
85 | annotations_ids = self.coco.getAnnIds(
86 | imgIds=self.image_ids[image_index], iscrowd=False)
87 | annotations = np.zeros((0, 5))
88 |
89 | # some images appear to miss annotations (like image with id 257034)
90 | if len(annotations_ids) == 0:
91 | return annotations
92 |
93 | # parse annotations
94 | coco_annotations = self.coco.loadAnns(annotations_ids)
95 | for idx, a in enumerate(coco_annotations):
96 |
97 | # some annotations have basically no width / height, skip them
98 | if a['bbox'][2] < 1 or a['bbox'][3] < 1:
99 | continue
100 |
101 | annotation = np.zeros((1, 5))
102 | annotation[0, :4] = a['bbox']
103 | annotation[0, 4] = self.coco_label_to_label(a['category_id'])
104 | annotations = np.append(annotations, annotation, axis=0)
105 |
106 | # transform from [x, y, w, h] to [x1, y1, x2, y2]
107 | annotations[:, 2] = annotations[:, 0] + annotations[:, 2]
108 | annotations[:, 3] = annotations[:, 1] + annotations[:, 3]
109 |
110 | return annotations
111 |
112 | def coco_label_to_label(self, coco_label):
113 | return self.coco_labels_inverse[coco_label]
114 |
115 | def label_to_coco_label(self, label):
116 | return self.coco_labels[label]
117 |
118 | def image_aspect_ratio(self, image_index):
119 | image = self.coco.loadImgs(self.image_ids[image_index])[0]
120 | return float(image['width']) / float(image['height'])
121 |
122 | def num_classes(self):
123 | return 80
124 |
125 |
126 | if __name__ == '__main__':
127 | from augmentation import get_augumentation
128 | dataset = CocoDataset(root_dir='/root/data/coco', set_name='trainval35k',
129 | transform=get_augumentation(phase='train'))
130 | sample = dataset[0]
131 | print('sample: ', sample)
132 |
--------------------------------------------------------------------------------
/datasets/coco_labels.txt:
--------------------------------------------------------------------------------
1 | 1,1,person
2 | 2,2,bicycle
3 | 3,3,car
4 | 4,4,motorcycle
5 | 5,5,airplane
6 | 6,6,bus
7 | 7,7,train
8 | 8,8,truck
9 | 9,9,boat
10 | 10,10,traffic light
11 | 11,11,fire hydrant
12 | 13,12,stop sign
13 | 14,13,parking meter
14 | 15,14,bench
15 | 16,15,bird
16 | 17,16,cat
17 | 18,17,dog
18 | 19,18,horse
19 | 20,19,sheep
20 | 21,20,cow
21 | 22,21,elephant
22 | 23,22,bear
23 | 24,23,zebra
24 | 25,24,giraffe
25 | 27,25,backpack
26 | 28,26,umbrella
27 | 31,27,handbag
28 | 32,28,tie
29 | 33,29,suitcase
30 | 34,30,frisbee
31 | 35,31,skis
32 | 36,32,snowboard
33 | 37,33,sports ball
34 | 38,34,kite
35 | 39,35,baseball bat
36 | 40,36,baseball glove
37 | 41,37,skateboard
38 | 42,38,surfboard
39 | 43,39,tennis racket
40 | 44,40,bottle
41 | 46,41,wine glass
42 | 47,42,cup
43 | 48,43,fork
44 | 49,44,knife
45 | 50,45,spoon
46 | 51,46,bowl
47 | 52,47,banana
48 | 53,48,apple
49 | 54,49,sandwich
50 | 55,50,orange
51 | 56,51,broccoli
52 | 57,52,carrot
53 | 58,53,hot dog
54 | 59,54,pizza
55 | 60,55,donut
56 | 61,56,cake
57 | 62,57,chair
58 | 63,58,couch
59 | 64,59,potted plant
60 | 65,60,bed
61 | 67,61,dining table
62 | 70,62,toilet
63 | 72,63,tv
64 | 73,64,laptop
65 | 74,65,mouse
66 | 75,66,remote
67 | 76,67,keyboard
68 | 77,68,cell phone
69 | 78,69,microwave
70 | 79,70,oven
71 | 80,71,toaster
72 | 81,72,sink
73 | 82,73,refrigerator
74 | 84,74,book
75 | 85,75,clock
76 | 86,76,vase
77 | 87,77,scissors
78 | 88,78,teddy bear
79 | 89,79,hair drier
80 | 90,80,toothbrush
--------------------------------------------------------------------------------
/datasets/scripts/COCO2014.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | start=`date +%s`
4 |
5 | # handle optional download dir
6 | if [ -z "$1" ]
7 | then
8 | # navigate to ~/data
9 | echo "navigating to ~/data/ ..."
10 | mkdir -p ~/data
11 | cd ~/data/
12 | mkdir -p ./coco
13 | cd ./coco
14 | mkdir -p ./images
15 | mkdir -p ./annotations
16 | else
17 | # check if specified dir is valid
18 | if [ ! -d $1 ]; then
19 | echo $1 " is not a valid directory"
20 | exit 0
21 | fi
22 | echo "navigating to " $1 " ..."
23 | cd $1
24 | fi
25 |
26 | if [ ! -d images ]
27 | then
28 | mkdir -p ./images
29 | fi
30 |
31 | # Download the image data.
32 | cd ./images
33 | echo "Downloading MSCOCO train images ..."
34 | curl -LO http://images.cocodataset.org/zips/train2014.zip
35 | echo "Downloading MSCOCO val images ..."
36 | curl -LO http://images.cocodataset.org/zips/val2014.zip
37 |
38 | cd ../
39 | if [ ! -d annotations]
40 | then
41 | mkdir -p ./annotations
42 | fi
43 |
44 | # Download the annotation data.
45 | cd ./annotations
46 | echo "Downloading MSCOCO train/val annotations ..."
47 | curl -LO http://images.cocodataset.org/annotations/annotations_trainval2014.zip
48 | echo "Finished downloading. Now extracting ..."
49 |
50 | # Unzip data
51 | echo "Extracting train images ..."
52 | unzip ../images/train2014.zip -d ../images
53 | echo "Extracting val images ..."
54 | unzip ../images/val2014.zip -d ../images
55 | echo "Extracting annotations ..."
56 | unzip ./annotations_trainval2014.zip
57 |
58 | echo "Removing zip files ..."
59 | rm ../images/train2014.zip
60 | rm ../images/val2014.zip
61 | rm ./annotations_trainval2014.zip
62 |
63 | echo "Creating trainval35k dataset..."
64 |
65 | # Download annotations json
66 | echo "Downloading trainval35k annotations from S3"
67 | curl -LO https://s3.amazonaws.com/amdegroot-datasets/instances_trainval35k.json.zip
68 |
69 | # combine train and val
70 | echo "Combining train and val images"
71 | mkdir ../images/trainval35k
72 | cd ../images/train2014
73 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} + # dir too large for cp
74 | cd ../val2014
75 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} +
76 |
77 |
78 | end=`date +%s`
79 | runtime=$((end-start))
80 |
81 | echo "Completed in " $runtime " seconds"
82 |
--------------------------------------------------------------------------------
/datasets/scripts/COCO2017.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | start=`date +%s`
4 |
5 | # handle optional download dir
6 | if [ -z "$1" ]
7 | then
8 | # navigate to ~/data
9 | echo "navigating to ~/data/ ..."
10 | mkdir -p ~/data
11 | cd ~/data/
12 | mkdir -p ./coco
13 | cd ./coco
14 | mkdir -p ./images
15 | mkdir -p ./annotations
16 | else
17 | # check if specified dir is valid
18 | if [ ! -d $1 ]; then
19 | echo $1 " is not a valid directory"
20 | exit 0
21 | fi
22 | echo "navigating to " $1 " ..."
23 | cd $1
24 | fi
25 |
26 | if [ ! -d images ]
27 | then
28 | mkdir -p ./images
29 | fi
30 |
31 | # Download the image data.
32 | cd ./images
33 | echo "Downloading MSCOCO train images ..."
34 | curl -LO http://images.cocodataset.org/zips/train2017.zip
35 | echo "Downloading MSCOCO val images ..."
36 | curl -LO http://images.cocodataset.org/zips/val2017.zip
37 |
38 | cd ../
39 | if [ ! -d annotations]
40 | then
41 | mkdir -p ./annotations
42 | fi
43 |
44 | # Download the annotation data.
45 | cd ./annotations
46 | echo "Downloading MSCOCO train/val annotations ..."
47 | curl -LO http://images.cocodataset.org/annotations/annotations_trainval2017.zip
48 | echo "Finished downloading. Now extracting ..."
49 |
50 | # Unzip data
51 | echo "Extracting train images ..."
52 | unzip ../images/train2017.zip -d ../images
53 | echo "Extracting val images ..."
54 | unzip ../images/val2017.zip -d ../images
55 | echo "Extracting annotations ..."
56 | unzip ./annotations_trainval2017.zip
57 |
58 | echo "Removing zip files ..."
59 | rm ../images/train2017.zip
60 | rm ../images/val2017.zip
61 | rm ./annotations_trainval2017.zip
62 |
63 | echo "Completed in " $runtime " seconds"
64 |
--------------------------------------------------------------------------------
/datasets/scripts/VOC2007.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Ellis Brown
3 |
4 | start=`date +%s`
5 |
6 | # handle optional download dir
7 | if [ -z "$1" ]
8 | then
9 | # navigate to ~/data
10 | echo "navigating to ~/data/ ..."
11 | mkdir -p ~/data
12 | cd ~/data/
13 | else
14 | # check if is valid directory
15 | if [ ! -d $1 ]; then
16 | echo $1 "is not a valid directory"
17 | exit 0
18 | fi
19 | echo "navigating to" $1 "..."
20 | cd $1
21 | fi
22 |
23 | echo "Downloading VOC2007 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
26 | echo "Downloading VOC2007 test data ..."
27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
28 | echo "Done downloading."
29 |
30 | # Extract data
31 | echo "Extracting trainval ..."
32 | tar -xvf VOCtrainval_06-Nov-2007.tar
33 | echo "Extracting test ..."
34 | tar -xvf VOCtest_06-Nov-2007.tar
35 | echo "removing tars ..."
36 | rm VOCtrainval_06-Nov-2007.tar
37 | rm VOCtest_06-Nov-2007.tar
38 |
39 | end=`date +%s`
40 | runtime=$((end-start))
41 |
42 | echo "Completed in" $runtime "seconds"
--------------------------------------------------------------------------------
/datasets/scripts/VOC2012.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | # Ellis Brown
3 |
4 | start=`date +%s`
5 |
6 | # handle optional download dir
7 | if [ -z "$1" ]
8 | then
9 | # navigate to ~/data
10 | echo "navigating to ~/data/ ..."
11 | mkdir -p ~/data
12 | cd ~/data/
13 | else
14 | # check if is valid directory
15 | if [ ! -d $1 ]; then
16 | echo $1 "is not a valid directory"
17 | exit 0
18 | fi
19 | echo "navigating to" $1 "..."
20 | cd $1
21 | fi
22 |
23 | echo "Downloading VOC2012 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
26 | echo "Done downloading."
27 |
28 |
29 | # Extract data
30 | echo "Extracting trainval ..."
31 | tar -xvf VOCtrainval_11-May-2012.tar
32 | echo "removing tar ..."
33 | rm VOCtrainval_11-May-2012.tar
34 |
35 | end=`date +%s`
36 | runtime=$((end-start))
37 |
38 | echo "Completed in" $runtime "seconds"
--------------------------------------------------------------------------------
/datasets/visual_aug.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | from augmentation import get_augumentation
3 | from voc0712 import VOCDetection
4 | import matplotlib.pyplot as plt
5 | EFFICIENTDET = {
6 | 'efficientdet-d0': {'input_size': 512,
7 | 'backbone': 'B0',
8 | 'W_bifpn': 64,
9 | 'D_bifpn': 2,
10 | 'D_class': 3},
11 | 'efficientdet-d1': {'input_size': 640,
12 | 'backbone': 'B1',
13 | 'W_bifpn': 88,
14 | 'D_bifpn': 3,
15 | 'D_class': 3},
16 | 'efficientdet-d2': {'input_size': 768,
17 | 'backbone': 'B2',
18 | 'W_bifpn': 112,
19 | 'D_bifpn': 4,
20 | 'D_class': 3},
21 | }
22 |
23 |
24 | # Functions to visualize bounding boxes and class labels on an image.
25 | # Based on https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/vis.py
26 |
27 | BOX_COLOR = (255, 0, 0)
28 | TEXT_COLOR = (255, 255, 255)
29 |
30 |
31 | def visualize_bbox(img, bbox, class_id, class_idx_to_name, color=BOX_COLOR, thickness=2):
32 | x_min, y_min, x_max, y_max = bbox
33 | x_min, x_max, y_min, y_max = int(x_min), int(x_max), int(y_min), int(y_max)
34 | cv2.rectangle(img, (x_min, y_min), (x_max, y_max),
35 | color=color, thickness=thickness)
36 | # class_name = class_idx_to_name[class_id]
37 | # ((text_width, text_height), _) = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.35, 1)
38 | # cv2.rectangle(img, (x_min, y_min - int(1.3 * text_height)), (x_min + text_width, y_min), BOX_COLOR, -1)
39 | # cv2.putText(img, class_name, (x_min, y_min - int(0.3 * text_height)), cv2.FONT_HERSHEY_SIMPLEX, 0.35,TEXT_COLOR, lineType=cv2.LINE_AA)
40 | return img
41 |
42 |
43 | def visualize(annotations, category_id_to_name):
44 | img = annotations['image'].copy()
45 | for idx, bbox in enumerate(annotations['bboxes']):
46 | img = visualize_bbox(
47 | img, bbox, annotations['category_id'][idx], category_id_to_name)
48 | # plt.figure(figsize=(12, 12))
49 | # plt.imshow(img)
50 | return img
51 |
52 |
53 | dataset_root = '/root/data/VOCdevkit'
54 | network = 'efficientdet-d0'
55 | dataset = VOCDetection(root=dataset_root,
56 | transform=get_augumentation(phase='train', width=EFFICIENTDET[network]['input_size'], height=EFFICIENTDET[network]['input_size']))
57 |
58 |
59 | def visual_data(data, name):
60 | img = data['image']
61 | bboxes = data['bboxes']
62 | annotations = {'image': data['image'], 'bboxes': data['bboxes'], 'category_id': range(
63 | len(data['bboxes']))}
64 | category_id_to_name = {v: v for v in range(len(data['bboxes']))}
65 |
66 | img = visualize(annotations, category_id_to_name)
67 | cv2.imwrite(name, img)
68 |
69 |
70 | for i in range(20, 25):
71 | visual_data(dataset[i], "name"+str(i)+".png")
72 |
--------------------------------------------------------------------------------
/datasets/voc0712.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import sys
3 | import torch
4 | import torch.utils.data as data
5 | import cv2
6 | import numpy as np
7 | if sys.version_info[0] == 2:
8 | import xml.etree.cElementTree as ET
9 | else:
10 | import xml.etree.ElementTree as ET
11 |
12 | VOC_CLASSES = ( # always index 0
13 | 'aeroplane', 'bicycle', 'bird', 'boat',
14 | 'bottle', 'bus', 'car', 'cat', 'chair',
15 | 'cow', 'diningtable', 'dog', 'horse',
16 | 'motorbike', 'person', 'pottedplant',
17 | 'sheep', 'sofa', 'train', 'tvmonitor')
18 |
19 | # note: if you used our download scripts, this should be right
20 | VOC_ROOT = osp.join('/home/toandm2', "data/VOCdevkit/")
21 |
22 |
23 | class VOCAnnotationTransform(object):
24 | """Transforms a VOC annotation into a Tensor of bbox coords and label index
25 | Initilized with a dictionary lookup of classnames to indexes
26 | Arguments:
27 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
28 | (default: alphabetic indexing of VOC's 20 classes)
29 | keep_difficult (bool, optional): keep difficult instances or not
30 | (default: False)
31 | height (int): height
32 | width (int): width
33 | """
34 |
35 | def __init__(self, class_to_ind=None, keep_difficult=False):
36 | self.class_to_ind = class_to_ind or dict(
37 | zip(VOC_CLASSES, range(len(VOC_CLASSES))))
38 | self.keep_difficult = keep_difficult
39 |
40 | def __call__(self, target, width, height):
41 | """
42 | Arguments:
43 | target (annotation) : the target annotation to be made usable
44 | will be an ET.Element
45 | Returns:
46 | a list containing lists of bounding boxes [bbox coords, class name]
47 | """
48 | res = []
49 | for obj in target.iter('object'):
50 | difficult = int(obj.find('difficult').text) == 1
51 | if not self.keep_difficult and difficult:
52 | continue
53 | name = obj.find('name').text.lower().strip()
54 | bbox = obj.find('bndbox')
55 |
56 | pts = ['xmin', 'ymin', 'xmax', 'ymax']
57 | bndbox = []
58 | for i, pt in enumerate(pts):
59 | cur_pt = float(bbox.find(pt).text) - 1
60 | # scale height or width
61 | # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
62 | bndbox.append(cur_pt)
63 | label_idx = self.class_to_ind[name]
64 | bndbox.append(label_idx)
65 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind]
66 | # img_id = target.find('filename').text[:-4]
67 |
68 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ]
69 |
70 |
71 | class VOCDetection(data.Dataset):
72 | """VOC Detection Dataset Object
73 | input is image, target is annotation
74 | Arguments:
75 | root (string): filepath to VOCdevkit folder.
76 | image_set (string): imageset to use (eg. 'train', 'val', 'test')
77 | transform (callable, optional): transformation to perform on the
78 | input image
79 | target_transform (callable, optional): transformation to perform on the
80 | target `annotation`
81 | (eg: take in caption string, return tensor of word indices)
82 | dataset_name (string, optional): which dataset to load
83 | (default: 'VOC2007')
84 | """
85 |
86 | def __init__(self, root,
87 | image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
88 | transform=None, target_transform=VOCAnnotationTransform(),
89 | dataset_name='VOC0712'):
90 | self.root = root
91 | self.image_set = image_sets
92 | self.transform = transform
93 | self.target_transform = target_transform
94 | self.name = dataset_name
95 | self._annopath = osp.join('%s', 'Annotations', '%s.xml')
96 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
97 | self.ids = list()
98 | for (year, name) in image_sets:
99 | rootpath = osp.join(self.root, 'VOC' + year)
100 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
101 | self.ids.append((rootpath, line.strip()))
102 |
103 | def __getitem__(self, index):
104 | img_id = self.ids[index]
105 |
106 | target = ET.parse(self._annopath % img_id).getroot()
107 | img = cv2.imread(self._imgpath % img_id)
108 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
109 | img = img.astype(np.float32)/255.
110 | height, width, channels = img.shape
111 |
112 | if self.target_transform is not None:
113 | target = self.target_transform(target, width, height)
114 | target = np.array(target)
115 | sample = {'img': img, 'annot': target}
116 | if self.transform is not None:
117 | sample = self.transform(sample)
118 | return sample
119 |
120 | bbox = target[:, :4]
121 | labels = target[:, 4]
122 |
123 | if self.transform is not None:
124 | annotation = {'image': img, 'bboxes': bbox, 'category_id': labels}
125 | augmentation = self.transform(**annotation)
126 | img = augmentation['image']
127 | bbox = augmentation['bboxes']
128 | labels = augmentation['category_id']
129 | return {'image': img, 'bboxes': bbox, 'category_id': labels}
130 |
131 | def __len__(self):
132 | return len(self.ids)
133 |
134 | def num_classes(self):
135 | return len(VOC_CLASSES)
136 |
137 | def label_to_name(self, label):
138 | return VOC_CLASSES[label]
139 |
140 | def load_annotations(self, index):
141 | img_id = self.ids[index]
142 | anno = ET.parse(self._annopath % img_id).getroot()
143 | gt = self.target_transform(anno, 1, 1)
144 | gt = np.array(gt)
145 | return gt
146 |
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import cv2
3 | from PIL import Image
4 | import matplotlib.pyplot as plt
5 | from models import EfficientDet
6 | from torchvision import transforms
7 | import numpy as np
8 | import skimage
9 | from datasets import get_augumentation, VOC_CLASSES
10 | from timeit import default_timer as timer
11 | import argparse
12 | import copy
13 | from utils import vis_bbox, EFFICIENTDET
14 |
15 | parser = argparse.ArgumentParser(description='EfficientDet')
16 |
17 | parser.add_argument('-n', '--network', default='efficientdet-d0',
18 | help='efficientdet-[d0, d1, ..]')
19 | parser.add_argument('-s', '--score', default=True,
20 | action="store_true", help='Show score')
21 | parser.add_argument('-t', '--threshold', default=0.6,
22 | type=float, help='Visualization threshold')
23 | parser.add_argument('-it', '--iou_threshold', default=0.6,
24 | type=float, help='Visualization threshold')
25 | parser.add_argument('-w', '--weight', default='./weights/voc0712.pth',
26 | type=str, help='Weight model path')
27 | parser.add_argument('-c', '--cam',
28 | action="store_true", help='Use camera')
29 | parser.add_argument('-f', '--file_name', default='pic.jpg',
30 | help='Image path')
31 | parser.add_argument('--num_class', default=21, type=int,
32 | help='Number of class used in model')
33 | args = parser.parse_args()
34 |
35 |
36 | class Detect(object):
37 | """
38 | dir_name: Folder or image_file
39 | """
40 |
41 | def __init__(self, weights, num_class=21, network='efficientdet-d0', size_image=(512, 512)):
42 | super(Detect, self).__init__()
43 | self.weights = weights
44 | self.size_image = size_image
45 | self.device = torch.device(
46 | "cuda:0" if torch.cuda.is_available() else 'cpu')
47 | self.transform = get_augumentation(phase='test')
48 | if(self.weights is not None):
49 | print('Load pretrained Model')
50 | checkpoint = torch.load(
51 | self.weights, map_location=lambda storage, loc: storage)
52 | params = checkpoint['parser']
53 | num_class = params.num_class
54 | network = params.network
55 |
56 | self.model = EfficientDet(num_classes=num_class,
57 | network=network,
58 | W_bifpn=EFFICIENTDET[network]['W_bifpn'],
59 | D_bifpn=EFFICIENTDET[network]['D_bifpn'],
60 | D_class=EFFICIENTDET[network]['D_class'],
61 | is_training=False
62 | )
63 |
64 | if(self.weights is not None):
65 | state_dict = checkpoint['state_dict']
66 | self.model.load_state_dict(state_dict)
67 | if torch.cuda.is_available():
68 | self.model = self.model.cuda()
69 | self.model.eval()
70 |
71 | def process(self, file_name=None, img=None, show=False):
72 | if file_name is not None:
73 | img = cv2.imread(file_name)
74 | origin_img = copy.deepcopy(img)
75 | augmentation = self.transform(image=img)
76 | img = augmentation['image']
77 | img = img.to(self.device)
78 | img = img.unsqueeze(0)
79 |
80 | with torch.no_grad():
81 | scores, classification, transformed_anchors = self.model(img)
82 | bboxes = list()
83 | labels = list()
84 | bbox_scores = list()
85 | colors = list()
86 | for j in range(scores.shape[0]):
87 | bbox = transformed_anchors[[j], :][0].data.cpu().numpy()
88 | x1 = int(bbox[0]*origin_img.shape[1]/self.size_image[1])
89 | y1 = int(bbox[1]*origin_img.shape[0]/self.size_image[0])
90 | x2 = int(bbox[2]*origin_img.shape[1]/self.size_image[1])
91 | y2 = int(bbox[3]*origin_img.shape[0]/self.size_image[0])
92 | bboxes.append([x1, y1, x2, y2])
93 | label_name = VOC_CLASSES[int(classification[[j]])]
94 | labels.append(label_name)
95 |
96 | if(args.cam):
97 | cv2.rectangle(origin_img, (x1, y1),
98 | (x2, y2), (179, 255, 179), 2, 1)
99 | if args.score:
100 | score = np.around(
101 | scores[[j]].cpu().numpy(), decimals=2) * 100
102 | if(args.cam):
103 | labelSize, baseLine = cv2.getTextSize('{} {}'.format(
104 | label_name, int(score)), cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
105 | cv2.rectangle(
106 | origin_img, (x1, y1-labelSize[1]), (x1+labelSize[0], y1+baseLine), (223, 128, 255), cv2.FILLED)
107 | cv2.putText(
108 | origin_img, '{} {}'.format(label_name, int(score)),
109 | (x1, y1), cv2.FONT_HERSHEY_SIMPLEX,
110 | 0.8, (0, 0, 0), 2
111 | )
112 | bbox_scores.append(int(score))
113 | else:
114 | if(args.cam):
115 | labelSize, baseLine = cv2.getTextSize('{}'.format(
116 | label_name), cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
117 | cv2.rectangle(
118 | origin_img, (x1, y1-labelSize[1]), (x1+labelSize[0], y1+baseLine), (0, 102, 255), cv2.FILLED)
119 | cv2.putText(
120 | origin_img, '{} {}'.format(label_name, int(score)),
121 | (x1, y1), cv2.FONT_HERSHEY_SIMPLEX,
122 | 0.8, (0, 0, 0), 2
123 | )
124 | if show:
125 | fig, ax = vis_bbox(img=origin_img, bbox=bboxes,
126 | label=labels, score=bbox_scores)
127 | fig.savefig('./docs/demo.png')
128 | plt.show()
129 | else:
130 | return origin_img
131 |
132 | def camera(self):
133 | cap = cv2.VideoCapture(0)
134 | if not cap.isOpened():
135 | print("Unable to open camera")
136 | exit(-1)
137 | count_tfps = 1
138 | accum_time = 0
139 | curr_fps = 0
140 | fps = "FPS: ??"
141 | prev_time = timer()
142 | while True:
143 | res, img = cap.read()
144 | curr_time = timer()
145 | exec_time = curr_time - prev_time
146 | prev_time = curr_time
147 | accum_time = accum_time + exec_time
148 | curr_fps = curr_fps + 1
149 |
150 | if accum_time > 1:
151 | accum_time = accum_time - 1
152 | fps = curr_fps
153 | curr_fps = 0
154 | if res:
155 | show_image = self.process(img=img)
156 | cv2.putText(
157 | show_image, "FPS: " + str(fps), (10, 20),
158 | cv2.FONT_HERSHEY_SIMPLEX, 0.9, (204, 51, 51), 2
159 | )
160 |
161 | cv2.imshow("Detection", show_image)
162 | k = cv2.waitKey(1)
163 | if k == 27:
164 | break
165 | else:
166 | print("Unable to read image")
167 | exit(-1)
168 | count_tfps += 1
169 | cap.release()
170 | cv2.destroyAllWindows()
171 |
172 |
173 | if __name__ == '__main__':
174 | detect = Detect(weights=args.weight)
175 | print('cam: ', args.cam)
176 | if args.cam:
177 | detect.camera()
178 | else:
179 | detect.process(file_name=args.file_name, show=True)
180 |
--------------------------------------------------------------------------------
/docs/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/arch.png
--------------------------------------------------------------------------------
/docs/compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/compare.png
--------------------------------------------------------------------------------
/docs/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/demo.png
--------------------------------------------------------------------------------
/docs/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/output.png
--------------------------------------------------------------------------------
/docs/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/performance.png
--------------------------------------------------------------------------------
/docs/pytoan.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/pytoan.gif
--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 |
4 | import numpy as np
5 | import torch
6 | from torch.utils.data import DataLoader
7 | from torchvision import transforms
8 | from tqdm import tqdm
9 | from pycocotools.cocoeval import COCOeval
10 | import json
11 |
12 | from datasets import (Augmenter, CocoDataset, Normalizer,
13 | Resizer, VOCDetection, collater, detection_collate,
14 | get_augumentation)
15 | from models.efficientdet import EfficientDet
16 | from utils import EFFICIENTDET, get_state_dict
17 |
18 |
19 | def compute_overlap(a, b):
20 | """
21 | Parameters
22 | ----------
23 | a: (N, 4) ndarray of float
24 | b: (K, 4) ndarray of float
25 | Returns
26 | -------
27 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes
28 | """
29 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
30 |
31 | iw = np.minimum(np.expand_dims(
32 | a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
33 | ih = np.minimum(np.expand_dims(
34 | a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])
35 |
36 | iw = np.maximum(iw, 0)
37 | ih = np.maximum(ih, 0)
38 |
39 | ua = np.expand_dims((a[:, 2] - a[:, 0]) *
40 | (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
41 |
42 | ua = np.maximum(ua, np.finfo(float).eps)
43 |
44 | intersection = iw * ih
45 |
46 | return intersection / ua
47 |
48 |
49 | def _compute_ap(recall, precision):
50 | """ Compute the average precision, given the recall and precision curves.
51 | Code originally from https://github.com/rbgirshick/py-faster-rcnn.
52 | # Arguments
53 | recall: The recall curve (list).
54 | precision: The precision curve (list).
55 | # Returns
56 | The average precision as computed in py-faster-rcnn.
57 | """
58 | # correct AP calculation
59 | # first append sentinel values at the end
60 | mrec = np.concatenate(([0.], recall, [1.]))
61 | mpre = np.concatenate(([0.], precision, [0.]))
62 |
63 | # compute the precision envelope
64 | for i in range(mpre.size - 1, 0, -1):
65 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
66 |
67 | # to calculate area under PR curve, look for points
68 | # where X axis (recall) changes value
69 | i = np.where(mrec[1:] != mrec[:-1])[0]
70 |
71 | # and sum (\Delta recall) * prec
72 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
73 | return ap
74 |
75 |
76 | def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, save_path=None):
77 | """ Get the detections from the retinanet using the generator.
78 | The result is a list of lists such that the size is:
79 | all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
80 | # Arguments
81 | dataset : The generator used to run images through the retinanet.
82 | retinanet : The retinanet to run on the images.
83 | score_threshold : The score confidence threshold to use.
84 | max_detections : The maximum number of detections to use per image.
85 | save_path : The path to save the images with visualized detections to.
86 | # Returns
87 | A list of lists containing the detections for each image in the generator.
88 | """
89 | all_detections = [[None for i in range(
90 | dataset.num_classes())] for j in range(len(dataset))]
91 |
92 | retinanet.eval()
93 |
94 | with torch.no_grad():
95 |
96 | for index in range(len(dataset)):
97 | data = dataset[index]
98 | scale = data['scale']
99 |
100 | # run network
101 | scores, labels, boxes = retinanet(data['img'].permute(
102 | 2, 0, 1).cuda().float().unsqueeze(dim=0))
103 | scores = scores.cpu().numpy()
104 | labels = labels.cpu().numpy()
105 | boxes = boxes.cpu().numpy()
106 |
107 | # correct boxes for image scale
108 | boxes /= scale
109 |
110 | # select indices which have a score above the threshold
111 | indices = np.where(scores > score_threshold)[0]
112 | if indices.shape[0] > 0:
113 | # select those scores
114 | scores = scores[indices]
115 |
116 | # find the order with which to sort the scores
117 | scores_sort = np.argsort(-scores)[:max_detections]
118 |
119 | # select detections
120 | image_boxes = boxes[indices[scores_sort], :]
121 | image_scores = scores[scores_sort]
122 | image_labels = labels[indices[scores_sort]]
123 | image_detections = np.concatenate([image_boxes, np.expand_dims(
124 | image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
125 |
126 | # copy detections to all_detections
127 | for label in range(dataset.num_classes()):
128 | all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1]
129 | else:
130 | # copy detections to all_detections
131 | for label in range(dataset.num_classes()):
132 | all_detections[index][label] = np.zeros((0, 5))
133 |
134 | print('{}/{}'.format(index + 1, len(dataset)), end='\r')
135 |
136 | return all_detections
137 |
138 |
139 | def _get_annotations(generator):
140 | """ Get the ground truth annotations from the generator.
141 | The result is a list of lists such that the size is:
142 | all_detections[num_images][num_classes] = annotations[num_detections, 5]
143 | # Arguments
144 | generator : The generator used to retrieve ground truth annotations.
145 | # Returns
146 | A list of lists containing the annotations for each image in the generator.
147 | """
148 | all_annotations = [[None for i in range(
149 | generator.num_classes())] for j in range(len(generator))]
150 |
151 | for i in range(len(generator)):
152 | # load the annotations
153 | annotations = generator.load_annotations(i)
154 |
155 | # copy detections to all_annotations
156 | for label in range(generator.num_classes()):
157 | all_annotations[i][label] = annotations[annotations[:, 4]
158 | == label, :4].copy()
159 |
160 | print('{}/{}'.format(i + 1, len(generator)), end='\r')
161 |
162 | return all_annotations
163 |
164 |
165 | def evaluate(
166 | generator,
167 | retinanet,
168 | iou_threshold=0.5,
169 | score_threshold=0.05,
170 | max_detections=100,
171 | save_path=None
172 | ):
173 | """ Evaluate a given dataset using a given retinanet.
174 | # Arguments
175 | generator : The generator that represents the dataset to evaluate.
176 | retinanet : The retinanet to evaluate.
177 | iou_threshold : The threshold used to consider when a detection is positive or negative.
178 | score_threshold : The score confidence threshold to use for detections.
179 | max_detections : The maximum number of detections to use per image.
180 | save_path : The path to save images with visualized detections to.
181 | # Returns
182 | A dict mapping class names to mAP scores.
183 | """
184 |
185 | # gather all detections and annotations
186 |
187 | all_detections = _get_detections(
188 | generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path)
189 | all_annotations = _get_annotations(generator)
190 |
191 | average_precisions = {}
192 |
193 | for label in range(generator.num_classes()):
194 | false_positives = np.zeros((0,))
195 | true_positives = np.zeros((0,))
196 | scores = np.zeros((0,))
197 | num_annotations = 0.0
198 |
199 | for i in range(len(generator)):
200 | detections = all_detections[i][label]
201 | annotations = all_annotations[i][label]
202 | num_annotations += annotations.shape[0]
203 | detected_annotations = []
204 |
205 | for d in detections:
206 | scores = np.append(scores, d[4])
207 |
208 | if annotations.shape[0] == 0:
209 | false_positives = np.append(false_positives, 1)
210 | true_positives = np.append(true_positives, 0)
211 | continue
212 |
213 | overlaps = compute_overlap(
214 | np.expand_dims(d, axis=0), annotations)
215 | assigned_annotation = np.argmax(overlaps, axis=1)
216 | max_overlap = overlaps[0, assigned_annotation]
217 |
218 | if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
219 | false_positives = np.append(false_positives, 0)
220 | true_positives = np.append(true_positives, 1)
221 | detected_annotations.append(assigned_annotation)
222 | else:
223 | false_positives = np.append(false_positives, 1)
224 | true_positives = np.append(true_positives, 0)
225 |
226 | # no annotations -> AP for this class is 0 (is this correct?)
227 | if num_annotations == 0:
228 | average_precisions[label] = 0, 0
229 | continue
230 |
231 | # sort by score
232 | indices = np.argsort(-scores)
233 | false_positives = false_positives[indices]
234 | true_positives = true_positives[indices]
235 |
236 | # compute false positives and true positives
237 | false_positives = np.cumsum(false_positives)
238 | true_positives = np.cumsum(true_positives)
239 |
240 | # compute recall and precision
241 | recall = true_positives / num_annotations
242 | precision = true_positives / \
243 | np.maximum(true_positives + false_positives,
244 | np.finfo(np.float64).eps)
245 |
246 | # compute average precision
247 | average_precision = _compute_ap(recall, precision)
248 | average_precisions[label] = average_precision, num_annotations
249 |
250 | print('\nmAP:')
251 | avg_mAP = []
252 | for label in range(generator.num_classes()):
253 | label_name = generator.label_to_name(label)
254 | print('{}: {}'.format(label_name, average_precisions[label][0]))
255 | avg_mAP.append(average_precisions[label][0])
256 | print('avg mAP: {}'.format(np.mean(avg_mAP)))
257 | return np.mean(avg_mAP), average_precisions
258 |
259 |
260 | def evaluate_coco(dataset, model, threshold=0.05):
261 |
262 | model.eval()
263 |
264 | with torch.no_grad():
265 |
266 | # start collecting results
267 | results = []
268 | image_ids = []
269 |
270 | for index in range(len(dataset)):
271 | data = dataset[index]
272 | scale = data['scale']
273 |
274 | # run network
275 | scores, labels, boxes = model(data['img'].permute(
276 | 2, 0, 1).cuda().float().unsqueeze(dim=0))
277 | scores = scores.cpu()
278 | labels = labels.cpu()
279 | boxes = boxes.cpu()
280 |
281 | # correct boxes for image scale
282 | boxes /= scale
283 |
284 | if boxes.shape[0] > 0:
285 | # change to (x, y, w, h) (MS COCO standard)
286 | boxes[:, 2] -= boxes[:, 0]
287 | boxes[:, 3] -= boxes[:, 1]
288 |
289 | # compute predicted labels and scores
290 | # for box, score, label in zip(boxes[0], scores[0], labels[0]):
291 | for box_id in range(boxes.shape[0]):
292 | score = float(scores[box_id])
293 | label = int(labels[box_id])
294 | box = boxes[box_id, :]
295 |
296 | # scores are sorted, so we can break
297 | if score < threshold:
298 | break
299 |
300 | # append detection for each positively labeled class
301 | image_result = {
302 | 'image_id': dataset.image_ids[index],
303 | 'category_id': dataset.label_to_coco_label(label),
304 | 'score': float(score),
305 | 'bbox': box.tolist(),
306 | }
307 |
308 | # append detection to results
309 | results.append(image_result)
310 |
311 | # append image to list of processed images
312 | image_ids.append(dataset.image_ids[index])
313 |
314 | # print progress
315 | print('{}/{}'.format(index, len(dataset)), end='\r')
316 |
317 | if not len(results):
318 | return
319 |
320 | # write output
321 | json.dump(results, open('{}_bbox_results.json'.format(
322 | dataset.set_name), 'w'), indent=4)
323 |
324 | # load results in COCO evaluation tool
325 | coco_true = dataset.coco
326 | coco_pred = coco_true.loadRes(
327 | '{}_bbox_results.json'.format(dataset.set_name))
328 |
329 | # run COCO evaluation
330 | coco_eval = COCOeval(coco_true, coco_pred, 'bbox')
331 | coco_eval.params.imgIds = image_ids
332 | coco_eval.evaluate()
333 | coco_eval.accumulate()
334 | coco_eval.summarize()
335 |
336 | model.train()
337 |
338 | return
339 |
340 |
341 | if __name__ == '__main__':
342 | parser = argparse.ArgumentParser(
343 | description='EfficientDet Training With Pytorch')
344 | train_set = parser.add_mutually_exclusive_group()
345 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'],
346 | type=str, help='VOC or COCO')
347 | parser.add_argument('--dataset_root', default='/root/data/VOCdevkit/',
348 | help='Dataset root directory path [/root/data/VOCdevkit/, /root/data/coco/]')
349 | parser.add_argument('-t', '--threshold', default=0.4,
350 | type=float, help='Visualization threshold')
351 | parser.add_argument('-it', '--iou_threshold', default=0.5,
352 | type=float, help='Visualization threshold')
353 | parser.add_argument('--weight', default='./checkpoint_VOC_efficientdet-d0_248.pth', type=str,
354 | help='Checkpoint state_dict file to resume training from')
355 | args = parser.parse_args()
356 |
357 | if(args.weight is not None):
358 | resume_path = str(args.weight)
359 | print("Loading checkpoint: {} ...".format(resume_path))
360 | checkpoint = torch.load(
361 | args.weight, map_location=lambda storage, loc: storage)
362 | params = checkpoint['parser']
363 | args.num_class = params.num_class
364 | args.network = params.network
365 | model = EfficientDet(
366 | num_classes=args.num_class,
367 | network=args.network,
368 | W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
369 | D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
370 | D_class=EFFICIENTDET[args.network]['D_class'],
371 | is_training=False,
372 | threshold=args.threshold,
373 | iou_threshold=args.iou_threshold)
374 | model.load_state_dict(checkpoint['state_dict'])
375 | model = model.cuda()
376 | if(args.dataset == 'VOC'):
377 | valid_dataset = VOCDetection(root=args.dataset_root, image_sets=[('2007', 'test')],
378 | transform=transforms.Compose([Normalizer(), Resizer()]))
379 | evaluate(valid_dataset, model)
380 | else:
381 | valid_dataset = CocoDataset(root_dir=args.dataset_root, set_name='val2017',
382 | transform=transforms.Compose([Normalizer(), Resizer()]))
383 | evaluate_coco(valid_dataset, model)
384 |
--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .efficientdet import EfficientDet
--------------------------------------------------------------------------------
/models/bifpn.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | import torch.nn.functional as F
3 |
4 |
5 | from .module import ConvModule, xavier_init
6 | import torch
7 |
8 |
9 | class BIFPN(nn.Module):
10 | def __init__(self,
11 | in_channels,
12 | out_channels,
13 | num_outs,
14 | start_level=0,
15 | end_level=-1,
16 | stack=1,
17 | add_extra_convs=False,
18 | extra_convs_on_inputs=True,
19 | relu_before_extra_convs=False,
20 | no_norm_on_lateral=False,
21 | conv_cfg=None,
22 | norm_cfg=None,
23 | activation=None):
24 | super(BIFPN, self).__init__()
25 | assert isinstance(in_channels, list)
26 | self.in_channels = in_channels
27 | self.out_channels = out_channels
28 | self.num_ins = len(in_channels)
29 | self.num_outs = num_outs
30 | self.activation = activation
31 | self.relu_before_extra_convs = relu_before_extra_convs
32 | self.no_norm_on_lateral = no_norm_on_lateral
33 | self.stack = stack
34 |
35 | if end_level == -1:
36 | self.backbone_end_level = self.num_ins
37 | assert num_outs >= self.num_ins - start_level
38 | else:
39 | # if end_level < inputs, no extra level is allowed
40 | self.backbone_end_level = end_level
41 | assert end_level <= len(in_channels)
42 | assert num_outs == end_level - start_level
43 | self.start_level = start_level
44 | self.end_level = end_level
45 | self.add_extra_convs = add_extra_convs
46 | self.extra_convs_on_inputs = extra_convs_on_inputs
47 |
48 | self.lateral_convs = nn.ModuleList()
49 | self.fpn_convs = nn.ModuleList()
50 | self.stack_bifpn_convs = nn.ModuleList()
51 |
52 | for i in range(self.start_level, self.backbone_end_level):
53 | l_conv = ConvModule(
54 | in_channels[i],
55 | out_channels,
56 | 1,
57 | conv_cfg=conv_cfg,
58 | norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
59 | activation=self.activation,
60 | inplace=False)
61 | self.lateral_convs.append(l_conv)
62 |
63 | for ii in range(stack):
64 | self.stack_bifpn_convs.append(BiFPNModule(channels=out_channels,
65 | levels=self.backbone_end_level-self.start_level,
66 | conv_cfg=conv_cfg,
67 | norm_cfg=norm_cfg,
68 | activation=activation))
69 | # add extra conv layers (e.g., RetinaNet)
70 | extra_levels = num_outs - self.backbone_end_level + self.start_level
71 | if add_extra_convs and extra_levels >= 1:
72 | for i in range(extra_levels):
73 | if i == 0 and self.extra_convs_on_inputs:
74 | in_channels = self.in_channels[self.backbone_end_level - 1]
75 | else:
76 | in_channels = out_channels
77 | extra_fpn_conv = ConvModule(
78 | in_channels,
79 | out_channels,
80 | 3,
81 | stride=2,
82 | padding=1,
83 | conv_cfg=conv_cfg,
84 | norm_cfg=norm_cfg,
85 | activation=self.activation,
86 | inplace=False)
87 | self.fpn_convs.append(extra_fpn_conv)
88 | self.init_weights()
89 |
90 | # default init_weights for conv(msra) and norm in ConvModule
91 | def init_weights(self):
92 | for m in self.modules():
93 | if isinstance(m, nn.Conv2d):
94 | xavier_init(m, distribution='uniform')
95 |
96 | def forward(self, inputs):
97 | assert len(inputs) == len(self.in_channels)
98 |
99 | # build laterals
100 | laterals = [
101 | lateral_conv(inputs[i + self.start_level])
102 | for i, lateral_conv in enumerate(self.lateral_convs)
103 | ]
104 |
105 | # part 1: build top-down and down-top path with stack
106 | used_backbone_levels = len(laterals)
107 | for bifpn_module in self.stack_bifpn_convs:
108 | laterals = bifpn_module(laterals)
109 | outs = laterals
110 | # part 2: add extra levels
111 | if self.num_outs > len(outs):
112 | # use max pool to get more levels on top of outputs
113 | # (e.g., Faster R-CNN, Mask R-CNN)
114 | if not self.add_extra_convs:
115 | for i in range(self.num_outs - used_backbone_levels):
116 | outs.append(F.max_pool2d(outs[-1], 1, stride=2))
117 | # add conv layers on top of original feature maps (RetinaNet)
118 | else:
119 | if self.extra_convs_on_inputs:
120 | orig = inputs[self.backbone_end_level - 1]
121 | outs.append(self.fpn_convs[0](orig))
122 | else:
123 | outs.append(self.fpn_convs[0](outs[-1]))
124 | for i in range(1, self.num_outs - used_backbone_levels):
125 | if self.relu_before_extra_convs:
126 | outs.append(self.fpn_convs[i](F.relu(outs[-1])))
127 | else:
128 | outs.append(self.fpn_convs[i](outs[-1]))
129 | return tuple(outs)
130 |
131 |
132 | class BiFPNModule(nn.Module):
133 | def __init__(self,
134 | channels,
135 | levels,
136 | init=0.5,
137 | conv_cfg=None,
138 | norm_cfg=None,
139 | activation=None,
140 | eps=0.0001):
141 | super(BiFPNModule, self).__init__()
142 | self.activation = activation
143 | self.eps = eps
144 | self.levels = levels
145 | self.bifpn_convs = nn.ModuleList()
146 | # weighted
147 | self.w1 = nn.Parameter(torch.Tensor(2, levels).fill_(init))
148 | self.relu1 = nn.ReLU()
149 | self.w2 = nn.Parameter(torch.Tensor(3, levels - 2).fill_(init))
150 | self.relu2 = nn.ReLU()
151 | for jj in range(2):
152 | for i in range(self.levels-1): # 1,2,3
153 | fpn_conv = nn.Sequential(
154 | ConvModule(
155 | channels,
156 | channels,
157 | 3,
158 | padding=1,
159 | conv_cfg=conv_cfg,
160 | norm_cfg=norm_cfg,
161 | activation=self.activation,
162 | inplace=False)
163 | )
164 | self.bifpn_convs.append(fpn_conv)
165 |
166 | # default init_weights for conv(msra) and norm in ConvModule
167 | def init_weights(self):
168 | for m in self.modules():
169 | if isinstance(m, nn.Conv2d):
170 | xavier_init(m, distribution='uniform')
171 |
172 | def forward(self, inputs):
173 | assert len(inputs) == self.levels
174 | # build top-down and down-top path with stack
175 | levels = self.levels
176 | # w relu
177 | w1 = self.relu1(self.w1)
178 | w1 /= torch.sum(w1, dim=0) + self.eps # normalize
179 | w2 = self.relu2(self.w2)
180 | w2 /= torch.sum(w2, dim=0) + self.eps # normalize
181 | # build top-down
182 | idx_bifpn = 0
183 | pathtd = inputs
184 | inputs_clone = []
185 | for in_tensor in inputs:
186 | inputs_clone.append(in_tensor.clone())
187 |
188 | for i in range(levels - 1, 0, -1):
189 | pathtd[i - 1] = (w1[0, i-1]*pathtd[i - 1] + w1[1, i-1]*F.interpolate(
190 | pathtd[i], scale_factor=2, mode='nearest'))/(w1[0, i-1] + w1[1, i-1] + self.eps)
191 | pathtd[i - 1] = self.bifpn_convs[idx_bifpn](pathtd[i - 1])
192 | idx_bifpn = idx_bifpn + 1
193 | # build down-top
194 | for i in range(0, levels - 2, 1):
195 | pathtd[i + 1] = (w2[0, i] * pathtd[i + 1] + w2[1, i] * F.max_pool2d(pathtd[i], kernel_size=2) +
196 | w2[2, i] * inputs_clone[i + 1])/(w2[0, i] + w2[1, i] + w2[2, i] + self.eps)
197 | pathtd[i + 1] = self.bifpn_convs[idx_bifpn](pathtd[i + 1])
198 | idx_bifpn = idx_bifpn + 1
199 |
200 | pathtd[levels - 1] = (w1[0, levels-1] * pathtd[levels - 1] + w1[1, levels-1] * F.max_pool2d(
201 | pathtd[levels - 2], kernel_size=2))/(w1[0, levels-1] + w1[1, levels-1] + self.eps)
202 | pathtd[levels - 1] = self.bifpn_convs[idx_bifpn](pathtd[levels - 1])
203 | return pathtd
204 |
--------------------------------------------------------------------------------
/models/efficientdet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import math
4 | from models.efficientnet import EfficientNet
5 | from models.bifpn import BIFPN
6 | from .retinahead import RetinaHead
7 | from models.module import RegressionModel, ClassificationModel, Anchors, ClipBoxes, BBoxTransform
8 | from torchvision.ops import nms
9 | from .losses import FocalLoss
10 | MODEL_MAP = {
11 | 'efficientdet-d0': 'efficientnet-b0',
12 | 'efficientdet-d1': 'efficientnet-b1',
13 | 'efficientdet-d2': 'efficientnet-b2',
14 | 'efficientdet-d3': 'efficientnet-b3',
15 | 'efficientdet-d4': 'efficientnet-b4',
16 | 'efficientdet-d5': 'efficientnet-b5',
17 | 'efficientdet-d6': 'efficientnet-b6',
18 | 'efficientdet-d7': 'efficientnet-b6',
19 | }
20 |
21 |
22 | class EfficientDet(nn.Module):
23 | def __init__(self,
24 | num_classes,
25 | network='efficientdet-d0',
26 | D_bifpn=3,
27 | W_bifpn=88,
28 | D_class=3,
29 | is_training=True,
30 | threshold=0.01,
31 | iou_threshold=0.5):
32 | super(EfficientDet, self).__init__()
33 | self.backbone = EfficientNet.from_pretrained(MODEL_MAP[network])
34 | self.is_training = is_training
35 | self.neck = BIFPN(in_channels=self.backbone.get_list_features()[-5:],
36 | out_channels=W_bifpn,
37 | stack=D_bifpn,
38 | num_outs=5)
39 | self.bbox_head = RetinaHead(num_classes=num_classes,
40 | in_channels=W_bifpn)
41 |
42 | self.anchors = Anchors()
43 | self.regressBoxes = BBoxTransform()
44 | self.clipBoxes = ClipBoxes()
45 | self.threshold = threshold
46 | self.iou_threshold = iou_threshold
47 | for m in self.modules():
48 | if isinstance(m, nn.Conv2d):
49 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
50 | m.weight.data.normal_(0, math.sqrt(2. / n))
51 | elif isinstance(m, nn.BatchNorm2d):
52 | m.weight.data.fill_(1)
53 | m.bias.data.zero_()
54 | self.freeze_bn()
55 | self.criterion = FocalLoss()
56 |
57 | def forward(self, inputs):
58 | if self.is_training:
59 | inputs, annotations = inputs
60 | else:
61 | inputs = inputs
62 | x = self.extract_feat(inputs)
63 | outs = self.bbox_head(x)
64 | classification = torch.cat([out for out in outs[0]], dim=1)
65 | regression = torch.cat([out for out in outs[1]], dim=1)
66 | anchors = self.anchors(inputs)
67 | if self.is_training:
68 | return self.criterion(classification, regression, anchors, annotations)
69 | else:
70 | transformed_anchors = self.regressBoxes(anchors, regression)
71 | transformed_anchors = self.clipBoxes(transformed_anchors, inputs)
72 | scores = torch.max(classification, dim=2, keepdim=True)[0]
73 | scores_over_thresh = (scores > self.threshold)[0, :, 0]
74 |
75 | if scores_over_thresh.sum() == 0:
76 | print('No boxes to NMS')
77 | # no boxes to NMS, just return
78 | return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)]
79 | classification = classification[:, scores_over_thresh, :]
80 | transformed_anchors = transformed_anchors[:, scores_over_thresh, :]
81 | scores = scores[:, scores_over_thresh, :]
82 | anchors_nms_idx = nms(
83 | transformed_anchors[0, :, :], scores[0, :, 0], iou_threshold=self.iou_threshold)
84 | nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(
85 | dim=1)
86 | return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]]
87 |
88 | def freeze_bn(self):
89 | '''Freeze BatchNorm layers.'''
90 | for layer in self.modules():
91 | if isinstance(layer, nn.BatchNorm2d):
92 | layer.eval()
93 |
94 | def extract_feat(self, img):
95 | """
96 | Directly extract features from the backbone+neck
97 | """
98 | x = self.backbone(img)
99 | x = self.neck(x[-5:])
100 | return x
101 |
--------------------------------------------------------------------------------
/models/efficientnet.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from torch.nn import functional as F
4 |
5 | from models.utils import (
6 | round_filters,
7 | round_repeats,
8 | drop_connect,
9 | get_same_padding_conv2d,
10 | get_model_params,
11 | efficientnet_params,
12 | load_pretrained_weights,
13 | Swish,
14 | MemoryEfficientSwish,
15 | )
16 |
17 |
18 | class MBConvBlock(nn.Module):
19 | """
20 | Mobile Inverted Residual Bottleneck Block
21 | Args:
22 | block_args (namedtuple): BlockArgs, see above
23 | global_params (namedtuple): GlobalParam, see above
24 | Attributes:
25 | has_se (bool): Whether the block contains a Squeeze and Excitation layer.
26 | """
27 |
28 | def __init__(self, block_args, global_params):
29 | super().__init__()
30 | self._block_args = block_args
31 | self._bn_mom = 1 - global_params.batch_norm_momentum
32 | self._bn_eps = global_params.batch_norm_epsilon
33 | self.has_se = (self._block_args.se_ratio is not None) and (
34 | 0 < self._block_args.se_ratio <= 1)
35 | self.id_skip = block_args.id_skip # skip connection and drop connect
36 |
37 | # Get static or dynamic convolution depending on image size
38 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
39 |
40 | # Expansion phase
41 | inp = self._block_args.input_filters # number of input channels
42 | oup = self._block_args.input_filters * \
43 | self._block_args.expand_ratio # number of output channels
44 | if self._block_args.expand_ratio != 1:
45 | self._expand_conv = Conv2d(
46 | in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
47 | self._bn0 = nn.BatchNorm2d(
48 | num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
49 | # Depthwise convolution phase
50 | k = self._block_args.kernel_size
51 | s = self._block_args.stride
52 | self._depthwise_conv = Conv2d(
53 | in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise
54 | kernel_size=k, stride=s, bias=False)
55 | self._bn1 = nn.BatchNorm2d(
56 | num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
57 |
58 | # Squeeze and Excitation layer, if desired
59 | if self.has_se:
60 | num_squeezed_channels = max(
61 | 1, int(self._block_args.input_filters * self._block_args.se_ratio))
62 | self._se_reduce = Conv2d(
63 | in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
64 | self._se_expand = Conv2d(
65 | in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
66 |
67 | # Output phase
68 | final_oup = self._block_args.output_filters
69 | self._project_conv = Conv2d(
70 | in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
71 | self._bn2 = nn.BatchNorm2d(
72 | num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
73 | self._swish = MemoryEfficientSwish()
74 |
75 | def forward(self, inputs, drop_connect_rate=None):
76 | """
77 | :param inputs: input tensor
78 | :param drop_connect_rate: drop connect rate (float, between 0 and 1)
79 | :return: output of block
80 | """
81 |
82 | # Expansion and Depthwise Convolution
83 | x = inputs
84 | if self._block_args.expand_ratio != 1:
85 | x = self._swish(self._bn0(self._expand_conv(inputs)))
86 |
87 | x = self._swish(self._bn1(self._depthwise_conv(x)))
88 |
89 | # Squeeze and Excitation
90 | if self.has_se:
91 | x_squeezed = F.adaptive_avg_pool2d(x, 1)
92 | x_squeezed = self._se_expand(
93 | self._swish(self._se_reduce(x_squeezed)))
94 | x = torch.sigmoid(x_squeezed) * x
95 |
96 | x = self._bn2(self._project_conv(x))
97 |
98 | # Skip connection and drop connect
99 | input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
100 | if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
101 | if drop_connect_rate:
102 | x = drop_connect(x, p=drop_connect_rate,
103 | training=self.training)
104 | x = x + inputs # skip connection
105 | return x
106 |
107 | def set_swish(self, memory_efficient=True):
108 | """Sets swish function as memory efficient (for training) or standard (for export)"""
109 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
110 |
111 |
112 | class EfficientNet(nn.Module):
113 | """
114 | An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods
115 | Args:
116 | blocks_args (list): A list of BlockArgs to construct blocks
117 | global_params (namedtuple): A set of GlobalParams shared between blocks
118 | Example:
119 | model = EfficientNet.from_pretrained('efficientnet-b0')
120 | """
121 |
122 | def __init__(self, blocks_args=None, global_params=None):
123 | super().__init__()
124 | assert isinstance(blocks_args, list), 'blocks_args should be a list'
125 | assert len(blocks_args) > 0, 'block args must be greater than 0'
126 | self._global_params = global_params
127 | self._blocks_args = blocks_args
128 |
129 | # Get static or dynamic convolution depending on image size
130 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
131 |
132 | # Batch norm parameters
133 | bn_mom = 1 - self._global_params.batch_norm_momentum
134 | bn_eps = self._global_params.batch_norm_epsilon
135 |
136 | # Stem
137 | in_channels = 3 # rgb
138 | # number of output channels
139 | out_channels = round_filters(32, self._global_params)
140 | self._conv_stem = Conv2d(
141 | in_channels, out_channels, kernel_size=3, stride=2, bias=False)
142 | self._bn0 = nn.BatchNorm2d(
143 | num_features=out_channels, momentum=bn_mom, eps=bn_eps)
144 |
145 | # Build blocks
146 | self._blocks = nn.ModuleList([])
147 | for i in range(len(self._blocks_args)):
148 | # Update block input and output filters based on depth multiplier.
149 | self._blocks_args[i] = self._blocks_args[i]._replace(
150 | input_filters=round_filters(
151 | self._blocks_args[i].input_filters, self._global_params),
152 | output_filters=round_filters(
153 | self._blocks_args[i].output_filters, self._global_params),
154 | num_repeat=round_repeats(
155 | self._blocks_args[i].num_repeat, self._global_params)
156 | )
157 |
158 | # The first block needs to take care of stride and filter size increase.
159 | self._blocks.append(MBConvBlock(
160 | self._blocks_args[i], self._global_params))
161 | if self._blocks_args[i].num_repeat > 1:
162 | self._blocks_args[i] = self._blocks_args[i]._replace(
163 | input_filters=self._blocks_args[i].output_filters, stride=1)
164 | for _ in range(self._blocks_args[i].num_repeat - 1):
165 | self._blocks.append(MBConvBlock(
166 | self._blocks_args[i], self._global_params))
167 |
168 | # Head'efficientdet-d0': 'efficientnet-b0',
169 | # output of final block
170 | in_channels = self._blocks_args[len(
171 | self._blocks_args)-1].output_filters
172 | out_channels = round_filters(1280, self._global_params)
173 | self._conv_head = Conv2d(
174 | in_channels, out_channels, kernel_size=1, bias=False)
175 | self._bn1 = nn.BatchNorm2d(
176 | num_features=out_channels, momentum=bn_mom, eps=bn_eps)
177 |
178 | # Final linear layer
179 | self._avg_pooling = nn.AdaptiveAvgPool2d(1)
180 | self._dropout = nn.Dropout(self._global_params.dropout_rate)
181 | self._fc = nn.Linear(out_channels, self._global_params.num_classes)
182 | self._swish = MemoryEfficientSwish()
183 |
184 | def set_swish(self, memory_efficient=True):
185 | """Sets swish function as memory efficient (for training) or standard (for export)"""
186 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
187 | for block in self._blocks:
188 | block.set_swish(memory_efficient)
189 |
190 | def extract_features(self, inputs):
191 | """ Returns output of the final convolution layer """
192 | # Stem
193 | x = self._swish(self._bn0(self._conv_stem(inputs)))
194 |
195 | P = []
196 | index = 0
197 | num_repeat = 0
198 | # Blocks
199 | for idx, block in enumerate(self._blocks):
200 | drop_connect_rate = self._global_params.drop_connect_rate
201 | if drop_connect_rate:
202 | drop_connect_rate *= float(idx) / len(self._blocks)
203 | x = block(x, drop_connect_rate=drop_connect_rate)
204 | num_repeat = num_repeat + 1
205 | if(num_repeat == self._blocks_args[index].num_repeat):
206 | num_repeat = 0
207 | index = index + 1
208 | P.append(x)
209 | return P
210 |
211 | def forward(self, inputs):
212 | """ Calls extract_features to extract features, applies final linear layer, and returns logits. """
213 | # Convolution layers
214 | P = self.extract_features(inputs)
215 | return P
216 |
217 | @classmethod
218 | def from_name(cls, model_name, override_params=None):
219 | cls._check_model_name_is_valid(model_name)
220 | blocks_args, global_params = get_model_params(
221 | model_name, override_params)
222 | return cls(blocks_args, global_params)
223 |
224 | @classmethod
225 | def from_pretrained(cls, model_name, num_classes=1000, in_channels=3):
226 | model = cls.from_name(model_name, override_params={
227 | 'num_classes': num_classes})
228 | load_pretrained_weights(
229 | model, model_name, load_fc=(num_classes == 1000))
230 | if in_channels != 3:
231 | Conv2d = get_same_padding_conv2d(
232 | image_size=model._global_params.image_size)
233 | out_channels = round_filters(32, model._global_params)
234 | model._conv_stem = Conv2d(
235 | in_channels, out_channels, kernel_size=3, stride=2, bias=False)
236 | return model
237 |
238 | @classmethod
239 | def from_pretrained(cls, model_name, num_classes=1000):
240 | model = cls.from_name(model_name, override_params={
241 | 'num_classes': num_classes})
242 | load_pretrained_weights(
243 | model, model_name, load_fc=(num_classes == 1000))
244 |
245 | return model
246 |
247 | @classmethod
248 | def get_image_size(cls, model_name):
249 | cls._check_model_name_is_valid(model_name)
250 | _, _, res, _ = efficientnet_params(model_name)
251 | return res
252 |
253 | @classmethod
254 | def _check_model_name_is_valid(cls, model_name, also_need_pretrained_weights=False):
255 | """ Validates model name. None that pretrained weights are only available for
256 | the first four models (efficientnet-b{i} for i in 0,1,2,3) at the moment. """
257 | num_models = 4 if also_need_pretrained_weights else 8
258 | valid_models = ['efficientnet-b'+str(i) for i in range(num_models)]
259 | if model_name not in valid_models:
260 | raise ValueError('model_name should be one of: ' +
261 | ', '.join(valid_models))
262 |
263 | def get_list_features(self):
264 | list_feature = []
265 | for idx in range(len(self._blocks_args)):
266 | list_feature.append(self._blocks_args[idx].output_filters)
267 |
268 | return list_feature
269 |
270 |
271 | if __name__ == '__main__':
272 | model = EfficientNet.from_pretrained('efficientnet-b0')
273 | inputs = torch.randn(4, 3, 640, 640)
274 | P = model(inputs)
275 | for idx, p in enumerate(P):
276 | print('P{}: {}'.format(idx, p.size()))
277 | # print('model: ', model)
278 |
--------------------------------------------------------------------------------
/models/losses.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | def calc_iou(a, b):
7 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
8 |
9 | iw = torch.min(torch.unsqueeze(
10 | a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0])
11 | ih = torch.min(torch.unsqueeze(
12 | a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1])
13 |
14 | iw = torch.clamp(iw, min=0)
15 | ih = torch.clamp(ih, min=0)
16 |
17 | ua = torch.unsqueeze((a[:, 2] - a[:, 0]) *
18 | (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih
19 |
20 | ua = torch.clamp(ua, min=1e-8)
21 |
22 | intersection = iw * ih
23 |
24 | IoU = intersection / ua
25 |
26 | return IoU
27 |
28 |
29 | class FocalLoss(nn.Module):
30 | # def __init__(self):
31 |
32 | def forward(self, classifications, regressions, anchors, annotations):
33 | alpha = 0.25
34 | gamma = 2.0
35 | batch_size = classifications.shape[0]
36 | classification_losses = []
37 | regression_losses = []
38 |
39 | anchor = anchors[0, :, :]
40 |
41 | anchor_widths = anchor[:, 2] - anchor[:, 0]
42 | anchor_heights = anchor[:, 3] - anchor[:, 1]
43 | anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths
44 | anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights
45 |
46 | for j in range(batch_size):
47 |
48 | classification = classifications[j, :, :]
49 | regression = regressions[j, :, :]
50 |
51 | bbox_annotation = annotations[j, :, :]
52 | bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1]
53 |
54 | if bbox_annotation.shape[0] == 0:
55 | regression_losses.append(torch.tensor(0).float().cuda())
56 | classification_losses.append(torch.tensor(0).float().cuda())
57 |
58 | continue
59 |
60 | classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)
61 |
62 | # num_anchors x num_annotations
63 | IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4])
64 |
65 | IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1
66 |
67 | #import pdb
68 | # pdb.set_trace()
69 |
70 | # compute the loss for classification
71 | targets = torch.ones(classification.shape) * -1
72 | targets = targets.cuda()
73 |
74 | targets[torch.lt(IoU_max, 0.4), :] = 0
75 |
76 | positive_indices = torch.ge(IoU_max, 0.5)
77 |
78 | num_positive_anchors = positive_indices.sum()
79 |
80 | assigned_annotations = bbox_annotation[IoU_argmax, :]
81 |
82 | targets[positive_indices, :] = 0
83 | targets[positive_indices,
84 | assigned_annotations[positive_indices, 4].long()] = 1
85 |
86 | alpha_factor = torch.ones(targets.shape).cuda() * alpha
87 |
88 | alpha_factor = torch.where(
89 | torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor)
90 | focal_weight = torch.where(
91 | torch.eq(targets, 1.), 1. - classification, classification)
92 | focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
93 |
94 | bce = -(targets * torch.log(classification) +
95 | (1.0 - targets) * torch.log(1.0 - classification))
96 |
97 | # cls_loss = focal_weight * torch.pow(bce, gamma)
98 | cls_loss = focal_weight * bce
99 |
100 | cls_loss = torch.where(
101 | torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda())
102 |
103 | classification_losses.append(
104 | cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0))
105 |
106 | # compute the loss for regression
107 |
108 | if positive_indices.sum() > 0:
109 | assigned_annotations = assigned_annotations[positive_indices, :]
110 |
111 | anchor_widths_pi = anchor_widths[positive_indices]
112 | anchor_heights_pi = anchor_heights[positive_indices]
113 | anchor_ctr_x_pi = anchor_ctr_x[positive_indices]
114 | anchor_ctr_y_pi = anchor_ctr_y[positive_indices]
115 |
116 | gt_widths = assigned_annotations[:,
117 | 2] - assigned_annotations[:, 0]
118 | gt_heights = assigned_annotations[:,
119 | 3] - assigned_annotations[:, 1]
120 | gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths
121 | gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights
122 |
123 | # clip widths to 1
124 | gt_widths = torch.clamp(gt_widths, min=1)
125 | gt_heights = torch.clamp(gt_heights, min=1)
126 |
127 | targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
128 | targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
129 | targets_dw = torch.log(gt_widths / anchor_widths_pi)
130 | targets_dh = torch.log(gt_heights / anchor_heights_pi)
131 |
132 | targets = torch.stack(
133 | (targets_dx, targets_dy, targets_dw, targets_dh))
134 | targets = targets.t()
135 |
136 | targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda()
137 |
138 | negative_indices = 1 + (~positive_indices)
139 |
140 | regression_diff = torch.abs(
141 | targets - regression[positive_indices, :])
142 |
143 | regression_loss = torch.where(
144 | torch.le(regression_diff, 1.0 / 9.0),
145 | 0.5 * 9.0 * torch.pow(regression_diff, 2),
146 | regression_diff - 0.5 / 9.0
147 | )
148 | regression_losses.append(regression_loss.mean())
149 | else:
150 | regression_losses.append(torch.tensor(0).float().cuda())
151 |
152 | return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True)
153 |
--------------------------------------------------------------------------------
/models/module.py:
--------------------------------------------------------------------------------
1 |
2 | import numpy as np
3 | import torch
4 | import warnings
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 |
8 |
9 | class BBoxTransform(nn.Module):
10 |
11 | def __init__(self, mean=None, std=None):
12 | super(BBoxTransform, self).__init__()
13 | if mean is None:
14 | self.mean = torch.from_numpy(
15 | np.array([0, 0, 0, 0]).astype(np.float32))
16 | else:
17 | self.mean = mean
18 | if std is None:
19 | self.std = torch.from_numpy(
20 | np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32))
21 | else:
22 | self.std = std
23 |
24 | def forward(self, boxes, deltas):
25 |
26 | widths = boxes[:, :, 2] - boxes[:, :, 0]
27 | heights = boxes[:, :, 3] - boxes[:, :, 1]
28 | ctr_x = boxes[:, :, 0] + 0.5 * widths
29 | ctr_y = boxes[:, :, 1] + 0.5 * heights
30 |
31 | dx = deltas[:, :, 0] * self.std[0] + self.mean[0]
32 | dy = deltas[:, :, 1] * self.std[1] + self.mean[1]
33 | dw = deltas[:, :, 2] * self.std[2] + self.mean[2]
34 | dh = deltas[:, :, 3] * self.std[3] + self.mean[3]
35 |
36 | pred_ctr_x = ctr_x + dx * widths
37 | pred_ctr_y = ctr_y + dy * heights
38 | pred_w = torch.exp(dw) * widths
39 | pred_h = torch.exp(dh) * heights
40 |
41 | pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w
42 | pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h
43 | pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w
44 | pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h
45 |
46 | pred_boxes = torch.stack(
47 | [pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2)
48 |
49 | return pred_boxes
50 |
51 |
52 | class ClipBoxes(nn.Module):
53 |
54 | def __init__(self, width=None, height=None):
55 | super(ClipBoxes, self).__init__()
56 |
57 | def forward(self, boxes, img):
58 |
59 | batch_size, num_channels, height, width = img.shape
60 |
61 | boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
62 | boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0)
63 |
64 | boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width)
65 | boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height)
66 |
67 | return boxes
68 |
69 |
70 | class RegressionModel(nn.Module):
71 | def __init__(self, num_features_in, num_anchors=9, feature_size=256):
72 | super(RegressionModel, self).__init__()
73 |
74 | self.conv1 = nn.Conv2d(
75 | num_features_in, feature_size, kernel_size=3, padding=1)
76 | self.act1 = nn.ReLU()
77 | self.conv2 = nn.Conv2d(feature_size, feature_size,
78 | kernel_size=3, padding=1)
79 | self.act2 = nn.ReLU()
80 | self.conv3 = nn.Conv2d(feature_size, feature_size,
81 | kernel_size=3, padding=1)
82 | self.act3 = nn.ReLU()
83 | self.conv4 = nn.Conv2d(feature_size, feature_size,
84 | kernel_size=3, padding=1)
85 | self.act4 = nn.ReLU()
86 | self.output = nn.Conv2d(
87 | feature_size, num_anchors*4, kernel_size=3, padding=1)
88 |
89 | def forward(self, x):
90 | out = self.conv1(x)
91 | out = self.act1(out)
92 | out = self.conv2(out)
93 | out = self.act2(out)
94 | out = self.conv3(out)
95 | out = self.act3(out)
96 | out = self.conv4(out)
97 | out = self.act4(out)
98 | out = self.output(out)
99 | # out is B x C x W x H, with C = 4*num_anchors
100 | out = out.permute(0, 2, 3, 1)
101 | return out.contiguous().view(out.shape[0], -1, 4)
102 |
103 |
104 | class ClassificationModel(nn.Module):
105 | def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256):
106 | super(ClassificationModel, self).__init__()
107 | self.num_classes = num_classes
108 | self.num_anchors = num_anchors
109 |
110 | self.conv1 = nn.Conv2d(
111 | num_features_in, feature_size, kernel_size=3, padding=1)
112 | self.act1 = nn.ReLU()
113 | self.conv2 = nn.Conv2d(feature_size, feature_size,
114 | kernel_size=3, padding=1)
115 | self.act2 = nn.ReLU()
116 | self.conv3 = nn.Conv2d(feature_size, feature_size,
117 | kernel_size=3, padding=1)
118 | self.act3 = nn.ReLU()
119 | self.conv4 = nn.Conv2d(feature_size, feature_size,
120 | kernel_size=3, padding=1)
121 | self.act4 = nn.ReLU()
122 | self.output = nn.Conv2d(
123 | feature_size, num_anchors*num_classes, kernel_size=3, padding=1)
124 | self.output_act = nn.Sigmoid()
125 |
126 | def forward(self, x):
127 | out = self.conv1(x)
128 | out = self.act1(out)
129 | out = self.conv2(out)
130 | out = self.act2(out)
131 | out = self.conv3(out)
132 | out = self.act3(out)
133 | out = self.conv4(out)
134 | out = self.act4(out)
135 | out = self.output(out)
136 | out = self.output_act(out)
137 | # out is B x C x W x H, with C = n_classes + n_anchors
138 | out1 = out.permute(0, 2, 3, 1)
139 | batch_size, width, height, channels = out1.shape
140 | out2 = out1.view(batch_size, width, height,
141 | self.num_anchors, self.num_classes)
142 | return out2.contiguous().view(x.shape[0], -1, self.num_classes)
143 |
144 |
145 | class Anchors(nn.Module):
146 | def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None):
147 | super(Anchors, self).__init__()
148 |
149 | if pyramid_levels is None:
150 | self.pyramid_levels = [3, 4, 5, 6, 7]
151 | if strides is None:
152 | self.strides = [2 ** x for x in self.pyramid_levels]
153 | if sizes is None:
154 | self.sizes = [2 ** (x + 2) for x in self.pyramid_levels]
155 | if ratios is None:
156 | self.ratios = np.array([0.5, 1, 2])
157 | if scales is None:
158 | self.scales = np.array(
159 | [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
160 |
161 | def forward(self, image):
162 |
163 | image_shape = image.shape[2:]
164 | image_shape = np.array(image_shape)
165 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x)
166 | for x in self.pyramid_levels]
167 |
168 | # compute anchors over all pyramid levels
169 | all_anchors = np.zeros((0, 4)).astype(np.float32)
170 |
171 | for idx, p in enumerate(self.pyramid_levels):
172 | anchors = generate_anchors(
173 | base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales)
174 | shifted_anchors = shift(
175 | image_shapes[idx], self.strides[idx], anchors)
176 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0)
177 |
178 | all_anchors = np.expand_dims(all_anchors, axis=0)
179 |
180 | return torch.from_numpy(all_anchors.astype(np.float32)).to(image.device)
181 |
182 |
183 | def generate_anchors(base_size=16, ratios=None, scales=None):
184 | """
185 | Generate anchor (reference) windows by enumerating aspect ratios X
186 | scales w.r.t. a reference window.
187 | """
188 |
189 | if ratios is None:
190 | ratios = np.array([0.5, 1, 2])
191 |
192 | if scales is None:
193 | scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
194 |
195 | num_anchors = len(ratios) * len(scales)
196 |
197 | # initialize output anchors
198 | anchors = np.zeros((num_anchors, 4))
199 |
200 | # scale base_size
201 | anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T
202 |
203 | # compute areas of anchors
204 | areas = anchors[:, 2] * anchors[:, 3]
205 |
206 | # correct for ratios
207 | anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales)))
208 | anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales))
209 |
210 | # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2)
211 | anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
212 | anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
213 |
214 | return anchors
215 |
216 |
217 | def compute_shape(image_shape, pyramid_levels):
218 | """Compute shapes based on pyramid levels.
219 | :param image_shape:
220 | :param pyramid_levels:
221 | :return:
222 | """
223 | image_shape = np.array(image_shape[:2])
224 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x)
225 | for x in pyramid_levels]
226 | return image_shapes
227 |
228 |
229 | def anchors_for_shape(
230 | image_shape,
231 | pyramid_levels=None,
232 | ratios=None,
233 | scales=None,
234 | strides=None,
235 | sizes=None,
236 | shapes_callback=None,
237 | ):
238 |
239 | image_shapes = compute_shape(image_shape, pyramid_levels)
240 |
241 | # compute anchors over all pyramid levels
242 | all_anchors = np.zeros((0, 4))
243 | for idx, p in enumerate(pyramid_levels):
244 | anchors = generate_anchors(
245 | base_size=sizes[idx], ratios=ratios, scales=scales)
246 | shifted_anchors = shift(image_shapes[idx], strides[idx], anchors)
247 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0)
248 |
249 | return all_anchors
250 |
251 |
252 | def shift(shape, stride, anchors):
253 | shift_x = (np.arange(0, shape[1]) + 0.5) * stride
254 | shift_y = (np.arange(0, shape[0]) + 0.5) * stride
255 |
256 | shift_x, shift_y = np.meshgrid(shift_x, shift_y)
257 |
258 | shifts = np.vstack((
259 | shift_x.ravel(), shift_y.ravel(),
260 | shift_x.ravel(), shift_y.ravel()
261 | )).transpose()
262 |
263 | # add A anchors (1, A, 4) to
264 | # cell K shifts (K, 1, 4) to get
265 | # shift anchors (K, A, 4)
266 | # reshape to (K*A, 4) shifted anchors
267 | A = anchors.shape[0]
268 | K = shifts.shape[0]
269 | all_anchors = (anchors.reshape((1, A, 4)) +
270 | shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
271 | all_anchors = all_anchors.reshape((K * A, 4))
272 |
273 | return all_anchors
274 |
275 |
276 | def conv_ws_2d(input,
277 | weight,
278 | bias=None,
279 | stride=1,
280 | padding=0,
281 | dilation=1,
282 | groups=1,
283 | eps=1e-5):
284 | c_in = weight.size(0)
285 | weight_flat = weight.view(c_in, -1)
286 | mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
287 | std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
288 | weight = (weight - mean) / (std + eps)
289 | return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
290 |
291 |
292 | class ConvWS2d(nn.Conv2d):
293 | def __init__(self,
294 | in_channels,
295 | out_channels,
296 | kernel_size,
297 | stride=1,
298 | padding=0,
299 | dilation=1,
300 | groups=1,
301 | bias=True,
302 | eps=1e-5):
303 | super(ConvWS2d, self).__init__(
304 | in_channels,
305 | out_channels,
306 | kernel_size,
307 | stride=stride,
308 | padding=padding,
309 | dilation=dilation,
310 | groups=groups,
311 | bias=bias)
312 | self.eps = eps
313 |
314 | def forward(self, x):
315 | return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
316 | self.dilation, self.groups, self.eps)
317 |
318 |
319 | conv_cfg = {
320 | 'Conv': nn.Conv2d,
321 | 'ConvWS': ConvWS2d,
322 | # TODO: octave conv
323 | }
324 |
325 |
326 | def build_conv_layer(cfg, *args, **kwargs):
327 | """ Build convolution layer
328 | Args:
329 | cfg (None or dict): cfg should contain:
330 | type (str): identify conv layer type.
331 | layer args: args needed to instantiate a conv layer.
332 | Returns:
333 | layer (nn.Module): created conv layer
334 | """
335 | if cfg is None:
336 | cfg_ = dict(type='Conv')
337 | else:
338 | assert isinstance(cfg, dict) and 'type' in cfg
339 | cfg_ = cfg.copy()
340 |
341 | layer_type = cfg_.pop('type')
342 | if layer_type not in conv_cfg:
343 | raise KeyError('Unrecognized norm type {}'.format(layer_type))
344 | else:
345 | conv_layer = conv_cfg[layer_type]
346 |
347 | layer = conv_layer(*args, **kwargs, **cfg_)
348 |
349 | return layer
350 |
351 |
352 | norm_cfg = {
353 | # format: layer_type: (abbreviation, module)
354 | 'BN': ('bn', nn.BatchNorm2d),
355 | 'SyncBN': ('bn', nn.SyncBatchNorm),
356 | 'GN': ('gn', nn.GroupNorm),
357 | # and potentially 'SN'
358 | }
359 |
360 |
361 | def build_norm_layer(cfg, num_features, postfix=''):
362 | """ Build normalization layer
363 | Args:
364 | cfg (dict): cfg should contain:
365 | type (str): identify norm layer type.
366 | layer args: args needed to instantiate a norm layer.
367 | requires_grad (bool): [optional] whether stop gradient updates
368 | num_features (int): number of channels from input.
369 | postfix (int, str): appended into norm abbreviation to
370 | create named layer.
371 | Returns:
372 | name (str): abbreviation + postfix
373 | layer (nn.Module): created norm layer
374 | """
375 | assert isinstance(cfg, dict) and 'type' in cfg
376 | cfg_ = cfg.copy()
377 |
378 | layer_type = cfg_.pop('type')
379 | if layer_type not in norm_cfg:
380 | raise KeyError('Unrecognized norm type {}'.format(layer_type))
381 | else:
382 | abbr, norm_layer = norm_cfg[layer_type]
383 | if norm_layer is None:
384 | raise NotImplementedError
385 |
386 | assert isinstance(postfix, (int, str))
387 | name = abbr + str(postfix)
388 |
389 | requires_grad = cfg_.pop('requires_grad', True)
390 | cfg_.setdefault('eps', 1e-5)
391 | if layer_type != 'GN':
392 | layer = norm_layer(num_features, **cfg_)
393 | if layer_type == 'SyncBN':
394 | layer._specify_ddp_gpu_num(1)
395 | else:
396 | assert 'num_groups' in cfg_
397 | layer = norm_layer(num_channels=num_features, **cfg_)
398 |
399 | for param in layer.parameters():
400 | param.requires_grad = requires_grad
401 |
402 | return name, layer
403 |
404 |
405 | class ConvModule(nn.Module):
406 | """A conv block that contains conv/norm/activation layers.
407 | Args:
408 | in_channels (int): Same as nn.Conv2d.
409 | out_channels (int): Same as nn.Conv2d.
410 | kernel_size (int or tuple[int]): Same as nn.Conv2d.
411 | stride (int or tuple[int]): Same as nn.Conv2d.
412 | padding (int or tuple[int]): Same as nn.Conv2d.
413 | dilation (int or tuple[int]): Same as nn.Conv2d.
414 | groups (int): Same as nn.Conv2d.
415 | bias (bool or str): If specified as `auto`, it will be decided by the
416 | norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
417 | False.
418 | conv_cfg (dict): Config dict for convolution layer.
419 | norm_cfg (dict): Config dict for normalization layer.
420 | activation (str or None): Activation type, "ReLU" by default.
421 | inplace (bool): Whether to use inplace mode for activation.
422 | order (tuple[str]): The order of conv/norm/activation layers. It is a
423 | sequence of "conv", "norm" and "act". Examples are
424 | ("conv", "norm", "act") and ("act", "conv", "norm").
425 | """
426 |
427 | def __init__(self,
428 | in_channels,
429 | out_channels,
430 | kernel_size,
431 | stride=1,
432 | padding=0,
433 | dilation=1,
434 | groups=1,
435 | bias='auto',
436 | conv_cfg=None,
437 | norm_cfg=None,
438 | activation='relu',
439 | inplace=True,
440 | order=('conv', 'norm', 'act')):
441 | super(ConvModule, self).__init__()
442 | assert conv_cfg is None or isinstance(conv_cfg, dict)
443 | assert norm_cfg is None or isinstance(norm_cfg, dict)
444 | self.conv_cfg = conv_cfg
445 | self.norm_cfg = norm_cfg
446 | self.activation = activation
447 | self.inplace = inplace
448 | self.order = order
449 | assert isinstance(self.order, tuple) and len(self.order) == 3
450 | assert set(order) == set(['conv', 'norm', 'act'])
451 |
452 | self.with_norm = norm_cfg is not None
453 | self.with_activatation = activation is not None
454 | # if the conv layer is before a norm layer, bias is unnecessary.
455 | if bias == 'auto':
456 | bias = False if self.with_norm else True
457 | self.with_bias = bias
458 |
459 | if self.with_norm and self.with_bias:
460 | warnings.warn('ConvModule has norm and bias at the same time')
461 |
462 | # build convolution layer
463 | self.conv = build_conv_layer(
464 | conv_cfg,
465 | in_channels,
466 | out_channels,
467 | kernel_size,
468 | stride=stride,
469 | padding=padding,
470 | dilation=dilation,
471 | groups=groups,
472 | bias=bias)
473 | # export the attributes of self.conv to a higher level for convenience
474 | self.in_channels = self.conv.in_channels
475 | self.out_channels = self.conv.out_channels
476 | self.kernel_size = self.conv.kernel_size
477 | self.stride = self.conv.stride
478 | self.padding = self.conv.padding
479 | self.dilation = self.conv.dilation
480 | self.transposed = self.conv.transposed
481 | self.output_padding = self.conv.output_padding
482 | self.groups = self.conv.groups
483 |
484 | # build normalization layers
485 | if self.with_norm:
486 | # norm layer is after conv layer
487 | if order.index('norm') > order.index('conv'):
488 | norm_channels = out_channels
489 | else:
490 | norm_channels = in_channels
491 | self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
492 | self.add_module(self.norm_name, norm)
493 |
494 | # build activation layer
495 | if self.with_activatation:
496 | # TODO: introduce `act_cfg` and supports more activation layers
497 | if self.activation not in ['relu']:
498 | raise ValueError('{} is currently not supported.'.format(
499 | self.activation))
500 | if self.activation == 'relu':
501 | self.activate = nn.ReLU(inplace=inplace)
502 |
503 | @property
504 | def norm(self):
505 | return getattr(self, self.norm_name)
506 |
507 | def forward(self, x, activate=True, norm=True):
508 | for layer in self.order:
509 | if layer == 'conv':
510 | x = self.conv(x)
511 | elif layer == 'norm' and norm and self.with_norm:
512 | x = self.norm(x)
513 | elif layer == 'act' and activate and self.with_activatation:
514 | x = self.activate(x)
515 | return x
516 |
517 |
518 | def xavier_init(module, gain=1, bias=0, distribution='normal'):
519 | assert distribution in ['uniform', 'normal']
520 | if distribution == 'uniform':
521 | nn.init.xavier_uniform_(module.weight, gain=gain)
522 | else:
523 | nn.init.xavier_normal_(module.weight, gain=gain)
524 | if hasattr(module, 'bias'):
525 | nn.init.constant_(module.bias, bias)
526 |
527 |
528 | def normal_init(module, mean=0, std=1, bias=0):
529 | nn.init.normal_(module.weight, mean, std)
530 | if hasattr(module, 'bias'):
531 | nn.init.constant_(module.bias, bias)
532 |
533 |
534 | def uniform_init(module, a=0, b=1, bias=0):
535 | nn.init.uniform_(module.weight, a, b)
536 | if hasattr(module, 'bias'):
537 | nn.init.constant_(module.bias, bias)
538 |
539 |
540 | def kaiming_init(module,
541 | mode='fan_out',
542 | nonlinearity='relu',
543 | bias=0,
544 | distribution='normal'):
545 | assert distribution in ['uniform', 'normal']
546 | if distribution == 'uniform':
547 | nn.init.kaiming_uniform_(
548 | module.weight, mode=mode, nonlinearity=nonlinearity)
549 | else:
550 | nn.init.kaiming_normal_(
551 | module.weight, mode=mode, nonlinearity=nonlinearity)
552 | if hasattr(module, 'bias'):
553 | nn.init.constant_(module.bias, bias)
554 |
555 |
556 | def bias_init_with_prob(prior_prob):
557 | """ initialize conv/fc bias value according to giving probablity"""
558 | bias_init = float(-np.log((1 - prior_prob) / prior_prob))
559 | return bias_init
560 |
--------------------------------------------------------------------------------
/models/retinahead.py:
--------------------------------------------------------------------------------
1 | from functools import partial
2 |
3 | import numpy as np
4 | import torch.nn as nn
5 |
6 | from .module import ConvModule, bias_init_with_prob, normal_init
7 | from six.moves import map, zip
8 |
9 |
10 | def multi_apply(func, *args, **kwargs):
11 | pfunc = partial(func, **kwargs) if kwargs else func
12 | map_results = map(pfunc, *args)
13 | return tuple(map(list, zip(*map_results)))
14 |
15 |
16 | class RetinaHead(nn.Module):
17 | """
18 | An anchor-based head used in [1]_.
19 | The head contains two subnetworks. The first classifies anchor boxes and
20 | the second regresses deltas for the anchors.
21 | References:
22 | .. [1] https://arxiv.org/pdf/1708.02002.pdf
23 | Example:
24 | >>> import torch
25 | >>> self = RetinaHead(11, 7)
26 | >>> x = torch.rand(1, 7, 32, 32)
27 | >>> cls_score, bbox_pred = self.forward_single(x)
28 | >>> # Each anchor predicts a score for each class except background
29 | >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
30 | >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
31 | >>> assert cls_per_anchor == (self.num_classes - 1)
32 | >>> assert box_per_anchor == 4
33 | """
34 |
35 | def __init__(self,
36 | num_classes,
37 | in_channels,
38 | feat_channels=256,
39 | anchor_scales=[8, 16, 32],
40 | anchor_ratios=[0.5, 1.0, 2.0],
41 | anchor_strides=[4, 8, 16, 32, 64],
42 | stacked_convs=4,
43 | octave_base_scale=4,
44 | scales_per_octave=3,
45 | conv_cfg=None,
46 | norm_cfg=None,
47 | **kwargs):
48 | super(RetinaHead, self).__init__()
49 | self.in_channels = in_channels
50 | self.num_classes = num_classes
51 | self.feat_channels = feat_channels
52 | self.anchor_scales = anchor_scales
53 | self.anchor_ratios = anchor_ratios
54 | self.anchor_strides = anchor_strides
55 | self.stacked_convs = stacked_convs
56 | self.octave_base_scale = octave_base_scale
57 | self.scales_per_octave = scales_per_octave
58 | self.conv_cfg = conv_cfg
59 | self.norm_cfg = norm_cfg
60 | octave_scales = np.array(
61 | [2**(i / scales_per_octave) for i in range(scales_per_octave)])
62 | anchor_scales = octave_scales * octave_base_scale
63 | self.cls_out_channels = num_classes
64 | self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales)
65 | self._init_layers()
66 |
67 | def _init_layers(self):
68 | self.relu = nn.ReLU(inplace=True)
69 | self.cls_convs = nn.ModuleList()
70 | self.reg_convs = nn.ModuleList()
71 | for i in range(self.stacked_convs):
72 | chn = self.in_channels if i == 0 else self.feat_channels
73 | self.cls_convs.append(
74 | ConvModule(
75 | chn,
76 | self.feat_channels,
77 | 3,
78 | stride=1,
79 | padding=1,
80 | conv_cfg=self.conv_cfg,
81 | norm_cfg=self.norm_cfg))
82 | self.reg_convs.append(
83 | ConvModule(
84 | chn,
85 | self.feat_channels,
86 | 3,
87 | stride=1,
88 | padding=1,
89 | conv_cfg=self.conv_cfg,
90 | norm_cfg=self.norm_cfg))
91 | self.retina_cls = nn.Conv2d(
92 | self.feat_channels,
93 | self.num_anchors * self.cls_out_channels,
94 | 3,
95 | padding=1)
96 | self.retina_reg = nn.Conv2d(
97 | self.feat_channels, self.num_anchors * 4, 3, padding=1)
98 | self.output_act = nn.Sigmoid()
99 |
100 | def init_weights(self):
101 | for m in self.cls_convs:
102 | normal_init(m.conv, std=0.01)
103 | for m in self.reg_convs:
104 | normal_init(m.conv, std=0.01)
105 | bias_cls = bias_init_with_prob(0.01)
106 | normal_init(self.retina_cls, std=0.01, bias=bias_cls)
107 | normal_init(self.retina_reg, std=0.01)
108 |
109 | def forward_single(self, x):
110 | cls_feat = x
111 | reg_feat = x
112 | for cls_conv in self.cls_convs:
113 | cls_feat = cls_conv(cls_feat)
114 | for reg_conv in self.reg_convs:
115 | reg_feat = reg_conv(reg_feat)
116 |
117 | cls_score = self.retina_cls(cls_feat)
118 | cls_score = self.output_act(cls_score)
119 | # out is B x C x W x H, with C = n_classes + n_anchors
120 | cls_score = cls_score.permute(0, 2, 3, 1)
121 | batch_size, width, height, channels = cls_score.shape
122 | cls_score = cls_score.view(
123 | batch_size, width, height, self.num_anchors, self.num_classes)
124 | cls_score = cls_score.contiguous().view(x.size(0), -1, self.num_classes)
125 |
126 | bbox_pred = self.retina_reg(reg_feat)
127 | bbox_pred = bbox_pred.permute(0, 2, 3, 1)
128 | bbox_pred = bbox_pred.contiguous().view(bbox_pred.size(0), -1, 4)
129 | return cls_score, bbox_pred
130 |
131 | def forward(self, feats):
132 | return multi_apply(self.forward_single, feats)
133 |
--------------------------------------------------------------------------------
/models/utils.py:
--------------------------------------------------------------------------------
1 | import re
2 | import math
3 | import collections
4 | from functools import partial
5 | import torch
6 | from torch import nn
7 | from torch.nn import functional as F
8 | from torch.utils import model_zoo
9 |
10 | ########################################################################
11 | ############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ###############
12 | ########################################################################
13 |
14 |
15 | # Parameters for the entire model (stem, all blocks, and head)
16 | GlobalParams = collections.namedtuple('GlobalParams', [
17 | 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate',
18 | 'num_classes', 'width_coefficient', 'depth_coefficient',
19 | 'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size'])
20 |
21 | # Parameters for an individual model block
22 | BlockArgs = collections.namedtuple('BlockArgs', [
23 | 'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
24 | 'expand_ratio', 'id_skip', 'stride', 'se_ratio'])
25 |
26 | # Change namedtuple defaults
27 | GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
28 | BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
29 |
30 |
31 | class SwishImplementation(torch.autograd.Function):
32 | @staticmethod
33 | def forward(ctx, i):
34 | result = i * torch.sigmoid(i)
35 | ctx.save_for_backward(i)
36 | return result
37 |
38 | @staticmethod
39 | def backward(ctx, grad_output):
40 | i = ctx.saved_variables[0]
41 | sigmoid_i = torch.sigmoid(i)
42 | return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
43 |
44 |
45 | class MemoryEfficientSwish(nn.Module):
46 | def forward(self, x):
47 | return SwishImplementation.apply(x)
48 |
49 |
50 | class Swish(nn.Module):
51 | def forward(self, x):
52 | return x * torch.sigmoid(x)
53 |
54 |
55 | def round_filters(filters, global_params):
56 | """ Calculate and round number of filters based on depth multiplier. """
57 | multiplier = global_params.width_coefficient
58 | if not multiplier:
59 | return filters
60 | divisor = global_params.depth_divisor
61 | min_depth = global_params.min_depth
62 | filters *= multiplier
63 | min_depth = min_depth or divisor
64 | new_filters = max(min_depth, int(
65 | filters + divisor / 2) // divisor * divisor)
66 | if new_filters < 0.9 * filters: # prevent rounding by more than 10%
67 | new_filters += divisor
68 | return int(new_filters)
69 |
70 |
71 | def round_repeats(repeats, global_params):
72 | """ Round number of filters based on depth multiplier. """
73 | multiplier = global_params.depth_coefficient
74 | if not multiplier:
75 | return repeats
76 | return int(math.ceil(multiplier * repeats))
77 |
78 |
79 | def drop_connect(inputs, p, training):
80 | """ Drop connect. """
81 | if not training:
82 | return inputs
83 | batch_size = inputs.shape[0]
84 | keep_prob = 1 - p
85 | random_tensor = keep_prob
86 | random_tensor += torch.rand([batch_size, 1, 1, 1],
87 | dtype=inputs.dtype, device=inputs.device)
88 | binary_tensor = torch.floor(random_tensor)
89 | output = inputs / keep_prob * binary_tensor
90 | return output
91 |
92 |
93 | def get_same_padding_conv2d(image_size=None):
94 | """ Chooses static padding if you have specified an image size, and dynamic padding otherwise.
95 | Static padding is necessary for ONNX exporting of models. """
96 | if image_size is None:
97 | return Conv2dDynamicSamePadding
98 | else:
99 | return partial(Conv2dStaticSamePadding, image_size=image_size)
100 |
101 |
102 | class Conv2dDynamicSamePadding(nn.Conv2d):
103 | """ 2D Convolutions like TensorFlow, for a dynamic image size """
104 |
105 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
106 | super().__init__(in_channels, out_channels,
107 | kernel_size, stride, 0, dilation, groups, bias)
108 | self.stride = self.stride if len(self.stride) == 2 else [
109 | self.stride[0]] * 2
110 |
111 | def forward(self, x):
112 | ih, iw = x.size()[-2:]
113 | kh, kw = self.weight.size()[-2:]
114 | sh, sw = self.stride
115 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
116 | pad_h = max((oh - 1) * self.stride[0] +
117 | (kh - 1) * self.dilation[0] + 1 - ih, 0)
118 | pad_w = max((ow - 1) * self.stride[1] +
119 | (kw - 1) * self.dilation[1] + 1 - iw, 0)
120 | if pad_h > 0 or pad_w > 0:
121 | x = F.pad(x, [pad_w // 2, pad_w - pad_w //
122 | 2, pad_h // 2, pad_h - pad_h // 2])
123 | return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
124 |
125 |
126 | class Conv2dStaticSamePadding(nn.Conv2d):
127 | """ 2D Convolutions like TensorFlow, for a fixed image size"""
128 |
129 | def __init__(self, in_channels, out_channels, kernel_size, image_size=None, **kwargs):
130 | super().__init__(in_channels, out_channels, kernel_size, **kwargs)
131 | self.stride = self.stride if len(self.stride) == 2 else [
132 | self.stride[0]] * 2
133 |
134 | # Calculate padding based on image size and save it
135 | assert image_size is not None
136 | ih, iw = image_size if type(image_size) == list else [
137 | image_size, image_size]
138 | kh, kw = self.weight.size()[-2:]
139 | sh, sw = self.stride
140 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
141 | pad_h = max((oh - 1) * self.stride[0] +
142 | (kh - 1) * self.dilation[0] + 1 - ih, 0)
143 | pad_w = max((ow - 1) * self.stride[1] +
144 | (kw - 1) * self.dilation[1] + 1 - iw, 0)
145 | if pad_h > 0 or pad_w > 0:
146 | self.static_padding = nn.ZeroPad2d(
147 | (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
148 | else:
149 | self.static_padding = Identity()
150 |
151 | def forward(self, x):
152 | x = self.static_padding(x)
153 | x = F.conv2d(x, self.weight, self.bias, self.stride,
154 | self.padding, self.dilation, self.groups)
155 | return x
156 |
157 |
158 | class Identity(nn.Module):
159 | def __init__(self, ):
160 | super(Identity, self).__init__()
161 |
162 | def forward(self, input):
163 | return input
164 |
165 |
166 | ########################################################################
167 | ############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ##############
168 | ########################################################################
169 |
170 |
171 | def efficientnet_params(model_name):
172 | """ Map EfficientNet model name to parameter coefficients. """
173 | params_dict = {
174 | # Coefficients: width,depth,res,dropout
175 | 'efficientnet-b0': (1.0, 1.0, 224, 0.2),
176 | 'efficientnet-b1': (1.0, 1.1, 240, 0.2),
177 | 'efficientnet-b2': (1.1, 1.2, 260, 0.3),
178 | 'efficientnet-b3': (1.2, 1.4, 300, 0.3),
179 | 'efficientnet-b4': (1.4, 1.8, 380, 0.4),
180 | 'efficientnet-b5': (1.6, 2.2, 456, 0.4),
181 | 'efficientnet-b6': (1.8, 2.6, 528, 0.5),
182 | 'efficientnet-b7': (2.0, 3.1, 600, 0.5),
183 | }
184 | return params_dict[model_name]
185 |
186 |
187 | class BlockDecoder(object):
188 | """ Block Decoder for readability, straight from the official TensorFlow repository """
189 |
190 | @staticmethod
191 | def _decode_block_string(block_string):
192 | """ Gets a block through a string notation of arguments. """
193 | assert isinstance(block_string, str)
194 |
195 | ops = block_string.split('_')
196 | options = {}
197 | for op in ops:
198 | splits = re.split(r'(\d.*)', op)
199 | if len(splits) >= 2:
200 | key, value = splits[:2]
201 | options[key] = value
202 |
203 | # Check stride
204 | assert (('s' in options and len(options['s']) == 1) or
205 | (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
206 |
207 | return BlockArgs(
208 | kernel_size=int(options['k']),
209 | num_repeat=int(options['r']),
210 | input_filters=int(options['i']),
211 | output_filters=int(options['o']),
212 | expand_ratio=int(options['e']),
213 | id_skip=('noskip' not in block_string),
214 | se_ratio=float(options['se']) if 'se' in options else None,
215 | stride=[int(options['s'][0])])
216 |
217 | @staticmethod
218 | def _encode_block_string(block):
219 | """Encodes a block to a string."""
220 | args = [
221 | 'r%d' % block.num_repeat,
222 | 'k%d' % block.kernel_size,
223 | 's%d%d' % (block.strides[0], block.strides[1]),
224 | 'e%s' % block.expand_ratio,
225 | 'i%d' % block.input_filters,
226 | 'o%d' % block.output_filters
227 | ]
228 | if 0 < block.se_ratio <= 1:
229 | args.append('se%s' % block.se_ratio)
230 | if block.id_skip is False:
231 | args.append('noskip')
232 | return '_'.join(args)
233 |
234 | @staticmethod
235 | def decode(string_list):
236 | """
237 | Decodes a list of string notations to specify blocks inside the network.
238 | :param string_list: a list of strings, each string is a notation of block
239 | :return: a list of BlockArgs namedtuples of block args
240 | """
241 | assert isinstance(string_list, list)
242 | blocks_args = []
243 | for block_string in string_list:
244 | blocks_args.append(BlockDecoder._decode_block_string(block_string))
245 | return blocks_args
246 |
247 | @staticmethod
248 | def encode(blocks_args):
249 | """
250 | Encodes a list of BlockArgs to a list of strings.
251 | :param blocks_args: a list of BlockArgs namedtuples of block args
252 | :return: a list of strings, each string is a notation of block
253 | """
254 | block_strings = []
255 | for block in blocks_args:
256 | block_strings.append(BlockDecoder._encode_block_string(block))
257 | return block_strings
258 |
259 |
260 | def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2,
261 | drop_connect_rate=0.2, image_size=None, num_classes=1000):
262 | """ Creates a efficientnet model. """
263 |
264 | blocks_args = [
265 | 'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25',
266 | 'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25',
267 | 'r3_k5_s22_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25',
268 | 'r1_k3_s22_e6_i192_o320_se0.25',
269 | ]
270 | blocks_args = BlockDecoder.decode(blocks_args)
271 |
272 | global_params = GlobalParams(
273 | batch_norm_momentum=0.99,
274 | batch_norm_epsilon=1e-3,
275 | dropout_rate=dropout_rate,
276 | drop_connect_rate=drop_connect_rate,
277 | # data_format='channels_last', # removed, this is always true in PyTorch
278 | num_classes=num_classes,
279 | width_coefficient=width_coefficient,
280 | depth_coefficient=depth_coefficient,
281 | depth_divisor=8,
282 | min_depth=None,
283 | image_size=image_size,
284 | )
285 |
286 | return blocks_args, global_params
287 |
288 |
289 | def get_model_params(model_name, override_params):
290 | """ Get the block args and global params for a given model """
291 | if model_name.startswith('efficientnet'):
292 | w, d, s, p = efficientnet_params(model_name)
293 | # note: all models have drop connect rate = 0.2
294 | blocks_args, global_params = efficientnet(
295 | width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
296 | else:
297 | raise NotImplementedError(
298 | 'model name is not pre-defined: %s' % model_name)
299 | if override_params:
300 | # ValueError will be raised here if override_params has fields not included in global_params.
301 | global_params = global_params._replace(**override_params)
302 | return blocks_args, global_params
303 |
304 |
305 | url_map = {
306 | 'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b0-355c32eb.pth',
307 | 'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b1-f1951068.pth',
308 | 'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b2-8bb594d6.pth',
309 | 'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b3-5fb5a3c3.pth',
310 | 'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b4-6ed6700e.pth',
311 | 'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b5-b6417697.pth',
312 | 'efficientnet-b6': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b6-c76e70fd.pth',
313 | 'efficientnet-b7': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b7-dcc49843.pth',
314 | }
315 |
316 |
317 | def load_pretrained_weights(model, model_name, load_fc=True):
318 | """ Loads pretrained weights, and downloads if loading for the first time. """
319 | state_dict = model_zoo.load_url(url_map[model_name])
320 | if load_fc:
321 | model.load_state_dict(state_dict)
322 | else:
323 | state_dict.pop('_fc.weight')
324 | state_dict.pop('_fc.bias')
325 | res = model.load_state_dict(state_dict, strict=False)
326 | assert set(res.missing_keys) == set(
327 | ['_fc.weight', '_fc.bias']), 'issue loading pretrained weights'
328 | print('Loaded pretrained weights for {}'.format(model_name))
329 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | torch
4 | torchvision
5 | pytoan
6 | albumentations
7 |
--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from models import EfficientDet
3 | from models.efficientnet import EfficientNet
4 |
5 | if __name__ == '__main__':
6 |
7 | inputs = torch.randn(5, 3, 512, 512)
8 |
9 | # Test EfficientNet
10 | model = EfficientNet.from_pretrained('efficientnet-b0')
11 | inputs = torch.randn(4, 3, 512, 512)
12 | P = model(inputs)
13 | for idx, p in enumerate(P):
14 | print('P{}: {}'.format(idx, p.size()))
15 |
16 | # print('model: ', model)
17 |
18 | # Test inference
19 | model = EfficientDet(num_classes=20, is_training=False)
20 | output = model(inputs)
21 | for out in output:
22 | print(out.size())
23 |
--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
1 | from tqdm import tqdm
2 | import argparse
3 | import os
4 | import random
5 | import shutil
6 | import time
7 | import warnings
8 | import torch
9 | import torch.nn as nn
10 | import torch.nn.parallel
11 | import torch.backends.cudnn as cudnn
12 | import torch.distributed as dist
13 | import torch.optim
14 | import torch.multiprocessing as mp
15 | import torch.utils.data
16 | import torch.utils.data.distributed
17 | import torchvision.transforms as transforms
18 | import torchvision.datasets as datasets
19 |
20 | import os
21 | import sys
22 | import time
23 | import argparse
24 | import numpy as np
25 | import torch
26 | import torch.optim as optim
27 | import torch.backends.cudnn as cudnn
28 | from torch.utils.data import DataLoader
29 |
30 | from models.efficientdet import EfficientDet
31 | from models.losses import FocalLoss
32 | from datasets import VOCDetection, CocoDataset, get_augumentation, detection_collate, Resizer, Normalizer, Augmenter, collater
33 | from utils import EFFICIENTDET, get_state_dict
34 | from eval import evaluate, evaluate_coco
35 |
36 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
37 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'],
38 | type=str, help='VOC or COCO')
39 | parser.add_argument(
40 | '--dataset_root',
41 | default='/root/data/VOCdevkit/',
42 | help='Dataset root directory path [/root/data/VOCdevkit/, /root/data/coco/]')
43 | parser.add_argument('--network', default='efficientdet-d0', type=str,
44 | help='efficientdet-[d0, d1, ..]')
45 |
46 | parser.add_argument('--resume', default=None, type=str,
47 | help='Checkpoint state_dict file to resume training from')
48 | parser.add_argument('--num_epoch', default=500, type=int,
49 | help='Num epoch for training')
50 | parser.add_argument('--batch_size', default=32, type=int,
51 | help='Batch size for training')
52 | parser.add_argument('--num_class', default=20, type=int,
53 | help='Number of class used in model')
54 | parser.add_argument('--device', default=[0, 1], type=list,
55 | help='Use CUDA to train model')
56 | parser.add_argument('--grad_accumulation_steps', default=1, type=int,
57 | help='Number of gradient accumulation steps')
58 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float,
59 | help='initial learning rate')
60 | parser.add_argument('--momentum', default=0.9, type=float,
61 | help='Momentum value for optim')
62 | parser.add_argument('--weight_decay', default=5e-4, type=float,
63 | help='Weight decay for SGD')
64 | parser.add_argument('--gamma', default=0.1, type=float,
65 | help='Gamma update for SGD')
66 | parser.add_argument('--save_folder', default='./saved/weights/', type=str,
67 | help='Directory for saving checkpoint models')
68 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
69 | help='number of data loading workers (default: 4)')
70 | parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
71 | help='manual epoch number (useful on restarts)')
72 | parser.add_argument('--world-size', default=1, type=int,
73 | help='number of nodes for distributed training')
74 | parser.add_argument('--rank', default=0, type=int,
75 | help='node rank for distributed training')
76 | parser.add_argument('--dist-url', default='env://', type=str,
77 | help='url used to set up distributed training')
78 | parser.add_argument('--dist-backend', default='nccl', type=str,
79 | help='distributed backend')
80 | parser.add_argument('--seed', default=24, type=int,
81 | help='seed for initializing training. ')
82 | parser.add_argument('--gpu', default=None, type=int,
83 | help='GPU id to use.')
84 | parser.add_argument(
85 | '--multiprocessing-distributed',
86 | action='store_true',
87 | help='Use multi-processing distributed training to launch '
88 | 'N processes per node, which has N GPUs. This is the '
89 | 'fastest way to use PyTorch for either single node or '
90 | 'multi node data parallel training')
91 |
92 | iteration = 1
93 |
94 |
95 | def train(train_loader, model, scheduler, optimizer, epoch, args):
96 | global iteration
97 | print("{} epoch: \t start training....".format(epoch))
98 | start = time.time()
99 | total_loss = []
100 | model.train()
101 | model.module.is_training = True
102 | model.module.freeze_bn()
103 | optimizer.zero_grad()
104 | for idx, (images, annotations) in enumerate(train_loader):
105 | images = images.cuda().float()
106 | annotations = annotations.cuda()
107 | classification_loss, regression_loss = model([images, annotations])
108 | classification_loss = classification_loss.mean()
109 | regression_loss = regression_loss.mean()
110 | loss = classification_loss + regression_loss
111 | if bool(loss == 0):
112 | print('loss equal zero(0)')
113 | continue
114 | loss.backward()
115 | if (idx + 1) % args.grad_accumulation_steps == 0:
116 | torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
117 | optimizer.step()
118 | optimizer.zero_grad()
119 |
120 | total_loss.append(loss.item())
121 | if(iteration % 300 == 0):
122 | print('{} iteration: training ...'.format(iteration))
123 | ans = {
124 | 'epoch': epoch,
125 | 'iteration': iteration,
126 | 'cls_loss': classification_loss.item(),
127 | 'reg_loss': regression_loss.item(),
128 | 'mean_loss': np.mean(total_loss)
129 | }
130 | for key, value in ans.items():
131 | print(' {:15s}: {}'.format(str(key), value))
132 | iteration += 1
133 | scheduler.step(np.mean(total_loss))
134 | result = {
135 | 'time': time.time() - start,
136 | 'loss': np.mean(total_loss)
137 | }
138 | for key, value in result.items():
139 | print(' {:15s}: {}'.format(str(key), value))
140 |
141 |
142 | def test(dataset, model, epoch, args):
143 | print("{} epoch: \t start validation....".format(epoch))
144 | model = model.module
145 | model.eval()
146 | model.is_training = False
147 | with torch.no_grad():
148 | if(args.dataset == 'VOC'):
149 | evaluate(dataset, model)
150 | else:
151 | evaluate_coco(dataset, model)
152 |
153 |
154 | def main_worker(gpu, ngpus_per_node, args):
155 | args.gpu = gpu
156 | if args.gpu is not None:
157 | print("Use GPU: {} for training".format(args.gpu))
158 |
159 | if args.distributed:
160 | if args.dist_url == "env://" and args.rank == -1:
161 | # args.rank = int(os.environ["RANK"])
162 | args.rank = 1
163 | if args.multiprocessing_distributed:
164 | # For multiprocessing distributed training, rank needs to be the
165 | # global rank among all the processes
166 | args.rank = args.rank * ngpus_per_node + gpu
167 | dist.init_process_group(
168 | backend=args.dist_backend,
169 | init_method=args.dist_url,
170 | world_size=args.world_size,
171 | rank=args.rank)
172 |
173 | # Training dataset
174 | train_dataset = []
175 | if(args.dataset == 'VOC'):
176 | train_dataset = VOCDetection(root=args.dataset_root, transform=transforms.Compose(
177 | [Normalizer(), Augmenter(), Resizer()]))
178 | valid_dataset = VOCDetection(root=args.dataset_root, image_sets=[(
179 | '2007', 'test')], transform=transforms.Compose([Normalizer(), Resizer()]))
180 | args.num_class = train_dataset.num_classes()
181 | elif(args.dataset == 'COCO'):
182 | train_dataset = CocoDataset(
183 | root_dir=args.dataset_root,
184 | set_name='train2017',
185 | transform=transforms.Compose(
186 | [
187 | Normalizer(),
188 | Augmenter(),
189 | Resizer()]))
190 | valid_dataset = CocoDataset(
191 | root_dir=args.dataset_root,
192 | set_name='val2017',
193 | transform=transforms.Compose(
194 | [
195 | Normalizer(),
196 | Resizer()]))
197 | args.num_class = train_dataset.num_classes()
198 |
199 | train_loader = DataLoader(train_dataset,
200 | batch_size=args.batch_size,
201 | num_workers=args.workers,
202 | shuffle=True,
203 | collate_fn=collater,
204 | pin_memory=True)
205 | valid_loader = DataLoader(valid_dataset,
206 | batch_size=1,
207 | num_workers=args.workers,
208 | shuffle=False,
209 | collate_fn=collater,
210 | pin_memory=True)
211 |
212 | checkpoint = []
213 | if(args.resume is not None):
214 | if os.path.isfile(args.resume):
215 | print("=> loading checkpoint '{}'".format(args.resume))
216 | if args.gpu is None:
217 | checkpoint = torch.load(args.resume)
218 | else:
219 | # Map model to be loaded to specified single gpu.
220 | loc = 'cuda:{}'.format(args.gpu)
221 | checkpoint = torch.load(args.resume, map_location=loc)
222 | params = checkpoint['parser']
223 | args.num_class = params.num_class
224 | args.network = params.network
225 | args.start_epoch = checkpoint['epoch'] + 1
226 | del params
227 |
228 | model = EfficientDet(num_classes=args.num_class,
229 | network=args.network,
230 | W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
231 | D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
232 | D_class=EFFICIENTDET[args.network]['D_class']
233 | )
234 | if(args.resume is not None):
235 | model.load_state_dict(checkpoint['state_dict'])
236 | del checkpoint
237 | if args.distributed:
238 | # For multiprocessing distributed, DistributedDataParallel constructor
239 | # should always set the single device scope, otherwise,
240 | # DistributedDataParallel will use all available devices.
241 | if args.gpu is not None:
242 | torch.cuda.set_device(args.gpu)
243 | model.cuda(args.gpu)
244 | # When using a single GPU per process and per
245 | # DistributedDataParallel, we need to divide the batch size
246 | # ourselves based on the total number of GPUs we have
247 | args.batch_size = int(args.batch_size / ngpus_per_node)
248 | args.workers = int(
249 | (args.workers + ngpus_per_node - 1) / ngpus_per_node)
250 | model = torch.nn.parallel.DistributedDataParallel(
251 | model, device_ids=[args.gpu], find_unused_parameters=True)
252 | print('Run with DistributedDataParallel with divice_ids....')
253 | else:
254 | model.cuda()
255 | # DistributedDataParallel will divide and allocate batch_size to all
256 | # available GPUs if device_ids are not set
257 | model = torch.nn.parallel.DistributedDataParallel(model)
258 | print('Run with DistributedDataParallel without device_ids....')
259 | elif args.gpu is not None:
260 | torch.cuda.set_device(args.gpu)
261 | model = model.cuda(args.gpu)
262 | else:
263 | model = model.cuda()
264 | print('Run with DataParallel ....')
265 | model = torch.nn.DataParallel(model).cuda()
266 |
267 | # define loss function (criterion) , optimizer, scheduler
268 | optimizer = optim.AdamW(model.parameters(), lr=args.lr)
269 | scheduler = optim.lr_scheduler.ReduceLROnPlateau(
270 | optimizer, patience=3, verbose=True)
271 | cudnn.benchmark = True
272 |
273 | for epoch in range(args.start_epoch, args.num_epoch):
274 | train(train_loader, model, scheduler, optimizer, epoch, args)
275 |
276 | if (epoch + 1) % 5 == 0:
277 | test(valid_dataset, model, epoch, args)
278 |
279 | state = {
280 | 'epoch': epoch,
281 | 'parser': args,
282 | 'state_dict': get_state_dict(model)
283 | }
284 |
285 | torch.save(
286 | state,
287 | os.path.join(
288 | args.save_folder,
289 | args.dataset,
290 | args.network,
291 | "checkpoint_{}.pth".format(epoch)))
292 |
293 |
294 | def main():
295 | args = parser.parse_args()
296 | if(not os.path.exists(os.path.join(args.save_folder, args.dataset, args.network))):
297 | os.makedirs(os.path.join(args.save_folder, args.dataset, args.network))
298 | if args.seed is not None:
299 | random.seed(args.seed)
300 | torch.manual_seed(args.seed)
301 | cudnn.deterministic = True
302 | warnings.warn('You have chosen to seed training. '
303 | 'This will turn on the CUDNN deterministic setting, '
304 | 'which can slow down your training considerably! '
305 | 'You may see unexpected behavior when restarting '
306 | 'from checkpoints.')
307 |
308 | if args.gpu is not None:
309 | warnings.warn('You have chosen a specific GPU. This will completely '
310 | 'disable data parallelism.')
311 | os.environ['MASTER_ADDR'] = 'localhost'
312 | os.environ['MASTER_PORT'] = '12355'
313 | os.environ['WORLD_SIZE'] = '2'
314 | if args.dist_url == "env://" and args.world_size == -1:
315 | args.world_size = int(os.environ["WORLD_SIZE"])
316 |
317 | args.distributed = args.world_size > 1 or args.multiprocessing_distributed
318 | ngpus_per_node = torch.cuda.device_count()
319 | if args.multiprocessing_distributed:
320 | # Since we have ngpus_per_node processes per node, the total world_size
321 | # needs to be adjusted accordingly
322 | args.world_size = ngpus_per_node * args.world_size
323 | # Use torch.multiprocessing.spawn to launch distributed processes: the
324 | # main_worker process function
325 | mp.spawn(main_worker, nprocs=ngpus_per_node,
326 | args=(ngpus_per_node, args))
327 | else:
328 | # Simply call main_worker function
329 | main_worker(args.gpu, ngpus_per_node, args)
330 |
331 |
332 | if __name__ == "__main__":
333 | main()
334 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .helper import *
2 | from .util import *
3 | from .visualization import *
4 | from .vis_bbox import vis_bbox
5 | from .config_eff import *
--------------------------------------------------------------------------------
/utils/config_eff.py:
--------------------------------------------------------------------------------
1 | EFFICIENTDET = {
2 | 'efficientdet-d0': {'input_size': 512,
3 | 'backbone': 'B0',
4 | 'W_bifpn': 64,
5 | 'D_bifpn': 2,
6 | 'D_class': 3},
7 | 'efficientdet-d1': {'input_size': 640,
8 | 'backbone': 'B1',
9 | 'W_bifpn': 88,
10 | 'D_bifpn': 3,
11 | 'D_class': 3},
12 | 'efficientdet-d2': {'input_size': 768,
13 | 'backbone': 'B2',
14 | 'W_bifpn': 112,
15 | 'D_bifpn': 4,
16 | 'D_class': 3},
17 | 'efficientdet-d3': {'input_size': 896,
18 | 'backbone': 'B3',
19 | 'W_bifpn': 160,
20 | 'D_bifpn': 5,
21 | 'D_class': 4},
22 | 'efficientdet-d4': {'input_size': 1024,
23 | 'backbone': 'B4',
24 | 'W_bifpn': 224,
25 | 'D_bifpn': 6,
26 | 'D_class': 4},
27 | 'efficientdet-d5': {'input_size': 1280,
28 | 'backbone': 'B5',
29 | 'W_bifpn': 288,
30 | 'D_bifpn': 7,
31 | 'D_class': 4},
32 | 'efficientdet-d6': {'input_size': 1408,
33 | 'backbone': 'B6',
34 | 'W_bifpn': 384,
35 | 'D_bifpn': 8,
36 | 'D_class': 5},
37 | 'efficientdet-d7': {'input_size': 1636,
38 | 'backbone': 'B6',
39 | 'W_bifpn': 384,
40 | 'D_bifpn': 8,
41 | 'D_class': 5},
42 | }
--------------------------------------------------------------------------------
/utils/helper.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | import numpy as np
3 | import torch
4 | import os
5 | import requests
6 | import socket
7 | import datetime
8 | import json
9 |
10 |
11 | def load_yaml(file_name):
12 | with open(file_name, 'r') as stream:
13 | config = yaml.load(stream, Loader=yaml.FullLoader)
14 | return config
15 |
16 |
17 | def init_seed(SEED=42):
18 | os.environ['PYTHONHASHSEED'] = str(SEED)
19 | np.random.seed(SEED)
20 | torch.manual_seed(SEED)
21 | torch.cuda.manual_seed(SEED)
22 | torch.backends.cudnn.deterministic = True
23 |
24 |
25 | def get_state_dict(model):
26 | if type(model) == torch.nn.DataParallel:
27 | state_dict = model.module.state_dict()
28 | else:
29 | state_dict = model.state_dict()
30 | return state_dict
31 |
--------------------------------------------------------------------------------
/utils/metric.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 |
3 | import numpy as np
4 | import json
5 | import os
6 |
7 | import torch
8 |
9 |
10 | def compute_overlap(a, b):
11 | """
12 | Parameters
13 | ----------
14 | a: (N, 4) ndarray of float
15 | b: (K, 4) ndarray of float
16 | Returns
17 | -------
18 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes
19 | """
20 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
21 |
22 | iw = np.minimum(np.expand_dims(
23 | a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
24 | ih = np.minimum(np.expand_dims(
25 | a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])
26 |
27 | iw = np.maximum(iw, 0)
28 | ih = np.maximum(ih, 0)
29 |
30 | ua = np.expand_dims((a[:, 2] - a[:, 0]) *
31 | (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
32 |
33 | ua = np.maximum(ua, np.finfo(float).eps)
34 |
35 | intersection = iw * ih
36 |
37 | return intersection / ua
38 |
39 |
40 | def _compute_ap(recall, precision):
41 | """ Compute the average precision, given the recall and precision curves.
42 | Code originally from https://github.com/rbgirshick/py-faster-rcnn.
43 | # Arguments
44 | recall: The recall curve (list).
45 | precision: The precision curve (list).
46 | # Returns
47 | The average precision as computed in py-faster-rcnn.
48 | """
49 | # correct AP calculation
50 | # first append sentinel values at the end
51 | mrec = np.concatenate(([0.], recall, [1.]))
52 | mpre = np.concatenate(([0.], precision, [0.]))
53 |
54 | # compute the precision envelope
55 | for i in range(mpre.size - 1, 0, -1):
56 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
57 |
58 | # to calculate area under PR curve, look for points
59 | # where X axis (recall) changes value
60 | i = np.where(mrec[1:] != mrec[:-1])[0]
61 |
62 | # and sum (\Delta recall) * prec
63 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
64 | return ap
65 |
66 |
67 | def _get_detections(dataset, model, score_threshold=0.05, max_detections=100, save_path=None):
68 | """ Get the detections from the retinanet using the generator.
69 | The result is a list of lists such that the size is:
70 | all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
71 | # Arguments
72 | dataset : The generator used to run images through the retinanet.
73 | retinanet : The retinanet to run on the images.
74 | score_threshold : The score confidence threshold to use.
75 | max_detections : The maximum number of detections to use per image.
76 | save_path : The path to save the images with visualized detections to.
77 | # Returns
78 | A list of lists containing the detections for each image in the generator.
79 | """
80 | all_detections = [[None for i in range(
81 | dataset.num_classes())] for j in range(len(dataset))]
82 |
83 | model.eval()
84 |
85 | with torch.no_grad():
86 | for index in range(len(dataset)):
87 | data = dataset[index]
88 | scale = data['scale']
89 |
90 | # run network
91 | scores, labels, boxes = model(data['img'].permute(
92 | 2, 0, 1).cuda().float().unsqueeze(dim=0))
93 | scores = scores.cpu().numpy()
94 | labels = labels.cpu().numpy()
95 | boxes = boxes.cpu().numpy()
96 |
97 | # correct boxes for image scale
98 | boxes /= scale
99 |
100 | # select indices which have a score above the threshold
101 | indices = np.where(scores > score_threshold)[0]
102 | if indices.shape[0] > 0:
103 | # select those scores
104 | scores = scores[indices]
105 |
106 | # find the order with which to sort the scores
107 | scores_sort = np.argsort(-scores)[:max_detections]
108 |
109 | # select detections
110 | image_boxes = boxes[indices[scores_sort], :]
111 | image_scores = scores[scores_sort]
112 | image_labels = labels[indices[scores_sort]]
113 | image_detections = np.concatenate([image_boxes, np.expand_dims(
114 | image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
115 |
116 | # copy detections to all_detections
117 | for label in range(dataset.num_classes()):
118 | all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1]
119 | else:
120 | # copy detections to all_detections
121 | for label in range(dataset.num_classes()):
122 | all_detections[index][label] = np.zeros((0, 5))
123 |
124 | print('{}/{}'.format(index + 1, len(dataset)), end='\r')
125 |
126 | return all_detections
127 |
128 |
129 | def _get_annotations(generator):
130 | """ Get the ground truth annotations from the generator.
131 | The result is a list of lists such that the size is:
132 | all_detections[num_images][num_classes] = annotations[num_detections, 5]
133 | # Arguments
134 | generator : The generator used to retrieve ground truth annotations.
135 | # Returns
136 | A list of lists containing the annotations for each image in the generator.
137 | """
138 | all_annotations = [[None for i in range(
139 | generator.num_classes())] for j in range(len(generator))]
140 |
141 | for i in range(len(generator)):
142 | # load the annotations
143 | annotations = generator.load_annotations(i)
144 |
145 | # copy detections to all_annotations
146 | for label in range(generator.num_classes()):
147 | all_annotations[i][label] = annotations[annotations[:, 4]
148 | == label, :4].copy()
149 |
150 | print('{}/{}'.format(i + 1, len(generator)), end='\r')
151 |
152 | return all_annotations
153 |
154 |
155 | def evaluate(
156 | generator,
157 | retinanet,
158 | iou_threshold=0.5,
159 | score_threshold=0.05,
160 | max_detections=100,
161 | save_path=None
162 | ):
163 | """ Evaluate a given dataset using a given retinanet.
164 | # Arguments
165 | generator : The generator that represents the dataset to evaluate.
166 | retinanet : The retinanet to evaluate.
167 | iou_threshold : The threshold used to consider when a detection is positive or negative.
168 | score_threshold : The score confidence threshold to use for detections.
169 | max_detections : The maximum number of detections to use per image.
170 | save_path : The path to save images with visualized detections to.
171 | # Returns
172 | A dict mapping class names to mAP scores.
173 | """
174 |
175 | # gather all detections and annotations
176 |
177 | all_detections = _get_detections(
178 | generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path)
179 | all_annotations = _get_annotations(generator)
180 |
181 | average_precisions = {}
182 |
183 | for label in range(generator.num_classes()):
184 | false_positives = np.zeros((0,))
185 | true_positives = np.zeros((0,))
186 | scores = np.zeros((0,))
187 | num_annotations = 0.0
188 |
189 | for i in range(len(generator)):
190 | detections = all_detections[i][label]
191 | annotations = all_annotations[i][label]
192 | num_annotations += annotations.shape[0]
193 | detected_annotations = []
194 |
195 | for d in detections:
196 | scores = np.append(scores, d[4])
197 |
198 | if annotations.shape[0] == 0:
199 | false_positives = np.append(false_positives, 1)
200 | true_positives = np.append(true_positives, 0)
201 | continue
202 |
203 | overlaps = compute_overlap(
204 | np.expand_dims(d, axis=0), annotations)
205 | assigned_annotation = np.argmax(overlaps, axis=1)
206 | max_overlap = overlaps[0, assigned_annotation]
207 |
208 | if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
209 | false_positives = np.append(false_positives, 0)
210 | true_positives = np.append(true_positives, 1)
211 | detected_annotations.append(assigned_annotation)
212 | else:
213 | false_positives = np.append(false_positives, 1)
214 | true_positives = np.append(true_positives, 0)
215 |
216 | # no annotations -> AP for this class is 0 (is this correct?)
217 | if num_annotations == 0:
218 | average_precisions[label] = 0, 0
219 | continue
220 |
221 | # sort by score
222 | indices = np.argsort(-scores)
223 | false_positives = false_positives[indices]
224 | true_positives = true_positives[indices]
225 |
226 | # compute false positives and true positives
227 | false_positives = np.cumsum(false_positives)
228 | true_positives = np.cumsum(true_positives)
229 |
230 | # compute recall and precision
231 | recall = true_positives / num_annotations
232 | precision = true_positives / \
233 | np.maximum(true_positives + false_positives,
234 | np.finfo(np.float64).eps)
235 |
236 | # compute average precision
237 | average_precision = _compute_ap(recall, precision)
238 | average_precisions[label] = average_precision, num_annotations
239 |
240 | print('\nmAP:')
241 | for label in range(generator.num_classes()):
242 | label_name = generator.label_to_name(label)
243 | print('{}: {}'.format(label_name, average_precisions[label][0]))
244 |
245 | return average_precisions
246 |
--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | class MetricTracker:
5 | def __init__(self, *keys, writer=None):
6 | self.writer = writer
7 | self._data = pd.DataFrame(
8 | index=keys, columns=['total', 'counts', 'average'])
9 | self.reset()
10 |
11 | def reset(self):
12 | for col in self._data.columns:
13 | self._data[col].values[:] = 0
14 |
15 | def update(self, key, value, n=1):
16 | if self.writer is not None:
17 | self.writer.add_scalar(key, value)
18 | self._data.total[key] += value * n
19 | self._data.counts[key] += n
20 | self._data.average[key] = self._data.total[key] / \
21 | self._data.counts[key]
22 |
23 | def avg(self, key):
24 | return self._data.average[key]
25 |
26 | def result(self):
27 | return dict(self._data.average)
28 |
--------------------------------------------------------------------------------
/utils/vis_bbox.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | from PIL import Image
3 | import numpy as np
4 | import matplotlib.pyplot as plt
5 |
6 |
7 | def vis_bbox(img, bbox, label=None, score=None,
8 | instance_colors=None, alpha=1., linewidth=2., ax=None):
9 | """Visualize bounding boxes inside the image.
10 | Args:
11 | img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
12 | This is in RGB format and the range of its value is
13 | :math:`[0, 255]`. If this is :obj:`None`, no image is displayed.
14 | bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where
15 | :math:`R` is the number of bounding boxes in the image.
16 | Each element is organized
17 | by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis.
18 | label (~numpy.ndarray): An integer array of shape :math:`(R,)`.
19 | The values correspond to id for label names stored in
20 | :obj:`label_names`. This is optional.
21 | score (~numpy.ndarray): A float array of shape :math:`(R,)`.
22 | Each value indicates how confident the prediction is.
23 | This is optional.
24 | label_names (iterable of strings): Name of labels ordered according
25 | to label ids. If this is :obj:`None`, labels will be skipped.
26 | instance_colors (iterable of tuples): List of colors.
27 | Each color is RGB format and the range of its values is
28 | :math:`[0, 255]`. The :obj:`i`-th element is the color used
29 | to visualize the :obj:`i`-th instance.
30 | If :obj:`instance_colors` is :obj:`None`, the red is used for
31 | all boxes.
32 | alpha (float): The value which determines transparency of the
33 | bounding boxes. The range of this value is :math:`[0, 1]`.
34 | linewidth (float): The thickness of the edges of the bounding boxes.
35 | ax (matplotlib.axes.Axis): The visualization is displayed on this
36 | axis. If this is :obj:`None` (default), a new axis is created.
37 | Returns:
38 | ~matploblib.axes.Axes:
39 | Returns the Axes object with the plot for further tweaking.
40 | from: https://github.com/chainer/chainercv
41 | """
42 |
43 | if label is not None and not len(bbox) == len(label):
44 | raise ValueError('The length of label must be same as that of bbox')
45 | if score is not None and not len(bbox) == len(score):
46 | raise ValueError('The length of score must be same as that of bbox')
47 |
48 | # Returns newly instantiated matplotlib.axes.Axes object if ax is None
49 | if ax is None:
50 | fig = plt.figure()
51 | # ax = fig.add_subplot(1, 1, 1)
52 | h, w, _ = img.shape
53 | w_ = w / 60.0
54 | h_ = w_ * (h / w)
55 | fig.set_size_inches((w_, h_))
56 | ax = plt.axes([0, 0, 1, 1])
57 | ax.imshow(img.astype(np.uint8))
58 | ax.axis('off')
59 | # If there is no bounding box to display, visualize the image and exit.
60 | if len(bbox) == 0:
61 | return fig, ax
62 |
63 | if instance_colors is None:
64 | # Red
65 | instance_colors = np.zeros((len(bbox), 3), dtype=np.float32)
66 | instance_colors[:, 0] = 51
67 | instance_colors[:, 1] = 51
68 | instance_colors[:, 2] = 224
69 | instance_colors = np.array(instance_colors)
70 |
71 | for i, bb in enumerate(bbox):
72 | xy = (bb[0], bb[1])
73 | height = bb[3] - bb[1]
74 | width = bb[2] - bb[0]
75 | color = instance_colors[i % len(instance_colors)] / 255
76 | ax.add_patch(plt.Rectangle(
77 | xy, width, height, fill=False,
78 | edgecolor=color, linewidth=linewidth, alpha=alpha))
79 |
80 | caption = []
81 | caption.append(label[i])
82 | if(len(score) > 0):
83 | sc = score[i]
84 | caption.append('{}'.format(sc))
85 |
86 | if len(caption) > 0:
87 | face_color = np.array([225, 51, 123])/255
88 | ax.text(bb[0], bb[1],
89 | ': '.join(caption),
90 | fontsize=12,
91 | color='black',
92 | style='italic',
93 | bbox={'facecolor': face_color, 'edgecolor': face_color, 'alpha': 1, 'pad': 0})
94 | return fig, ax
95 |
96 |
97 | if __name__ == '__main__':
98 | img = cv2.imread('./../docs/output.png')
99 | print('img: ', img.shape)
100 | img = np.array(img)
101 | # img = img.convert('RGB')
102 | bbox = np.array([[50, 50, 200, 200]])
103 | label = np.array(['toan'])
104 | score = np.array([100])
105 | ax, fig = vis_bbox(img=img,
106 | bbox=bbox,
107 | label=label,
108 | score=score,
109 | label_names=label_names
110 | )
111 | fig.savefig('kaka.png')
112 | fig.show()
113 | plt.show()
114 |
--------------------------------------------------------------------------------
/utils/visualization.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | from datetime import datetime
3 |
4 |
5 | class TensorboardWriter():
6 | def __init__(self, log_dir, enabled):
7 | self.writer = None
8 | self.selected_module = ""
9 |
10 | if enabled:
11 | log_dir = str(log_dir)
12 |
13 | # Retrieve vizualization writer.
14 | succeeded = False
15 | for module in ["torch.utils.tensorboard", "tensorboardX"]:
16 | try:
17 | self.writer = importlib.import_module(
18 | module).SummaryWriter(log_dir)
19 | succeeded = True
20 | break
21 | except ImportError:
22 | succeeded = False
23 | self.selected_module = module
24 |
25 | if not succeeded:
26 | message = "Warning: visualization (Tensorboard) is configured to use, but currently not installed on " \
27 | "this machine. Please install TensorboardX with 'pip install tensorboardx', upgrade PyTorch to " \
28 | "version >= 1.1 to use 'torch.utils.tensorboard' or turn off the option in the 'config.json' file."
29 | print(message)
30 |
31 | self.step = 0
32 | self.mode = ''
33 |
34 | self.tb_writer_ftns = {
35 | 'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio',
36 | 'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding', 'add_graph'
37 | }
38 | self.tag_mode_exceptions = {'add_histogram', 'add_embedding'}
39 | self.timer = datetime.now()
40 |
41 | def set_step(self, step, mode='train'):
42 | self.mode = mode
43 | self.step = step
44 | if step == 0:
45 | self.timer = datetime.now()
46 | else:
47 | duration = datetime.now() - self.timer
48 | self.add_scalar('steps_per_sec', 1 / duration.total_seconds())
49 | self.timer = datetime.now()
50 |
51 | def __getattr__(self, name):
52 | """
53 | If visualization is configured to use:
54 | return add_data() methods of tensorboard with additional information (step, tag) added.
55 | Otherwise:
56 | return a blank function handle that does nothing
57 | """
58 | if name in self.tb_writer_ftns:
59 | add_data = getattr(self.writer, name, None)
60 |
61 | def wrapper(tag, data, *args, **kwargs):
62 | if add_data is not None:
63 | # add mode(train/valid) tag
64 | if name not in self.tag_mode_exceptions:
65 | tag = '{}/{}'.format(tag, self.mode)
66 | add_data(tag, data, self.step, *args, **kwargs)
67 | return wrapper
68 | else:
69 | # default action for returning methods defined in this class, set_step() for instance.
70 | try:
71 | attr = object.__getattr__(name)
72 | except AttributeError:
73 | raise AttributeError("type object '{}' has no attribute '{}'".format(
74 | self.selected_module, name))
75 | return attr
76 |
--------------------------------------------------------------------------------