├── .circleci
    └── config.yml
├── .gitignore
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── configs
    └── efficientdet-d0.yaml
├── datasets
    ├── __init__.py
    ├── augmentation.py
    ├── coco.py
    ├── coco_labels.txt
    ├── scripts
    │   ├── COCO2014.sh
    │   ├── COCO2017.sh
    │   ├── VOC2007.sh
    │   └── VOC2012.sh
    ├── visual_aug.py
    └── voc0712.py
├── demo.py
├── docs
    ├── arch.png
    ├── compare.png
    ├── demo.png
    ├── output.png
    ├── performance.png
    └── pytoan.gif
├── eval.py
├── models
    ├── __init__.py
    ├── bifpn.py
    ├── efficientdet.py
    ├── efficientnet.py
    ├── losses.py
    ├── module.py
    ├── retinahead.py
    └── utils.py
├── requirements.txt
├── test.py
├── train.py
└── utils
    ├── __init__.py
    ├── config_eff.py
    ├── helper.py
    ├── metric.py
    ├── util.py
    ├── vis_bbox.py
    └── visualization.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |  build:
 4 |   docker:
 5 |    - image: toandaominh1997/pytoan:latest
 6 |   steps:
 7 |    - checkout # check out the code in the project directory
 8 |    - run: |
 9 |        pip install flake8
10 |        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
11 |        python test.py
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # vscode
107 | .vscode/*
108 | !.vscode/settings.json
109 | !.vscode/tasks.json
110 | !.vscode/launch.json
111 | !.vscode/extensions.json
112 | *.code-workspace
113 | 
114 | 
115 | saved/
116 | weights/
117 | val2017_bbox_results.json


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.pythonPath": "/home/toandm2/devtools/anaconda3/envs/pytoan/bin/python"
3 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Toan Dao Minh(bigkizd)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # EfficientDet: Scalable and Efficient Object Detection, in PyTorch
  2 | A [PyTorch](http://pytorch.org/) implementation of [EfficientDet](https://arxiv.org/abs/1911.09070) from the 2019 paper by Mingxing Tan Ruoming Pang Quoc V. Le
  3 | Google Research, Brain Team.  The official and original: comming soon.
  4 | 
  5 | 
  6 | <img src= "./docs/arch.png"/>
  7 | 
  8 | # Fun with Demo:
  9 | ```Shell
 10 | python demo.py --weight ./checkpoint_VOC_efficientdet-d1_97.pth --threshold 0.6 --iou_threshold 0.5 --cam --score
 11 | ```
 12 | 
 13 | <p align="center">
 14 | <img src="docs/pytoan.gif">
 15 | </p>
 16 | 
 17 | 
 18 | ### Table of Contents
 19 | - <a href='#recent-update'>Recent Update</a>
 20 | - <a href='#benchmarking'>Benchmarking</a>
 21 | - <a href='#installation'>Installation</a>
 22 | - <a href='#installation'>Installation</a>
 23 | - <a href='#prerequisites'>Prerequisites</a>
 24 | - <a href='#datasets'>Datasets</a>
 25 | - <a href='#training-efficientdet'>Train</a>
 26 | - <a href='#evaluation'>Evaluate</a>
 27 | - <a href='#performance'>Performance</a>
 28 | - <a href='#demo'>Demo</a>
 29 | - <a href='#todo'>Future Work</a>
 30 | - <a href='#references'>Reference</a>
 31 | 
 32 | &nbsp;
 33 | &nbsp;
 34 | &nbsp;
 35 | &nbsp;
 36 | 
 37 | ## Recent Update
 38 |  - [06/01/2020] Support both DistributedDataParallel and DataParallel, change augmentation, eval_voc
 39 |  - [17/12/2019] Add Fast normalized fusion, Augmentation with Ratio, Change RetinaHead, Fix Support EfficientDet-D0->D7
 40 |  - [7/12/2019] Support EfficientDet-D0, EfficientDet-D1, EfficientDet-D2, EfficientDet-D3, EfficientDet-D4,... . Support change gradient accumulation steps, AdamW.
 41 | ## Benchmarking
 42 | 
 43 | We benchmark our code thoroughly on three datasets: pascal voc and coco, using family efficientnet different network architectures: EfficientDet-D0->7. Below are the results:
 44 | 
 45 | 1). PASCAL VOC 2007 (Train/Test: 07trainval/07test, scale=600, ROI Align)
 46 | 
 47 | model    | mAP |
 48 | ---------|--------|
 49 | [EfficientDet-D0(with Weight)](https://drive.google.com/file/d/1r7MAyBfG5OK_9F_cU8yActUWxTHOuOpL/view?usp=sharing | 62.16
 50 | 
 51 | 
 52 | ## Installation
 53 | - Install [PyTorch](http://pytorch.org/) by selecting your environment on the website and running the appropriate command.
 54 | - Clone this repository and install package [prerequisites](#prerequisites) below.
 55 | - Then download the dataset by following the [instructions](#datasets) below.
 56 | - Note: For training, we currently support [VOC](http://host.robots.ox.ac.uk/pascal/VOC/) and [COCO](http://mscoco.org/), and aim to add [ImageNet](http://www.image-net.org/) support soon.
 57 | 
 58 | ### prerequisites
 59 | 
 60 | * Python 3.6+
 61 | * PyTorch 1.3+
 62 | * Torchvision 0.4.0+ (**We need high version because Torchvision support nms now.**)
 63 | * requirements.txt 
 64 | ## Datasets
 65 | To make things easy, we provide bash scripts to handle the dataset downloads and setup for you.  We also provide simple dataset loaders that inherit `torch.utils.data.Dataset`, making them fully compatible with the `torchvision.datasets` [API](http://pytorch.org/docs/torchvision/datasets.html).
 66 | 
 67 | ### VOC Dataset
 68 | PASCAL VOC: Visual Object Classes
 69 | 
 70 | ##### Download VOC2007 + VOC2012 trainval & test
 71 | ```Shell
 72 | # specify a directory for dataset to be downloaded into, else default is ~/data/
 73 | sh datasets/scripts/VOC2007.sh
 74 | sh datasets/scripts/VOC2012.sh
 75 | ```
 76 | 
 77 | ### COCO
 78 | Microsoft COCO: Common Objects in Context
 79 | 
 80 | ##### Download COCO 2017
 81 | ```Shell
 82 | # specify a directory for dataset to be downloaded into, else default is ~/data/
 83 | sh datasets/scripts/COCO2017.sh
 84 | ```
 85 | 
 86 | ## Training EfficientDet
 87 | 
 88 | - To train EfficientDet using the train script simply specify the parameters listed in `train.py` as a flag or manually change them.
 89 | 
 90 | ```Shell
 91 | python train.py --network effcientdet-d0  # Example
 92 | ```
 93 | 
 94 |   - With VOC Dataset:
 95 |   ```Shell
 96 |   # DataParallel
 97 |   python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network effcientdet-d0 --batch_size 32 
 98 |   # DistributedDataParallel with backend nccl
 99 |   python train.py --dataset VOC --dataset_root /root/data/VOCdevkit/ --network effcientdet-d0 --batch_size 32 --multiprocessing-distributed
100 |   ```
101 |   - With COCO Dataset:
102 |   ```Shell
103 |   # DataParallel
104 |   python train.py --dataset COCO --dataset_root ~/data/coco/ --network effcientdet-d0 --batch_size 32
105 |   # DistributedDataParallel with backend nccl
106 |   python train.py --dataset COCO --dataset_root ~/data/coco/ --network effcientdet-d0 --batch_size 32 --multiprocessing-distributed
107 |   ```
108 | 
109 | ## Evaluation
110 | To evaluate a trained network:
111 |  - With VOC Dataset:
112 |     ```Shell
113 |     python eval_voc.py --dataset_root ~/data/VOCdevkit --weight ./checkpoint_VOC_efficientdet-d0_261.pth
114 |     ```
115 | - With COCO Dataset
116 | comming soon.
117 | ## Demo
118 | 
119 | ```Shell
120 | python demo.py --threshold 0.5 --iou_threshold 0.5 --score --weight checkpoint_VOC_efficientdet-d1_34.pth --file_name demo.png
121 | ```
122 | 
123 | Output: 
124 | 
125 | <p align="center">
126 | <img src= "./docs/demo.png">
127 | </p>
128 | 
129 | ## Webcam Demo
130 | 
131 | You can use a webcam in a real-time demo by running:
132 | ```Shell
133 | python demo.py --threshold 0.5 --iou_threshold 0.5 --cam --score --weight checkpoint_VOC_efficientdet-d1_34.pth
134 | ```
135 | 
136 | ## Performance
137 | <img src= "./docs/compare.png"/>
138 | 
139 | 
140 | 
141 | ## TODO
142 | We have accumulated the following to-do list, which we hope to complete in the near future
143 | - Still to come:
144 |   * [x] EfficientDet-[D0-7]
145 |   * [x] GPU-Parallel
146 |   * [x] NMS
147 |   * [ ] Soft-NMS
148 |   * [x] Pretrained model
149 |   * [x] Demo
150 |   * [ ] Model zoo
151 |   * [ ] TorchScript
152 |   * [ ] Mobile
153 |   * [ ] C++ Onnx
154 |   
155 | 
156 | ## Authors
157 | 
158 | * [**Toan Dao Minh**](https://github.com/toandaominh1997)
159 | 
160 | ***Note:*** Unfortunately, this is just a hobby of ours and not a full-time job, so we'll do our best to keep things up to date, but no guarantees.  That being said, thanks to everyone for your continued help and feedback as it is really appreciated. We will try to address everything as soon as possible.
161 | 
162 | ## References
163 | - tanmingxing, rpang, qvl, et al. "EfficientDet: Scalable and Efficient Object Detection." [EfficientDet](https://arxiv.org/abs/1911.09070).
164 | - A list of other great EfficientDet ports that were sources of inspiration:
165 |   * [EfficientNet](https://github.com/lukemelas/EfficientNet-PyTorch)
166 |   * [SSD.Pytorch](https://github.com/amdegroot/ssd.pytorch)
167 |   * [mmdetection](https://github.com/open-mmlab/mmdetection)
168 |   * [RetinaNet.Pytorch](https://github.com/yhenon/pytorch-retinanet)
169 |   * [NMS.Torchvision](https://pytorch.org/docs/stable/torchvision/ops.html)
170 |   
171 | 
172 | ## Citation
173 | 
174 |     @article{efficientdetpytoan,
175 |         Author = {Toan Dao Minh},
176 |         Title = {A Pytorch Implementation of EfficientDet Object Detection},
177 |         Journal = {github.com/toandaominh1997/EfficientDet.Pytorch},
178 |         Year = {2019}
179 |     }
180 | 


--------------------------------------------------------------------------------
/configs/efficientdet-d0.yaml:
--------------------------------------------------------------------------------
 1 | SEED: 42
 2 | DEVICE: [0, 1]
 3 | # DATASET
 4 | DATA_TRAIN: VOC
 5 | 
 6 | GRADIENT_ACCUMULATION_STEPS: 1
 7 | GRADIENT_CLIPPING: 1
 8 | NUM_EPOCH: 500
 9 | EARLY_STOPPING: 50
10 | VALIDATION_FREQUENCY: 2
11 | TENSORBOARD: True
12 | CHECKPOINT_DIR: ./saved
13 | RESUME_PATH:
14 | 
15 | TRAIN_DATASET:
16 |     PY: datasets
17 |     CLASS: spoofDataset
18 |     ARGS:
19 |         root_dir: ./
20 |         phase: train
21 | 
22 | VALID_DATASET:
23 |     PY: datasets
24 |     CLASS: spoofDataset
25 |     ARGS:
26 |         root_dir: ./
27 |         phase: valid
28 | 
29 | TEST_DATASET:
30 |     PY: datasets.dataset
31 |     CLASS: spoofDataset
32 |     ARGS:
33 |         root_dir: ./data
34 |         phase: valid
35 | 
36 | TRAIN_DATALOADER:
37 |     PY: torch.utils.data
38 |     CLASS: DataLoader
39 |     ARGS:
40 |         batch_size: 8
41 |         shuffle: True
42 |         num_workers: 8
43 |         pin_memory: True
44 | 
45 | VALID_DATALOADER:
46 |     PY: torch.utils.data
47 |     CLASS: DataLoader
48 |     ARGS:
49 |         batch_size: 8
50 |         shuffle: False
51 |         num_workers: 8
52 |         pin_memory: True
53 | 
54 | TEST_DATALOADER:
55 |     PY: torch.utils.data
56 |     CLASS: DataLoader
57 |     ARGS:
58 |         batch_size: 8
59 |         shuffle: False
60 |         num_workers: 8
61 | 
62 | MODEL:
63 |   PY: models
64 |   CLASS: EfficientDet
65 |   ARGS:
66 |     num_class: 21
67 |     levels: 3
68 |     num_channels: 128
69 |     model_name: efficientnet-b0
70 | 
71 | CRITERION:
72 |   PY: layers.modules
73 |   CLASS: MultiBoxLoss
74 |   ARGS:
75 |     num_classes: 21
76 |     overlap_thresh: 0.5
77 |     prior_for_matching: True
78 |     bkg_label: 0
79 |     neg_mining: True
80 |     neg_pos: 3
81 |     neg_overlap: 0.5
82 |     encode_target: False
83 |     use_gpu: False
84 | 
85 | OPTIMIZER:
86 |   PY: torch.optim
87 |   CLASS: AdamW
88 |   ARGS:
89 |     lr: 0.0001
90 |     weight_decay: 0.000005
91 | 
92 | SCHEDULER:
93 |   PY: torch.optim.lr_scheduler
94 |   CLASS: ReduceLROnPlateau
95 |   ARGS:
96 |     factor: 0.15
97 |     patience: 2
98 | 


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .voc0712 import VOCDetection, VOC_CLASSES
2 | from .augmentation import get_augumentation, detection_collate, Resizer, Normalizer, Augmenter, collater
3 | from .coco import CocoDataset


--------------------------------------------------------------------------------
/datasets/augmentation.py:
--------------------------------------------------------------------------------
  1 | import albumentations as albu
  2 | from albumentations.pytorch.transforms import ToTensor
  3 | import torch
  4 | import numpy as np
  5 | import cv2
  6 | 
  7 | 
  8 | def get_augumentation(phase, width=512, height=512, min_area=0., min_visibility=0.):
  9 |     list_transforms = []
 10 |     if phase == 'train':
 11 |         list_transforms.extend([
 12 |             albu.augmentations.transforms.LongestMaxSize(
 13 |                 max_size=width, always_apply=True),
 14 |             albu.PadIfNeeded(min_height=height, min_width=width,
 15 |                              always_apply=True, border_mode=0, value=[0, 0, 0]),
 16 |             albu.augmentations.transforms.RandomResizedCrop(
 17 |                 height=height,
 18 |                 width=width, p=0.3),
 19 |             albu.augmentations.transforms.Flip(),
 20 |             albu.augmentations.transforms.Transpose(),
 21 |             albu.OneOf([
 22 |                 albu.RandomBrightnessContrast(brightness_limit=0.5,
 23 |                                               contrast_limit=0.4),
 24 |                 albu.RandomGamma(gamma_limit=(50, 150)),
 25 |                 albu.NoOp()
 26 |             ]),
 27 |             albu.OneOf([
 28 |                 albu.RGBShift(r_shift_limit=20, b_shift_limit=15,
 29 |                               g_shift_limit=15),
 30 |                 albu.HueSaturationValue(hue_shift_limit=5,
 31 |                                         sat_shift_limit=5),
 32 |                 albu.NoOp()
 33 |             ]),
 34 |             albu.CLAHE(p=0.8),
 35 |             albu.HorizontalFlip(p=0.5),
 36 |             albu.VerticalFlip(p=0.5),
 37 |         ])
 38 |     if(phase == 'test' or phase == 'valid'):
 39 |         list_transforms.extend([
 40 |             albu.Resize(height=height, width=width)
 41 |         ])
 42 |     list_transforms.extend([
 43 |         albu.Normalize(mean=(0.485, 0.456, 0.406),
 44 |                        std=(0.229, 0.224, 0.225), p=1),
 45 |         ToTensor()
 46 |     ])
 47 |     if(phase == 'test'):
 48 |         return albu.Compose(list_transforms)
 49 |     return albu.Compose(list_transforms, bbox_params=albu.BboxParams(format='pascal_voc', min_area=min_area,
 50 |                                                                      min_visibility=min_visibility, label_fields=['category_id']))
 51 | 
 52 | 
 53 | def detection_collate(batch):
 54 |     imgs = [s['image'] for s in batch]
 55 |     annots = [s['bboxes'] for s in batch]
 56 |     labels = [s['category_id'] for s in batch]
 57 | 
 58 |     max_num_annots = max(len(annot) for annot in annots)
 59 |     annot_padded = np.ones((len(annots), max_num_annots, 5))*-1
 60 | 
 61 |     if max_num_annots > 0:
 62 |         for idx, (annot, lab) in enumerate(zip(annots, labels)):
 63 |             if len(annot) > 0:
 64 |                 annot_padded[idx, :len(annot), :4] = annot
 65 |                 annot_padded[idx, :len(annot), 4] = lab
 66 |     return (torch.stack(imgs, 0), torch.FloatTensor(annot_padded))
 67 | 
 68 | 
 69 | def collater(data):
 70 |     imgs = [s['img'] for s in data]
 71 |     annots = [s['annot'] for s in data]
 72 |     scales = [s['scale'] for s in data]
 73 | 
 74 |     imgs = torch.from_numpy(np.stack(imgs, axis=0))
 75 | 
 76 |     max_num_annots = max(annot.shape[0] for annot in annots)
 77 | 
 78 |     if max_num_annots > 0:
 79 | 
 80 |         annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1
 81 | 
 82 |         if max_num_annots > 0:
 83 |             for idx, annot in enumerate(annots):
 84 |                 if annot.shape[0] > 0:
 85 |                     annot_padded[idx, :annot.shape[0], :] = annot
 86 |     else:
 87 |         annot_padded = torch.ones((len(annots), 1, 5)) * -1
 88 | 
 89 |     imgs = imgs.permute(0, 3, 1, 2)
 90 | 
 91 |     return (imgs, torch.FloatTensor(annot_padded))
 92 | 
 93 | 
 94 | class Resizer(object):
 95 |     """Convert ndarrays in sample to Tensors."""
 96 | 
 97 |     def __call__(self, sample, common_size=512):
 98 |         image, annots = sample['img'], sample['annot']
 99 |         height, width, _ = image.shape
100 |         if height > width:
101 |             scale = common_size / height
102 |             resized_height = common_size
103 |             resized_width = int(width * scale)
104 |         else:
105 |             scale = common_size / width
106 |             resized_height = int(height * scale)
107 |             resized_width = common_size
108 | 
109 |         image = cv2.resize(image, (resized_width, resized_height))
110 | 
111 |         new_image = np.zeros((common_size, common_size, 3))
112 |         new_image[0:resized_height, 0:resized_width] = image
113 |         annots[:, :4] *= scale
114 | 
115 |         return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale}
116 | 
117 | 
118 | class Augmenter(object):
119 |     """Convert ndarrays in sample to Tensors."""
120 | 
121 |     def __call__(self, sample, flip_x=0.5):
122 |         if np.random.rand() < flip_x:
123 |             image, annots = sample['img'], sample['annot']
124 |             image = image[:, ::-1, :]
125 | 
126 |             rows, cols, channels = image.shape
127 | 
128 |             x1 = annots[:, 0].copy()
129 |             x2 = annots[:, 2].copy()
130 | 
131 |             x_tmp = x1.copy()
132 | 
133 |             annots[:, 0] = cols - x2
134 |             annots[:, 2] = cols - x_tmp
135 | 
136 |             sample = {'img': image, 'annot': annots}
137 | 
138 |         return sample
139 | 
140 | 
141 | class Normalizer(object):
142 | 
143 |     def __init__(self):
144 |         self.mean = np.array([[[0.485, 0.456, 0.406]]])
145 |         self.std = np.array([[[0.229, 0.224, 0.225]]])
146 | 
147 |     def __call__(self, sample):
148 |         image, annots = sample['img'], sample['annot']
149 | 
150 |         return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots}
151 | 


--------------------------------------------------------------------------------
/datasets/coco.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function, division
  2 | import sys
  3 | import os
  4 | import torch
  5 | import numpy as np
  6 | import random
  7 | import csv
  8 | 
  9 | from torch.utils.data import Dataset, DataLoader
 10 | from torchvision import transforms, utils
 11 | from torch.utils.data.sampler import Sampler
 12 | 
 13 | from pycocotools.coco import COCO
 14 | 
 15 | import skimage.io
 16 | import skimage.transform
 17 | import skimage.color
 18 | import skimage
 19 | import cv2
 20 | from PIL import Image
 21 | 
 22 | 
 23 | class CocoDataset(Dataset):
 24 |     """Coco dataset."""
 25 | 
 26 |     def __init__(self, root_dir, set_name='train2017', transform=None):
 27 |         """
 28 |         Args:
 29 |             root_dir (string): COCO directory.
 30 |             transform (callable, optional): Optional transform to be applied
 31 |                 on a sample.
 32 |         """
 33 |         self.root_dir = root_dir
 34 |         self.set_name = set_name
 35 |         self.transform = transform
 36 | 
 37 |         self.coco = COCO(os.path.join(self.root_dir, 'annotations',
 38 |                                       'instances_' + self.set_name + '.json'))
 39 |         self.image_ids = self.coco.getImgIds()
 40 | 
 41 |         self.load_classes()
 42 | 
 43 |     def load_classes(self):
 44 |         # load class names (name -> label)
 45 |         categories = self.coco.loadCats(self.coco.getCatIds())
 46 |         categories.sort(key=lambda x: x['id'])
 47 | 
 48 |         self.classes = {}
 49 |         self.coco_labels = {}
 50 |         self.coco_labels_inverse = {}
 51 |         for c in categories:
 52 |             self.coco_labels[len(self.classes)] = c['id']
 53 |             self.coco_labels_inverse[c['id']] = len(self.classes)
 54 |             self.classes[c['name']] = len(self.classes)
 55 | 
 56 |         # also load the reverse (label -> name)
 57 |         self.labels = {}
 58 |         for key, value in self.classes.items():
 59 |             self.labels[value] = key
 60 | 
 61 |     def __len__(self):
 62 |         return len(self.image_ids)
 63 | 
 64 |     def __getitem__(self, idx):
 65 | 
 66 |         img = self.load_image(idx)
 67 |         annot = self.load_annotations(idx)
 68 |         sample = {'img': img, 'annot': annot}
 69 |         if self.transform:
 70 |             sample = self.transform(sample)
 71 |         return sample
 72 | 
 73 |     def load_image(self, image_index):
 74 |         image_info = self.coco.loadImgs(self.image_ids[image_index])[0]
 75 |         path = os.path.join(self.root_dir, 'images',
 76 |                             self.set_name, image_info['file_name'])
 77 |         img = cv2.imread(path)
 78 | 
 79 |         if len(img.shape) == 2:
 80 |             img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
 81 |         return img
 82 | 
 83 |     def load_annotations(self, image_index):
 84 |         # get ground truth annotations
 85 |         annotations_ids = self.coco.getAnnIds(
 86 |             imgIds=self.image_ids[image_index], iscrowd=False)
 87 |         annotations = np.zeros((0, 5))
 88 | 
 89 |         # some images appear to miss annotations (like image with id 257034)
 90 |         if len(annotations_ids) == 0:
 91 |             return annotations
 92 | 
 93 |         # parse annotations
 94 |         coco_annotations = self.coco.loadAnns(annotations_ids)
 95 |         for idx, a in enumerate(coco_annotations):
 96 | 
 97 |             # some annotations have basically no width / height, skip them
 98 |             if a['bbox'][2] < 1 or a['bbox'][3] < 1:
 99 |                 continue
100 | 
101 |             annotation = np.zeros((1, 5))
102 |             annotation[0, :4] = a['bbox']
103 |             annotation[0, 4] = self.coco_label_to_label(a['category_id'])
104 |             annotations = np.append(annotations, annotation, axis=0)
105 | 
106 |         # transform from [x, y, w, h] to [x1, y1, x2, y2]
107 |         annotations[:, 2] = annotations[:, 0] + annotations[:, 2]
108 |         annotations[:, 3] = annotations[:, 1] + annotations[:, 3]
109 | 
110 |         return annotations
111 | 
112 |     def coco_label_to_label(self, coco_label):
113 |         return self.coco_labels_inverse[coco_label]
114 | 
115 |     def label_to_coco_label(self, label):
116 |         return self.coco_labels[label]
117 | 
118 |     def image_aspect_ratio(self, image_index):
119 |         image = self.coco.loadImgs(self.image_ids[image_index])[0]
120 |         return float(image['width']) / float(image['height'])
121 | 
122 |     def num_classes(self):
123 |         return 80
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     from augmentation import get_augumentation
128 |     dataset = CocoDataset(root_dir='/root/data/coco', set_name='trainval35k',
129 |                           transform=get_augumentation(phase='train'))
130 |     sample = dataset[0]
131 |     print('sample: ', sample)
132 | 


--------------------------------------------------------------------------------
/datasets/coco_labels.txt:
--------------------------------------------------------------------------------
 1 | 1,1,person
 2 | 2,2,bicycle
 3 | 3,3,car
 4 | 4,4,motorcycle
 5 | 5,5,airplane
 6 | 6,6,bus
 7 | 7,7,train
 8 | 8,8,truck
 9 | 9,9,boat
10 | 10,10,traffic light
11 | 11,11,fire hydrant
12 | 13,12,stop sign
13 | 14,13,parking meter
14 | 15,14,bench
15 | 16,15,bird
16 | 17,16,cat
17 | 18,17,dog
18 | 19,18,horse
19 | 20,19,sheep
20 | 21,20,cow
21 | 22,21,elephant
22 | 23,22,bear
23 | 24,23,zebra
24 | 25,24,giraffe
25 | 27,25,backpack
26 | 28,26,umbrella
27 | 31,27,handbag
28 | 32,28,tie
29 | 33,29,suitcase
30 | 34,30,frisbee
31 | 35,31,skis
32 | 36,32,snowboard
33 | 37,33,sports ball
34 | 38,34,kite
35 | 39,35,baseball bat
36 | 40,36,baseball glove
37 | 41,37,skateboard
38 | 42,38,surfboard
39 | 43,39,tennis racket
40 | 44,40,bottle
41 | 46,41,wine glass
42 | 47,42,cup
43 | 48,43,fork
44 | 49,44,knife
45 | 50,45,spoon
46 | 51,46,bowl
47 | 52,47,banana
48 | 53,48,apple
49 | 54,49,sandwich
50 | 55,50,orange
51 | 56,51,broccoli
52 | 57,52,carrot
53 | 58,53,hot dog
54 | 59,54,pizza
55 | 60,55,donut
56 | 61,56,cake
57 | 62,57,chair
58 | 63,58,couch
59 | 64,59,potted plant
60 | 65,60,bed
61 | 67,61,dining table
62 | 70,62,toilet
63 | 72,63,tv
64 | 73,64,laptop
65 | 74,65,mouse
66 | 75,66,remote
67 | 76,67,keyboard
68 | 77,68,cell phone
69 | 78,69,microwave
70 | 79,70,oven
71 | 80,71,toaster
72 | 81,72,sink
73 | 82,73,refrigerator
74 | 84,74,book
75 | 85,75,clock
76 | 86,76,vase
77 | 87,77,scissors
78 | 88,78,teddy bear
79 | 89,79,hair drier
80 | 90,80,toothbrush


--------------------------------------------------------------------------------
/datasets/scripts/COCO2014.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | start=`date +%s`
 4 | 
 5 | # handle optional download dir
 6 | if [ -z "$1" ]
 7 |   then
 8 |     # navigate to ~/data
 9 |     echo "navigating to ~/data/ ..."
10 |     mkdir -p ~/data
11 |     cd ~/data/
12 |     mkdir -p ./coco
13 |     cd ./coco
14 |     mkdir -p ./images
15 |     mkdir -p ./annotations
16 |   else
17 |     # check if specified dir is valid
18 |     if [ ! -d $1 ]; then
19 |         echo $1 " is not a valid directory"
20 |         exit 0
21 |     fi
22 |     echo "navigating to " $1 " ..."
23 |     cd $1
24 | fi
25 | 
26 | if [ ! -d images ]
27 |   then
28 |     mkdir -p ./images
29 | fi
30 | 
31 | # Download the image data.
32 | cd ./images
33 | echo "Downloading MSCOCO train images ..."
34 | curl -LO http://images.cocodataset.org/zips/train2014.zip
35 | echo "Downloading MSCOCO val images ..."
36 | curl -LO http://images.cocodataset.org/zips/val2014.zip
37 | 
38 | cd ../
39 | if [ ! -d annotations]
40 |   then
41 |     mkdir -p ./annotations
42 | fi
43 | 
44 | # Download the annotation data.
45 | cd ./annotations
46 | echo "Downloading MSCOCO train/val annotations ..."
47 | curl -LO http://images.cocodataset.org/annotations/annotations_trainval2014.zip
48 | echo "Finished downloading. Now extracting ..."
49 | 
50 | # Unzip data
51 | echo "Extracting train images ..."
52 | unzip ../images/train2014.zip -d ../images
53 | echo "Extracting val images ..."
54 | unzip ../images/val2014.zip -d ../images
55 | echo "Extracting annotations ..."
56 | unzip ./annotations_trainval2014.zip
57 | 
58 | echo "Removing zip files ..."
59 | rm ../images/train2014.zip
60 | rm ../images/val2014.zip
61 | rm ./annotations_trainval2014.zip
62 | 
63 | echo "Creating trainval35k dataset..."
64 | 
65 | # Download annotations json
66 | echo "Downloading trainval35k annotations from S3"
67 | curl -LO https://s3.amazonaws.com/amdegroot-datasets/instances_trainval35k.json.zip
68 | 
69 | # combine train and val 
70 | echo "Combining train and val images"
71 | mkdir ../images/trainval35k
72 | cd ../images/train2014
73 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} + # dir too large for cp
74 | cd ../val2014
75 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} +
76 | 
77 | 
78 | end=`date +%s`
79 | runtime=$((end-start))
80 | 
81 | echo "Completed in " $runtime " seconds"
82 | 


--------------------------------------------------------------------------------
/datasets/scripts/COCO2017.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | start=`date +%s`
 4 | 
 5 | # handle optional download dir
 6 | if [ -z "$1" ]
 7 |   then
 8 |     # navigate to ~/data
 9 |     echo "navigating to ~/data/ ..."
10 |     mkdir -p ~/data
11 |     cd ~/data/
12 |     mkdir -p ./coco
13 |     cd ./coco
14 |     mkdir -p ./images
15 |     mkdir -p ./annotations
16 |   else
17 |     # check if specified dir is valid
18 |     if [ ! -d $1 ]; then
19 |         echo $1 " is not a valid directory"
20 |         exit 0
21 |     fi
22 |     echo "navigating to " $1 " ..."
23 |     cd $1
24 | fi
25 | 
26 | if [ ! -d images ]
27 |   then
28 |     mkdir -p ./images
29 | fi
30 | 
31 | # Download the image data.
32 | cd ./images
33 | echo "Downloading MSCOCO train images ..."
34 | curl -LO http://images.cocodataset.org/zips/train2017.zip
35 | echo "Downloading MSCOCO val images ..."
36 | curl -LO http://images.cocodataset.org/zips/val2017.zip
37 | 
38 | cd ../
39 | if [ ! -d annotations]
40 |   then
41 |     mkdir -p ./annotations
42 | fi
43 | 
44 | # Download the annotation data.
45 | cd ./annotations
46 | echo "Downloading MSCOCO train/val annotations ..."
47 | curl -LO http://images.cocodataset.org/annotations/annotations_trainval2017.zip
48 | echo "Finished downloading. Now extracting ..."
49 | 
50 | # Unzip data
51 | echo "Extracting train images ..."
52 | unzip ../images/train2017.zip -d ../images
53 | echo "Extracting val images ..."
54 | unzip ../images/val2017.zip -d ../images
55 | echo "Extracting annotations ..."
56 | unzip ./annotations_trainval2017.zip
57 | 
58 | echo "Removing zip files ..."
59 | rm ../images/train2017.zip
60 | rm ../images/val2017.zip
61 | rm ./annotations_trainval2017.zip
62 | 
63 | echo "Completed in " $runtime " seconds"
64 | 


--------------------------------------------------------------------------------
/datasets/scripts/VOC2007.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2007 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar
26 | echo "Downloading VOC2007 test data ..."
27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar
28 | echo "Done downloading."
29 | 
30 | # Extract data
31 | echo "Extracting trainval ..."
32 | tar -xvf VOCtrainval_06-Nov-2007.tar
33 | echo "Extracting test ..."
34 | tar -xvf VOCtest_06-Nov-2007.tar
35 | echo "removing tars ..."
36 | rm VOCtrainval_06-Nov-2007.tar
37 | rm VOCtest_06-Nov-2007.tar
38 | 
39 | end=`date +%s`
40 | runtime=$((end-start))
41 | 
42 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/datasets/scripts/VOC2012.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Ellis Brown
 3 | 
 4 | start=`date +%s`
 5 | 
 6 | # handle optional download dir
 7 | if [ -z "$1" ]
 8 |   then
 9 |     # navigate to ~/data
10 |     echo "navigating to ~/data/ ..." 
11 |     mkdir -p ~/data
12 |     cd ~/data/
13 |   else
14 |     # check if is valid directory
15 |     if [ ! -d $1 ]; then
16 |         echo $1 "is not a valid directory"
17 |         exit 0
18 |     fi
19 |     echo "navigating to" $1 "..."
20 |     cd $1
21 | fi
22 | 
23 | echo "Downloading VOC2012 trainval ..."
24 | # Download the data.
25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
26 | echo "Done downloading."
27 | 
28 | 
29 | # Extract data
30 | echo "Extracting trainval ..."
31 | tar -xvf VOCtrainval_11-May-2012.tar
32 | echo "removing tar ..."
33 | rm VOCtrainval_11-May-2012.tar
34 | 
35 | end=`date +%s`
36 | runtime=$((end-start))
37 | 
38 | echo "Completed in" $runtime "seconds"


--------------------------------------------------------------------------------
/datasets/visual_aug.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | from augmentation import get_augumentation
 3 | from voc0712 import VOCDetection
 4 | import matplotlib.pyplot as plt
 5 | EFFICIENTDET = {
 6 |     'efficientdet-d0': {'input_size': 512,
 7 |                         'backbone': 'B0',
 8 |                         'W_bifpn': 64,
 9 |                         'D_bifpn': 2,
10 |                         'D_class': 3},
11 |     'efficientdet-d1': {'input_size': 640,
12 |                         'backbone': 'B1',
13 |                         'W_bifpn': 88,
14 |                         'D_bifpn': 3,
15 |                         'D_class': 3},
16 |     'efficientdet-d2': {'input_size': 768,
17 |                         'backbone': 'B2',
18 |                         'W_bifpn': 112,
19 |                         'D_bifpn': 4,
20 |                         'D_class': 3},
21 | }
22 | 
23 | 
24 | # Functions to visualize bounding boxes and class labels on an image.
25 | # Based on https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/vis.py
26 | 
27 | BOX_COLOR = (255, 0, 0)
28 | TEXT_COLOR = (255, 255, 255)
29 | 
30 | 
31 | def visualize_bbox(img, bbox, class_id, class_idx_to_name, color=BOX_COLOR, thickness=2):
32 |     x_min, y_min, x_max, y_max = bbox
33 |     x_min, x_max, y_min, y_max = int(x_min), int(x_max), int(y_min), int(y_max)
34 |     cv2.rectangle(img, (x_min, y_min), (x_max, y_max),
35 |                   color=color, thickness=thickness)
36 |     # class_name = class_idx_to_name[class_id]
37 |     # ((text_width, text_height), _) = cv2.getTextSize(class_name, cv2.FONT_HERSHEY_SIMPLEX, 0.35, 1)
38 |     # cv2.rectangle(img, (x_min, y_min - int(1.3 * text_height)), (x_min + text_width, y_min), BOX_COLOR, -1)
39 |     # cv2.putText(img, class_name, (x_min, y_min - int(0.3 * text_height)), cv2.FONT_HERSHEY_SIMPLEX, 0.35,TEXT_COLOR, lineType=cv2.LINE_AA)
40 |     return img
41 | 
42 | 
43 | def visualize(annotations, category_id_to_name):
44 |     img = annotations['image'].copy()
45 |     for idx, bbox in enumerate(annotations['bboxes']):
46 |         img = visualize_bbox(
47 |             img, bbox, annotations['category_id'][idx], category_id_to_name)
48 |     # plt.figure(figsize=(12, 12))
49 |     # plt.imshow(img)
50 |     return img
51 | 
52 | 
53 | dataset_root = '/root/data/VOCdevkit'
54 | network = 'efficientdet-d0'
55 | dataset = VOCDetection(root=dataset_root,
56 |                        transform=get_augumentation(phase='train', width=EFFICIENTDET[network]['input_size'], height=EFFICIENTDET[network]['input_size']))
57 | 
58 | 
59 | def visual_data(data, name):
60 |     img = data['image']
61 |     bboxes = data['bboxes']
62 |     annotations = {'image': data['image'], 'bboxes': data['bboxes'], 'category_id': range(
63 |         len(data['bboxes']))}
64 |     category_id_to_name = {v: v for v in range(len(data['bboxes']))}
65 | 
66 |     img = visualize(annotations, category_id_to_name)
67 |     cv2.imwrite(name, img)
68 | 
69 | 
70 | for i in range(20, 25):
71 |     visual_data(dataset[i], "name"+str(i)+".png")
72 | 


--------------------------------------------------------------------------------
/datasets/voc0712.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | import sys
  3 | import torch
  4 | import torch.utils.data as data
  5 | import cv2
  6 | import numpy as np
  7 | if sys.version_info[0] == 2:
  8 |     import xml.etree.cElementTree as ET
  9 | else:
 10 |     import xml.etree.ElementTree as ET
 11 | 
 12 | VOC_CLASSES = (  # always index 0
 13 |     'aeroplane', 'bicycle', 'bird', 'boat',
 14 |     'bottle', 'bus', 'car', 'cat', 'chair',
 15 |     'cow', 'diningtable', 'dog', 'horse',
 16 |     'motorbike', 'person', 'pottedplant',
 17 |     'sheep', 'sofa', 'train', 'tvmonitor')
 18 | 
 19 | # note: if you used our download scripts, this should be right
 20 | VOC_ROOT = osp.join('/home/toandm2', "data/VOCdevkit/")
 21 | 
 22 | 
 23 | class VOCAnnotationTransform(object):
 24 |     """Transforms a VOC annotation into a Tensor of bbox coords and label index
 25 |     Initilized with a dictionary lookup of classnames to indexes
 26 |     Arguments:
 27 |         class_to_ind (dict, optional): dictionary lookup of classnames -> indexes
 28 |             (default: alphabetic indexing of VOC's 20 classes)
 29 |         keep_difficult (bool, optional): keep difficult instances or not
 30 |             (default: False)
 31 |         height (int): height
 32 |         width (int): width
 33 |     """
 34 | 
 35 |     def __init__(self, class_to_ind=None, keep_difficult=False):
 36 |         self.class_to_ind = class_to_ind or dict(
 37 |             zip(VOC_CLASSES, range(len(VOC_CLASSES))))
 38 |         self.keep_difficult = keep_difficult
 39 | 
 40 |     def __call__(self, target, width, height):
 41 |         """
 42 |         Arguments:
 43 |             target (annotation) : the target annotation to be made usable
 44 |                 will be an ET.Element
 45 |         Returns:
 46 |             a list containing lists of bounding boxes  [bbox coords, class name]
 47 |         """
 48 |         res = []
 49 |         for obj in target.iter('object'):
 50 |             difficult = int(obj.find('difficult').text) == 1
 51 |             if not self.keep_difficult and difficult:
 52 |                 continue
 53 |             name = obj.find('name').text.lower().strip()
 54 |             bbox = obj.find('bndbox')
 55 | 
 56 |             pts = ['xmin', 'ymin', 'xmax', 'ymax']
 57 |             bndbox = []
 58 |             for i, pt in enumerate(pts):
 59 |                 cur_pt = float(bbox.find(pt).text) - 1
 60 |                 # scale height or width
 61 |                 # cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height
 62 |                 bndbox.append(cur_pt)
 63 |             label_idx = self.class_to_ind[name]
 64 |             bndbox.append(label_idx)
 65 |             res += [bndbox]  # [xmin, ymin, xmax, ymax, label_ind]
 66 |             # img_id = target.find('filename').text[:-4]
 67 | 
 68 |         return res  # [[xmin, ymin, xmax, ymax, label_ind], ... ]
 69 | 
 70 | 
 71 | class VOCDetection(data.Dataset):
 72 |     """VOC Detection Dataset Object
 73 |     input is image, target is annotation
 74 |     Arguments:
 75 |         root (string): filepath to VOCdevkit folder.
 76 |         image_set (string): imageset to use (eg. 'train', 'val', 'test')
 77 |         transform (callable, optional): transformation to perform on the
 78 |             input image
 79 |         target_transform (callable, optional): transformation to perform on the
 80 |             target `annotation`
 81 |             (eg: take in caption string, return tensor of word indices)
 82 |         dataset_name (string, optional): which dataset to load
 83 |             (default: 'VOC2007')
 84 |     """
 85 | 
 86 |     def __init__(self, root,
 87 |                  image_sets=[('2007', 'trainval'), ('2012', 'trainval')],
 88 |                  transform=None, target_transform=VOCAnnotationTransform(),
 89 |                  dataset_name='VOC0712'):
 90 |         self.root = root
 91 |         self.image_set = image_sets
 92 |         self.transform = transform
 93 |         self.target_transform = target_transform
 94 |         self.name = dataset_name
 95 |         self._annopath = osp.join('%s', 'Annotations', '%s.xml')
 96 |         self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg')
 97 |         self.ids = list()
 98 |         for (year, name) in image_sets:
 99 |             rootpath = osp.join(self.root, 'VOC' + year)
100 |             for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')):
101 |                 self.ids.append((rootpath, line.strip()))
102 | 
103 |     def __getitem__(self, index):
104 |         img_id = self.ids[index]
105 | 
106 |         target = ET.parse(self._annopath % img_id).getroot()
107 |         img = cv2.imread(self._imgpath % img_id)
108 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
109 |         img = img.astype(np.float32)/255.
110 |         height, width, channels = img.shape
111 | 
112 |         if self.target_transform is not None:
113 |             target = self.target_transform(target, width, height)
114 |         target = np.array(target)
115 |         sample = {'img': img, 'annot': target}
116 |         if self.transform is not None:
117 |             sample = self.transform(sample)
118 |         return sample
119 | 
120 |         bbox = target[:, :4]
121 |         labels = target[:, 4]
122 | 
123 |         if self.transform is not None:
124 |             annotation = {'image': img, 'bboxes': bbox, 'category_id': labels}
125 |             augmentation = self.transform(**annotation)
126 |             img = augmentation['image']
127 |             bbox = augmentation['bboxes']
128 |             labels = augmentation['category_id']
129 |         return {'image': img, 'bboxes': bbox, 'category_id': labels}
130 | 
131 |     def __len__(self):
132 |         return len(self.ids)
133 | 
134 |     def num_classes(self):
135 |         return len(VOC_CLASSES)
136 | 
137 |     def label_to_name(self, label):
138 |         return VOC_CLASSES[label]
139 | 
140 |     def load_annotations(self, index):
141 |         img_id = self.ids[index]
142 |         anno = ET.parse(self._annopath % img_id).getroot()
143 |         gt = self.target_transform(anno, 1, 1)
144 |         gt = np.array(gt)
145 |         return gt
146 | 


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import cv2
  3 | from PIL import Image
  4 | import matplotlib.pyplot as plt
  5 | from models import EfficientDet
  6 | from torchvision import transforms
  7 | import numpy as np
  8 | import skimage
  9 | from datasets import get_augumentation, VOC_CLASSES
 10 | from timeit import default_timer as timer
 11 | import argparse
 12 | import copy
 13 | from utils import vis_bbox, EFFICIENTDET
 14 | 
 15 | parser = argparse.ArgumentParser(description='EfficientDet')
 16 | 
 17 | parser.add_argument('-n', '--network', default='efficientdet-d0',
 18 |                     help='efficientdet-[d0, d1, ..]')
 19 | parser.add_argument('-s', '--score', default=True,
 20 |                     action="store_true", help='Show score')
 21 | parser.add_argument('-t', '--threshold', default=0.6,
 22 |                     type=float, help='Visualization threshold')
 23 | parser.add_argument('-it', '--iou_threshold', default=0.6,
 24 |                     type=float, help='Visualization threshold')
 25 | parser.add_argument('-w', '--weight', default='./weights/voc0712.pth',
 26 |                     type=str, help='Weight model path')
 27 | parser.add_argument('-c', '--cam',
 28 |                     action="store_true", help='Use camera')
 29 | parser.add_argument('-f', '--file_name', default='pic.jpg',
 30 |                     help='Image path')
 31 | parser.add_argument('--num_class', default=21, type=int,
 32 |                     help='Number of class used in model')
 33 | args = parser.parse_args()
 34 | 
 35 | 
 36 | class Detect(object):
 37 |     """
 38 |         dir_name: Folder or image_file
 39 |     """
 40 | 
 41 |     def __init__(self, weights, num_class=21, network='efficientdet-d0', size_image=(512, 512)):
 42 |         super(Detect,  self).__init__()
 43 |         self.weights = weights
 44 |         self.size_image = size_image
 45 |         self.device = torch.device(
 46 |             "cuda:0" if torch.cuda.is_available() else 'cpu')
 47 |         self.transform = get_augumentation(phase='test')
 48 |         if(self.weights is not None):
 49 |             print('Load pretrained Model')
 50 |             checkpoint = torch.load(
 51 |                 self.weights, map_location=lambda storage, loc: storage)
 52 |             params = checkpoint['parser']
 53 |             num_class = params.num_class
 54 |             network = params.network
 55 | 
 56 |         self.model = EfficientDet(num_classes=num_class,
 57 |                                   network=network,
 58 |                                   W_bifpn=EFFICIENTDET[network]['W_bifpn'],
 59 |                                   D_bifpn=EFFICIENTDET[network]['D_bifpn'],
 60 |                                   D_class=EFFICIENTDET[network]['D_class'],
 61 |                                   is_training=False
 62 |                                   )
 63 | 
 64 |         if(self.weights is not None):
 65 |             state_dict = checkpoint['state_dict']
 66 |             self.model.load_state_dict(state_dict)
 67 |         if torch.cuda.is_available():
 68 |             self.model = self.model.cuda()
 69 |         self.model.eval()
 70 | 
 71 |     def process(self, file_name=None, img=None, show=False):
 72 |         if file_name is not None:
 73 |             img = cv2.imread(file_name)
 74 |         origin_img = copy.deepcopy(img)
 75 |         augmentation = self.transform(image=img)
 76 |         img = augmentation['image']
 77 |         img = img.to(self.device)
 78 |         img = img.unsqueeze(0)
 79 | 
 80 |         with torch.no_grad():
 81 |             scores, classification, transformed_anchors = self.model(img)
 82 |             bboxes = list()
 83 |             labels = list()
 84 |             bbox_scores = list()
 85 |             colors = list()
 86 |             for j in range(scores.shape[0]):
 87 |                 bbox = transformed_anchors[[j], :][0].data.cpu().numpy()
 88 |                 x1 = int(bbox[0]*origin_img.shape[1]/self.size_image[1])
 89 |                 y1 = int(bbox[1]*origin_img.shape[0]/self.size_image[0])
 90 |                 x2 = int(bbox[2]*origin_img.shape[1]/self.size_image[1])
 91 |                 y2 = int(bbox[3]*origin_img.shape[0]/self.size_image[0])
 92 |                 bboxes.append([x1, y1, x2, y2])
 93 |                 label_name = VOC_CLASSES[int(classification[[j]])]
 94 |                 labels.append(label_name)
 95 | 
 96 |                 if(args.cam):
 97 |                     cv2.rectangle(origin_img, (x1, y1),
 98 |                                   (x2, y2), (179, 255, 179), 2, 1)
 99 |                 if args.score:
100 |                     score = np.around(
101 |                         scores[[j]].cpu().numpy(), decimals=2) * 100
102 |                     if(args.cam):
103 |                         labelSize, baseLine = cv2.getTextSize('{} {}'.format(
104 |                             label_name, int(score)), cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
105 |                         cv2.rectangle(
106 |                             origin_img, (x1, y1-labelSize[1]), (x1+labelSize[0], y1+baseLine), (223, 128, 255), cv2.FILLED)
107 |                         cv2.putText(
108 |                             origin_img, '{} {}'.format(label_name, int(score)),
109 |                             (x1, y1), cv2.FONT_HERSHEY_SIMPLEX,
110 |                             0.8, (0, 0, 0), 2
111 |                         )
112 |                     bbox_scores.append(int(score))
113 |                 else:
114 |                     if(args.cam):
115 |                         labelSize, baseLine = cv2.getTextSize('{}'.format(
116 |                             label_name), cv2.FONT_HERSHEY_SIMPLEX, 0.8, 2)
117 |                         cv2.rectangle(
118 |                             origin_img, (x1, y1-labelSize[1]), (x1+labelSize[0], y1+baseLine), (0, 102, 255), cv2.FILLED)
119 |                         cv2.putText(
120 |                             origin_img, '{} {}'.format(label_name, int(score)),
121 |                             (x1, y1), cv2.FONT_HERSHEY_SIMPLEX,
122 |                             0.8, (0, 0, 0), 2
123 |                         )
124 |             if show:
125 |                 fig, ax = vis_bbox(img=origin_img, bbox=bboxes,
126 |                                    label=labels, score=bbox_scores)
127 |                 fig.savefig('./docs/demo.png')
128 |                 plt.show()
129 |             else:
130 |                 return origin_img
131 | 
132 |     def camera(self):
133 |         cap = cv2.VideoCapture(0)
134 |         if not cap.isOpened():
135 |             print("Unable to open camera")
136 |             exit(-1)
137 |         count_tfps = 1
138 |         accum_time = 0
139 |         curr_fps = 0
140 |         fps = "FPS: ??"
141 |         prev_time = timer()
142 |         while True:
143 |             res, img = cap.read()
144 |             curr_time = timer()
145 |             exec_time = curr_time - prev_time
146 |             prev_time = curr_time
147 |             accum_time = accum_time + exec_time
148 |             curr_fps = curr_fps + 1
149 | 
150 |             if accum_time > 1:
151 |                 accum_time = accum_time - 1
152 |                 fps = curr_fps
153 |                 curr_fps = 0
154 |             if res:
155 |                 show_image = self.process(img=img)
156 |                 cv2.putText(
157 |                     show_image, "FPS: " + str(fps), (10,  20),
158 |                     cv2.FONT_HERSHEY_SIMPLEX, 0.9, (204, 51, 51), 2
159 |                 )
160 | 
161 |                 cv2.imshow("Detection", show_image)
162 |                 k = cv2.waitKey(1)
163 |                 if k == 27:
164 |                     break
165 |             else:
166 |                 print("Unable to read image")
167 |                 exit(-1)
168 |             count_tfps += 1
169 |         cap.release()
170 |         cv2.destroyAllWindows()
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     detect = Detect(weights=args.weight)
175 |     print('cam: ', args.cam)
176 |     if args.cam:
177 |         detect.camera()
178 |     else:
179 |         detect.process(file_name=args.file_name, show=True)
180 | 


--------------------------------------------------------------------------------
/docs/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/arch.png


--------------------------------------------------------------------------------
/docs/compare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/compare.png


--------------------------------------------------------------------------------
/docs/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/demo.png


--------------------------------------------------------------------------------
/docs/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/output.png


--------------------------------------------------------------------------------
/docs/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/performance.png


--------------------------------------------------------------------------------
/docs/pytoan.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toandaominh1997/EfficientDet.Pytorch/fbe56e58c9a2749520303d2d380427e5f01305ba/docs/pytoan.gif


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | 
  4 | import numpy as np
  5 | import torch
  6 | from torch.utils.data import DataLoader
  7 | from torchvision import transforms
  8 | from tqdm import tqdm
  9 | from pycocotools.cocoeval import COCOeval
 10 | import json
 11 | 
 12 | from datasets import (Augmenter, CocoDataset, Normalizer,
 13 |                       Resizer, VOCDetection, collater, detection_collate,
 14 |                       get_augumentation)
 15 | from models.efficientdet import EfficientDet
 16 | from utils import EFFICIENTDET, get_state_dict
 17 | 
 18 | 
 19 | def compute_overlap(a, b):
 20 |     """
 21 |     Parameters
 22 |     ----------
 23 |     a: (N, 4) ndarray of float
 24 |     b: (K, 4) ndarray of float
 25 |     Returns
 26 |     -------
 27 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
 28 |     """
 29 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
 30 | 
 31 |     iw = np.minimum(np.expand_dims(
 32 |         a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
 33 |     ih = np.minimum(np.expand_dims(
 34 |         a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])
 35 | 
 36 |     iw = np.maximum(iw, 0)
 37 |     ih = np.maximum(ih, 0)
 38 | 
 39 |     ua = np.expand_dims((a[:, 2] - a[:, 0]) *
 40 |                         (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
 41 | 
 42 |     ua = np.maximum(ua, np.finfo(float).eps)
 43 | 
 44 |     intersection = iw * ih
 45 | 
 46 |     return intersection / ua
 47 | 
 48 | 
 49 | def _compute_ap(recall, precision):
 50 |     """ Compute the average precision, given the recall and precision curves.
 51 |     Code originally from https://github.com/rbgirshick/py-faster-rcnn.
 52 |     # Arguments
 53 |         recall:    The recall curve (list).
 54 |         precision: The precision curve (list).
 55 |     # Returns
 56 |         The average precision as computed in py-faster-rcnn.
 57 |     """
 58 |     # correct AP calculation
 59 |     # first append sentinel values at the end
 60 |     mrec = np.concatenate(([0.], recall, [1.]))
 61 |     mpre = np.concatenate(([0.], precision, [0.]))
 62 | 
 63 |     # compute the precision envelope
 64 |     for i in range(mpre.size - 1, 0, -1):
 65 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 66 | 
 67 |     # to calculate area under PR curve, look for points
 68 |     # where X axis (recall) changes value
 69 |     i = np.where(mrec[1:] != mrec[:-1])[0]
 70 | 
 71 |     # and sum (\Delta recall) * prec
 72 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 73 |     return ap
 74 | 
 75 | 
 76 | def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, save_path=None):
 77 |     """ Get the detections from the retinanet using the generator.
 78 |     The result is a list of lists such that the size is:
 79 |         all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
 80 |     # Arguments
 81 |         dataset         : The generator used to run images through the retinanet.
 82 |         retinanet           : The retinanet to run on the images.
 83 |         score_threshold : The score confidence threshold to use.
 84 |         max_detections  : The maximum number of detections to use per image.
 85 |         save_path       : The path to save the images with visualized detections to.
 86 |     # Returns
 87 |         A list of lists containing the detections for each image in the generator.
 88 |     """
 89 |     all_detections = [[None for i in range(
 90 |         dataset.num_classes())] for j in range(len(dataset))]
 91 | 
 92 |     retinanet.eval()
 93 | 
 94 |     with torch.no_grad():
 95 | 
 96 |         for index in range(len(dataset)):
 97 |             data = dataset[index]
 98 |             scale = data['scale']
 99 | 
100 |             # run network
101 |             scores, labels, boxes = retinanet(data['img'].permute(
102 |                 2, 0, 1).cuda().float().unsqueeze(dim=0))
103 |             scores = scores.cpu().numpy()
104 |             labels = labels.cpu().numpy()
105 |             boxes = boxes.cpu().numpy()
106 | 
107 |             # correct boxes for image scale
108 |             boxes /= scale
109 | 
110 |             # select indices which have a score above the threshold
111 |             indices = np.where(scores > score_threshold)[0]
112 |             if indices.shape[0] > 0:
113 |                 # select those scores
114 |                 scores = scores[indices]
115 | 
116 |                 # find the order with which to sort the scores
117 |                 scores_sort = np.argsort(-scores)[:max_detections]
118 | 
119 |                 # select detections
120 |                 image_boxes = boxes[indices[scores_sort], :]
121 |                 image_scores = scores[scores_sort]
122 |                 image_labels = labels[indices[scores_sort]]
123 |                 image_detections = np.concatenate([image_boxes, np.expand_dims(
124 |                     image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
125 | 
126 |                 # copy detections to all_detections
127 |                 for label in range(dataset.num_classes()):
128 |                     all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1]
129 |             else:
130 |                 # copy detections to all_detections
131 |                 for label in range(dataset.num_classes()):
132 |                     all_detections[index][label] = np.zeros((0, 5))
133 | 
134 |             print('{}/{}'.format(index + 1, len(dataset)), end='\r')
135 | 
136 |     return all_detections
137 | 
138 | 
139 | def _get_annotations(generator):
140 |     """ Get the ground truth annotations from the generator.
141 |     The result is a list of lists such that the size is:
142 |         all_detections[num_images][num_classes] = annotations[num_detections, 5]
143 |     # Arguments
144 |         generator : The generator used to retrieve ground truth annotations.
145 |     # Returns
146 |         A list of lists containing the annotations for each image in the generator.
147 |     """
148 |     all_annotations = [[None for i in range(
149 |         generator.num_classes())] for j in range(len(generator))]
150 | 
151 |     for i in range(len(generator)):
152 |         # load the annotations
153 |         annotations = generator.load_annotations(i)
154 | 
155 |         # copy detections to all_annotations
156 |         for label in range(generator.num_classes()):
157 |             all_annotations[i][label] = annotations[annotations[:, 4]
158 |                                                     == label, :4].copy()
159 | 
160 |         print('{}/{}'.format(i + 1, len(generator)), end='\r')
161 | 
162 |     return all_annotations
163 | 
164 | 
165 | def evaluate(
166 |     generator,
167 |     retinanet,
168 |     iou_threshold=0.5,
169 |     score_threshold=0.05,
170 |     max_detections=100,
171 |     save_path=None
172 | ):
173 |     """ Evaluate a given dataset using a given retinanet.
174 |     # Arguments
175 |         generator       : The generator that represents the dataset to evaluate.
176 |         retinanet           : The retinanet to evaluate.
177 |         iou_threshold   : The threshold used to consider when a detection is positive or negative.
178 |         score_threshold : The score confidence threshold to use for detections.
179 |         max_detections  : The maximum number of detections to use per image.
180 |         save_path       : The path to save images with visualized detections to.
181 |     # Returns
182 |         A dict mapping class names to mAP scores.
183 |     """
184 | 
185 |     # gather all detections and annotations
186 | 
187 |     all_detections = _get_detections(
188 |         generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path)
189 |     all_annotations = _get_annotations(generator)
190 | 
191 |     average_precisions = {}
192 | 
193 |     for label in range(generator.num_classes()):
194 |         false_positives = np.zeros((0,))
195 |         true_positives = np.zeros((0,))
196 |         scores = np.zeros((0,))
197 |         num_annotations = 0.0
198 | 
199 |         for i in range(len(generator)):
200 |             detections = all_detections[i][label]
201 |             annotations = all_annotations[i][label]
202 |             num_annotations += annotations.shape[0]
203 |             detected_annotations = []
204 | 
205 |             for d in detections:
206 |                 scores = np.append(scores, d[4])
207 | 
208 |                 if annotations.shape[0] == 0:
209 |                     false_positives = np.append(false_positives, 1)
210 |                     true_positives = np.append(true_positives, 0)
211 |                     continue
212 | 
213 |                 overlaps = compute_overlap(
214 |                     np.expand_dims(d, axis=0), annotations)
215 |                 assigned_annotation = np.argmax(overlaps, axis=1)
216 |                 max_overlap = overlaps[0, assigned_annotation]
217 | 
218 |                 if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
219 |                     false_positives = np.append(false_positives, 0)
220 |                     true_positives = np.append(true_positives, 1)
221 |                     detected_annotations.append(assigned_annotation)
222 |                 else:
223 |                     false_positives = np.append(false_positives, 1)
224 |                     true_positives = np.append(true_positives, 0)
225 | 
226 |         # no annotations -> AP for this class is 0 (is this correct?)
227 |         if num_annotations == 0:
228 |             average_precisions[label] = 0, 0
229 |             continue
230 | 
231 |         # sort by score
232 |         indices = np.argsort(-scores)
233 |         false_positives = false_positives[indices]
234 |         true_positives = true_positives[indices]
235 | 
236 |         # compute false positives and true positives
237 |         false_positives = np.cumsum(false_positives)
238 |         true_positives = np.cumsum(true_positives)
239 | 
240 |         # compute recall and precision
241 |         recall = true_positives / num_annotations
242 |         precision = true_positives / \
243 |             np.maximum(true_positives + false_positives,
244 |                        np.finfo(np.float64).eps)
245 | 
246 |         # compute average precision
247 |         average_precision = _compute_ap(recall, precision)
248 |         average_precisions[label] = average_precision, num_annotations
249 | 
250 |     print('\nmAP:')
251 |     avg_mAP = []
252 |     for label in range(generator.num_classes()):
253 |         label_name = generator.label_to_name(label)
254 |         print('{}: {}'.format(label_name, average_precisions[label][0]))
255 |         avg_mAP.append(average_precisions[label][0])
256 |     print('avg mAP: {}'.format(np.mean(avg_mAP)))
257 |     return np.mean(avg_mAP), average_precisions
258 | 
259 | 
260 | def evaluate_coco(dataset, model, threshold=0.05):
261 | 
262 |     model.eval()
263 | 
264 |     with torch.no_grad():
265 | 
266 |         # start collecting results
267 |         results = []
268 |         image_ids = []
269 | 
270 |         for index in range(len(dataset)):
271 |             data = dataset[index]
272 |             scale = data['scale']
273 | 
274 |             # run network
275 |             scores, labels, boxes = model(data['img'].permute(
276 |                 2, 0, 1).cuda().float().unsqueeze(dim=0))
277 |             scores = scores.cpu()
278 |             labels = labels.cpu()
279 |             boxes = boxes.cpu()
280 | 
281 |             # correct boxes for image scale
282 |             boxes /= scale
283 | 
284 |             if boxes.shape[0] > 0:
285 |                 # change to (x, y, w, h) (MS COCO standard)
286 |                 boxes[:, 2] -= boxes[:, 0]
287 |                 boxes[:, 3] -= boxes[:, 1]
288 | 
289 |                 # compute predicted labels and scores
290 |                 # for box, score, label in zip(boxes[0], scores[0], labels[0]):
291 |                 for box_id in range(boxes.shape[0]):
292 |                     score = float(scores[box_id])
293 |                     label = int(labels[box_id])
294 |                     box = boxes[box_id, :]
295 | 
296 |                     # scores are sorted, so we can break
297 |                     if score < threshold:
298 |                         break
299 | 
300 |                     # append detection for each positively labeled class
301 |                     image_result = {
302 |                         'image_id': dataset.image_ids[index],
303 |                         'category_id': dataset.label_to_coco_label(label),
304 |                         'score': float(score),
305 |                         'bbox': box.tolist(),
306 |                     }
307 | 
308 |                     # append detection to results
309 |                     results.append(image_result)
310 | 
311 |             # append image to list of processed images
312 |             image_ids.append(dataset.image_ids[index])
313 | 
314 |             # print progress
315 |             print('{}/{}'.format(index, len(dataset)), end='\r')
316 | 
317 |         if not len(results):
318 |             return
319 | 
320 |         # write output
321 |         json.dump(results, open('{}_bbox_results.json'.format(
322 |             dataset.set_name), 'w'), indent=4)
323 | 
324 |         # load results in COCO evaluation tool
325 |         coco_true = dataset.coco
326 |         coco_pred = coco_true.loadRes(
327 |             '{}_bbox_results.json'.format(dataset.set_name))
328 | 
329 |         # run COCO evaluation
330 |         coco_eval = COCOeval(coco_true, coco_pred, 'bbox')
331 |         coco_eval.params.imgIds = image_ids
332 |         coco_eval.evaluate()
333 |         coco_eval.accumulate()
334 |         coco_eval.summarize()
335 | 
336 |         model.train()
337 | 
338 |         return
339 | 
340 | 
341 | if __name__ == '__main__':
342 |     parser = argparse.ArgumentParser(
343 |         description='EfficientDet Training With Pytorch')
344 |     train_set = parser.add_mutually_exclusive_group()
345 |     parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'],
346 |                         type=str, help='VOC or COCO')
347 |     parser.add_argument('--dataset_root', default='/root/data/VOCdevkit/',
348 |                         help='Dataset root directory path [/root/data/VOCdevkit/, /root/data/coco/]')
349 |     parser.add_argument('-t', '--threshold', default=0.4,
350 |                         type=float, help='Visualization threshold')
351 |     parser.add_argument('-it', '--iou_threshold', default=0.5,
352 |                         type=float, help='Visualization threshold')
353 |     parser.add_argument('--weight', default='./checkpoint_VOC_efficientdet-d0_248.pth', type=str,
354 |                         help='Checkpoint state_dict file to resume training from')
355 |     args = parser.parse_args()
356 | 
357 |     if(args.weight is not None):
358 |         resume_path = str(args.weight)
359 |         print("Loading checkpoint: {} ...".format(resume_path))
360 |         checkpoint = torch.load(
361 |             args.weight, map_location=lambda storage, loc: storage)
362 |         params = checkpoint['parser']
363 |         args.num_class = params.num_class
364 |         args.network = params.network
365 |         model = EfficientDet(
366 |             num_classes=args.num_class,
367 |             network=args.network,
368 |             W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
369 |             D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
370 |             D_class=EFFICIENTDET[args.network]['D_class'],
371 |             is_training=False,
372 |             threshold=args.threshold,
373 |             iou_threshold=args.iou_threshold)
374 |         model.load_state_dict(checkpoint['state_dict'])
375 |     model = model.cuda()
376 |     if(args.dataset == 'VOC'):
377 |         valid_dataset = VOCDetection(root=args.dataset_root, image_sets=[('2007', 'test')],
378 |                                      transform=transforms.Compose([Normalizer(), Resizer()]))
379 |         evaluate(valid_dataset, model)
380 |     else:
381 |         valid_dataset = CocoDataset(root_dir=args.dataset_root, set_name='val2017',
382 |                                     transform=transforms.Compose([Normalizer(), Resizer()]))
383 |         evaluate_coco(valid_dataset, model)
384 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .efficientdet import EfficientDet


--------------------------------------------------------------------------------
/models/bifpn.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | import torch.nn.functional as F
  3 | 
  4 | 
  5 | from .module import ConvModule, xavier_init
  6 | import torch
  7 | 
  8 | 
  9 | class BIFPN(nn.Module):
 10 |     def __init__(self,
 11 |                  in_channels,
 12 |                  out_channels,
 13 |                  num_outs,
 14 |                  start_level=0,
 15 |                  end_level=-1,
 16 |                  stack=1,
 17 |                  add_extra_convs=False,
 18 |                  extra_convs_on_inputs=True,
 19 |                  relu_before_extra_convs=False,
 20 |                  no_norm_on_lateral=False,
 21 |                  conv_cfg=None,
 22 |                  norm_cfg=None,
 23 |                  activation=None):
 24 |         super(BIFPN, self).__init__()
 25 |         assert isinstance(in_channels, list)
 26 |         self.in_channels = in_channels
 27 |         self.out_channels = out_channels
 28 |         self.num_ins = len(in_channels)
 29 |         self.num_outs = num_outs
 30 |         self.activation = activation
 31 |         self.relu_before_extra_convs = relu_before_extra_convs
 32 |         self.no_norm_on_lateral = no_norm_on_lateral
 33 |         self.stack = stack
 34 | 
 35 |         if end_level == -1:
 36 |             self.backbone_end_level = self.num_ins
 37 |             assert num_outs >= self.num_ins - start_level
 38 |         else:
 39 |             # if end_level < inputs, no extra level is allowed
 40 |             self.backbone_end_level = end_level
 41 |             assert end_level <= len(in_channels)
 42 |             assert num_outs == end_level - start_level
 43 |         self.start_level = start_level
 44 |         self.end_level = end_level
 45 |         self.add_extra_convs = add_extra_convs
 46 |         self.extra_convs_on_inputs = extra_convs_on_inputs
 47 | 
 48 |         self.lateral_convs = nn.ModuleList()
 49 |         self.fpn_convs = nn.ModuleList()
 50 |         self.stack_bifpn_convs = nn.ModuleList()
 51 | 
 52 |         for i in range(self.start_level, self.backbone_end_level):
 53 |             l_conv = ConvModule(
 54 |                 in_channels[i],
 55 |                 out_channels,
 56 |                 1,
 57 |                 conv_cfg=conv_cfg,
 58 |                 norm_cfg=norm_cfg if not self.no_norm_on_lateral else None,
 59 |                 activation=self.activation,
 60 |                 inplace=False)
 61 |             self.lateral_convs.append(l_conv)
 62 | 
 63 |         for ii in range(stack):
 64 |             self.stack_bifpn_convs.append(BiFPNModule(channels=out_channels,
 65 |                                                       levels=self.backbone_end_level-self.start_level,
 66 |                                                       conv_cfg=conv_cfg,
 67 |                                                       norm_cfg=norm_cfg,
 68 |                                                       activation=activation))
 69 |         # add extra conv layers (e.g., RetinaNet)
 70 |         extra_levels = num_outs - self.backbone_end_level + self.start_level
 71 |         if add_extra_convs and extra_levels >= 1:
 72 |             for i in range(extra_levels):
 73 |                 if i == 0 and self.extra_convs_on_inputs:
 74 |                     in_channels = self.in_channels[self.backbone_end_level - 1]
 75 |                 else:
 76 |                     in_channels = out_channels
 77 |                 extra_fpn_conv = ConvModule(
 78 |                     in_channels,
 79 |                     out_channels,
 80 |                     3,
 81 |                     stride=2,
 82 |                     padding=1,
 83 |                     conv_cfg=conv_cfg,
 84 |                     norm_cfg=norm_cfg,
 85 |                     activation=self.activation,
 86 |                     inplace=False)
 87 |                 self.fpn_convs.append(extra_fpn_conv)
 88 |         self.init_weights()
 89 | 
 90 |     # default init_weights for conv(msra) and norm in ConvModule
 91 |     def init_weights(self):
 92 |         for m in self.modules():
 93 |             if isinstance(m, nn.Conv2d):
 94 |                 xavier_init(m, distribution='uniform')
 95 | 
 96 |     def forward(self, inputs):
 97 |         assert len(inputs) == len(self.in_channels)
 98 | 
 99 |         # build laterals
100 |         laterals = [
101 |             lateral_conv(inputs[i + self.start_level])
102 |             for i, lateral_conv in enumerate(self.lateral_convs)
103 |         ]
104 | 
105 |         # part 1: build top-down and down-top path with stack
106 |         used_backbone_levels = len(laterals)
107 |         for bifpn_module in self.stack_bifpn_convs:
108 |             laterals = bifpn_module(laterals)
109 |         outs = laterals
110 |         # part 2: add extra levels
111 |         if self.num_outs > len(outs):
112 |             # use max pool to get more levels on top of outputs
113 |             # (e.g., Faster R-CNN, Mask R-CNN)
114 |             if not self.add_extra_convs:
115 |                 for i in range(self.num_outs - used_backbone_levels):
116 |                     outs.append(F.max_pool2d(outs[-1], 1, stride=2))
117 |             # add conv layers on top of original feature maps (RetinaNet)
118 |             else:
119 |                 if self.extra_convs_on_inputs:
120 |                     orig = inputs[self.backbone_end_level - 1]
121 |                     outs.append(self.fpn_convs[0](orig))
122 |                 else:
123 |                     outs.append(self.fpn_convs[0](outs[-1]))
124 |                 for i in range(1, self.num_outs - used_backbone_levels):
125 |                     if self.relu_before_extra_convs:
126 |                         outs.append(self.fpn_convs[i](F.relu(outs[-1])))
127 |                     else:
128 |                         outs.append(self.fpn_convs[i](outs[-1]))
129 |         return tuple(outs)
130 | 
131 | 
132 | class BiFPNModule(nn.Module):
133 |     def __init__(self,
134 |                  channels,
135 |                  levels,
136 |                  init=0.5,
137 |                  conv_cfg=None,
138 |                  norm_cfg=None,
139 |                  activation=None,
140 |                  eps=0.0001):
141 |         super(BiFPNModule, self).__init__()
142 |         self.activation = activation
143 |         self.eps = eps
144 |         self.levels = levels
145 |         self.bifpn_convs = nn.ModuleList()
146 |         # weighted
147 |         self.w1 = nn.Parameter(torch.Tensor(2, levels).fill_(init))
148 |         self.relu1 = nn.ReLU()
149 |         self.w2 = nn.Parameter(torch.Tensor(3, levels - 2).fill_(init))
150 |         self.relu2 = nn.ReLU()
151 |         for jj in range(2):
152 |             for i in range(self.levels-1):  # 1,2,3
153 |                 fpn_conv = nn.Sequential(
154 |                     ConvModule(
155 |                         channels,
156 |                         channels,
157 |                         3,
158 |                         padding=1,
159 |                         conv_cfg=conv_cfg,
160 |                         norm_cfg=norm_cfg,
161 |                         activation=self.activation,
162 |                         inplace=False)
163 |                 )
164 |                 self.bifpn_convs.append(fpn_conv)
165 | 
166 |     # default init_weights for conv(msra) and norm in ConvModule
167 |     def init_weights(self):
168 |         for m in self.modules():
169 |             if isinstance(m, nn.Conv2d):
170 |                 xavier_init(m, distribution='uniform')
171 | 
172 |     def forward(self, inputs):
173 |         assert len(inputs) == self.levels
174 |         # build top-down and down-top path with stack
175 |         levels = self.levels
176 |         # w relu
177 |         w1 = self.relu1(self.w1)
178 |         w1 /= torch.sum(w1, dim=0) + self.eps  # normalize
179 |         w2 = self.relu2(self.w2)
180 |         w2 /= torch.sum(w2, dim=0) + self.eps  # normalize
181 |         # build top-down
182 |         idx_bifpn = 0
183 |         pathtd = inputs
184 |         inputs_clone = []
185 |         for in_tensor in inputs:
186 |             inputs_clone.append(in_tensor.clone())
187 | 
188 |         for i in range(levels - 1, 0, -1):
189 |             pathtd[i - 1] = (w1[0, i-1]*pathtd[i - 1] + w1[1, i-1]*F.interpolate(
190 |                 pathtd[i], scale_factor=2, mode='nearest'))/(w1[0, i-1] + w1[1, i-1] + self.eps)
191 |             pathtd[i - 1] = self.bifpn_convs[idx_bifpn](pathtd[i - 1])
192 |             idx_bifpn = idx_bifpn + 1
193 |         # build down-top
194 |         for i in range(0, levels - 2, 1):
195 |             pathtd[i + 1] = (w2[0, i] * pathtd[i + 1] + w2[1, i] * F.max_pool2d(pathtd[i], kernel_size=2) +
196 |                              w2[2, i] * inputs_clone[i + 1])/(w2[0, i] + w2[1, i] + w2[2, i] + self.eps)
197 |             pathtd[i + 1] = self.bifpn_convs[idx_bifpn](pathtd[i + 1])
198 |             idx_bifpn = idx_bifpn + 1
199 | 
200 |         pathtd[levels - 1] = (w1[0, levels-1] * pathtd[levels - 1] + w1[1, levels-1] * F.max_pool2d(
201 |             pathtd[levels - 2], kernel_size=2))/(w1[0, levels-1] + w1[1, levels-1] + self.eps)
202 |         pathtd[levels - 1] = self.bifpn_convs[idx_bifpn](pathtd[levels - 1])
203 |         return pathtd
204 | 


--------------------------------------------------------------------------------
/models/efficientdet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import math
  4 | from models.efficientnet import EfficientNet
  5 | from models.bifpn import BIFPN
  6 | from .retinahead import RetinaHead
  7 | from models.module import RegressionModel, ClassificationModel, Anchors, ClipBoxes, BBoxTransform
  8 | from torchvision.ops import nms
  9 | from .losses import FocalLoss
 10 | MODEL_MAP = {
 11 |     'efficientdet-d0': 'efficientnet-b0',
 12 |     'efficientdet-d1': 'efficientnet-b1',
 13 |     'efficientdet-d2': 'efficientnet-b2',
 14 |     'efficientdet-d3': 'efficientnet-b3',
 15 |     'efficientdet-d4': 'efficientnet-b4',
 16 |     'efficientdet-d5': 'efficientnet-b5',
 17 |     'efficientdet-d6': 'efficientnet-b6',
 18 |     'efficientdet-d7': 'efficientnet-b6',
 19 | }
 20 | 
 21 | 
 22 | class EfficientDet(nn.Module):
 23 |     def __init__(self,
 24 |                  num_classes,
 25 |                  network='efficientdet-d0',
 26 |                  D_bifpn=3,
 27 |                  W_bifpn=88,
 28 |                  D_class=3,
 29 |                  is_training=True,
 30 |                  threshold=0.01,
 31 |                  iou_threshold=0.5):
 32 |         super(EfficientDet, self).__init__()
 33 |         self.backbone = EfficientNet.from_pretrained(MODEL_MAP[network])
 34 |         self.is_training = is_training
 35 |         self.neck = BIFPN(in_channels=self.backbone.get_list_features()[-5:],
 36 |                           out_channels=W_bifpn,
 37 |                           stack=D_bifpn,
 38 |                           num_outs=5)
 39 |         self.bbox_head = RetinaHead(num_classes=num_classes,
 40 |                                     in_channels=W_bifpn)
 41 | 
 42 |         self.anchors = Anchors()
 43 |         self.regressBoxes = BBoxTransform()
 44 |         self.clipBoxes = ClipBoxes()
 45 |         self.threshold = threshold
 46 |         self.iou_threshold = iou_threshold
 47 |         for m in self.modules():
 48 |             if isinstance(m, nn.Conv2d):
 49 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 50 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 51 |             elif isinstance(m, nn.BatchNorm2d):
 52 |                 m.weight.data.fill_(1)
 53 |                 m.bias.data.zero_()
 54 |         self.freeze_bn()
 55 |         self.criterion = FocalLoss()
 56 | 
 57 |     def forward(self, inputs):
 58 |         if self.is_training:
 59 |             inputs, annotations = inputs
 60 |         else:
 61 |             inputs = inputs
 62 |         x = self.extract_feat(inputs)
 63 |         outs = self.bbox_head(x)
 64 |         classification = torch.cat([out for out in outs[0]], dim=1)
 65 |         regression = torch.cat([out for out in outs[1]], dim=1)
 66 |         anchors = self.anchors(inputs)
 67 |         if self.is_training:
 68 |             return self.criterion(classification, regression, anchors, annotations)
 69 |         else:
 70 |             transformed_anchors = self.regressBoxes(anchors, regression)
 71 |             transformed_anchors = self.clipBoxes(transformed_anchors, inputs)
 72 |             scores = torch.max(classification, dim=2, keepdim=True)[0]
 73 |             scores_over_thresh = (scores > self.threshold)[0, :, 0]
 74 | 
 75 |             if scores_over_thresh.sum() == 0:
 76 |                 print('No boxes to NMS')
 77 |                 # no boxes to NMS, just return
 78 |                 return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)]
 79 |             classification = classification[:, scores_over_thresh, :]
 80 |             transformed_anchors = transformed_anchors[:, scores_over_thresh, :]
 81 |             scores = scores[:, scores_over_thresh, :]
 82 |             anchors_nms_idx = nms(
 83 |                 transformed_anchors[0, :, :], scores[0, :, 0], iou_threshold=self.iou_threshold)
 84 |             nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(
 85 |                 dim=1)
 86 |             return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]]
 87 | 
 88 |     def freeze_bn(self):
 89 |         '''Freeze BatchNorm layers.'''
 90 |         for layer in self.modules():
 91 |             if isinstance(layer, nn.BatchNorm2d):
 92 |                 layer.eval()
 93 | 
 94 |     def extract_feat(self, img):
 95 |         """
 96 |             Directly extract features from the backbone+neck
 97 |         """
 98 |         x = self.backbone(img)
 99 |         x = self.neck(x[-5:])
100 |         return x
101 | 


--------------------------------------------------------------------------------
/models/efficientnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn
  3 | from torch.nn import functional as F
  4 | 
  5 | from models.utils import (
  6 |     round_filters,
  7 |     round_repeats,
  8 |     drop_connect,
  9 |     get_same_padding_conv2d,
 10 |     get_model_params,
 11 |     efficientnet_params,
 12 |     load_pretrained_weights,
 13 |     Swish,
 14 |     MemoryEfficientSwish,
 15 | )
 16 | 
 17 | 
 18 | class MBConvBlock(nn.Module):
 19 |     """
 20 |     Mobile Inverted Residual Bottleneck Block
 21 |     Args:
 22 |         block_args (namedtuple): BlockArgs, see above
 23 |         global_params (namedtuple): GlobalParam, see above
 24 |     Attributes:
 25 |         has_se (bool): Whether the block contains a Squeeze and Excitation layer.
 26 |     """
 27 | 
 28 |     def __init__(self, block_args, global_params):
 29 |         super().__init__()
 30 |         self._block_args = block_args
 31 |         self._bn_mom = 1 - global_params.batch_norm_momentum
 32 |         self._bn_eps = global_params.batch_norm_epsilon
 33 |         self.has_se = (self._block_args.se_ratio is not None) and (
 34 |             0 < self._block_args.se_ratio <= 1)
 35 |         self.id_skip = block_args.id_skip  # skip connection and drop connect
 36 | 
 37 |         # Get static or dynamic convolution depending on image size
 38 |         Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
 39 | 
 40 |         # Expansion phase
 41 |         inp = self._block_args.input_filters  # number of input channels
 42 |         oup = self._block_args.input_filters * \
 43 |             self._block_args.expand_ratio  # number of output channels
 44 |         if self._block_args.expand_ratio != 1:
 45 |             self._expand_conv = Conv2d(
 46 |                 in_channels=inp, out_channels=oup, kernel_size=1, bias=False)
 47 |             self._bn0 = nn.BatchNorm2d(
 48 |                 num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
 49 |         # Depthwise convolution phase
 50 |         k = self._block_args.kernel_size
 51 |         s = self._block_args.stride
 52 |         self._depthwise_conv = Conv2d(
 53 |             in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
 54 |             kernel_size=k, stride=s, bias=False)
 55 |         self._bn1 = nn.BatchNorm2d(
 56 |             num_features=oup, momentum=self._bn_mom, eps=self._bn_eps)
 57 | 
 58 |         # Squeeze and Excitation layer, if desired
 59 |         if self.has_se:
 60 |             num_squeezed_channels = max(
 61 |                 1, int(self._block_args.input_filters * self._block_args.se_ratio))
 62 |             self._se_reduce = Conv2d(
 63 |                 in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1)
 64 |             self._se_expand = Conv2d(
 65 |                 in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1)
 66 | 
 67 |         # Output phase
 68 |         final_oup = self._block_args.output_filters
 69 |         self._project_conv = Conv2d(
 70 |             in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
 71 |         self._bn2 = nn.BatchNorm2d(
 72 |             num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
 73 |         self._swish = MemoryEfficientSwish()
 74 | 
 75 |     def forward(self, inputs, drop_connect_rate=None):
 76 |         """
 77 |         :param inputs: input tensor
 78 |         :param drop_connect_rate: drop connect rate (float, between 0 and 1)
 79 |         :return: output of block
 80 |         """
 81 | 
 82 |         # Expansion and Depthwise Convolution
 83 |         x = inputs
 84 |         if self._block_args.expand_ratio != 1:
 85 |             x = self._swish(self._bn0(self._expand_conv(inputs)))
 86 | 
 87 |         x = self._swish(self._bn1(self._depthwise_conv(x)))
 88 | 
 89 |         # Squeeze and Excitation
 90 |         if self.has_se:
 91 |             x_squeezed = F.adaptive_avg_pool2d(x, 1)
 92 |             x_squeezed = self._se_expand(
 93 |                 self._swish(self._se_reduce(x_squeezed)))
 94 |             x = torch.sigmoid(x_squeezed) * x
 95 | 
 96 |         x = self._bn2(self._project_conv(x))
 97 | 
 98 |         # Skip connection and drop connect
 99 |         input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
100 |         if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
101 |             if drop_connect_rate:
102 |                 x = drop_connect(x, p=drop_connect_rate,
103 |                                  training=self.training)
104 |             x = x + inputs  # skip connection
105 |         return x
106 | 
107 |     def set_swish(self, memory_efficient=True):
108 |         """Sets swish function as memory efficient (for training) or standard (for export)"""
109 |         self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
110 | 
111 | 
112 | class EfficientNet(nn.Module):
113 |     """
114 |     An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods
115 |     Args:
116 |         blocks_args (list): A list of BlockArgs to construct blocks
117 |         global_params (namedtuple): A set of GlobalParams shared between blocks
118 |     Example:
119 |         model = EfficientNet.from_pretrained('efficientnet-b0')
120 |     """
121 | 
122 |     def __init__(self, blocks_args=None, global_params=None):
123 |         super().__init__()
124 |         assert isinstance(blocks_args, list), 'blocks_args should be a list'
125 |         assert len(blocks_args) > 0, 'block args must be greater than 0'
126 |         self._global_params = global_params
127 |         self._blocks_args = blocks_args
128 | 
129 |         # Get static or dynamic convolution depending on image size
130 |         Conv2d = get_same_padding_conv2d(image_size=global_params.image_size)
131 | 
132 |         # Batch norm parameters
133 |         bn_mom = 1 - self._global_params.batch_norm_momentum
134 |         bn_eps = self._global_params.batch_norm_epsilon
135 | 
136 |         # Stem
137 |         in_channels = 3  # rgb
138 |         # number of output channels
139 |         out_channels = round_filters(32, self._global_params)
140 |         self._conv_stem = Conv2d(
141 |             in_channels, out_channels, kernel_size=3, stride=2, bias=False)
142 |         self._bn0 = nn.BatchNorm2d(
143 |             num_features=out_channels, momentum=bn_mom, eps=bn_eps)
144 | 
145 |         # Build blocks
146 |         self._blocks = nn.ModuleList([])
147 |         for i in range(len(self._blocks_args)):
148 |             # Update block input and output filters based on depth multiplier.
149 |             self._blocks_args[i] = self._blocks_args[i]._replace(
150 |                 input_filters=round_filters(
151 |                     self._blocks_args[i].input_filters, self._global_params),
152 |                 output_filters=round_filters(
153 |                     self._blocks_args[i].output_filters, self._global_params),
154 |                 num_repeat=round_repeats(
155 |                     self._blocks_args[i].num_repeat, self._global_params)
156 |             )
157 | 
158 |             # The first block needs to take care of stride and filter size increase.
159 |             self._blocks.append(MBConvBlock(
160 |                 self._blocks_args[i], self._global_params))
161 |             if self._blocks_args[i].num_repeat > 1:
162 |                 self._blocks_args[i] = self._blocks_args[i]._replace(
163 |                     input_filters=self._blocks_args[i].output_filters, stride=1)
164 |             for _ in range(self._blocks_args[i].num_repeat - 1):
165 |                 self._blocks.append(MBConvBlock(
166 |                     self._blocks_args[i], self._global_params))
167 | 
168 |         # Head'efficientdet-d0': 'efficientnet-b0',
169 |         # output of final block
170 |         in_channels = self._blocks_args[len(
171 |             self._blocks_args)-1].output_filters
172 |         out_channels = round_filters(1280, self._global_params)
173 |         self._conv_head = Conv2d(
174 |             in_channels, out_channels, kernel_size=1, bias=False)
175 |         self._bn1 = nn.BatchNorm2d(
176 |             num_features=out_channels, momentum=bn_mom, eps=bn_eps)
177 | 
178 |         # Final linear layer
179 |         self._avg_pooling = nn.AdaptiveAvgPool2d(1)
180 |         self._dropout = nn.Dropout(self._global_params.dropout_rate)
181 |         self._fc = nn.Linear(out_channels, self._global_params.num_classes)
182 |         self._swish = MemoryEfficientSwish()
183 | 
184 |     def set_swish(self, memory_efficient=True):
185 |         """Sets swish function as memory efficient (for training) or standard (for export)"""
186 |         self._swish = MemoryEfficientSwish() if memory_efficient else Swish()
187 |         for block in self._blocks:
188 |             block.set_swish(memory_efficient)
189 | 
190 |     def extract_features(self, inputs):
191 |         """ Returns output of the final convolution layer """
192 |         # Stem
193 |         x = self._swish(self._bn0(self._conv_stem(inputs)))
194 | 
195 |         P = []
196 |         index = 0
197 |         num_repeat = 0
198 |         # Blocks
199 |         for idx, block in enumerate(self._blocks):
200 |             drop_connect_rate = self._global_params.drop_connect_rate
201 |             if drop_connect_rate:
202 |                 drop_connect_rate *= float(idx) / len(self._blocks)
203 |             x = block(x, drop_connect_rate=drop_connect_rate)
204 |             num_repeat = num_repeat + 1
205 |             if(num_repeat == self._blocks_args[index].num_repeat):
206 |                 num_repeat = 0
207 |                 index = index + 1
208 |                 P.append(x)
209 |         return P
210 | 
211 |     def forward(self, inputs):
212 |         """ Calls extract_features to extract features, applies final linear layer, and returns logits. """
213 |         # Convolution layers
214 |         P = self.extract_features(inputs)
215 |         return P
216 | 
217 |     @classmethod
218 |     def from_name(cls, model_name, override_params=None):
219 |         cls._check_model_name_is_valid(model_name)
220 |         blocks_args, global_params = get_model_params(
221 |             model_name, override_params)
222 |         return cls(blocks_args, global_params)
223 | 
224 |     @classmethod
225 |     def from_pretrained(cls, model_name, num_classes=1000, in_channels=3):
226 |         model = cls.from_name(model_name, override_params={
227 |                               'num_classes': num_classes})
228 |         load_pretrained_weights(
229 |             model, model_name, load_fc=(num_classes == 1000))
230 |         if in_channels != 3:
231 |             Conv2d = get_same_padding_conv2d(
232 |                 image_size=model._global_params.image_size)
233 |             out_channels = round_filters(32, model._global_params)
234 |             model._conv_stem = Conv2d(
235 |                 in_channels, out_channels, kernel_size=3, stride=2, bias=False)
236 |         return model
237 | 
238 |     @classmethod
239 |     def from_pretrained(cls, model_name, num_classes=1000):
240 |         model = cls.from_name(model_name, override_params={
241 |                               'num_classes': num_classes})
242 |         load_pretrained_weights(
243 |             model, model_name, load_fc=(num_classes == 1000))
244 | 
245 |         return model
246 | 
247 |     @classmethod
248 |     def get_image_size(cls, model_name):
249 |         cls._check_model_name_is_valid(model_name)
250 |         _, _, res, _ = efficientnet_params(model_name)
251 |         return res
252 | 
253 |     @classmethod
254 |     def _check_model_name_is_valid(cls, model_name, also_need_pretrained_weights=False):
255 |         """ Validates model name. None that pretrained weights are only available for
256 |         the first four models (efficientnet-b{i} for i in 0,1,2,3) at the moment. """
257 |         num_models = 4 if also_need_pretrained_weights else 8
258 |         valid_models = ['efficientnet-b'+str(i) for i in range(num_models)]
259 |         if model_name not in valid_models:
260 |             raise ValueError('model_name should be one of: ' +
261 |                              ', '.join(valid_models))
262 | 
263 |     def get_list_features(self):
264 |         list_feature = []
265 |         for idx in range(len(self._blocks_args)):
266 |             list_feature.append(self._blocks_args[idx].output_filters)
267 | 
268 |         return list_feature
269 | 
270 | 
271 | if __name__ == '__main__':
272 |     model = EfficientNet.from_pretrained('efficientnet-b0')
273 |     inputs = torch.randn(4, 3, 640, 640)
274 |     P = model(inputs)
275 |     for idx, p in enumerate(P):
276 |         print('P{}: {}'.format(idx, p.size()))
277 |     # print('model: ', model)
278 | 


--------------------------------------------------------------------------------
/models/losses.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | 
  6 | def calc_iou(a, b):
  7 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
  8 | 
  9 |     iw = torch.min(torch.unsqueeze(
 10 |         a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0])
 11 |     ih = torch.min(torch.unsqueeze(
 12 |         a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1])
 13 | 
 14 |     iw = torch.clamp(iw, min=0)
 15 |     ih = torch.clamp(ih, min=0)
 16 | 
 17 |     ua = torch.unsqueeze((a[:, 2] - a[:, 0]) *
 18 |                          (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih
 19 | 
 20 |     ua = torch.clamp(ua, min=1e-8)
 21 | 
 22 |     intersection = iw * ih
 23 | 
 24 |     IoU = intersection / ua
 25 | 
 26 |     return IoU
 27 | 
 28 | 
 29 | class FocalLoss(nn.Module):
 30 |     # def __init__(self):
 31 | 
 32 |     def forward(self, classifications, regressions, anchors, annotations):
 33 |         alpha = 0.25
 34 |         gamma = 2.0
 35 |         batch_size = classifications.shape[0]
 36 |         classification_losses = []
 37 |         regression_losses = []
 38 | 
 39 |         anchor = anchors[0, :, :]
 40 | 
 41 |         anchor_widths = anchor[:, 2] - anchor[:, 0]
 42 |         anchor_heights = anchor[:, 3] - anchor[:, 1]
 43 |         anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths
 44 |         anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights
 45 | 
 46 |         for j in range(batch_size):
 47 | 
 48 |             classification = classifications[j, :, :]
 49 |             regression = regressions[j, :, :]
 50 | 
 51 |             bbox_annotation = annotations[j, :, :]
 52 |             bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1]
 53 | 
 54 |             if bbox_annotation.shape[0] == 0:
 55 |                 regression_losses.append(torch.tensor(0).float().cuda())
 56 |                 classification_losses.append(torch.tensor(0).float().cuda())
 57 | 
 58 |                 continue
 59 | 
 60 |             classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4)
 61 | 
 62 |             # num_anchors x num_annotations
 63 |             IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4])
 64 | 
 65 |             IoU_max, IoU_argmax = torch.max(IoU, dim=1)  # num_anchors x 1
 66 | 
 67 |             #import pdb
 68 |             # pdb.set_trace()
 69 | 
 70 |             # compute the loss for classification
 71 |             targets = torch.ones(classification.shape) * -1
 72 |             targets = targets.cuda()
 73 | 
 74 |             targets[torch.lt(IoU_max, 0.4), :] = 0
 75 | 
 76 |             positive_indices = torch.ge(IoU_max, 0.5)
 77 | 
 78 |             num_positive_anchors = positive_indices.sum()
 79 | 
 80 |             assigned_annotations = bbox_annotation[IoU_argmax, :]
 81 | 
 82 |             targets[positive_indices, :] = 0
 83 |             targets[positive_indices,
 84 |                     assigned_annotations[positive_indices, 4].long()] = 1
 85 | 
 86 |             alpha_factor = torch.ones(targets.shape).cuda() * alpha
 87 | 
 88 |             alpha_factor = torch.where(
 89 |                 torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor)
 90 |             focal_weight = torch.where(
 91 |                 torch.eq(targets, 1.), 1. - classification, classification)
 92 |             focal_weight = alpha_factor * torch.pow(focal_weight, gamma)
 93 | 
 94 |             bce = -(targets * torch.log(classification) +
 95 |                     (1.0 - targets) * torch.log(1.0 - classification))
 96 | 
 97 |             # cls_loss = focal_weight * torch.pow(bce, gamma)
 98 |             cls_loss = focal_weight * bce
 99 | 
100 |             cls_loss = torch.where(
101 |                 torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda())
102 | 
103 |             classification_losses.append(
104 |                 cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0))
105 | 
106 |             # compute the loss for regression
107 | 
108 |             if positive_indices.sum() > 0:
109 |                 assigned_annotations = assigned_annotations[positive_indices, :]
110 | 
111 |                 anchor_widths_pi = anchor_widths[positive_indices]
112 |                 anchor_heights_pi = anchor_heights[positive_indices]
113 |                 anchor_ctr_x_pi = anchor_ctr_x[positive_indices]
114 |                 anchor_ctr_y_pi = anchor_ctr_y[positive_indices]
115 | 
116 |                 gt_widths = assigned_annotations[:,
117 |                                                  2] - assigned_annotations[:, 0]
118 |                 gt_heights = assigned_annotations[:,
119 |                                                   3] - assigned_annotations[:, 1]
120 |                 gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths
121 |                 gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights
122 | 
123 |                 # clip widths to 1
124 |                 gt_widths = torch.clamp(gt_widths, min=1)
125 |                 gt_heights = torch.clamp(gt_heights, min=1)
126 | 
127 |                 targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi
128 |                 targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi
129 |                 targets_dw = torch.log(gt_widths / anchor_widths_pi)
130 |                 targets_dh = torch.log(gt_heights / anchor_heights_pi)
131 | 
132 |                 targets = torch.stack(
133 |                     (targets_dx, targets_dy, targets_dw, targets_dh))
134 |                 targets = targets.t()
135 | 
136 |                 targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda()
137 | 
138 |                 negative_indices = 1 + (~positive_indices)
139 | 
140 |                 regression_diff = torch.abs(
141 |                     targets - regression[positive_indices, :])
142 | 
143 |                 regression_loss = torch.where(
144 |                     torch.le(regression_diff, 1.0 / 9.0),
145 |                     0.5 * 9.0 * torch.pow(regression_diff, 2),
146 |                     regression_diff - 0.5 / 9.0
147 |                 )
148 |                 regression_losses.append(regression_loss.mean())
149 |             else:
150 |                 regression_losses.append(torch.tensor(0).float().cuda())
151 | 
152 |         return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True)
153 | 


--------------------------------------------------------------------------------
/models/module.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import torch
  4 | import warnings
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | 
  9 | class BBoxTransform(nn.Module):
 10 | 
 11 |     def __init__(self, mean=None, std=None):
 12 |         super(BBoxTransform, self).__init__()
 13 |         if mean is None:
 14 |             self.mean = torch.from_numpy(
 15 |                 np.array([0, 0, 0, 0]).astype(np.float32))
 16 |         else:
 17 |             self.mean = mean
 18 |         if std is None:
 19 |             self.std = torch.from_numpy(
 20 |                 np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32))
 21 |         else:
 22 |             self.std = std
 23 | 
 24 |     def forward(self, boxes, deltas):
 25 | 
 26 |         widths = boxes[:, :, 2] - boxes[:, :, 0]
 27 |         heights = boxes[:, :, 3] - boxes[:, :, 1]
 28 |         ctr_x = boxes[:, :, 0] + 0.5 * widths
 29 |         ctr_y = boxes[:, :, 1] + 0.5 * heights
 30 | 
 31 |         dx = deltas[:, :, 0] * self.std[0] + self.mean[0]
 32 |         dy = deltas[:, :, 1] * self.std[1] + self.mean[1]
 33 |         dw = deltas[:, :, 2] * self.std[2] + self.mean[2]
 34 |         dh = deltas[:, :, 3] * self.std[3] + self.mean[3]
 35 | 
 36 |         pred_ctr_x = ctr_x + dx * widths
 37 |         pred_ctr_y = ctr_y + dy * heights
 38 |         pred_w = torch.exp(dw) * widths
 39 |         pred_h = torch.exp(dh) * heights
 40 | 
 41 |         pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w
 42 |         pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h
 43 |         pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w
 44 |         pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h
 45 | 
 46 |         pred_boxes = torch.stack(
 47 |             [pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2)
 48 | 
 49 |         return pred_boxes
 50 | 
 51 | 
 52 | class ClipBoxes(nn.Module):
 53 | 
 54 |     def __init__(self, width=None, height=None):
 55 |         super(ClipBoxes, self).__init__()
 56 | 
 57 |     def forward(self, boxes, img):
 58 | 
 59 |         batch_size, num_channels, height, width = img.shape
 60 | 
 61 |         boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0)
 62 |         boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0)
 63 | 
 64 |         boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width)
 65 |         boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height)
 66 | 
 67 |         return boxes
 68 | 
 69 | 
 70 | class RegressionModel(nn.Module):
 71 |     def __init__(self, num_features_in, num_anchors=9, feature_size=256):
 72 |         super(RegressionModel, self).__init__()
 73 | 
 74 |         self.conv1 = nn.Conv2d(
 75 |             num_features_in, feature_size, kernel_size=3, padding=1)
 76 |         self.act1 = nn.ReLU()
 77 |         self.conv2 = nn.Conv2d(feature_size, feature_size,
 78 |                                kernel_size=3, padding=1)
 79 |         self.act2 = nn.ReLU()
 80 |         self.conv3 = nn.Conv2d(feature_size, feature_size,
 81 |                                kernel_size=3, padding=1)
 82 |         self.act3 = nn.ReLU()
 83 |         self.conv4 = nn.Conv2d(feature_size, feature_size,
 84 |                                kernel_size=3, padding=1)
 85 |         self.act4 = nn.ReLU()
 86 |         self.output = nn.Conv2d(
 87 |             feature_size, num_anchors*4, kernel_size=3, padding=1)
 88 | 
 89 |     def forward(self, x):
 90 |         out = self.conv1(x)
 91 |         out = self.act1(out)
 92 |         out = self.conv2(out)
 93 |         out = self.act2(out)
 94 |         out = self.conv3(out)
 95 |         out = self.act3(out)
 96 |         out = self.conv4(out)
 97 |         out = self.act4(out)
 98 |         out = self.output(out)
 99 |         # out is B x C x W x H, with C = 4*num_anchors
100 |         out = out.permute(0, 2, 3, 1)
101 |         return out.contiguous().view(out.shape[0], -1, 4)
102 | 
103 | 
104 | class ClassificationModel(nn.Module):
105 |     def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256):
106 |         super(ClassificationModel, self).__init__()
107 |         self.num_classes = num_classes
108 |         self.num_anchors = num_anchors
109 | 
110 |         self.conv1 = nn.Conv2d(
111 |             num_features_in, feature_size, kernel_size=3, padding=1)
112 |         self.act1 = nn.ReLU()
113 |         self.conv2 = nn.Conv2d(feature_size, feature_size,
114 |                                kernel_size=3, padding=1)
115 |         self.act2 = nn.ReLU()
116 |         self.conv3 = nn.Conv2d(feature_size, feature_size,
117 |                                kernel_size=3, padding=1)
118 |         self.act3 = nn.ReLU()
119 |         self.conv4 = nn.Conv2d(feature_size, feature_size,
120 |                                kernel_size=3, padding=1)
121 |         self.act4 = nn.ReLU()
122 |         self.output = nn.Conv2d(
123 |             feature_size, num_anchors*num_classes, kernel_size=3, padding=1)
124 |         self.output_act = nn.Sigmoid()
125 | 
126 |     def forward(self, x):
127 |         out = self.conv1(x)
128 |         out = self.act1(out)
129 |         out = self.conv2(out)
130 |         out = self.act2(out)
131 |         out = self.conv3(out)
132 |         out = self.act3(out)
133 |         out = self.conv4(out)
134 |         out = self.act4(out)
135 |         out = self.output(out)
136 |         out = self.output_act(out)
137 |         # out is B x C x W x H, with C = n_classes + n_anchors
138 |         out1 = out.permute(0, 2, 3, 1)
139 |         batch_size, width, height, channels = out1.shape
140 |         out2 = out1.view(batch_size, width, height,
141 |                          self.num_anchors, self.num_classes)
142 |         return out2.contiguous().view(x.shape[0], -1, self.num_classes)
143 | 
144 | 
145 | class Anchors(nn.Module):
146 |     def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None):
147 |         super(Anchors, self).__init__()
148 | 
149 |         if pyramid_levels is None:
150 |             self.pyramid_levels = [3, 4, 5, 6, 7]
151 |         if strides is None:
152 |             self.strides = [2 ** x for x in self.pyramid_levels]
153 |         if sizes is None:
154 |             self.sizes = [2 ** (x + 2) for x in self.pyramid_levels]
155 |         if ratios is None:
156 |             self.ratios = np.array([0.5, 1, 2])
157 |         if scales is None:
158 |             self.scales = np.array(
159 |                 [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
160 | 
161 |     def forward(self, image):
162 | 
163 |         image_shape = image.shape[2:]
164 |         image_shape = np.array(image_shape)
165 |         image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x)
166 |                         for x in self.pyramid_levels]
167 | 
168 |         # compute anchors over all pyramid levels
169 |         all_anchors = np.zeros((0, 4)).astype(np.float32)
170 | 
171 |         for idx, p in enumerate(self.pyramid_levels):
172 |             anchors = generate_anchors(
173 |                 base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales)
174 |             shifted_anchors = shift(
175 |                 image_shapes[idx], self.strides[idx], anchors)
176 |             all_anchors = np.append(all_anchors, shifted_anchors, axis=0)
177 | 
178 |         all_anchors = np.expand_dims(all_anchors, axis=0)
179 | 
180 |         return torch.from_numpy(all_anchors.astype(np.float32)).to(image.device)
181 | 
182 | 
183 | def generate_anchors(base_size=16, ratios=None, scales=None):
184 |     """
185 |     Generate anchor (reference) windows by enumerating aspect ratios X
186 |     scales w.r.t. a reference window.
187 |     """
188 | 
189 |     if ratios is None:
190 |         ratios = np.array([0.5, 1, 2])
191 | 
192 |     if scales is None:
193 |         scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)])
194 | 
195 |     num_anchors = len(ratios) * len(scales)
196 | 
197 |     # initialize output anchors
198 |     anchors = np.zeros((num_anchors, 4))
199 | 
200 |     # scale base_size
201 |     anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T
202 | 
203 |     # compute areas of anchors
204 |     areas = anchors[:, 2] * anchors[:, 3]
205 | 
206 |     # correct for ratios
207 |     anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales)))
208 |     anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales))
209 | 
210 |     # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2)
211 |     anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T
212 |     anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T
213 | 
214 |     return anchors
215 | 
216 | 
217 | def compute_shape(image_shape, pyramid_levels):
218 |     """Compute shapes based on pyramid levels.
219 |     :param image_shape:
220 |     :param pyramid_levels:
221 |     :return:
222 |     """
223 |     image_shape = np.array(image_shape[:2])
224 |     image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x)
225 |                     for x in pyramid_levels]
226 |     return image_shapes
227 | 
228 | 
229 | def anchors_for_shape(
230 |     image_shape,
231 |     pyramid_levels=None,
232 |     ratios=None,
233 |     scales=None,
234 |     strides=None,
235 |     sizes=None,
236 |     shapes_callback=None,
237 | ):
238 | 
239 |     image_shapes = compute_shape(image_shape, pyramid_levels)
240 | 
241 |     # compute anchors over all pyramid levels
242 |     all_anchors = np.zeros((0, 4))
243 |     for idx, p in enumerate(pyramid_levels):
244 |         anchors = generate_anchors(
245 |             base_size=sizes[idx], ratios=ratios, scales=scales)
246 |         shifted_anchors = shift(image_shapes[idx], strides[idx], anchors)
247 |         all_anchors = np.append(all_anchors, shifted_anchors, axis=0)
248 | 
249 |     return all_anchors
250 | 
251 | 
252 | def shift(shape, stride, anchors):
253 |     shift_x = (np.arange(0, shape[1]) + 0.5) * stride
254 |     shift_y = (np.arange(0, shape[0]) + 0.5) * stride
255 | 
256 |     shift_x, shift_y = np.meshgrid(shift_x, shift_y)
257 | 
258 |     shifts = np.vstack((
259 |         shift_x.ravel(), shift_y.ravel(),
260 |         shift_x.ravel(), shift_y.ravel()
261 |     )).transpose()
262 | 
263 |     # add A anchors (1, A, 4) to
264 |     # cell K shifts (K, 1, 4) to get
265 |     # shift anchors (K, A, 4)
266 |     # reshape to (K*A, 4) shifted anchors
267 |     A = anchors.shape[0]
268 |     K = shifts.shape[0]
269 |     all_anchors = (anchors.reshape((1, A, 4)) +
270 |                    shifts.reshape((1, K, 4)).transpose((1, 0, 2)))
271 |     all_anchors = all_anchors.reshape((K * A, 4))
272 | 
273 |     return all_anchors
274 | 
275 | 
276 | def conv_ws_2d(input,
277 |                weight,
278 |                bias=None,
279 |                stride=1,
280 |                padding=0,
281 |                dilation=1,
282 |                groups=1,
283 |                eps=1e-5):
284 |     c_in = weight.size(0)
285 |     weight_flat = weight.view(c_in, -1)
286 |     mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
287 |     std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
288 |     weight = (weight - mean) / (std + eps)
289 |     return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
290 | 
291 | 
292 | class ConvWS2d(nn.Conv2d):
293 |     def __init__(self,
294 |                  in_channels,
295 |                  out_channels,
296 |                  kernel_size,
297 |                  stride=1,
298 |                  padding=0,
299 |                  dilation=1,
300 |                  groups=1,
301 |                  bias=True,
302 |                  eps=1e-5):
303 |         super(ConvWS2d, self).__init__(
304 |             in_channels,
305 |             out_channels,
306 |             kernel_size,
307 |             stride=stride,
308 |             padding=padding,
309 |             dilation=dilation,
310 |             groups=groups,
311 |             bias=bias)
312 |         self.eps = eps
313 | 
314 |     def forward(self, x):
315 |         return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
316 |                           self.dilation, self.groups, self.eps)
317 | 
318 | 
319 | conv_cfg = {
320 |     'Conv': nn.Conv2d,
321 |     'ConvWS': ConvWS2d,
322 |     # TODO: octave conv
323 | }
324 | 
325 | 
326 | def build_conv_layer(cfg, *args, **kwargs):
327 |     """ Build convolution layer
328 |     Args:
329 |         cfg (None or dict): cfg should contain:
330 |             type (str): identify conv layer type.
331 |             layer args: args needed to instantiate a conv layer.
332 |     Returns:
333 |         layer (nn.Module): created conv layer
334 |     """
335 |     if cfg is None:
336 |         cfg_ = dict(type='Conv')
337 |     else:
338 |         assert isinstance(cfg, dict) and 'type' in cfg
339 |         cfg_ = cfg.copy()
340 | 
341 |     layer_type = cfg_.pop('type')
342 |     if layer_type not in conv_cfg:
343 |         raise KeyError('Unrecognized norm type {}'.format(layer_type))
344 |     else:
345 |         conv_layer = conv_cfg[layer_type]
346 | 
347 |     layer = conv_layer(*args, **kwargs, **cfg_)
348 | 
349 |     return layer
350 | 
351 | 
352 | norm_cfg = {
353 |     # format: layer_type: (abbreviation, module)
354 |     'BN': ('bn', nn.BatchNorm2d),
355 |     'SyncBN': ('bn', nn.SyncBatchNorm),
356 |     'GN': ('gn', nn.GroupNorm),
357 |     # and potentially 'SN'
358 | }
359 | 
360 | 
361 | def build_norm_layer(cfg, num_features, postfix=''):
362 |     """ Build normalization layer
363 |     Args:
364 |         cfg (dict): cfg should contain:
365 |             type (str): identify norm layer type.
366 |             layer args: args needed to instantiate a norm layer.
367 |             requires_grad (bool): [optional] whether stop gradient updates
368 |         num_features (int): number of channels from input.
369 |         postfix (int, str): appended into norm abbreviation to
370 |             create named layer.
371 |     Returns:
372 |         name (str): abbreviation + postfix
373 |         layer (nn.Module): created norm layer
374 |     """
375 |     assert isinstance(cfg, dict) and 'type' in cfg
376 |     cfg_ = cfg.copy()
377 | 
378 |     layer_type = cfg_.pop('type')
379 |     if layer_type not in norm_cfg:
380 |         raise KeyError('Unrecognized norm type {}'.format(layer_type))
381 |     else:
382 |         abbr, norm_layer = norm_cfg[layer_type]
383 |         if norm_layer is None:
384 |             raise NotImplementedError
385 | 
386 |     assert isinstance(postfix, (int, str))
387 |     name = abbr + str(postfix)
388 | 
389 |     requires_grad = cfg_.pop('requires_grad', True)
390 |     cfg_.setdefault('eps', 1e-5)
391 |     if layer_type != 'GN':
392 |         layer = norm_layer(num_features, **cfg_)
393 |         if layer_type == 'SyncBN':
394 |             layer._specify_ddp_gpu_num(1)
395 |     else:
396 |         assert 'num_groups' in cfg_
397 |         layer = norm_layer(num_channels=num_features, **cfg_)
398 | 
399 |     for param in layer.parameters():
400 |         param.requires_grad = requires_grad
401 | 
402 |     return name, layer
403 | 
404 | 
405 | class ConvModule(nn.Module):
406 |     """A conv block that contains conv/norm/activation layers.
407 |     Args:
408 |         in_channels (int): Same as nn.Conv2d.
409 |         out_channels (int): Same as nn.Conv2d.
410 |         kernel_size (int or tuple[int]): Same as nn.Conv2d.
411 |         stride (int or tuple[int]): Same as nn.Conv2d.
412 |         padding (int or tuple[int]): Same as nn.Conv2d.
413 |         dilation (int or tuple[int]): Same as nn.Conv2d.
414 |         groups (int): Same as nn.Conv2d.
415 |         bias (bool or str): If specified as `auto`, it will be decided by the
416 |             norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
417 |             False.
418 |         conv_cfg (dict): Config dict for convolution layer.
419 |         norm_cfg (dict): Config dict for normalization layer.
420 |         activation (str or None): Activation type, "ReLU" by default.
421 |         inplace (bool): Whether to use inplace mode for activation.
422 |         order (tuple[str]): The order of conv/norm/activation layers. It is a
423 |             sequence of "conv", "norm" and "act". Examples are
424 |             ("conv", "norm", "act") and ("act", "conv", "norm").
425 |     """
426 | 
427 |     def __init__(self,
428 |                  in_channels,
429 |                  out_channels,
430 |                  kernel_size,
431 |                  stride=1,
432 |                  padding=0,
433 |                  dilation=1,
434 |                  groups=1,
435 |                  bias='auto',
436 |                  conv_cfg=None,
437 |                  norm_cfg=None,
438 |                  activation='relu',
439 |                  inplace=True,
440 |                  order=('conv', 'norm', 'act')):
441 |         super(ConvModule, self).__init__()
442 |         assert conv_cfg is None or isinstance(conv_cfg, dict)
443 |         assert norm_cfg is None or isinstance(norm_cfg, dict)
444 |         self.conv_cfg = conv_cfg
445 |         self.norm_cfg = norm_cfg
446 |         self.activation = activation
447 |         self.inplace = inplace
448 |         self.order = order
449 |         assert isinstance(self.order, tuple) and len(self.order) == 3
450 |         assert set(order) == set(['conv', 'norm', 'act'])
451 | 
452 |         self.with_norm = norm_cfg is not None
453 |         self.with_activatation = activation is not None
454 |         # if the conv layer is before a norm layer, bias is unnecessary.
455 |         if bias == 'auto':
456 |             bias = False if self.with_norm else True
457 |         self.with_bias = bias
458 | 
459 |         if self.with_norm and self.with_bias:
460 |             warnings.warn('ConvModule has norm and bias at the same time')
461 | 
462 |         # build convolution layer
463 |         self.conv = build_conv_layer(
464 |             conv_cfg,
465 |             in_channels,
466 |             out_channels,
467 |             kernel_size,
468 |             stride=stride,
469 |             padding=padding,
470 |             dilation=dilation,
471 |             groups=groups,
472 |             bias=bias)
473 |         # export the attributes of self.conv to a higher level for convenience
474 |         self.in_channels = self.conv.in_channels
475 |         self.out_channels = self.conv.out_channels
476 |         self.kernel_size = self.conv.kernel_size
477 |         self.stride = self.conv.stride
478 |         self.padding = self.conv.padding
479 |         self.dilation = self.conv.dilation
480 |         self.transposed = self.conv.transposed
481 |         self.output_padding = self.conv.output_padding
482 |         self.groups = self.conv.groups
483 | 
484 |         # build normalization layers
485 |         if self.with_norm:
486 |             # norm layer is after conv layer
487 |             if order.index('norm') > order.index('conv'):
488 |                 norm_channels = out_channels
489 |             else:
490 |                 norm_channels = in_channels
491 |             self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels)
492 |             self.add_module(self.norm_name, norm)
493 | 
494 |         # build activation layer
495 |         if self.with_activatation:
496 |             # TODO: introduce `act_cfg` and supports more activation layers
497 |             if self.activation not in ['relu']:
498 |                 raise ValueError('{} is currently not supported.'.format(
499 |                     self.activation))
500 |             if self.activation == 'relu':
501 |                 self.activate = nn.ReLU(inplace=inplace)
502 | 
503 |     @property
504 |     def norm(self):
505 |         return getattr(self, self.norm_name)
506 | 
507 |     def forward(self, x, activate=True, norm=True):
508 |         for layer in self.order:
509 |             if layer == 'conv':
510 |                 x = self.conv(x)
511 |             elif layer == 'norm' and norm and self.with_norm:
512 |                 x = self.norm(x)
513 |             elif layer == 'act' and activate and self.with_activatation:
514 |                 x = self.activate(x)
515 |         return x
516 | 
517 | 
518 | def xavier_init(module, gain=1, bias=0, distribution='normal'):
519 |     assert distribution in ['uniform', 'normal']
520 |     if distribution == 'uniform':
521 |         nn.init.xavier_uniform_(module.weight, gain=gain)
522 |     else:
523 |         nn.init.xavier_normal_(module.weight, gain=gain)
524 |     if hasattr(module, 'bias'):
525 |         nn.init.constant_(module.bias, bias)
526 | 
527 | 
528 | def normal_init(module, mean=0, std=1, bias=0):
529 |     nn.init.normal_(module.weight, mean, std)
530 |     if hasattr(module, 'bias'):
531 |         nn.init.constant_(module.bias, bias)
532 | 
533 | 
534 | def uniform_init(module, a=0, b=1, bias=0):
535 |     nn.init.uniform_(module.weight, a, b)
536 |     if hasattr(module, 'bias'):
537 |         nn.init.constant_(module.bias, bias)
538 | 
539 | 
540 | def kaiming_init(module,
541 |                  mode='fan_out',
542 |                  nonlinearity='relu',
543 |                  bias=0,
544 |                  distribution='normal'):
545 |     assert distribution in ['uniform', 'normal']
546 |     if distribution == 'uniform':
547 |         nn.init.kaiming_uniform_(
548 |             module.weight, mode=mode, nonlinearity=nonlinearity)
549 |     else:
550 |         nn.init.kaiming_normal_(
551 |             module.weight, mode=mode, nonlinearity=nonlinearity)
552 |     if hasattr(module, 'bias'):
553 |         nn.init.constant_(module.bias, bias)
554 | 
555 | 
556 | def bias_init_with_prob(prior_prob):
557 |     """ initialize conv/fc bias value according to giving probablity"""
558 |     bias_init = float(-np.log((1 - prior_prob) / prior_prob))
559 |     return bias_init
560 | 


--------------------------------------------------------------------------------
/models/retinahead.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import numpy as np
  4 | import torch.nn as nn
  5 | 
  6 | from .module import ConvModule, bias_init_with_prob, normal_init
  7 | from six.moves import map, zip
  8 | 
  9 | 
 10 | def multi_apply(func, *args, **kwargs):
 11 |     pfunc = partial(func, **kwargs) if kwargs else func
 12 |     map_results = map(pfunc, *args)
 13 |     return tuple(map(list, zip(*map_results)))
 14 | 
 15 | 
 16 | class RetinaHead(nn.Module):
 17 |     """
 18 |     An anchor-based head used in [1]_.
 19 |     The head contains two subnetworks. The first classifies anchor boxes and
 20 |     the second regresses deltas for the anchors.
 21 |     References:
 22 |         .. [1]  https://arxiv.org/pdf/1708.02002.pdf
 23 |     Example:
 24 |         >>> import torch
 25 |         >>> self = RetinaHead(11, 7)
 26 |         >>> x = torch.rand(1, 7, 32, 32)
 27 |         >>> cls_score, bbox_pred = self.forward_single(x)
 28 |         >>> # Each anchor predicts a score for each class except background
 29 |         >>> cls_per_anchor = cls_score.shape[1] / self.num_anchors
 30 |         >>> box_per_anchor = bbox_pred.shape[1] / self.num_anchors
 31 |         >>> assert cls_per_anchor == (self.num_classes - 1)
 32 |         >>> assert box_per_anchor == 4
 33 |     """
 34 | 
 35 |     def __init__(self,
 36 |                  num_classes,
 37 |                  in_channels,
 38 |                  feat_channels=256,
 39 |                  anchor_scales=[8, 16, 32],
 40 |                  anchor_ratios=[0.5, 1.0, 2.0],
 41 |                  anchor_strides=[4, 8, 16, 32, 64],
 42 |                  stacked_convs=4,
 43 |                  octave_base_scale=4,
 44 |                  scales_per_octave=3,
 45 |                  conv_cfg=None,
 46 |                  norm_cfg=None,
 47 |                  **kwargs):
 48 |         super(RetinaHead, self).__init__()
 49 |         self.in_channels = in_channels
 50 |         self.num_classes = num_classes
 51 |         self.feat_channels = feat_channels
 52 |         self.anchor_scales = anchor_scales
 53 |         self.anchor_ratios = anchor_ratios
 54 |         self.anchor_strides = anchor_strides
 55 |         self.stacked_convs = stacked_convs
 56 |         self.octave_base_scale = octave_base_scale
 57 |         self.scales_per_octave = scales_per_octave
 58 |         self.conv_cfg = conv_cfg
 59 |         self.norm_cfg = norm_cfg
 60 |         octave_scales = np.array(
 61 |             [2**(i / scales_per_octave) for i in range(scales_per_octave)])
 62 |         anchor_scales = octave_scales * octave_base_scale
 63 |         self.cls_out_channels = num_classes
 64 |         self.num_anchors = len(self.anchor_ratios) * len(self.anchor_scales)
 65 |         self._init_layers()
 66 | 
 67 |     def _init_layers(self):
 68 |         self.relu = nn.ReLU(inplace=True)
 69 |         self.cls_convs = nn.ModuleList()
 70 |         self.reg_convs = nn.ModuleList()
 71 |         for i in range(self.stacked_convs):
 72 |             chn = self.in_channels if i == 0 else self.feat_channels
 73 |             self.cls_convs.append(
 74 |                 ConvModule(
 75 |                     chn,
 76 |                     self.feat_channels,
 77 |                     3,
 78 |                     stride=1,
 79 |                     padding=1,
 80 |                     conv_cfg=self.conv_cfg,
 81 |                     norm_cfg=self.norm_cfg))
 82 |             self.reg_convs.append(
 83 |                 ConvModule(
 84 |                     chn,
 85 |                     self.feat_channels,
 86 |                     3,
 87 |                     stride=1,
 88 |                     padding=1,
 89 |                     conv_cfg=self.conv_cfg,
 90 |                     norm_cfg=self.norm_cfg))
 91 |         self.retina_cls = nn.Conv2d(
 92 |             self.feat_channels,
 93 |             self.num_anchors * self.cls_out_channels,
 94 |             3,
 95 |             padding=1)
 96 |         self.retina_reg = nn.Conv2d(
 97 |             self.feat_channels, self.num_anchors * 4, 3, padding=1)
 98 |         self.output_act = nn.Sigmoid()
 99 | 
100 |     def init_weights(self):
101 |         for m in self.cls_convs:
102 |             normal_init(m.conv, std=0.01)
103 |         for m in self.reg_convs:
104 |             normal_init(m.conv, std=0.01)
105 |         bias_cls = bias_init_with_prob(0.01)
106 |         normal_init(self.retina_cls, std=0.01, bias=bias_cls)
107 |         normal_init(self.retina_reg, std=0.01)
108 | 
109 |     def forward_single(self, x):
110 |         cls_feat = x
111 |         reg_feat = x
112 |         for cls_conv in self.cls_convs:
113 |             cls_feat = cls_conv(cls_feat)
114 |         for reg_conv in self.reg_convs:
115 |             reg_feat = reg_conv(reg_feat)
116 | 
117 |         cls_score = self.retina_cls(cls_feat)
118 |         cls_score = self.output_act(cls_score)
119 |         # out is B x C x W x H, with C = n_classes + n_anchors
120 |         cls_score = cls_score.permute(0, 2, 3, 1)
121 |         batch_size, width, height, channels = cls_score.shape
122 |         cls_score = cls_score.view(
123 |             batch_size, width, height, self.num_anchors, self.num_classes)
124 |         cls_score = cls_score.contiguous().view(x.size(0), -1, self.num_classes)
125 | 
126 |         bbox_pred = self.retina_reg(reg_feat)
127 |         bbox_pred = bbox_pred.permute(0, 2, 3, 1)
128 |         bbox_pred = bbox_pred.contiguous().view(bbox_pred.size(0), -1, 4)
129 |         return cls_score, bbox_pred
130 | 
131 |     def forward(self, feats):
132 |         return multi_apply(self.forward_single, feats)
133 | 


--------------------------------------------------------------------------------
/models/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import math
  3 | import collections
  4 | from functools import partial
  5 | import torch
  6 | from torch import nn
  7 | from torch.nn import functional as F
  8 | from torch.utils import model_zoo
  9 | 
 10 | ########################################################################
 11 | ############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ###############
 12 | ########################################################################
 13 | 
 14 | 
 15 | # Parameters for the entire model (stem, all blocks, and head)
 16 | GlobalParams = collections.namedtuple('GlobalParams', [
 17 |     'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate',
 18 |     'num_classes', 'width_coefficient', 'depth_coefficient',
 19 |     'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size'])
 20 | 
 21 | # Parameters for an individual model block
 22 | BlockArgs = collections.namedtuple('BlockArgs', [
 23 |     'kernel_size', 'num_repeat', 'input_filters', 'output_filters',
 24 |     'expand_ratio', 'id_skip', 'stride', 'se_ratio'])
 25 | 
 26 | # Change namedtuple defaults
 27 | GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
 28 | BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields)
 29 | 
 30 | 
 31 | class SwishImplementation(torch.autograd.Function):
 32 |     @staticmethod
 33 |     def forward(ctx, i):
 34 |         result = i * torch.sigmoid(i)
 35 |         ctx.save_for_backward(i)
 36 |         return result
 37 | 
 38 |     @staticmethod
 39 |     def backward(ctx, grad_output):
 40 |         i = ctx.saved_variables[0]
 41 |         sigmoid_i = torch.sigmoid(i)
 42 |         return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i)))
 43 | 
 44 | 
 45 | class MemoryEfficientSwish(nn.Module):
 46 |     def forward(self, x):
 47 |         return SwishImplementation.apply(x)
 48 | 
 49 | 
 50 | class Swish(nn.Module):
 51 |     def forward(self, x):
 52 |         return x * torch.sigmoid(x)
 53 | 
 54 | 
 55 | def round_filters(filters, global_params):
 56 |     """ Calculate and round number of filters based on depth multiplier. """
 57 |     multiplier = global_params.width_coefficient
 58 |     if not multiplier:
 59 |         return filters
 60 |     divisor = global_params.depth_divisor
 61 |     min_depth = global_params.min_depth
 62 |     filters *= multiplier
 63 |     min_depth = min_depth or divisor
 64 |     new_filters = max(min_depth, int(
 65 |         filters + divisor / 2) // divisor * divisor)
 66 |     if new_filters < 0.9 * filters:  # prevent rounding by more than 10%
 67 |         new_filters += divisor
 68 |     return int(new_filters)
 69 | 
 70 | 
 71 | def round_repeats(repeats, global_params):
 72 |     """ Round number of filters based on depth multiplier. """
 73 |     multiplier = global_params.depth_coefficient
 74 |     if not multiplier:
 75 |         return repeats
 76 |     return int(math.ceil(multiplier * repeats))
 77 | 
 78 | 
 79 | def drop_connect(inputs, p, training):
 80 |     """ Drop connect. """
 81 |     if not training:
 82 |         return inputs
 83 |     batch_size = inputs.shape[0]
 84 |     keep_prob = 1 - p
 85 |     random_tensor = keep_prob
 86 |     random_tensor += torch.rand([batch_size, 1, 1, 1],
 87 |                                 dtype=inputs.dtype, device=inputs.device)
 88 |     binary_tensor = torch.floor(random_tensor)
 89 |     output = inputs / keep_prob * binary_tensor
 90 |     return output
 91 | 
 92 | 
 93 | def get_same_padding_conv2d(image_size=None):
 94 |     """ Chooses static padding if you have specified an image size, and dynamic padding otherwise.
 95 |         Static padding is necessary for ONNX exporting of models. """
 96 |     if image_size is None:
 97 |         return Conv2dDynamicSamePadding
 98 |     else:
 99 |         return partial(Conv2dStaticSamePadding, image_size=image_size)
100 | 
101 | 
102 | class Conv2dDynamicSamePadding(nn.Conv2d):
103 |     """ 2D Convolutions like TensorFlow, for a dynamic image size """
104 | 
105 |     def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
106 |         super().__init__(in_channels, out_channels,
107 |                          kernel_size, stride, 0, dilation, groups, bias)
108 |         self.stride = self.stride if len(self.stride) == 2 else [
109 |             self.stride[0]] * 2
110 | 
111 |     def forward(self, x):
112 |         ih, iw = x.size()[-2:]
113 |         kh, kw = self.weight.size()[-2:]
114 |         sh, sw = self.stride
115 |         oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
116 |         pad_h = max((oh - 1) * self.stride[0] +
117 |                     (kh - 1) * self.dilation[0] + 1 - ih, 0)
118 |         pad_w = max((ow - 1) * self.stride[1] +
119 |                     (kw - 1) * self.dilation[1] + 1 - iw, 0)
120 |         if pad_h > 0 or pad_w > 0:
121 |             x = F.pad(x, [pad_w // 2, pad_w - pad_w //
122 |                           2, pad_h // 2, pad_h - pad_h // 2])
123 |         return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
124 | 
125 | 
126 | class Conv2dStaticSamePadding(nn.Conv2d):
127 |     """ 2D Convolutions like TensorFlow, for a fixed image size"""
128 | 
129 |     def __init__(self, in_channels, out_channels, kernel_size, image_size=None, **kwargs):
130 |         super().__init__(in_channels, out_channels, kernel_size, **kwargs)
131 |         self.stride = self.stride if len(self.stride) == 2 else [
132 |             self.stride[0]] * 2
133 | 
134 |         # Calculate padding based on image size and save it
135 |         assert image_size is not None
136 |         ih, iw = image_size if type(image_size) == list else [
137 |             image_size, image_size]
138 |         kh, kw = self.weight.size()[-2:]
139 |         sh, sw = self.stride
140 |         oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
141 |         pad_h = max((oh - 1) * self.stride[0] +
142 |                     (kh - 1) * self.dilation[0] + 1 - ih, 0)
143 |         pad_w = max((ow - 1) * self.stride[1] +
144 |                     (kw - 1) * self.dilation[1] + 1 - iw, 0)
145 |         if pad_h > 0 or pad_w > 0:
146 |             self.static_padding = nn.ZeroPad2d(
147 |                 (pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
148 |         else:
149 |             self.static_padding = Identity()
150 | 
151 |     def forward(self, x):
152 |         x = self.static_padding(x)
153 |         x = F.conv2d(x, self.weight, self.bias, self.stride,
154 |                      self.padding, self.dilation, self.groups)
155 |         return x
156 | 
157 | 
158 | class Identity(nn.Module):
159 |     def __init__(self, ):
160 |         super(Identity, self).__init__()
161 | 
162 |     def forward(self, input):
163 |         return input
164 | 
165 | 
166 | ########################################################################
167 | ############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ##############
168 | ########################################################################
169 | 
170 | 
171 | def efficientnet_params(model_name):
172 |     """ Map EfficientNet model name to parameter coefficients. """
173 |     params_dict = {
174 |         # Coefficients:   width,depth,res,dropout
175 |         'efficientnet-b0': (1.0, 1.0, 224, 0.2),
176 |         'efficientnet-b1': (1.0, 1.1, 240, 0.2),
177 |         'efficientnet-b2': (1.1, 1.2, 260, 0.3),
178 |         'efficientnet-b3': (1.2, 1.4, 300, 0.3),
179 |         'efficientnet-b4': (1.4, 1.8, 380, 0.4),
180 |         'efficientnet-b5': (1.6, 2.2, 456, 0.4),
181 |         'efficientnet-b6': (1.8, 2.6, 528, 0.5),
182 |         'efficientnet-b7': (2.0, 3.1, 600, 0.5),
183 |     }
184 |     return params_dict[model_name]
185 | 
186 | 
187 | class BlockDecoder(object):
188 |     """ Block Decoder for readability, straight from the official TensorFlow repository """
189 | 
190 |     @staticmethod
191 |     def _decode_block_string(block_string):
192 |         """ Gets a block through a string notation of arguments. """
193 |         assert isinstance(block_string, str)
194 | 
195 |         ops = block_string.split('_')
196 |         options = {}
197 |         for op in ops:
198 |             splits = re.split(r'(\d.*)', op)
199 |             if len(splits) >= 2:
200 |                 key, value = splits[:2]
201 |                 options[key] = value
202 | 
203 |         # Check stride
204 |         assert (('s' in options and len(options['s']) == 1) or
205 |                 (len(options['s']) == 2 and options['s'][0] == options['s'][1]))
206 | 
207 |         return BlockArgs(
208 |             kernel_size=int(options['k']),
209 |             num_repeat=int(options['r']),
210 |             input_filters=int(options['i']),
211 |             output_filters=int(options['o']),
212 |             expand_ratio=int(options['e']),
213 |             id_skip=('noskip' not in block_string),
214 |             se_ratio=float(options['se']) if 'se' in options else None,
215 |             stride=[int(options['s'][0])])
216 | 
217 |     @staticmethod
218 |     def _encode_block_string(block):
219 |         """Encodes a block to a string."""
220 |         args = [
221 |             'r%d' % block.num_repeat,
222 |             'k%d' % block.kernel_size,
223 |             's%d%d' % (block.strides[0], block.strides[1]),
224 |             'e%s' % block.expand_ratio,
225 |             'i%d' % block.input_filters,
226 |             'o%d' % block.output_filters
227 |         ]
228 |         if 0 < block.se_ratio <= 1:
229 |             args.append('se%s' % block.se_ratio)
230 |         if block.id_skip is False:
231 |             args.append('noskip')
232 |         return '_'.join(args)
233 | 
234 |     @staticmethod
235 |     def decode(string_list):
236 |         """
237 |         Decodes a list of string notations to specify blocks inside the network.
238 |         :param string_list: a list of strings, each string is a notation of block
239 |         :return: a list of BlockArgs namedtuples of block args
240 |         """
241 |         assert isinstance(string_list, list)
242 |         blocks_args = []
243 |         for block_string in string_list:
244 |             blocks_args.append(BlockDecoder._decode_block_string(block_string))
245 |         return blocks_args
246 | 
247 |     @staticmethod
248 |     def encode(blocks_args):
249 |         """
250 |         Encodes a list of BlockArgs to a list of strings.
251 |         :param blocks_args: a list of BlockArgs namedtuples of block args
252 |         :return: a list of strings, each string is a notation of block
253 |         """
254 |         block_strings = []
255 |         for block in blocks_args:
256 |             block_strings.append(BlockDecoder._encode_block_string(block))
257 |         return block_strings
258 | 
259 | 
260 | def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2,
261 |                  drop_connect_rate=0.2, image_size=None, num_classes=1000):
262 |     """ Creates a efficientnet model. """
263 | 
264 |     blocks_args = [
265 |         'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25',
266 |         'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25',
267 |         'r3_k5_s22_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25',
268 |         'r1_k3_s22_e6_i192_o320_se0.25',
269 |     ]
270 |     blocks_args = BlockDecoder.decode(blocks_args)
271 | 
272 |     global_params = GlobalParams(
273 |         batch_norm_momentum=0.99,
274 |         batch_norm_epsilon=1e-3,
275 |         dropout_rate=dropout_rate,
276 |         drop_connect_rate=drop_connect_rate,
277 |         # data_format='channels_last',  # removed, this is always true in PyTorch
278 |         num_classes=num_classes,
279 |         width_coefficient=width_coefficient,
280 |         depth_coefficient=depth_coefficient,
281 |         depth_divisor=8,
282 |         min_depth=None,
283 |         image_size=image_size,
284 |     )
285 | 
286 |     return blocks_args, global_params
287 | 
288 | 
289 | def get_model_params(model_name, override_params):
290 |     """ Get the block args and global params for a given model """
291 |     if model_name.startswith('efficientnet'):
292 |         w, d, s, p = efficientnet_params(model_name)
293 |         # note: all models have drop connect rate = 0.2
294 |         blocks_args, global_params = efficientnet(
295 |             width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s)
296 |     else:
297 |         raise NotImplementedError(
298 |             'model name is not pre-defined: %s' % model_name)
299 |     if override_params:
300 |         # ValueError will be raised here if override_params has fields not included in global_params.
301 |         global_params = global_params._replace(**override_params)
302 |     return blocks_args, global_params
303 | 
304 | 
305 | url_map = {
306 |     'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b0-355c32eb.pth',
307 |     'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b1-f1951068.pth',
308 |     'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b2-8bb594d6.pth',
309 |     'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b3-5fb5a3c3.pth',
310 |     'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b4-6ed6700e.pth',
311 |     'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b5-b6417697.pth',
312 |     'efficientnet-b6': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b6-c76e70fd.pth',
313 |     'efficientnet-b7': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b7-dcc49843.pth',
314 | }
315 | 
316 | 
317 | def load_pretrained_weights(model, model_name, load_fc=True):
318 |     """ Loads pretrained weights, and downloads if loading for the first time. """
319 |     state_dict = model_zoo.load_url(url_map[model_name])
320 |     if load_fc:
321 |         model.load_state_dict(state_dict)
322 |     else:
323 |         state_dict.pop('_fc.weight')
324 |         state_dict.pop('_fc.bias')
325 |         res = model.load_state_dict(state_dict, strict=False)
326 |         assert set(res.missing_keys) == set(
327 |             ['_fc.weight', '_fc.bias']), 'issue loading pretrained weights'
328 |     print('Loaded pretrained weights for {}'.format(model_name))
329 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | torch
4 | torchvision
5 | pytoan
6 | albumentations
7 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from models import EfficientDet
 3 | from models.efficientnet import EfficientNet
 4 | 
 5 | if __name__ == '__main__':
 6 | 
 7 |     inputs = torch.randn(5, 3, 512, 512)
 8 | 
 9 |     # Test EfficientNet
10 |     model = EfficientNet.from_pretrained('efficientnet-b0')
11 |     inputs = torch.randn(4, 3, 512, 512)
12 |     P = model(inputs)
13 |     for idx, p in enumerate(P):
14 |         print('P{}: {}'.format(idx, p.size()))
15 | 
16 |     # print('model: ', model)
17 | 
18 |     # Test inference
19 |     model = EfficientDet(num_classes=20, is_training=False)
20 |     output = model(inputs)
21 |     for out in output:
22 |         print(out.size())
23 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from tqdm import tqdm
  2 | import argparse
  3 | import os
  4 | import random
  5 | import shutil
  6 | import time
  7 | import warnings
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.parallel
 11 | import torch.backends.cudnn as cudnn
 12 | import torch.distributed as dist
 13 | import torch.optim
 14 | import torch.multiprocessing as mp
 15 | import torch.utils.data
 16 | import torch.utils.data.distributed
 17 | import torchvision.transforms as transforms
 18 | import torchvision.datasets as datasets
 19 | 
 20 | import os
 21 | import sys
 22 | import time
 23 | import argparse
 24 | import numpy as np
 25 | import torch
 26 | import torch.optim as optim
 27 | import torch.backends.cudnn as cudnn
 28 | from torch.utils.data import DataLoader
 29 | 
 30 | from models.efficientdet import EfficientDet
 31 | from models.losses import FocalLoss
 32 | from datasets import VOCDetection, CocoDataset, get_augumentation, detection_collate, Resizer, Normalizer, Augmenter, collater
 33 | from utils import EFFICIENTDET, get_state_dict
 34 | from eval import evaluate, evaluate_coco
 35 | 
 36 | parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
 37 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'],
 38 |                     type=str, help='VOC or COCO')
 39 | parser.add_argument(
 40 |     '--dataset_root',
 41 |     default='/root/data/VOCdevkit/',
 42 |     help='Dataset root directory path [/root/data/VOCdevkit/, /root/data/coco/]')
 43 | parser.add_argument('--network', default='efficientdet-d0', type=str,
 44 |                     help='efficientdet-[d0, d1, ..]')
 45 | 
 46 | parser.add_argument('--resume', default=None, type=str,
 47 |                     help='Checkpoint state_dict file to resume training from')
 48 | parser.add_argument('--num_epoch', default=500, type=int,
 49 |                     help='Num epoch for training')
 50 | parser.add_argument('--batch_size', default=32, type=int,
 51 |                     help='Batch size for training')
 52 | parser.add_argument('--num_class', default=20, type=int,
 53 |                     help='Number of class used in model')
 54 | parser.add_argument('--device', default=[0, 1], type=list,
 55 |                     help='Use CUDA to train model')
 56 | parser.add_argument('--grad_accumulation_steps', default=1, type=int,
 57 |                     help='Number of gradient accumulation steps')
 58 | parser.add_argument('--lr', '--learning-rate', default=1e-4, type=float,
 59 |                     help='initial learning rate')
 60 | parser.add_argument('--momentum', default=0.9, type=float,
 61 |                     help='Momentum value for optim')
 62 | parser.add_argument('--weight_decay', default=5e-4, type=float,
 63 |                     help='Weight decay for SGD')
 64 | parser.add_argument('--gamma', default=0.1, type=float,
 65 |                     help='Gamma update for SGD')
 66 | parser.add_argument('--save_folder', default='./saved/weights/', type=str,
 67 |                     help='Directory for saving checkpoint models')
 68 | parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
 69 |                     help='number of data loading workers (default: 4)')
 70 | parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
 71 |                     help='manual epoch number (useful on restarts)')
 72 | parser.add_argument('--world-size', default=1, type=int,
 73 |                     help='number of nodes for distributed training')
 74 | parser.add_argument('--rank', default=0, type=int,
 75 |                     help='node rank for distributed training')
 76 | parser.add_argument('--dist-url', default='env://', type=str,
 77 |                     help='url used to set up distributed training')
 78 | parser.add_argument('--dist-backend', default='nccl', type=str,
 79 |                     help='distributed backend')
 80 | parser.add_argument('--seed', default=24, type=int,
 81 |                     help='seed for initializing training. ')
 82 | parser.add_argument('--gpu', default=None, type=int,
 83 |                     help='GPU id to use.')
 84 | parser.add_argument(
 85 |     '--multiprocessing-distributed',
 86 |     action='store_true',
 87 |     help='Use multi-processing distributed training to launch '
 88 |     'N processes per node, which has N GPUs. This is the '
 89 |     'fastest way to use PyTorch for either single node or '
 90 |     'multi node data parallel training')
 91 | 
 92 | iteration = 1
 93 | 
 94 | 
 95 | def train(train_loader, model, scheduler, optimizer, epoch, args):
 96 |     global iteration
 97 |     print("{} epoch: \t start training....".format(epoch))
 98 |     start = time.time()
 99 |     total_loss = []
100 |     model.train()
101 |     model.module.is_training = True
102 |     model.module.freeze_bn()
103 |     optimizer.zero_grad()
104 |     for idx, (images, annotations) in enumerate(train_loader):
105 |         images = images.cuda().float()
106 |         annotations = annotations.cuda()
107 |         classification_loss, regression_loss = model([images, annotations])
108 |         classification_loss = classification_loss.mean()
109 |         regression_loss = regression_loss.mean()
110 |         loss = classification_loss + regression_loss
111 |         if bool(loss == 0):
112 |             print('loss equal zero(0)')
113 |             continue
114 |         loss.backward()
115 |         if (idx + 1) % args.grad_accumulation_steps == 0:
116 |             torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
117 |             optimizer.step()
118 |             optimizer.zero_grad()
119 | 
120 |         total_loss.append(loss.item())
121 |         if(iteration % 300 == 0):
122 |             print('{} iteration: training ...'.format(iteration))
123 |             ans = {
124 |                 'epoch': epoch,
125 |                 'iteration': iteration,
126 |                 'cls_loss': classification_loss.item(),
127 |                 'reg_loss': regression_loss.item(),
128 |                 'mean_loss': np.mean(total_loss)
129 |             }
130 |             for key, value in ans.items():
131 |                 print('    {:15s}: {}'.format(str(key), value))
132 |         iteration += 1
133 |     scheduler.step(np.mean(total_loss))
134 |     result = {
135 |         'time': time.time() - start,
136 |         'loss': np.mean(total_loss)
137 |     }
138 |     for key, value in result.items():
139 |         print('    {:15s}: {}'.format(str(key), value))
140 | 
141 | 
142 | def test(dataset, model, epoch, args):
143 |     print("{} epoch: \t start validation....".format(epoch))
144 |     model = model.module
145 |     model.eval()
146 |     model.is_training = False
147 |     with torch.no_grad():
148 |         if(args.dataset == 'VOC'):
149 |             evaluate(dataset, model)
150 |         else:
151 |             evaluate_coco(dataset, model)
152 | 
153 | 
154 | def main_worker(gpu, ngpus_per_node, args):
155 |     args.gpu = gpu
156 |     if args.gpu is not None:
157 |         print("Use GPU: {} for training".format(args.gpu))
158 | 
159 |     if args.distributed:
160 |         if args.dist_url == "env://" and args.rank == -1:
161 |             # args.rank = int(os.environ["RANK"])
162 |             args.rank = 1
163 |         if args.multiprocessing_distributed:
164 |             # For multiprocessing distributed training, rank needs to be the
165 |             # global rank among all the processes
166 |             args.rank = args.rank * ngpus_per_node + gpu
167 |         dist.init_process_group(
168 |             backend=args.dist_backend,
169 |             init_method=args.dist_url,
170 |             world_size=args.world_size,
171 |             rank=args.rank)
172 | 
173 |     # Training dataset
174 |     train_dataset = []
175 |     if(args.dataset == 'VOC'):
176 |         train_dataset = VOCDetection(root=args.dataset_root, transform=transforms.Compose(
177 |             [Normalizer(), Augmenter(), Resizer()]))
178 |         valid_dataset = VOCDetection(root=args.dataset_root, image_sets=[(
179 |             '2007', 'test')], transform=transforms.Compose([Normalizer(), Resizer()]))
180 |         args.num_class = train_dataset.num_classes()
181 |     elif(args.dataset == 'COCO'):
182 |         train_dataset = CocoDataset(
183 |             root_dir=args.dataset_root,
184 |             set_name='train2017',
185 |             transform=transforms.Compose(
186 |                 [
187 |                     Normalizer(),
188 |                     Augmenter(),
189 |                     Resizer()]))
190 |         valid_dataset = CocoDataset(
191 |             root_dir=args.dataset_root,
192 |             set_name='val2017',
193 |             transform=transforms.Compose(
194 |                 [
195 |                     Normalizer(),
196 |                     Resizer()]))
197 |         args.num_class = train_dataset.num_classes()
198 | 
199 |     train_loader = DataLoader(train_dataset,
200 |                               batch_size=args.batch_size,
201 |                               num_workers=args.workers,
202 |                               shuffle=True,
203 |                               collate_fn=collater,
204 |                               pin_memory=True)
205 |     valid_loader = DataLoader(valid_dataset,
206 |                               batch_size=1,
207 |                               num_workers=args.workers,
208 |                               shuffle=False,
209 |                               collate_fn=collater,
210 |                               pin_memory=True)
211 | 
212 |     checkpoint = []
213 |     if(args.resume is not None):
214 |         if os.path.isfile(args.resume):
215 |             print("=> loading checkpoint '{}'".format(args.resume))
216 |             if args.gpu is None:
217 |                 checkpoint = torch.load(args.resume)
218 |             else:
219 |                 # Map model to be loaded to specified single gpu.
220 |                 loc = 'cuda:{}'.format(args.gpu)
221 |                 checkpoint = torch.load(args.resume, map_location=loc)
222 |         params = checkpoint['parser']
223 |         args.num_class = params.num_class
224 |         args.network = params.network
225 |         args.start_epoch = checkpoint['epoch'] + 1
226 |         del params
227 | 
228 |     model = EfficientDet(num_classes=args.num_class,
229 |                          network=args.network,
230 |                          W_bifpn=EFFICIENTDET[args.network]['W_bifpn'],
231 |                          D_bifpn=EFFICIENTDET[args.network]['D_bifpn'],
232 |                          D_class=EFFICIENTDET[args.network]['D_class']
233 |                          )
234 |     if(args.resume is not None):
235 |         model.load_state_dict(checkpoint['state_dict'])
236 |     del checkpoint
237 |     if args.distributed:
238 |         # For multiprocessing distributed, DistributedDataParallel constructor
239 |         # should always set the single device scope, otherwise,
240 |         # DistributedDataParallel will use all available devices.
241 |         if args.gpu is not None:
242 |             torch.cuda.set_device(args.gpu)
243 |             model.cuda(args.gpu)
244 |             # When using a single GPU per process and per
245 |             # DistributedDataParallel, we need to divide the batch size
246 |             # ourselves based on the total number of GPUs we have
247 |             args.batch_size = int(args.batch_size / ngpus_per_node)
248 |             args.workers = int(
249 |                 (args.workers + ngpus_per_node - 1) / ngpus_per_node)
250 |             model = torch.nn.parallel.DistributedDataParallel(
251 |                 model, device_ids=[args.gpu], find_unused_parameters=True)
252 |             print('Run with DistributedDataParallel with divice_ids....')
253 |         else:
254 |             model.cuda()
255 |             # DistributedDataParallel will divide and allocate batch_size to all
256 |             # available GPUs if device_ids are not set
257 |             model = torch.nn.parallel.DistributedDataParallel(model)
258 |             print('Run with DistributedDataParallel without device_ids....')
259 |     elif args.gpu is not None:
260 |         torch.cuda.set_device(args.gpu)
261 |         model = model.cuda(args.gpu)
262 |     else:
263 |         model = model.cuda()
264 |         print('Run with DataParallel ....')
265 |         model = torch.nn.DataParallel(model).cuda()
266 | 
267 |     # define loss function (criterion) , optimizer, scheduler
268 |     optimizer = optim.AdamW(model.parameters(), lr=args.lr)
269 |     scheduler = optim.lr_scheduler.ReduceLROnPlateau(
270 |         optimizer, patience=3, verbose=True)
271 |     cudnn.benchmark = True
272 | 
273 |     for epoch in range(args.start_epoch, args.num_epoch):
274 |         train(train_loader, model, scheduler, optimizer, epoch, args)
275 | 
276 |         if (epoch + 1) % 5 == 0:
277 |             test(valid_dataset, model, epoch, args)
278 | 
279 |         state = {
280 |             'epoch': epoch,
281 |             'parser': args,
282 |             'state_dict': get_state_dict(model)
283 |         }
284 | 
285 |         torch.save(
286 |             state,
287 |             os.path.join(
288 |                 args.save_folder,
289 |                 args.dataset,
290 |                 args.network,
291 |                 "checkpoint_{}.pth".format(epoch)))
292 | 
293 | 
294 | def main():
295 |     args = parser.parse_args()
296 |     if(not os.path.exists(os.path.join(args.save_folder, args.dataset, args.network))):
297 |         os.makedirs(os.path.join(args.save_folder, args.dataset, args.network))
298 |     if args.seed is not None:
299 |         random.seed(args.seed)
300 |         torch.manual_seed(args.seed)
301 |         cudnn.deterministic = True
302 |         warnings.warn('You have chosen to seed training. '
303 |                       'This will turn on the CUDNN deterministic setting, '
304 |                       'which can slow down your training considerably! '
305 |                       'You may see unexpected behavior when restarting '
306 |                       'from checkpoints.')
307 | 
308 |     if args.gpu is not None:
309 |         warnings.warn('You have chosen a specific GPU. This will completely '
310 |                       'disable data parallelism.')
311 |     os.environ['MASTER_ADDR'] = 'localhost'
312 |     os.environ['MASTER_PORT'] = '12355'
313 |     os.environ['WORLD_SIZE'] = '2'
314 |     if args.dist_url == "env://" and args.world_size == -1:
315 |         args.world_size = int(os.environ["WORLD_SIZE"])
316 | 
317 |     args.distributed = args.world_size > 1 or args.multiprocessing_distributed
318 |     ngpus_per_node = torch.cuda.device_count()
319 |     if args.multiprocessing_distributed:
320 |         # Since we have ngpus_per_node processes per node, the total world_size
321 |         # needs to be adjusted accordingly
322 |         args.world_size = ngpus_per_node * args.world_size
323 |         # Use torch.multiprocessing.spawn to launch distributed processes: the
324 |         # main_worker process function
325 |         mp.spawn(main_worker, nprocs=ngpus_per_node,
326 |                  args=(ngpus_per_node, args))
327 |     else:
328 |         # Simply call main_worker function
329 |         main_worker(args.gpu, ngpus_per_node, args)
330 | 
331 | 
332 | if __name__ == "__main__":
333 |     main()
334 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .helper import *
2 | from .util import *
3 | from .visualization import *
4 | from .vis_bbox import vis_bbox
5 | from .config_eff import *


--------------------------------------------------------------------------------
/utils/config_eff.py:
--------------------------------------------------------------------------------
 1 | EFFICIENTDET = {
 2 |     'efficientdet-d0': {'input_size': 512,
 3 |                         'backbone': 'B0',
 4 |                         'W_bifpn': 64,
 5 |                         'D_bifpn': 2,
 6 |                         'D_class': 3},
 7 |     'efficientdet-d1': {'input_size': 640,
 8 |                         'backbone': 'B1',
 9 |                         'W_bifpn': 88,
10 |                         'D_bifpn': 3,
11 |                         'D_class': 3},
12 |     'efficientdet-d2': {'input_size': 768,
13 |                         'backbone': 'B2',
14 |                         'W_bifpn': 112,
15 |                         'D_bifpn': 4,
16 |                         'D_class': 3},
17 |     'efficientdet-d3': {'input_size': 896,
18 |                         'backbone': 'B3',
19 |                         'W_bifpn': 160,
20 |                         'D_bifpn': 5,
21 |                         'D_class': 4},
22 |     'efficientdet-d4': {'input_size': 1024,
23 |                         'backbone': 'B4',
24 |                         'W_bifpn': 224,
25 |                         'D_bifpn': 6,
26 |                         'D_class': 4},
27 |     'efficientdet-d5': {'input_size': 1280,
28 |                         'backbone': 'B5',
29 |                         'W_bifpn': 288,
30 |                         'D_bifpn': 7,
31 |                         'D_class': 4},
32 |     'efficientdet-d6': {'input_size': 1408,
33 |                         'backbone': 'B6',
34 |                         'W_bifpn': 384,
35 |                         'D_bifpn': 8,
36 |                         'D_class': 5},
37 |     'efficientdet-d7': {'input_size': 1636,
38 |                         'backbone': 'B6',
39 |                         'W_bifpn': 384,
40 |                         'D_bifpn': 8,
41 |                         'D_class': 5},
42 | }


--------------------------------------------------------------------------------
/utils/helper.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import numpy as np
 3 | import torch
 4 | import os
 5 | import requests
 6 | import socket
 7 | import datetime
 8 | import json
 9 | 
10 | 
11 | def load_yaml(file_name):
12 |     with open(file_name, 'r') as stream:
13 |         config = yaml.load(stream, Loader=yaml.FullLoader)
14 |     return config
15 | 
16 | 
17 | def init_seed(SEED=42):
18 |     os.environ['PYTHONHASHSEED'] = str(SEED)
19 |     np.random.seed(SEED)
20 |     torch.manual_seed(SEED)
21 |     torch.cuda.manual_seed(SEED)
22 |     torch.backends.cudnn.deterministic = True
23 | 
24 | 
25 | def get_state_dict(model):
26 |     if type(model) == torch.nn.DataParallel:
27 |         state_dict = model.module.state_dict()
28 |     else:
29 |         state_dict = model.state_dict()
30 |     return state_dict
31 | 


--------------------------------------------------------------------------------
/utils/metric.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import numpy as np
  4 | import json
  5 | import os
  6 | 
  7 | import torch
  8 | 
  9 | 
 10 | def compute_overlap(a, b):
 11 |     """
 12 |     Parameters
 13 |     ----------
 14 |     a: (N, 4) ndarray of float
 15 |     b: (K, 4) ndarray of float
 16 |     Returns
 17 |     -------
 18 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
 19 |     """
 20 |     area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1])
 21 | 
 22 |     iw = np.minimum(np.expand_dims(
 23 |         a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0])
 24 |     ih = np.minimum(np.expand_dims(
 25 |         a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1])
 26 | 
 27 |     iw = np.maximum(iw, 0)
 28 |     ih = np.maximum(ih, 0)
 29 | 
 30 |     ua = np.expand_dims((a[:, 2] - a[:, 0]) *
 31 |                         (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih
 32 | 
 33 |     ua = np.maximum(ua, np.finfo(float).eps)
 34 | 
 35 |     intersection = iw * ih
 36 | 
 37 |     return intersection / ua
 38 | 
 39 | 
 40 | def _compute_ap(recall, precision):
 41 |     """ Compute the average precision, given the recall and precision curves.
 42 |     Code originally from https://github.com/rbgirshick/py-faster-rcnn.
 43 |     # Arguments
 44 |         recall:    The recall curve (list).
 45 |         precision: The precision curve (list).
 46 |     # Returns
 47 |         The average precision as computed in py-faster-rcnn.
 48 |     """
 49 |     # correct AP calculation
 50 |     # first append sentinel values at the end
 51 |     mrec = np.concatenate(([0.], recall, [1.]))
 52 |     mpre = np.concatenate(([0.], precision, [0.]))
 53 | 
 54 |     # compute the precision envelope
 55 |     for i in range(mpre.size - 1, 0, -1):
 56 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
 57 | 
 58 |     # to calculate area under PR curve, look for points
 59 |     # where X axis (recall) changes value
 60 |     i = np.where(mrec[1:] != mrec[:-1])[0]
 61 | 
 62 |     # and sum (\Delta recall) * prec
 63 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
 64 |     return ap
 65 | 
 66 | 
 67 | def _get_detections(dataset, model, score_threshold=0.05, max_detections=100, save_path=None):
 68 |     """ Get the detections from the retinanet using the generator.
 69 |     The result is a list of lists such that the size is:
 70 |         all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes]
 71 |     # Arguments
 72 |         dataset         : The generator used to run images through the retinanet.
 73 |         retinanet           : The retinanet to run on the images.
 74 |         score_threshold : The score confidence threshold to use.
 75 |         max_detections  : The maximum number of detections to use per image.
 76 |         save_path       : The path to save the images with visualized detections to.
 77 |     # Returns
 78 |         A list of lists containing the detections for each image in the generator.
 79 |     """
 80 |     all_detections = [[None for i in range(
 81 |         dataset.num_classes())] for j in range(len(dataset))]
 82 | 
 83 |     model.eval()
 84 | 
 85 |     with torch.no_grad():
 86 |         for index in range(len(dataset)):
 87 |             data = dataset[index]
 88 |             scale = data['scale']
 89 | 
 90 |             # run network
 91 |             scores, labels, boxes = model(data['img'].permute(
 92 |                 2, 0, 1).cuda().float().unsqueeze(dim=0))
 93 |             scores = scores.cpu().numpy()
 94 |             labels = labels.cpu().numpy()
 95 |             boxes = boxes.cpu().numpy()
 96 | 
 97 |             # correct boxes for image scale
 98 |             boxes /= scale
 99 | 
100 |             # select indices which have a score above the threshold
101 |             indices = np.where(scores > score_threshold)[0]
102 |             if indices.shape[0] > 0:
103 |                 # select those scores
104 |                 scores = scores[indices]
105 | 
106 |                 # find the order with which to sort the scores
107 |                 scores_sort = np.argsort(-scores)[:max_detections]
108 | 
109 |                 # select detections
110 |                 image_boxes = boxes[indices[scores_sort], :]
111 |                 image_scores = scores[scores_sort]
112 |                 image_labels = labels[indices[scores_sort]]
113 |                 image_detections = np.concatenate([image_boxes, np.expand_dims(
114 |                     image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1)
115 | 
116 |                 # copy detections to all_detections
117 |                 for label in range(dataset.num_classes()):
118 |                     all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1]
119 |             else:
120 |                 # copy detections to all_detections
121 |                 for label in range(dataset.num_classes()):
122 |                     all_detections[index][label] = np.zeros((0, 5))
123 | 
124 |             print('{}/{}'.format(index + 1, len(dataset)), end='\r')
125 | 
126 |     return all_detections
127 | 
128 | 
129 | def _get_annotations(generator):
130 |     """ Get the ground truth annotations from the generator.
131 |     The result is a list of lists such that the size is:
132 |         all_detections[num_images][num_classes] = annotations[num_detections, 5]
133 |     # Arguments
134 |         generator : The generator used to retrieve ground truth annotations.
135 |     # Returns
136 |         A list of lists containing the annotations for each image in the generator.
137 |     """
138 |     all_annotations = [[None for i in range(
139 |         generator.num_classes())] for j in range(len(generator))]
140 | 
141 |     for i in range(len(generator)):
142 |         # load the annotations
143 |         annotations = generator.load_annotations(i)
144 | 
145 |         # copy detections to all_annotations
146 |         for label in range(generator.num_classes()):
147 |             all_annotations[i][label] = annotations[annotations[:, 4]
148 |                                                     == label, :4].copy()
149 | 
150 |         print('{}/{}'.format(i + 1, len(generator)), end='\r')
151 | 
152 |     return all_annotations
153 | 
154 | 
155 | def evaluate(
156 |     generator,
157 |     retinanet,
158 |     iou_threshold=0.5,
159 |     score_threshold=0.05,
160 |     max_detections=100,
161 |     save_path=None
162 | ):
163 |     """ Evaluate a given dataset using a given retinanet.
164 |     # Arguments
165 |         generator       : The generator that represents the dataset to evaluate.
166 |         retinanet           : The retinanet to evaluate.
167 |         iou_threshold   : The threshold used to consider when a detection is positive or negative.
168 |         score_threshold : The score confidence threshold to use for detections.
169 |         max_detections  : The maximum number of detections to use per image.
170 |         save_path       : The path to save images with visualized detections to.
171 |     # Returns
172 |         A dict mapping class names to mAP scores.
173 |     """
174 | 
175 |     # gather all detections and annotations
176 | 
177 |     all_detections = _get_detections(
178 |         generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path)
179 |     all_annotations = _get_annotations(generator)
180 | 
181 |     average_precisions = {}
182 | 
183 |     for label in range(generator.num_classes()):
184 |         false_positives = np.zeros((0,))
185 |         true_positives = np.zeros((0,))
186 |         scores = np.zeros((0,))
187 |         num_annotations = 0.0
188 | 
189 |         for i in range(len(generator)):
190 |             detections = all_detections[i][label]
191 |             annotations = all_annotations[i][label]
192 |             num_annotations += annotations.shape[0]
193 |             detected_annotations = []
194 | 
195 |             for d in detections:
196 |                 scores = np.append(scores, d[4])
197 | 
198 |                 if annotations.shape[0] == 0:
199 |                     false_positives = np.append(false_positives, 1)
200 |                     true_positives = np.append(true_positives, 0)
201 |                     continue
202 | 
203 |                 overlaps = compute_overlap(
204 |                     np.expand_dims(d, axis=0), annotations)
205 |                 assigned_annotation = np.argmax(overlaps, axis=1)
206 |                 max_overlap = overlaps[0, assigned_annotation]
207 | 
208 |                 if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations:
209 |                     false_positives = np.append(false_positives, 0)
210 |                     true_positives = np.append(true_positives, 1)
211 |                     detected_annotations.append(assigned_annotation)
212 |                 else:
213 |                     false_positives = np.append(false_positives, 1)
214 |                     true_positives = np.append(true_positives, 0)
215 | 
216 |         # no annotations -> AP for this class is 0 (is this correct?)
217 |         if num_annotations == 0:
218 |             average_precisions[label] = 0, 0
219 |             continue
220 | 
221 |         # sort by score
222 |         indices = np.argsort(-scores)
223 |         false_positives = false_positives[indices]
224 |         true_positives = true_positives[indices]
225 | 
226 |         # compute false positives and true positives
227 |         false_positives = np.cumsum(false_positives)
228 |         true_positives = np.cumsum(true_positives)
229 | 
230 |         # compute recall and precision
231 |         recall = true_positives / num_annotations
232 |         precision = true_positives / \
233 |             np.maximum(true_positives + false_positives,
234 |                        np.finfo(np.float64).eps)
235 | 
236 |         # compute average precision
237 |         average_precision = _compute_ap(recall, precision)
238 |         average_precisions[label] = average_precision, num_annotations
239 | 
240 |     print('\nmAP:')
241 |     for label in range(generator.num_classes()):
242 |         label_name = generator.label_to_name(label)
243 |         print('{}: {}'.format(label_name, average_precisions[label][0]))
244 | 
245 |     return average_precisions
246 | 


--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | 
 4 | class MetricTracker:
 5 |     def __init__(self, *keys, writer=None):
 6 |         self.writer = writer
 7 |         self._data = pd.DataFrame(
 8 |             index=keys, columns=['total', 'counts', 'average'])
 9 |         self.reset()
10 | 
11 |     def reset(self):
12 |         for col in self._data.columns:
13 |             self._data[col].values[:] = 0
14 | 
15 |     def update(self, key, value, n=1):
16 |         if self.writer is not None:
17 |             self.writer.add_scalar(key, value)
18 |         self._data.total[key] += value * n
19 |         self._data.counts[key] += n
20 |         self._data.average[key] = self._data.total[key] / \
21 |             self._data.counts[key]
22 | 
23 |     def avg(self, key):
24 |         return self._data.average[key]
25 | 
26 |     def result(self):
27 |         return dict(self._data.average)
28 | 


--------------------------------------------------------------------------------
/utils/vis_bbox.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | from PIL import Image
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | 
  7 | def vis_bbox(img, bbox, label=None, score=None,
  8 |              instance_colors=None, alpha=1., linewidth=2., ax=None):
  9 |     """Visualize bounding boxes inside the image.
 10 |     Args:
 11 |         img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
 12 |             This is in RGB format and the range of its value is
 13 |             :math:`[0, 255]`. If this is :obj:`None`, no image is displayed.
 14 |         bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where
 15 |             :math:`R` is the number of bounding boxes in the image.
 16 |             Each element is organized
 17 |             by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis.
 18 |         label (~numpy.ndarray): An integer array of shape :math:`(R,)`.
 19 |             The values correspond to id for label names stored in
 20 |             :obj:`label_names`. This is optional.
 21 |         score (~numpy.ndarray): A float array of shape :math:`(R,)`.
 22 |              Each value indicates how confident the prediction is.
 23 |              This is optional.
 24 |         label_names (iterable of strings): Name of labels ordered according
 25 |             to label ids. If this is :obj:`None`, labels will be skipped.
 26 |         instance_colors (iterable of tuples): List of colors.
 27 |             Each color is RGB format and the range of its values is
 28 |             :math:`[0, 255]`. The :obj:`i`-th element is the color used
 29 |             to visualize the :obj:`i`-th instance.
 30 |             If :obj:`instance_colors` is :obj:`None`, the red is used for
 31 |             all boxes.
 32 |         alpha (float): The value which determines transparency of the
 33 |             bounding boxes. The range of this value is :math:`[0, 1]`.
 34 |         linewidth (float): The thickness of the edges of the bounding boxes.
 35 |         ax (matplotlib.axes.Axis): The visualization is displayed on this
 36 |             axis. If this is :obj:`None` (default), a new axis is created.
 37 |     Returns:
 38 |         ~matploblib.axes.Axes:
 39 |         Returns the Axes object with the plot for further tweaking.
 40 |     from: https://github.com/chainer/chainercv
 41 |     """
 42 | 
 43 |     if label is not None and not len(bbox) == len(label):
 44 |         raise ValueError('The length of label must be same as that of bbox')
 45 |     if score is not None and not len(bbox) == len(score):
 46 |         raise ValueError('The length of score must be same as that of bbox')
 47 | 
 48 |     # Returns newly instantiated matplotlib.axes.Axes object if ax is None
 49 |     if ax is None:
 50 |         fig = plt.figure()
 51 |         # ax = fig.add_subplot(1, 1, 1)
 52 |         h, w, _ = img.shape
 53 |         w_ = w / 60.0
 54 |         h_ = w_ * (h / w)
 55 |         fig.set_size_inches((w_, h_))
 56 |         ax = plt.axes([0, 0, 1, 1])
 57 |     ax.imshow(img.astype(np.uint8))
 58 |     ax.axis('off')
 59 |     # If there is no bounding box to display, visualize the image and exit.
 60 |     if len(bbox) == 0:
 61 |         return fig, ax
 62 | 
 63 |     if instance_colors is None:
 64 |         # Red
 65 |         instance_colors = np.zeros((len(bbox), 3), dtype=np.float32)
 66 |         instance_colors[:, 0] = 51
 67 |         instance_colors[:, 1] = 51
 68 |         instance_colors[:, 2] = 224
 69 |     instance_colors = np.array(instance_colors)
 70 | 
 71 |     for i, bb in enumerate(bbox):
 72 |         xy = (bb[0], bb[1])
 73 |         height = bb[3] - bb[1]
 74 |         width = bb[2] - bb[0]
 75 |         color = instance_colors[i % len(instance_colors)] / 255
 76 |         ax.add_patch(plt.Rectangle(
 77 |             xy, width, height, fill=False,
 78 |             edgecolor=color, linewidth=linewidth, alpha=alpha))
 79 | 
 80 |         caption = []
 81 |         caption.append(label[i])
 82 |         if(len(score) > 0):
 83 |             sc = score[i]
 84 |             caption.append('{}'.format(sc))
 85 | 
 86 |         if len(caption) > 0:
 87 |             face_color = np.array([225, 51, 123])/255
 88 |             ax.text(bb[0], bb[1],
 89 |                     ': '.join(caption),
 90 |                     fontsize=12,
 91 |                     color='black',
 92 |                     style='italic',
 93 |                     bbox={'facecolor': face_color, 'edgecolor': face_color, 'alpha': 1, 'pad': 0})
 94 |     return fig, ax
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 |     img = cv2.imread('./../docs/output.png')
 99 |     print('img: ', img.shape)
100 |     img = np.array(img)
101 |     # img = img.convert('RGB')
102 |     bbox = np.array([[50, 50, 200, 200]])
103 |     label = np.array(['toan'])
104 |     score = np.array([100])
105 |     ax, fig = vis_bbox(img=img,
106 |                        bbox=bbox,
107 |                        label=label,
108 |                        score=score,
109 |                        label_names=label_names
110 |                        )
111 |     fig.savefig('kaka.png')
112 |     fig.show()
113 |     plt.show()
114 | 


--------------------------------------------------------------------------------
/utils/visualization.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | from datetime import datetime
 3 | 
 4 | 
 5 | class TensorboardWriter():
 6 |     def __init__(self, log_dir, enabled):
 7 |         self.writer = None
 8 |         self.selected_module = ""
 9 | 
10 |         if enabled:
11 |             log_dir = str(log_dir)
12 | 
13 |             # Retrieve vizualization writer.
14 |             succeeded = False
15 |             for module in ["torch.utils.tensorboard", "tensorboardX"]:
16 |                 try:
17 |                     self.writer = importlib.import_module(
18 |                         module).SummaryWriter(log_dir)
19 |                     succeeded = True
20 |                     break
21 |                 except ImportError:
22 |                     succeeded = False
23 |                 self.selected_module = module
24 | 
25 |             if not succeeded:
26 |                 message = "Warning: visualization (Tensorboard) is configured to use, but currently not installed on " \
27 |                     "this machine. Please install TensorboardX with 'pip install tensorboardx', upgrade PyTorch to " \
28 |                     "version >= 1.1 to use 'torch.utils.tensorboard' or turn off the option in the 'config.json' file."
29 |                 print(message)
30 | 
31 |         self.step = 0
32 |         self.mode = ''
33 | 
34 |         self.tb_writer_ftns = {
35 |             'add_scalar', 'add_scalars', 'add_image', 'add_images', 'add_audio',
36 |             'add_text', 'add_histogram', 'add_pr_curve', 'add_embedding', 'add_graph'
37 |         }
38 |         self.tag_mode_exceptions = {'add_histogram', 'add_embedding'}
39 |         self.timer = datetime.now()
40 | 
41 |     def set_step(self, step, mode='train'):
42 |         self.mode = mode
43 |         self.step = step
44 |         if step == 0:
45 |             self.timer = datetime.now()
46 |         else:
47 |             duration = datetime.now() - self.timer
48 |             self.add_scalar('steps_per_sec', 1 / duration.total_seconds())
49 |             self.timer = datetime.now()
50 | 
51 |     def __getattr__(self, name):
52 |         """
53 |         If visualization is configured to use:
54 |             return add_data() methods of tensorboard with additional information (step, tag) added.
55 |         Otherwise:
56 |             return a blank function handle that does nothing
57 |         """
58 |         if name in self.tb_writer_ftns:
59 |             add_data = getattr(self.writer, name, None)
60 | 
61 |             def wrapper(tag, data, *args, **kwargs):
62 |                 if add_data is not None:
63 |                     # add mode(train/valid) tag
64 |                     if name not in self.tag_mode_exceptions:
65 |                         tag = '{}/{}'.format(tag, self.mode)
66 |                     add_data(tag, data, self.step, *args, **kwargs)
67 |             return wrapper
68 |         else:
69 |             # default action for returning methods defined in this class, set_step() for instance.
70 |             try:
71 |                 attr = object.__getattr__(name)
72 |             except AttributeError:
73 |                 raise AttributeError("type object '{}' has no attribute '{}'".format(
74 |                     self.selected_module, name))
75 |             return attr
76 | 


--------------------------------------------------------------------------------