├── .gitignore ├── DEVELOP_GUIDE.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── TROUBLESHOOTING.md ├── configs ├── efficient_net_b3_ssd300_voc0712.yaml ├── mobilenet_v2_ssd320_voc0712.yaml ├── mobilenet_v3_ssd320_voc0712.yaml ├── vgg_ssd300_coco_trainval35k.yaml ├── vgg_ssd300_voc0712.yaml ├── vgg_ssd512_coco_trainval35k.yaml └── vgg_ssd512_voc0712.yaml ├── demo.py ├── demo ├── 000342.jpg ├── 000542.jpg ├── 003123.jpg ├── 004101.jpg └── 008591.jpg ├── figures ├── 004545.jpg ├── losses.png ├── lr.png └── metrics.png ├── outputs └── .gitignore ├── requirements.txt ├── setup.py ├── ssd ├── __init__.py ├── config │ ├── __init__.py │ ├── defaults.py │ └── path_catlog.py ├── data │ ├── __init__.py │ ├── build.py │ ├── datasets │ │ ├── __init__.py │ │ ├── coco.py │ │ ├── evaluation │ │ │ ├── __init__.py │ │ │ ├── coco │ │ │ │ └── __init__.py │ │ │ └── voc │ │ │ │ ├── __init__.py │ │ │ │ └── eval_detection_voc.py │ │ └── voc.py │ ├── samplers │ │ ├── __init__.py │ │ ├── distributed.py │ │ └── iteration_based_batch_sampler.py │ └── transforms │ │ ├── __init__.py │ │ ├── target_transform.py │ │ └── transforms.py ├── engine │ ├── __init__.py │ ├── inference.py │ └── trainer.py ├── layers │ ├── __init__.py │ └── separable_conv.py ├── modeling │ ├── __init__.py │ ├── anchors │ │ ├── __init__.py │ │ └── prior_box.py │ ├── backbone │ │ ├── __init__.py │ │ ├── efficient_net │ │ │ ├── __init__.py │ │ │ ├── efficient_net.py │ │ │ └── utils.py │ │ ├── mobilenet.py │ │ ├── mobilenetv3.py │ │ └── vgg.py │ ├── box_head │ │ ├── __init__.py │ │ ├── box_head.py │ │ ├── box_predictor.py │ │ ├── inference.py │ │ └── loss.py │ ├── detector │ │ ├── __init__.py │ │ └── ssd_detector.py │ └── registry.py ├── solver │ ├── __init__.py │ ├── build.py │ └── lr_scheduler.py ├── structures │ ├── __init__.py │ └── container.py └── utils │ ├── __init__.py │ ├── box_utils.py │ ├── checkpoint.py │ ├── dist_util.py │ ├── logger.py │ ├── metric_logger.py │ ├── misc.py │ ├── model_zoo.py │ ├── nms.py │ └── registry.py ├── test.py └── train.py /.gitignore: -------------------------------------------------------------------------------- 1 | # compilation and distribution 2 | __pycache__ 3 | *.pyc 4 | *.so 5 | ext/build/ 6 | ext/torch_extension.egg-info/ 7 | dist/ 8 | *.egg-info 9 | 10 | # pytorch/python/numpy formats 11 | *.pth 12 | *.pkl 13 | *.npy 14 | 15 | # ipython/jupyter notebooks 16 | *.ipynb 17 | **/.ipynb_checkpoints/ 18 | 19 | # Editor temporaries 20 | *.swn 21 | *.swo 22 | *.swp 23 | *~ 24 | 25 | # Pycharm editor settings 26 | .idea 27 | .DS_Store 28 | -------------------------------------------------------------------------------- /DEVELOP_GUIDE.md: -------------------------------------------------------------------------------- 1 | # Develop Guide 2 | 3 | ## Custom Dataset 4 | Add your custom dataset is simple and flexible. 5 | For example, create `ssd/data/datasets/my_dataset.py`: 6 | ```python 7 | import torch.utils.data 8 | 9 | from ssd.structures.container import Container 10 | 11 | class MyDataset(torch.utils.data.Dataset): 12 | def __init__(self, ..., transform=None, target_transform=None): 13 | # as you would do normally 14 | ... 15 | self.transform = transform 16 | self.target_transform = target_transform 17 | 18 | def __getitem__(self, index): 19 | # load the image as a PIL Image 20 | image = ... 21 | 22 | # load the bounding boxes in x1, y1, x2, y2 order. 23 | boxes = np.array((N, 4), dtype=np.float32) 24 | # and labels 25 | labels = np.array((N, ), dtype=np.int64) 26 | 27 | if self.transform: 28 | image, boxes, labels = self.transform(image, boxes, labels) 29 | if self.target_transform: 30 | boxes, labels = self.target_transform(boxes, labels) 31 | targets = Container( 32 | boxes=boxes, 33 | labels=labels, 34 | ) 35 | # return the image, the targets and the index in your dataset 36 | return image, targets, index 37 | ``` 38 | 39 | in `ssd/data/datasets/__init__.py` 40 | ```python 41 | from .my_dataset import MyDataset 42 | 43 | _DATASETS = { 44 | 'VOCDataset': VOCDataset, 45 | 'COCODataset': COCODataset, 46 | 'MyDataset': MyDataset, 47 | } 48 | ``` 49 | 50 | in `ssd/config/path_catlog.py`: 51 | ```python 52 | DATASETS = { 53 | ... 54 | 'my_custom_dataset': { 55 | "arg1": "your/arg", 56 | "arg2": "your/arg", 57 | }, 58 | ... 59 | } 60 | 61 | @staticmethod 62 | def get(name): 63 | ... 64 | if name == 'my_custom_dataset': 65 | attrs = DatasetCatalog.DATASETS[name] 66 | return dict(factory="MyDataset", args=attrs) 67 | ... 68 | ``` 69 | 70 | in your `config.ymal`: 71 | ```yaml 72 | DATASETS: 73 | TRAIN: ("my_custom_dataset", ) 74 | TEST: ("my_custom_test_dataset", ) 75 | ``` 76 | 77 | ### Test 78 | While the aforementioned example should work for training, it's also easy to add your custom test code: 79 | in `ssd/data/datasets/evaluation/__init__.py` 80 | ```python 81 | if isinstance(dataset, MyDataset): 82 | return my_own_evaluation(**args) 83 | ``` 84 | 85 | ## Custom Backbone 86 | 87 | It very simple to add your own backbone for SSD. 88 | For example, create `ssd/modeling/backbone/my_backbone.py`: 89 | ```python 90 | import torch.nn as nn 91 | 92 | from ssd.modeling import registry 93 | from ssd.utils.model_zoo import load_state_dict_from_url 94 | 95 | 96 | class MyBackbone(nn.Module): 97 | def __init__(self, cfg): 98 | super().__init__() 99 | ... 100 | 101 | def forward(self, x): 102 | features = [] 103 | 104 | # forward your network 105 | 106 | # add arbitrary feature you want to do prediction upon it. 107 | 108 | features.append(feature1) 109 | features.append(feature2) 110 | features.append(feature3) 111 | features.append(feature4) 112 | 113 | # return them as a tuple 114 | return tuple(features) 115 | 116 | @registry.BACKBONES.register('my_backbone') 117 | def my_backbone(cfg, pretrained=True): 118 | model = MyBackbone(cfg) 119 | model_url = 'you_model_url' 120 | if pretrained: 121 | model.init_from_pretrain(load_state_dict_from_url(model_url)) 122 | return model 123 | ``` 124 | in `ssd/modeling/backbone/__init__.py`: 125 | ```python 126 | from .my_backbone import MyBackbone 127 | ``` 128 | 129 | in your `config.ymal`: 130 | ```yaml 131 | MODEL: 132 | BACKBONE: 133 | NAME: 'my_backbone' 134 | OUT_CHANNELS: (-, -, -, -) # should match feature1 - feature4's out_channels in MyBackbone 135 | PRIORS: 136 | FEATURE_MAPS: [-, -, -, -] # feature1 - feature4's size 137 | STRIDES: [-, -, -, -] # feature1 - feature4's output stride 138 | MIN_SIZES: [21, 45, 99, 153] # your custom anchor settings 139 | MAX_SIZES: [45, 99, 153, 207] 140 | ASPECT_RATIOS: [[2, 3], [2, 3], [2, 3], [2, 3]] 141 | BOXES_PER_LOCATION: [6, 6, 6, 6] 142 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 lufficc 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include configs *.yaml 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # High quality, fast, modular reference implementation of SSD in PyTorch 1.0 2 | 3 | 4 | This repository implements [SSD (Single Shot MultiBox Detector)](https://arxiv.org/abs/1512.02325). The implementation is heavily influenced by the projects [ssd.pytorch](https://github.com/amdegroot/ssd.pytorch), [pytorch-ssd](https://github.com/qfgaohao/pytorch-ssd) and [maskrcnn-benchmark](https://github.com/facebookresearch/maskrcnn-benchmark). This repository aims to be the code base for researches based on SSD. 5 | 6 |
7 | 8 |

Example SSD output (vgg_ssd300_voc0712).

9 |
10 | 11 | | Losses | Learning rate | Metrics | 12 | | :-----------: |:-------------:| :------:| 13 | | ![losses](figures/losses.png) | ![lr](figures/lr.png) | ![metric](figures/metrics.png) | 14 | 15 | ## Highlights 16 | 17 | - **PyTorch 1.0**: Support PyTorch 1.0 or higher. 18 | - **Multi-GPU training and inference**: We use `DistributedDataParallel`, you can train or test with arbitrary GPU(s), the training schema will change accordingly. 19 | - **Modular**: Add your own modules without pain. We abstract `backbone`,`Detector`, `BoxHead`, `BoxPredictor`, etc. You can replace every component with your own code without change the code base. For example, You can add [EfficientNet](https://github.com/lukemelas/EfficientNet-PyTorch) as backbone, just add `efficient_net.py` (ALREADY ADDED) and register it, specific it in the config file, It's done! 20 | - **CPU support for inference**: runs on CPU in inference time. 21 | - **Smooth and enjoyable training procedure**: we save the state of model, optimizer, scheduler, training iter, you can stop your training and resume training exactly from the save point without change your training `CMD`. 22 | - **Batched inference**: can perform inference using multiple images per batch per GPU. 23 | - **Evaluating during training**: eval you model every `eval_step` to check performance improving or not. 24 | - **Metrics Visualization**: visualize metrics details in tensorboard, like AP, APl, APm and APs for COCO dataset or mAP and 20 categories' AP for VOC dataset. 25 | - **Auto download**: load pre-trained weights from URL and cache it. 26 | ## Installation 27 | ### Requirements 28 | 29 | 1. Python3 30 | 1. PyTorch 1.0 or higher 31 | 1. yacs 32 | 1. [Vizer](https://github.com/lufficc/Vizer) 33 | 1. GCC >= 4.9 34 | 1. OpenCV 35 | 36 | 37 | ### Step-by-step installation 38 | 39 | ```bash 40 | git clone https://github.com/lufficc/SSD.git 41 | cd SSD 42 | # Required packages: torch torchvision yacs tqdm opencv-python vizer 43 | pip install -r requirements.txt 44 | 45 | # Done! That's ALL! No BUILD! No bothering SETUP! 46 | 47 | # It's recommended to install the latest release of torch and torchvision. 48 | ``` 49 | 50 | 51 | ## Train 52 | 53 | ### Setting Up Datasets 54 | #### Pascal VOC 55 | 56 | For Pascal VOC dataset, make the folder structure like this: 57 | ``` 58 | VOC_ROOT 59 | |__ VOC2007 60 | |_ JPEGImages 61 | |_ Annotations 62 | |_ ImageSets 63 | |_ SegmentationClass 64 | |__ VOC2012 65 | |_ JPEGImages 66 | |_ Annotations 67 | |_ ImageSets 68 | |_ SegmentationClass 69 | |__ ... 70 | ``` 71 | Where `VOC_ROOT` default is `datasets` folder in current project, you can create symlinks to `datasets` or `export VOC_ROOT="/path/to/voc_root"`. 72 | 73 | #### COCO 74 | 75 | For COCO dataset, make the folder structure like this: 76 | ``` 77 | COCO_ROOT 78 | |__ annotations 79 | |_ instances_valminusminival2014.json 80 | |_ instances_minival2014.json 81 | |_ instances_train2014.json 82 | |_ instances_val2014.json 83 | |_ ... 84 | |__ train2014 85 | |_ .jpg 86 | |_ ... 87 | |_ .jpg 88 | |__ val2014 89 | |_ .jpg 90 | |_ ... 91 | |_ .jpg 92 | |__ ... 93 | ``` 94 | Where `COCO_ROOT` default is `datasets` folder in current project, you can create symlinks to `datasets` or `export COCO_ROOT="/path/to/coco_root"`. 95 | 96 | ### Single GPU training 97 | 98 | ```bash 99 | # for example, train SSD300: 100 | python train.py --config-file configs/vgg_ssd300_voc0712.yaml 101 | ``` 102 | ### Multi-GPU training 103 | 104 | ```bash 105 | # for example, train SSD300 with 4 GPUs: 106 | export NGPUS=4 107 | python -m torch.distributed.launch --nproc_per_node=$NGPUS train.py --config-file configs/vgg_ssd300_voc0712.yaml SOLVER.WARMUP_FACTOR 0.03333 SOLVER.WARMUP_ITERS 1000 108 | ``` 109 | The configuration files that I provide assume that we are running on single GPU. When changing number of GPUs, hyper-parameter (lr, max_iter, ...) will also changed according to this paper: [Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour](https://arxiv.org/abs/1706.02677). 110 | 111 | ## Evaluate 112 | 113 | ### Single GPU evaluating 114 | 115 | ```bash 116 | # for example, evaluate SSD300: 117 | python test.py --config-file configs/vgg_ssd300_voc0712.yaml 118 | ``` 119 | 120 | ### Multi-GPU evaluating 121 | 122 | ```bash 123 | # for example, evaluate SSD300 with 4 GPUs: 124 | export NGPUS=4 125 | python -m torch.distributed.launch --nproc_per_node=$NGPUS test.py --config-file configs/vgg_ssd300_voc0712.yaml 126 | ``` 127 | 128 | ## Demo 129 | 130 | Predicting image in a folder is simple: 131 | ```bash 132 | python demo.py --config-file configs/vgg_ssd300_voc0712.yaml --images_dir demo --ckpt https://github.com/lufficc/SSD/releases/download/1.2/vgg_ssd300_voc0712.pth 133 | ``` 134 | Then it will download and cache `vgg_ssd300_voc0712.pth` automatically and predicted images with boxes, scores and label names will saved to `demo/result` folder by default. 135 | 136 | You will see a similar output: 137 | ```text 138 | (0001/0005) 004101.jpg: objects 01 | load 010ms | inference 033ms | FPS 31 139 | (0002/0005) 003123.jpg: objects 05 | load 009ms | inference 019ms | FPS 53 140 | (0003/0005) 000342.jpg: objects 02 | load 009ms | inference 019ms | FPS 51 141 | (0004/0005) 008591.jpg: objects 02 | load 008ms | inference 020ms | FPS 50 142 | (0005/0005) 000542.jpg: objects 01 | load 011ms | inference 019ms | FPS 53 143 | ``` 144 | 145 | ## MODEL ZOO 146 | ### Origin Paper: 147 | 148 | | | VOC2007 test | coco test-dev2015 | 149 | | :-----: | :----------: | :----------: | 150 | | SSD300* | 77.2 | 25.1 | 151 | | SSD512* | 79.8 | 28.8 | 152 | 153 | ### COCO: 154 | 155 | | Backbone | Input Size | box AP | Model Size | Download | 156 | | :------------: | :----------:| :--------------------------: | :--------: | :-------: | 157 | | VGG16 | 300 | 25.2 | 262MB | [model](https://github.com/lufficc/SSD/releases/download/1.2/vgg_ssd300_coco_trainval35k.pth) | 158 | | VGG16 | 512 | 29.0 | 275MB | [model](https://github.com/lufficc/SSD/releases/download/1.2/vgg_ssd512_coco_trainval35k.pth) | 159 | 160 | ### PASCAL VOC: 161 | 162 | | Backbone | Input Size | mAP | Model Size | Download | 163 | | :--------------: | :----------:| :--------------------------: | :--------: | :-------: | 164 | | VGG16 | 300 | 77.7 | 201MB | [model](https://github.com/lufficc/SSD/releases/download/1.2/vgg_ssd300_voc0712.pth) | 165 | | VGG16 | 512 | 80.7 | 207MB | [model](https://github.com/lufficc/SSD/releases/download/1.2/vgg_ssd512_voc0712.pth) | 166 | | Mobilenet V2 | 320 | 68.9 | 25.5MB | [model](https://github.com/lufficc/SSD/releases/download/1.2/mobilenet_v2_ssd320_voc0712_v2.pth) | 167 | | Mobilenet V3 | 320 | 69.5 | 29.9MB | [model](https://github.com/lufficc/SSD/releases/download/1.2/mobilenet_v3_ssd320_voc0712.pth) | 168 | | EfficientNet-B3 | 300 | 73.9 | 97.1MB | [model](https://github.com/lufficc/SSD/releases/download/1.2/efficient_net_b3_ssd300_voc0712.pth) | 169 | 170 | ## Develop Guide 171 | 172 | If you want to add your custom components, please see [DEVELOP_GUIDE.md](DEVELOP_GUIDE.md) for more details. 173 | 174 | 175 | ## Troubleshooting 176 | If you have issues running or compiling this code, we have compiled a list of common issues in [TROUBLESHOOTING.md](TROUBLESHOOTING.md). If your issue is not present there, please feel free to open a new issue. 177 | 178 | ## Citations 179 | If you use this project in your research, please cite this project. 180 | ```text 181 | @misc{lufficc2018ssd, 182 | author = {Congcong Li}, 183 | title = {{High quality, fast, modular reference implementation of SSD in PyTorch}}, 184 | year = {2018}, 185 | howpublished = {\url{https://github.com/lufficc/SSD}} 186 | } 187 | ``` -------------------------------------------------------------------------------- /TROUBLESHOOTING.md: -------------------------------------------------------------------------------- 1 | # Troubleshooting 2 | 3 | ## RuntimeError: merge_sort: failed to synchronize: an illegal memory access was encountered 4 | 5 | This is caused in multi-box loss. The sort method failed due to NaN numbers. This may be a bug in `log_softmax`: https://github.com/pytorch/pytorch/issues/14335 .Three ways to solve : 6 | 1. Use a smaller warmup factor, like 0.1. (append `SOLVER.WARMUP_FACTOR 0.1` to your train cmd's end). 7 | 1. Use a longer warmup iters, like 1000. (append `SOLVER.WARMUP_ITERS 1000` to your train cmd's end). 8 | 1. [Described in the forums by Jinserk Baik](https://discuss.pytorch.org/t/ctcloss-performance-of-pytorch-1-0-0/27524/29) -------------------------------------------------------------------------------- /configs/efficient_net_b3_ssd300_voc0712.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NUM_CLASSES: 21 3 | BACKBONE: 4 | NAME: 'efficient_net-b3' 5 | OUT_CHANNELS: (48, 136, 384, 256, 256, 256) 6 | INPUT: 7 | IMAGE_SIZE: 300 8 | DATASETS: 9 | TRAIN: ("voc_2007_trainval", "voc_2012_trainval") 10 | TEST: ("voc_2007_test", ) 11 | SOLVER: 12 | MAX_ITER: 160000 13 | LR_STEPS: [105000, 135000] 14 | GAMMA: 0.1 15 | BATCH_SIZE: 24 16 | LR: 1e-3 17 | 18 | OUTPUT_DIR: 'outputs/efficient_net_b3_ssd300_voc0712' -------------------------------------------------------------------------------- /configs/mobilenet_v2_ssd320_voc0712.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NUM_CLASSES: 21 3 | BOX_HEAD: 4 | PREDICTOR: 'SSDLiteBoxPredictor' 5 | BACKBONE: 6 | NAME: 'mobilenet_v2' 7 | OUT_CHANNELS: (96, 1280, 512, 256, 256, 64) 8 | PRIORS: 9 | FEATURE_MAPS: [20, 10, 5, 3, 2, 1] 10 | STRIDES: [16, 32, 64, 107, 160, 320] 11 | MIN_SIZES: [60, 105, 150, 195, 240, 285] 12 | MAX_SIZES: [105, 150, 195, 240, 285, 330] 13 | ASPECT_RATIOS: [[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]] 14 | BOXES_PER_LOCATION: [6, 6, 6, 6, 6, 6] 15 | INPUT: 16 | IMAGE_SIZE: 320 17 | DATASETS: 18 | TRAIN: ("voc_2007_trainval", "voc_2012_trainval") 19 | TEST: ("voc_2007_test", ) 20 | SOLVER: 21 | MAX_ITER: 120000 22 | LR_STEPS: [80000, 100000] 23 | GAMMA: 0.1 24 | BATCH_SIZE: 32 25 | LR: 1e-3 26 | 27 | OUTPUT_DIR: 'outputs/mobilenet_v2_ssd320_voc0712' -------------------------------------------------------------------------------- /configs/mobilenet_v3_ssd320_voc0712.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NUM_CLASSES: 21 3 | BOX_HEAD: 4 | PREDICTOR: 'SSDLiteBoxPredictor' 5 | BACKBONE: 6 | NAME: 'mobilenet_v3' 7 | OUT_CHANNELS: (112, 960, 512, 256, 256, 64) 8 | PRIORS: 9 | FEATURE_MAPS: [20, 10, 5, 3, 2, 1] 10 | STRIDES: [16, 32, 64, 107, 160, 320] 11 | MIN_SIZES: [60, 105, 150, 195, 240, 285] 12 | MAX_SIZES: [105, 150, 195, 240, 285, 330] 13 | ASPECT_RATIOS: [[2, 3], [2, 3], [2, 3], [2, 3], [2, 3], [2, 3]] 14 | BOXES_PER_LOCATION: [6, 6, 6, 6, 6, 6] 15 | INPUT: 16 | IMAGE_SIZE: 320 17 | DATASETS: 18 | TRAIN: ("voc_2007_trainval", "voc_2012_trainval") 19 | TEST: ("voc_2007_test", ) 20 | SOLVER: 21 | MAX_ITER: 120000 22 | LR_STEPS: [80000, 100000] 23 | GAMMA: 0.1 24 | BATCH_SIZE: 32 25 | LR: 1e-3 26 | 27 | OUTPUT_DIR: 'outputs/mobilenet_v3_ssd320_voc0712' 28 | -------------------------------------------------------------------------------- /configs/vgg_ssd300_coco_trainval35k.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NUM_CLASSES: 81 3 | PRIORS: 4 | FEATURE_MAPS: [38, 19, 10, 5, 3, 1] 5 | STRIDES: [8, 16, 32, 64, 100, 300] 6 | MIN_SIZES: [21, 45, 99, 153, 207, 261] 7 | MAX_SIZES: [45, 99, 153, 207, 261, 315] 8 | ASPECT_RATIOS: [[2], [2, 3], [2, 3], [2, 3], [2], [2]] 9 | BOXES_PER_LOCATION: [4, 6, 6, 6, 4, 4] 10 | INPUT: 11 | IMAGE_SIZE: 300 12 | DATASETS: 13 | TRAIN: ("coco_2014_train", "coco_2014_valminusminival") 14 | TEST: ("coco_2014_minival", ) 15 | SOLVER: 16 | MAX_ITER: 400000 17 | LR_STEPS: [280000, 360000] 18 | GAMMA: 0.1 19 | BATCH_SIZE: 32 20 | LR: 1e-3 21 | 22 | OUTPUT_DIR: 'outputs/vgg_ssd300_coco_trainval35k' -------------------------------------------------------------------------------- /configs/vgg_ssd300_voc0712.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NUM_CLASSES: 21 3 | INPUT: 4 | IMAGE_SIZE: 300 5 | DATASETS: 6 | TRAIN: ("voc_2007_trainval", "voc_2012_trainval") 7 | TEST: ("voc_2007_test", ) 8 | SOLVER: 9 | MAX_ITER: 120000 10 | LR_STEPS: [80000, 100000] 11 | GAMMA: 0.1 12 | BATCH_SIZE: 32 13 | LR: 1e-3 14 | 15 | OUTPUT_DIR: 'outputs/vgg_ssd300_voc0712' -------------------------------------------------------------------------------- /configs/vgg_ssd512_coco_trainval35k.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NUM_CLASSES: 81 3 | BACKBONE: 4 | OUT_CHANNELS: (512, 1024, 512, 256, 256, 256, 256) 5 | PRIORS: 6 | FEATURE_MAPS: [64, 32, 16, 8, 4, 2, 1] 7 | STRIDES: [8, 16, 32, 64, 128, 256, 512] 8 | MIN_SIZES: [20.48, 51.2, 133.12, 215.04, 296.96, 378.88, 460.8] 9 | MAX_SIZES: [51.2, 133.12, 215.04, 296.96, 378.88, 460.8, 542.72] 10 | ASPECT_RATIOS: [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]] 11 | BOXES_PER_LOCATION: [4, 6, 6, 6, 6, 4, 4] 12 | INPUT: 13 | IMAGE_SIZE: 512 14 | DATASETS: 15 | TRAIN: ("coco_2014_train", "coco_2014_valminusminival") 16 | TEST: ("coco_2014_minival", ) 17 | SOLVER: 18 | MAX_ITER: 520000 19 | LR_STEPS: [360000, 480000] 20 | GAMMA: 0.1 21 | BATCH_SIZE: 24 22 | LR: 1e-3 23 | 24 | OUTPUT_DIR: 'outputs/vgg_ssd512_coco_trainval35k' -------------------------------------------------------------------------------- /configs/vgg_ssd512_voc0712.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | NUM_CLASSES: 21 3 | BACKBONE: 4 | OUT_CHANNELS: (512, 1024, 512, 256, 256, 256, 256) 5 | PRIORS: 6 | FEATURE_MAPS: [64, 32, 16, 8, 4, 2, 1] 7 | STRIDES: [8, 16, 32, 64, 128, 256, 512] 8 | MIN_SIZES: [35.84, 76.8, 153.6, 230.4, 307.2, 384.0, 460.8] 9 | MAX_SIZES: [76.8, 153.6, 230.4, 307.2, 384.0, 460.8, 537.65] 10 | ASPECT_RATIOS: [[2], [2, 3], [2, 3], [2, 3], [2, 3], [2], [2]] 11 | BOXES_PER_LOCATION: [4, 6, 6, 6, 6, 4, 4] 12 | INPUT: 13 | IMAGE_SIZE: 512 14 | DATASETS: 15 | TRAIN: ("voc_2007_trainval", "voc_2012_trainval") 16 | TEST: ("voc_2007_test", ) 17 | SOLVER: 18 | MAX_ITER: 120000 19 | LR_STEPS: [80000, 100000] 20 | GAMMA: 0.1 21 | BATCH_SIZE: 24 22 | LR: 1e-3 23 | 24 | OUTPUT_DIR: 'outputs/vgg_ssd512_voc0712' -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import time 4 | 5 | import torch 6 | from PIL import Image 7 | from vizer.draw import draw_boxes 8 | 9 | from ssd.config import cfg 10 | from ssd.data.datasets import COCODataset, VOCDataset 11 | import argparse 12 | import numpy as np 13 | 14 | from ssd.data.transforms import build_transforms 15 | from ssd.modeling.detector import build_detection_model 16 | from ssd.utils import mkdir 17 | from ssd.utils.checkpoint import CheckPointer 18 | 19 | 20 | @torch.no_grad() 21 | def run_demo(cfg, ckpt, score_threshold, images_dir, output_dir, dataset_type): 22 | if dataset_type == "voc": 23 | class_names = VOCDataset.class_names 24 | elif dataset_type == 'coco': 25 | class_names = COCODataset.class_names 26 | else: 27 | raise NotImplementedError('Not implemented now.') 28 | device = torch.device(cfg.MODEL.DEVICE) 29 | 30 | model = build_detection_model(cfg) 31 | model = model.to(device) 32 | checkpointer = CheckPointer(model, save_dir=cfg.OUTPUT_DIR) 33 | checkpointer.load(ckpt, use_latest=ckpt is None) 34 | weight_file = ckpt if ckpt else checkpointer.get_checkpoint_file() 35 | print('Loaded weights from {}'.format(weight_file)) 36 | 37 | image_paths = glob.glob(os.path.join(images_dir, '*.jpg')) 38 | mkdir(output_dir) 39 | 40 | cpu_device = torch.device("cpu") 41 | transforms = build_transforms(cfg, is_train=False) 42 | model.eval() 43 | for i, image_path in enumerate(image_paths): 44 | start = time.time() 45 | image_name = os.path.basename(image_path) 46 | 47 | image = np.array(Image.open(image_path).convert("RGB")) 48 | height, width = image.shape[:2] 49 | images = transforms(image)[0].unsqueeze(0) 50 | load_time = time.time() - start 51 | 52 | start = time.time() 53 | result = model(images.to(device))[0] 54 | inference_time = time.time() - start 55 | 56 | result = result.resize((width, height)).to(cpu_device).numpy() 57 | boxes, labels, scores = result['boxes'], result['labels'], result['scores'] 58 | 59 | indices = scores > score_threshold 60 | boxes = boxes[indices] 61 | labels = labels[indices] 62 | scores = scores[indices] 63 | meters = ' | '.join( 64 | [ 65 | 'objects {:02d}'.format(len(boxes)), 66 | 'load {:03d}ms'.format(round(load_time * 1000)), 67 | 'inference {:03d}ms'.format(round(inference_time * 1000)), 68 | 'FPS {}'.format(round(1.0 / inference_time)) 69 | ] 70 | ) 71 | print('({:04d}/{:04d}) {}: {}'.format(i + 1, len(image_paths), image_name, meters)) 72 | 73 | drawn_image = draw_boxes(image, boxes, labels, scores, class_names).astype(np.uint8) 74 | Image.fromarray(drawn_image).save(os.path.join(output_dir, image_name)) 75 | 76 | 77 | def main(): 78 | parser = argparse.ArgumentParser(description="SSD Demo.") 79 | parser.add_argument( 80 | "--config-file", 81 | default="", 82 | metavar="FILE", 83 | help="path to config file", 84 | type=str, 85 | ) 86 | parser.add_argument("--ckpt", type=str, default=None, help="Trained weights.") 87 | parser.add_argument("--score_threshold", type=float, default=0.7) 88 | parser.add_argument("--images_dir", default='demo', type=str, help='Specify a image dir to do prediction.') 89 | parser.add_argument("--output_dir", default='demo/result', type=str, help='Specify a image dir to save predicted images.') 90 | parser.add_argument("--dataset_type", default="voc", type=str, help='Specify dataset type. Currently support voc and coco.') 91 | 92 | parser.add_argument( 93 | "opts", 94 | help="Modify config options using the command-line", 95 | default=None, 96 | nargs=argparse.REMAINDER, 97 | ) 98 | args = parser.parse_args() 99 | print(args) 100 | 101 | cfg.merge_from_file(args.config_file) 102 | cfg.merge_from_list(args.opts) 103 | cfg.freeze() 104 | 105 | print("Loaded configuration file {}".format(args.config_file)) 106 | with open(args.config_file, "r") as cf: 107 | config_str = "\n" + cf.read() 108 | print(config_str) 109 | print("Running with config:\n{}".format(cfg)) 110 | 111 | run_demo(cfg=cfg, 112 | ckpt=args.ckpt, 113 | score_threshold=args.score_threshold, 114 | images_dir=args.images_dir, 115 | output_dir=args.output_dir, 116 | dataset_type=args.dataset_type) 117 | 118 | 119 | if __name__ == '__main__': 120 | main() 121 | -------------------------------------------------------------------------------- /demo/000342.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/demo/000342.jpg -------------------------------------------------------------------------------- /demo/000542.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/demo/000542.jpg -------------------------------------------------------------------------------- /demo/003123.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/demo/003123.jpg -------------------------------------------------------------------------------- /demo/004101.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/demo/004101.jpg -------------------------------------------------------------------------------- /demo/008591.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/demo/008591.jpg -------------------------------------------------------------------------------- /figures/004545.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/figures/004545.jpg -------------------------------------------------------------------------------- /figures/losses.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/figures/losses.png -------------------------------------------------------------------------------- /figures/lr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/figures/lr.png -------------------------------------------------------------------------------- /figures/metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/figures/metrics.png -------------------------------------------------------------------------------- /outputs/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/outputs/.gitignore -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.3 2 | torchvision>=0.3 3 | yacs 4 | tqdm 5 | opencv-python 6 | vizer -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setup( 7 | name="torch-ssd", 8 | version="1.2.0", 9 | packages=find_packages(exclude=['ext']), 10 | install_requires=[ 11 | "torch>=1.3", 12 | "torchvision>=0.3", 13 | "opencv-python~=4.0", 14 | "yacs==0.1.6", 15 | "Vizer~=0.1.4", 16 | ], 17 | author="Congcong Li", 18 | author_email="luffy.lcc@gmail.com", 19 | description="High quality, fast, modular reference implementation of SSD in PyTorch", 20 | long_description=long_description, 21 | long_description_content_type="text/markdown", 22 | url="https://github.com/lufficc/SSD", 23 | classifiers=[ 24 | "Programming Language :: Python :: 3", 25 | "License :: OSI Approved :: MIT License", 26 | "Operating System :: OS Independent", 27 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 28 | ], 29 | license="MIT", 30 | python_requires=">=3.6", 31 | include_package_data=True, 32 | ) 33 | -------------------------------------------------------------------------------- /ssd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/ssd/__init__.py -------------------------------------------------------------------------------- /ssd/config/__init__.py: -------------------------------------------------------------------------------- 1 | from .defaults import _C as cfg 2 | -------------------------------------------------------------------------------- /ssd/config/defaults.py: -------------------------------------------------------------------------------- 1 | from yacs.config import CfgNode as CN 2 | 3 | _C = CN() 4 | 5 | _C.MODEL = CN() 6 | _C.MODEL.META_ARCHITECTURE = 'SSDDetector' 7 | _C.MODEL.DEVICE = "cuda" 8 | # match default boxes to any ground truth with jaccard overlap higher than a threshold (0.5) 9 | _C.MODEL.THRESHOLD = 0.5 10 | _C.MODEL.NUM_CLASSES = 21 11 | # Hard negative mining 12 | _C.MODEL.NEG_POS_RATIO = 3 13 | _C.MODEL.CENTER_VARIANCE = 0.1 14 | _C.MODEL.SIZE_VARIANCE = 0.2 15 | 16 | # ---------------------------------------------------------------------------- # 17 | # Backbone 18 | # ---------------------------------------------------------------------------- # 19 | _C.MODEL.BACKBONE = CN() 20 | _C.MODEL.BACKBONE.NAME = 'vgg' 21 | _C.MODEL.BACKBONE.OUT_CHANNELS = (512, 1024, 512, 256, 256, 256) 22 | _C.MODEL.BACKBONE.PRETRAINED = True 23 | 24 | # ----------------------------------------------------------------------------- 25 | # PRIORS 26 | # ----------------------------------------------------------------------------- 27 | _C.MODEL.PRIORS = CN() 28 | _C.MODEL.PRIORS.FEATURE_MAPS = [38, 19, 10, 5, 3, 1] 29 | _C.MODEL.PRIORS.STRIDES = [8, 16, 32, 64, 100, 300] 30 | _C.MODEL.PRIORS.MIN_SIZES = [30, 60, 111, 162, 213, 264] 31 | _C.MODEL.PRIORS.MAX_SIZES = [60, 111, 162, 213, 264, 315] 32 | _C.MODEL.PRIORS.ASPECT_RATIOS = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] 33 | # When has 1 aspect ratio, every location has 4 boxes, 2 ratio 6 boxes. 34 | # #boxes = 2 + #ratio * 2 35 | _C.MODEL.PRIORS.BOXES_PER_LOCATION = [4, 6, 6, 6, 4, 4] # number of boxes per feature map location 36 | _C.MODEL.PRIORS.CLIP = True 37 | 38 | # ----------------------------------------------------------------------------- 39 | # Box Head 40 | # ----------------------------------------------------------------------------- 41 | _C.MODEL.BOX_HEAD = CN() 42 | _C.MODEL.BOX_HEAD.NAME = 'SSDBoxHead' 43 | _C.MODEL.BOX_HEAD.PREDICTOR = 'SSDBoxPredictor' 44 | 45 | # ----------------------------------------------------------------------------- 46 | # INPUT 47 | # ----------------------------------------------------------------------------- 48 | _C.INPUT = CN() 49 | # Image size 50 | _C.INPUT.IMAGE_SIZE = 300 51 | # Values to be used for image normalization, RGB layout 52 | _C.INPUT.PIXEL_MEAN = [123, 117, 104] 53 | 54 | # ----------------------------------------------------------------------------- 55 | # Dataset 56 | # ----------------------------------------------------------------------------- 57 | _C.DATASETS = CN() 58 | # List of the dataset names for training, as present in paths_catalog.py 59 | _C.DATASETS.TRAIN = () 60 | # List of the dataset names for testing, as present in paths_catalog.py 61 | _C.DATASETS.TEST = () 62 | 63 | # ----------------------------------------------------------------------------- 64 | # DataLoader 65 | # ----------------------------------------------------------------------------- 66 | _C.DATA_LOADER = CN() 67 | # Number of data loading threads 68 | _C.DATA_LOADER.NUM_WORKERS = 8 69 | _C.DATA_LOADER.PIN_MEMORY = True 70 | 71 | # ---------------------------------------------------------------------------- # 72 | # Solver 73 | # ---------------------------------------------------------------------------- # 74 | _C.SOLVER = CN() 75 | # train configs 76 | _C.SOLVER.MAX_ITER = 120000 77 | _C.SOLVER.LR_STEPS = [80000, 100000] 78 | _C.SOLVER.GAMMA = 0.1 79 | _C.SOLVER.BATCH_SIZE = 32 80 | _C.SOLVER.LR = 1e-3 81 | _C.SOLVER.MOMENTUM = 0.9 82 | _C.SOLVER.WEIGHT_DECAY = 5e-4 83 | _C.SOLVER.WARMUP_FACTOR = 1.0 / 3 84 | _C.SOLVER.WARMUP_ITERS = 500 85 | 86 | # ---------------------------------------------------------------------------- # 87 | # Specific test options 88 | # ---------------------------------------------------------------------------- # 89 | _C.TEST = CN() 90 | _C.TEST.NMS_THRESHOLD = 0.45 91 | _C.TEST.CONFIDENCE_THRESHOLD = 0.01 92 | _C.TEST.MAX_PER_CLASS = -1 93 | _C.TEST.MAX_PER_IMAGE = 100 94 | _C.TEST.BATCH_SIZE = 10 95 | 96 | _C.OUTPUT_DIR = 'outputs' 97 | -------------------------------------------------------------------------------- /ssd/config/path_catlog.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class DatasetCatalog: 5 | DATA_DIR = 'datasets' 6 | DATASETS = { 7 | 'voc_2007_train': { 8 | "data_dir": "VOC2007", 9 | "split": "train" 10 | }, 11 | 'voc_2007_val': { 12 | "data_dir": "VOC2007", 13 | "split": "val" 14 | }, 15 | 'voc_2007_trainval': { 16 | "data_dir": "VOC2007", 17 | "split": "trainval" 18 | }, 19 | 'voc_2007_test': { 20 | "data_dir": "VOC2007", 21 | "split": "test" 22 | }, 23 | 'voc_2012_train': { 24 | "data_dir": "VOC2012", 25 | "split": "train" 26 | }, 27 | 'voc_2012_val': { 28 | "data_dir": "VOC2012", 29 | "split": "val" 30 | }, 31 | 'voc_2012_trainval': { 32 | "data_dir": "VOC2012", 33 | "split": "trainval" 34 | }, 35 | 'voc_2012_test': { 36 | "data_dir": "VOC2012", 37 | "split": "test" 38 | }, 39 | 'coco_2014_valminusminival': { 40 | "data_dir": "val2014", 41 | "ann_file": "annotations/instances_valminusminival2014.json" 42 | }, 43 | 'coco_2014_minival': { 44 | "data_dir": "val2014", 45 | "ann_file": "annotations/instances_minival2014.json" 46 | }, 47 | 'coco_2014_train': { 48 | "data_dir": "train2014", 49 | "ann_file": "annotations/instances_train2014.json" 50 | }, 51 | 'coco_2014_val': { 52 | "data_dir": "val2014", 53 | "ann_file": "annotations/instances_val2014.json" 54 | }, 55 | } 56 | 57 | @staticmethod 58 | def get(name): 59 | if "voc" in name: 60 | voc_root = DatasetCatalog.DATA_DIR 61 | if 'VOC_ROOT' in os.environ: 62 | voc_root = os.environ['VOC_ROOT'] 63 | 64 | attrs = DatasetCatalog.DATASETS[name] 65 | args = dict( 66 | data_dir=os.path.join(voc_root, attrs["data_dir"]), 67 | split=attrs["split"], 68 | ) 69 | return dict(factory="VOCDataset", args=args) 70 | elif "coco" in name: 71 | coco_root = DatasetCatalog.DATA_DIR 72 | if 'COCO_ROOT' in os.environ: 73 | coco_root = os.environ['COCO_ROOT'] 74 | 75 | attrs = DatasetCatalog.DATASETS[name] 76 | args = dict( 77 | data_dir=os.path.join(coco_root, attrs["data_dir"]), 78 | ann_file=os.path.join(coco_root, attrs["ann_file"]), 79 | ) 80 | return dict(factory="COCODataset", args=args) 81 | 82 | raise RuntimeError("Dataset not available: {}".format(name)) 83 | -------------------------------------------------------------------------------- /ssd/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/ssd/data/__init__.py -------------------------------------------------------------------------------- /ssd/data/build.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.utils.data import DataLoader 3 | from torch.utils.data.dataloader import default_collate 4 | 5 | from ssd.data import samplers 6 | from ssd.data.datasets import build_dataset 7 | from ssd.data.transforms import build_transforms, build_target_transform 8 | from ssd.structures.container import Container 9 | 10 | 11 | class BatchCollator: 12 | def __init__(self, is_train=True): 13 | self.is_train = is_train 14 | 15 | def __call__(self, batch): 16 | transposed_batch = list(zip(*batch)) 17 | images = default_collate(transposed_batch[0]) 18 | img_ids = default_collate(transposed_batch[2]) 19 | 20 | if self.is_train: 21 | list_targets = transposed_batch[1] 22 | targets = Container( 23 | {key: default_collate([d[key] for d in list_targets]) for key in list_targets[0]} 24 | ) 25 | else: 26 | targets = None 27 | return images, targets, img_ids 28 | 29 | 30 | def make_data_loader(cfg, is_train=True, distributed=False, max_iter=None, start_iter=0): 31 | train_transform = build_transforms(cfg, is_train=is_train) 32 | target_transform = build_target_transform(cfg) if is_train else None 33 | dataset_list = cfg.DATASETS.TRAIN if is_train else cfg.DATASETS.TEST 34 | datasets = build_dataset(dataset_list, transform=train_transform, target_transform=target_transform, is_train=is_train) 35 | 36 | shuffle = is_train 37 | 38 | data_loaders = [] 39 | 40 | for dataset in datasets: 41 | if distributed: 42 | sampler = samplers.DistributedSampler(dataset, shuffle=shuffle) 43 | elif shuffle: 44 | sampler = torch.utils.data.RandomSampler(dataset) 45 | else: 46 | sampler = torch.utils.data.sampler.SequentialSampler(dataset) 47 | 48 | batch_size = cfg.SOLVER.BATCH_SIZE if is_train else cfg.TEST.BATCH_SIZE 49 | batch_sampler = torch.utils.data.sampler.BatchSampler(sampler=sampler, batch_size=batch_size, drop_last=False) 50 | if max_iter is not None: 51 | batch_sampler = samplers.IterationBasedBatchSampler(batch_sampler, num_iterations=max_iter, start_iter=start_iter) 52 | 53 | data_loader = DataLoader(dataset, num_workers=cfg.DATA_LOADER.NUM_WORKERS, batch_sampler=batch_sampler, 54 | pin_memory=cfg.DATA_LOADER.PIN_MEMORY, collate_fn=BatchCollator(is_train)) 55 | data_loaders.append(data_loader) 56 | 57 | if is_train: 58 | # during training, a single (possibly concatenated) data_loader is returned 59 | assert len(data_loaders) == 1 60 | return data_loaders[0] 61 | return data_loaders 62 | -------------------------------------------------------------------------------- /ssd/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import ConcatDataset 2 | 3 | from ssd.config.path_catlog import DatasetCatalog 4 | from .voc import VOCDataset 5 | from .coco import COCODataset 6 | 7 | _DATASETS = { 8 | 'VOCDataset': VOCDataset, 9 | 'COCODataset': COCODataset, 10 | } 11 | 12 | 13 | def build_dataset(dataset_list, transform=None, target_transform=None, is_train=True): 14 | assert len(dataset_list) > 0 15 | datasets = [] 16 | for dataset_name in dataset_list: 17 | data = DatasetCatalog.get(dataset_name) 18 | args = data['args'] 19 | factory = _DATASETS[data['factory']] 20 | args['transform'] = transform 21 | args['target_transform'] = target_transform 22 | if factory == VOCDataset: 23 | args['keep_difficult'] = not is_train 24 | elif factory == COCODataset: 25 | args['remove_empty'] = is_train 26 | dataset = factory(**args) 27 | datasets.append(dataset) 28 | # for testing, return a list of datasets 29 | if not is_train: 30 | return datasets 31 | dataset = datasets[0] 32 | if len(datasets) > 1: 33 | dataset = ConcatDataset(datasets) 34 | 35 | return [dataset] 36 | -------------------------------------------------------------------------------- /ssd/data/datasets/coco.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch.utils.data 3 | import numpy as np 4 | from PIL import Image 5 | 6 | from ssd.structures.container import Container 7 | 8 | 9 | class COCODataset(torch.utils.data.Dataset): 10 | class_names = ('__background__', 11 | 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 12 | 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 13 | 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 14 | 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 15 | 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 16 | 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 17 | 'kite', 'baseball bat', 'baseball glove', 'skateboard', 18 | 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 19 | 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 20 | 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 21 | 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 22 | 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 23 | 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 24 | 'refrigerator', 'book', 'clock', 'vase', 'scissors', 25 | 'teddy bear', 'hair drier', 'toothbrush') 26 | 27 | def __init__(self, data_dir, ann_file, transform=None, target_transform=None, remove_empty=False): 28 | from pycocotools.coco import COCO 29 | self.coco = COCO(ann_file) 30 | self.data_dir = data_dir 31 | self.transform = transform 32 | self.target_transform = target_transform 33 | self.remove_empty = remove_empty 34 | if self.remove_empty: 35 | # when training, images without annotations are removed. 36 | self.ids = list(self.coco.imgToAnns.keys()) 37 | else: 38 | # when testing, all images used. 39 | self.ids = list(self.coco.imgs.keys()) 40 | coco_categories = sorted(self.coco.getCatIds()) 41 | self.coco_id_to_contiguous_id = {coco_id: i + 1 for i, coco_id in enumerate(coco_categories)} 42 | self.contiguous_id_to_coco_id = {v: k for k, v in self.coco_id_to_contiguous_id.items()} 43 | 44 | def __getitem__(self, index): 45 | image_id = self.ids[index] 46 | boxes, labels = self._get_annotation(image_id) 47 | image = self._read_image(image_id) 48 | if self.transform: 49 | image, boxes, labels = self.transform(image, boxes, labels) 50 | if self.target_transform: 51 | boxes, labels = self.target_transform(boxes, labels) 52 | targets = Container( 53 | boxes=boxes, 54 | labels=labels, 55 | ) 56 | return image, targets, index 57 | 58 | def get_annotation(self, index): 59 | image_id = self.ids[index] 60 | return image_id, self._get_annotation(image_id) 61 | 62 | def __len__(self): 63 | return len(self.ids) 64 | 65 | def _get_annotation(self, image_id): 66 | ann_ids = self.coco.getAnnIds(imgIds=image_id) 67 | ann = self.coco.loadAnns(ann_ids) 68 | # filter crowd annotations 69 | ann = [obj for obj in ann if obj["iscrowd"] == 0] 70 | boxes = np.array([self._xywh2xyxy(obj["bbox"]) for obj in ann], np.float32).reshape((-1, 4)) 71 | labels = np.array([self.coco_id_to_contiguous_id[obj["category_id"]] for obj in ann], np.int64).reshape((-1,)) 72 | # remove invalid boxes 73 | keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) 74 | boxes = boxes[keep] 75 | labels = labels[keep] 76 | return boxes, labels 77 | 78 | def _xywh2xyxy(self, box): 79 | x1, y1, w, h = box 80 | return [x1, y1, x1 + w, y1 + h] 81 | 82 | def get_img_info(self, index): 83 | image_id = self.ids[index] 84 | img_data = self.coco.imgs[image_id] 85 | return img_data 86 | 87 | def _read_image(self, image_id): 88 | file_name = self.coco.loadImgs(image_id)[0]['file_name'] 89 | image_file = os.path.join(self.data_dir, file_name) 90 | image = Image.open(image_file).convert("RGB") 91 | image = np.array(image) 92 | return image 93 | -------------------------------------------------------------------------------- /ssd/data/datasets/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from ssd.data.datasets import VOCDataset, COCODataset 2 | from .coco import coco_evaluation 3 | from .voc import voc_evaluation 4 | 5 | 6 | def evaluate(dataset, predictions, output_dir, **kwargs): 7 | """evaluate dataset using different methods based on dataset type. 8 | Args: 9 | dataset: Dataset object 10 | predictions(list[(boxes, labels, scores)]): Each item in the list represents the 11 | prediction results for one image. And the index should match the dataset index. 12 | output_dir: output folder, to save evaluation files or results. 13 | Returns: 14 | evaluation result 15 | """ 16 | args = dict( 17 | dataset=dataset, predictions=predictions, output_dir=output_dir, **kwargs, 18 | ) 19 | if isinstance(dataset, VOCDataset): 20 | return voc_evaluation(**args) 21 | elif isinstance(dataset, COCODataset): 22 | return coco_evaluation(**args) 23 | else: 24 | raise NotImplementedError 25 | -------------------------------------------------------------------------------- /ssd/data/datasets/evaluation/coco/__init__.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | from datetime import datetime 5 | 6 | 7 | def coco_evaluation(dataset, predictions, output_dir, iteration=None): 8 | coco_results = [] 9 | for i, prediction in enumerate(predictions): 10 | img_info = dataset.get_img_info(i) 11 | prediction = prediction.resize((img_info['width'], img_info['height'])).numpy() 12 | boxes, labels, scores = prediction['boxes'], prediction['labels'], prediction['scores'] 13 | 14 | image_id, annotation = dataset.get_annotation(i) 15 | class_mapper = dataset.contiguous_id_to_coco_id 16 | if labels.shape[0] == 0: 17 | continue 18 | 19 | boxes = boxes.tolist() 20 | labels = labels.tolist() 21 | scores = scores.tolist() 22 | coco_results.extend( 23 | [ 24 | { 25 | "image_id": image_id, 26 | "category_id": class_mapper[labels[k]], 27 | "bbox": [box[0], box[1], box[2] - box[0], box[3] - box[1]], # to xywh format 28 | "score": scores[k], 29 | } 30 | for k, box in enumerate(boxes) 31 | ] 32 | ) 33 | iou_type = 'bbox' 34 | json_result_file = os.path.join(output_dir, iou_type + ".json") 35 | logger = logging.getLogger("SSD.inference") 36 | logger.info('Writing results to {}...'.format(json_result_file)) 37 | with open(json_result_file, "w") as f: 38 | json.dump(coco_results, f) 39 | from pycocotools.cocoeval import COCOeval 40 | coco_gt = dataset.coco 41 | coco_dt = coco_gt.loadRes(json_result_file) 42 | coco_eval = COCOeval(coco_gt, coco_dt, iou_type) 43 | coco_eval.evaluate() 44 | coco_eval.accumulate() 45 | coco_eval.summarize() 46 | 47 | result_strings = [] 48 | keys = ["AP", "AP50", "AP75", "APs", "APm", "APl"] 49 | metrics = {} 50 | for i, key in enumerate(keys): 51 | metrics[key] = coco_eval.stats[i] 52 | logger.info('{:<10}: {}'.format(key, round(coco_eval.stats[i], 3))) 53 | result_strings.append('{:<10}: {}'.format(key, round(coco_eval.stats[i], 3))) 54 | 55 | if iteration is not None: 56 | result_path = os.path.join(output_dir, 'result_{:07d}.txt'.format(iteration)) 57 | else: 58 | result_path = os.path.join(output_dir, 'result_{}.txt'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) 59 | with open(result_path, "w") as f: 60 | f.write('\n'.join(result_strings)) 61 | 62 | return dict(metrics=metrics) 63 | -------------------------------------------------------------------------------- /ssd/data/datasets/evaluation/voc/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | from datetime import datetime 4 | 5 | import numpy as np 6 | 7 | from .eval_detection_voc import eval_detection_voc 8 | 9 | 10 | def voc_evaluation(dataset, predictions, output_dir, iteration=None): 11 | class_names = dataset.class_names 12 | 13 | pred_boxes_list = [] 14 | pred_labels_list = [] 15 | pred_scores_list = [] 16 | gt_boxes_list = [] 17 | gt_labels_list = [] 18 | gt_difficults = [] 19 | 20 | for i in range(len(dataset)): 21 | image_id, annotation = dataset.get_annotation(i) 22 | gt_boxes, gt_labels, is_difficult = annotation 23 | gt_boxes_list.append(gt_boxes) 24 | gt_labels_list.append(gt_labels) 25 | gt_difficults.append(is_difficult.astype(np.bool)) 26 | 27 | img_info = dataset.get_img_info(i) 28 | prediction = predictions[i] 29 | prediction = prediction.resize((img_info['width'], img_info['height'])).numpy() 30 | boxes, labels, scores = prediction['boxes'], prediction['labels'], prediction['scores'] 31 | 32 | pred_boxes_list.append(boxes) 33 | pred_labels_list.append(labels) 34 | pred_scores_list.append(scores) 35 | result = eval_detection_voc(pred_bboxes=pred_boxes_list, 36 | pred_labels=pred_labels_list, 37 | pred_scores=pred_scores_list, 38 | gt_bboxes=gt_boxes_list, 39 | gt_labels=gt_labels_list, 40 | gt_difficults=gt_difficults, 41 | iou_thresh=0.5, 42 | use_07_metric=True) 43 | logger = logging.getLogger("SSD.inference") 44 | result_str = "mAP: {:.4f}\n".format(result["map"]) 45 | metrics = {'mAP': result["map"]} 46 | for i, ap in enumerate(result["ap"]): 47 | if i == 0: # skip background 48 | continue 49 | metrics[class_names[i]] = ap 50 | result_str += "{:<16}: {:.4f}\n".format(class_names[i], ap) 51 | logger.info(result_str) 52 | 53 | if iteration is not None: 54 | result_path = os.path.join(output_dir, 'result_{:07d}.txt'.format(iteration)) 55 | else: 56 | result_path = os.path.join(output_dir, 'result_{}.txt'.format(datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) 57 | with open(result_path, "w") as f: 58 | f.write(result_str) 59 | 60 | return dict(metrics=metrics) 61 | -------------------------------------------------------------------------------- /ssd/data/datasets/evaluation/voc/eval_detection_voc.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | from collections import defaultdict 4 | import itertools 5 | import numpy as np 6 | import six 7 | 8 | 9 | def bbox_iou(bbox_a, bbox_b): 10 | """Calculate the Intersection of Unions (IoUs) between bounding boxes. 11 | IoU is calculated as a ratio of area of the intersection 12 | and area of the union. 13 | This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as 14 | inputs. Please note that both :obj:`bbox_a` and :obj:`bbox_b` need to be 15 | same type. 16 | The output is same type as the type of the inputs. 17 | Args: 18 | bbox_a (array): An array whose shape is :math:`(N, 4)`. 19 | :math:`N` is the number of bounding boxes. 20 | The dtype should be :obj:`numpy.float32`. 21 | bbox_b (array): An array similar to :obj:`bbox_a`, 22 | whose shape is :math:`(K, 4)`. 23 | The dtype should be :obj:`numpy.float32`. 24 | Returns: 25 | array: 26 | An array whose shape is :math:`(N, K)`. \ 27 | An element at index :math:`(n, k)` contains IoUs between \ 28 | :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \ 29 | box in :obj:`bbox_b`. 30 | """ 31 | if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4: 32 | raise IndexError 33 | 34 | # top left 35 | tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2]) 36 | # bottom right 37 | br = np.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:]) 38 | 39 | area_i = np.prod(br - tl, axis=2) * (tl < br).all(axis=2) 40 | area_a = np.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1) 41 | area_b = np.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1) 42 | return area_i / (area_a[:, None] + area_b - area_i) 43 | 44 | 45 | def eval_detection_voc( 46 | pred_bboxes, 47 | pred_labels, 48 | pred_scores, 49 | gt_bboxes, 50 | gt_labels, 51 | gt_difficults=None, 52 | iou_thresh=0.5, 53 | use_07_metric=False): 54 | """Calculate average precisions based on evaluation code of PASCAL VOC. 55 | 56 | This function evaluates predicted bounding boxes obtained from a dataset 57 | which has :math:`N` images by using average precision for each class. 58 | The code is based on the evaluation code used in PASCAL VOC Challenge. 59 | 60 | Args: 61 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 62 | sets of bounding boxes. 63 | Its index corresponds to an index for the base dataset. 64 | Each element of :obj:`pred_bboxes` is a set of coordinates 65 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 66 | where :math:`R` corresponds 67 | to the number of bounding boxes, which may vary among boxes. 68 | The second axis corresponds to 69 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 70 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 71 | Similar to :obj:`pred_bboxes`, its index corresponds to an 72 | index for the base dataset. Its length is :math:`N`. 73 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 74 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 75 | its index corresponds to an index for the base dataset. 76 | Its length is :math:`N`. 77 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 78 | bounding boxes 79 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 80 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 81 | bounding boxes in each image does not need to be same as the number 82 | of corresponding predicted boxes. 83 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 84 | labels which are organized similarly to :obj:`gt_bboxes`. 85 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 86 | arrays which is organized similarly to :obj:`gt_bboxes`. 87 | This tells whether the 88 | corresponding ground truth bounding box is difficult or not. 89 | By default, this is :obj:`None`. In that case, this function 90 | considers all bounding boxes to be not difficult. 91 | iou_thresh (float): A prediction is correct if its Intersection over 92 | Union with the ground truth is above this value. 93 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 94 | for calculating average precision. The default value is 95 | :obj:`False`. 96 | 97 | Returns: 98 | dict: 99 | 100 | The keys, value-types and the description of the values are listed 101 | below. 102 | 103 | * **ap** (*numpy.ndarray*): An array of average precisions. \ 104 | The :math:`l`-th value corresponds to the average precision \ 105 | for class :math:`l`. If class :math:`l` does not exist in \ 106 | either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \ 107 | value is set to :obj:`numpy.nan`. 108 | * **map** (*float*): The average of Average Precisions over classes. 109 | 110 | """ 111 | 112 | prec, rec = calc_detection_voc_prec_rec(pred_bboxes, 113 | pred_labels, 114 | pred_scores, 115 | gt_bboxes, 116 | gt_labels, 117 | gt_difficults, 118 | iou_thresh=iou_thresh) 119 | 120 | ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric) 121 | 122 | return {'ap': ap, 'map': np.nanmean(ap)} 123 | 124 | 125 | def calc_detection_voc_prec_rec( 126 | pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels, 127 | gt_difficults=None, 128 | iou_thresh=0.5): 129 | """Calculate precision and recall based on evaluation code of PASCAL VOC. 130 | 131 | This function calculates precision and recall of 132 | predicted bounding boxes obtained from a dataset which has :math:`N` 133 | images. 134 | The code is based on the evaluation code used in PASCAL VOC Challenge. 135 | 136 | Args: 137 | pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N` 138 | sets of bounding boxes. 139 | Its index corresponds to an index for the base dataset. 140 | Each element of :obj:`pred_bboxes` is a set of coordinates 141 | of bounding boxes. This is an array whose shape is :math:`(R, 4)`, 142 | where :math:`R` corresponds 143 | to the number of bounding boxes, which may vary among boxes. 144 | The second axis corresponds to 145 | :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box. 146 | pred_labels (iterable of numpy.ndarray): An iterable of labels. 147 | Similar to :obj:`pred_bboxes`, its index corresponds to an 148 | index for the base dataset. Its length is :math:`N`. 149 | pred_scores (iterable of numpy.ndarray): An iterable of confidence 150 | scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`, 151 | its index corresponds to an index for the base dataset. 152 | Its length is :math:`N`. 153 | gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth 154 | bounding boxes 155 | whose length is :math:`N`. An element of :obj:`gt_bboxes` is a 156 | bounding box whose shape is :math:`(R, 4)`. Note that the number of 157 | bounding boxes in each image does not need to be same as the number 158 | of corresponding predicted boxes. 159 | gt_labels (iterable of numpy.ndarray): An iterable of ground truth 160 | labels which are organized similarly to :obj:`gt_bboxes`. 161 | gt_difficults (iterable of numpy.ndarray): An iterable of boolean 162 | arrays which is organized similarly to :obj:`gt_bboxes`. 163 | This tells whether the 164 | corresponding ground truth bounding box is difficult or not. 165 | By default, this is :obj:`None`. In that case, this function 166 | considers all bounding boxes to be not difficult. 167 | iou_thresh (float): A prediction is correct if its Intersection over 168 | Union with the ground truth is above this value.. 169 | 170 | Returns: 171 | tuple of two lists: 172 | This function returns two lists: :obj:`prec` and :obj:`rec`. 173 | 174 | * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \ 175 | for class :math:`l`. If class :math:`l` does not exist in \ 176 | either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \ 177 | set to :obj:`None`. 178 | * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \ 179 | for class :math:`l`. If class :math:`l` that is not marked as \ 180 | difficult does not exist in \ 181 | :obj:`gt_labels`, :obj:`rec[l]` is \ 182 | set to :obj:`None`. 183 | 184 | """ 185 | 186 | pred_bboxes = iter(pred_bboxes) 187 | pred_labels = iter(pred_labels) 188 | pred_scores = iter(pred_scores) 189 | gt_bboxes = iter(gt_bboxes) 190 | gt_labels = iter(gt_labels) 191 | if gt_difficults is None: 192 | gt_difficults = itertools.repeat(None) 193 | else: 194 | gt_difficults = iter(gt_difficults) 195 | 196 | n_pos = defaultdict(int) 197 | score = defaultdict(list) 198 | match = defaultdict(list) 199 | 200 | for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \ 201 | six.moves.zip( 202 | pred_bboxes, pred_labels, pred_scores, 203 | gt_bboxes, gt_labels, gt_difficults): 204 | 205 | if gt_difficult is None: 206 | gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool) 207 | 208 | for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)): 209 | pred_mask_l = pred_label == l 210 | pred_bbox_l = pred_bbox[pred_mask_l] 211 | pred_score_l = pred_score[pred_mask_l] 212 | # sort by score 213 | order = pred_score_l.argsort()[::-1] 214 | pred_bbox_l = pred_bbox_l[order] 215 | pred_score_l = pred_score_l[order] 216 | 217 | gt_mask_l = gt_label == l 218 | gt_bbox_l = gt_bbox[gt_mask_l] 219 | gt_difficult_l = gt_difficult[gt_mask_l] 220 | 221 | n_pos[l] += np.logical_not(gt_difficult_l).sum() 222 | score[l].extend(pred_score_l) 223 | 224 | if len(pred_bbox_l) == 0: 225 | continue 226 | if len(gt_bbox_l) == 0: 227 | match[l].extend((0,) * pred_bbox_l.shape[0]) 228 | continue 229 | 230 | # VOC evaluation follows integer typed bounding boxes. 231 | pred_bbox_l = pred_bbox_l.copy() 232 | pred_bbox_l[:, 2:] += 1 233 | gt_bbox_l = gt_bbox_l.copy() 234 | gt_bbox_l[:, 2:] += 1 235 | 236 | iou = bbox_iou(pred_bbox_l, gt_bbox_l) 237 | gt_index = iou.argmax(axis=1) 238 | # set -1 if there is no matching ground truth 239 | gt_index[iou.max(axis=1) < iou_thresh] = -1 240 | del iou 241 | 242 | selec = np.zeros(gt_bbox_l.shape[0], dtype=bool) 243 | for gt_idx in gt_index: 244 | if gt_idx >= 0: 245 | if gt_difficult_l[gt_idx]: 246 | match[l].append(-1) 247 | else: 248 | if not selec[gt_idx]: 249 | match[l].append(1) 250 | else: 251 | match[l].append(0) 252 | selec[gt_idx] = True 253 | else: 254 | match[l].append(0) 255 | 256 | for iter_ in ( 257 | pred_bboxes, pred_labels, pred_scores, 258 | gt_bboxes, gt_labels, gt_difficults): 259 | if next(iter_, None) is not None: 260 | raise ValueError('Length of input iterables need to be same.') 261 | 262 | n_fg_class = max(n_pos.keys()) + 1 263 | prec = [None] * n_fg_class 264 | rec = [None] * n_fg_class 265 | 266 | for l in n_pos.keys(): 267 | score_l = np.array(score[l]) 268 | match_l = np.array(match[l], dtype=np.int8) 269 | 270 | order = score_l.argsort()[::-1] 271 | match_l = match_l[order] 272 | 273 | tp = np.cumsum(match_l == 1) 274 | fp = np.cumsum(match_l == 0) 275 | 276 | # If an element of fp + tp is 0, 277 | # the corresponding element of prec[l] is nan. 278 | prec[l] = tp / (fp + tp) 279 | # If n_pos[l] is 0, rec[l] is None. 280 | if n_pos[l] > 0: 281 | rec[l] = tp / n_pos[l] 282 | 283 | return prec, rec 284 | 285 | 286 | def calc_detection_voc_ap(prec, rec, use_07_metric=False): 287 | """Calculate average precisions based on evaluation code of PASCAL VOC. 288 | 289 | This function calculates average precisions 290 | from given precisions and recalls. 291 | The code is based on the evaluation code used in PASCAL VOC Challenge. 292 | 293 | Args: 294 | prec (list of numpy.array): A list of arrays. 295 | :obj:`prec[l]` indicates precision for class :math:`l`. 296 | If :obj:`prec[l]` is :obj:`None`, this function returns 297 | :obj:`numpy.nan` for class :math:`l`. 298 | rec (list of numpy.array): A list of arrays. 299 | :obj:`rec[l]` indicates recall for class :math:`l`. 300 | If :obj:`rec[l]` is :obj:`None`, this function returns 301 | :obj:`numpy.nan` for class :math:`l`. 302 | use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric 303 | for calculating average precision. The default value is 304 | :obj:`False`. 305 | 306 | Returns: 307 | ~numpy.ndarray: 308 | This function returns an array of average precisions. 309 | The :math:`l`-th value corresponds to the average precision 310 | for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is 311 | :obj:`None`, the corresponding value is set to :obj:`numpy.nan`. 312 | 313 | """ 314 | 315 | n_fg_class = len(prec) 316 | ap = np.empty(n_fg_class) 317 | for l in six.moves.range(n_fg_class): 318 | if prec[l] is None or rec[l] is None: 319 | ap[l] = np.nan 320 | continue 321 | 322 | if use_07_metric: 323 | # 11 point metric 324 | ap[l] = 0 325 | for t in np.arange(0., 1.1, 0.1): 326 | if np.sum(rec[l] >= t) == 0: 327 | p = 0 328 | else: 329 | p = np.max(np.nan_to_num(prec[l])[rec[l] >= t]) 330 | ap[l] += p / 11 331 | else: 332 | # correct AP calculation 333 | # first append sentinel values at the end 334 | mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0])) 335 | mrec = np.concatenate(([0], rec[l], [1])) 336 | 337 | mpre = np.maximum.accumulate(mpre[::-1])[::-1] 338 | 339 | # to calculate area under PR curve, look for points 340 | # where X axis (recall) changes value 341 | i = np.where(mrec[1:] != mrec[:-1])[0] 342 | 343 | # and sum (\Delta recall) * prec 344 | ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 345 | 346 | return ap 347 | -------------------------------------------------------------------------------- /ssd/data/datasets/voc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch.utils.data 3 | import numpy as np 4 | import xml.etree.ElementTree as ET 5 | from PIL import Image 6 | 7 | from ssd.structures.container import Container 8 | 9 | 10 | class VOCDataset(torch.utils.data.Dataset): 11 | class_names = ('__background__', 12 | 'aeroplane', 'bicycle', 'bird', 'boat', 13 | 'bottle', 'bus', 'car', 'cat', 'chair', 14 | 'cow', 'diningtable', 'dog', 'horse', 15 | 'motorbike', 'person', 'pottedplant', 16 | 'sheep', 'sofa', 'train', 'tvmonitor') 17 | 18 | def __init__(self, data_dir, split, transform=None, target_transform=None, keep_difficult=False): 19 | """Dataset for VOC data. 20 | Args: 21 | data_dir: the root of the VOC2007 or VOC2012 dataset, the directory contains the following sub-directories: 22 | Annotations, ImageSets, JPEGImages, SegmentationClass, SegmentationObject. 23 | """ 24 | self.data_dir = data_dir 25 | self.split = split 26 | self.transform = transform 27 | self.target_transform = target_transform 28 | image_sets_file = os.path.join(self.data_dir, "ImageSets", "Main", "%s.txt" % self.split) 29 | self.ids = VOCDataset._read_image_ids(image_sets_file) 30 | self.keep_difficult = keep_difficult 31 | 32 | self.class_dict = {class_name: i for i, class_name in enumerate(self.class_names)} 33 | 34 | def __getitem__(self, index): 35 | image_id = self.ids[index] 36 | boxes, labels, is_difficult = self._get_annotation(image_id) 37 | if not self.keep_difficult: 38 | boxes = boxes[is_difficult == 0] 39 | labels = labels[is_difficult == 0] 40 | image = self._read_image(image_id) 41 | if self.transform: 42 | image, boxes, labels = self.transform(image, boxes, labels) 43 | if self.target_transform: 44 | boxes, labels = self.target_transform(boxes, labels) 45 | targets = Container( 46 | boxes=boxes, 47 | labels=labels, 48 | ) 49 | return image, targets, index 50 | 51 | def get_annotation(self, index): 52 | image_id = self.ids[index] 53 | return image_id, self._get_annotation(image_id) 54 | 55 | def __len__(self): 56 | return len(self.ids) 57 | 58 | @staticmethod 59 | def _read_image_ids(image_sets_file): 60 | ids = [] 61 | with open(image_sets_file) as f: 62 | for line in f: 63 | ids.append(line.rstrip()) 64 | return ids 65 | 66 | def _get_annotation(self, image_id): 67 | annotation_file = os.path.join(self.data_dir, "Annotations", "%s.xml" % image_id) 68 | objects = ET.parse(annotation_file).findall("object") 69 | boxes = [] 70 | labels = [] 71 | is_difficult = [] 72 | for obj in objects: 73 | class_name = obj.find('name').text.lower().strip() 74 | bbox = obj.find('bndbox') 75 | # VOC dataset format follows Matlab, in which indexes start from 0 76 | x1 = float(bbox.find('xmin').text) - 1 77 | y1 = float(bbox.find('ymin').text) - 1 78 | x2 = float(bbox.find('xmax').text) - 1 79 | y2 = float(bbox.find('ymax').text) - 1 80 | boxes.append([x1, y1, x2, y2]) 81 | labels.append(self.class_dict[class_name]) 82 | is_difficult_str = obj.find('difficult').text 83 | is_difficult.append(int(is_difficult_str) if is_difficult_str else 0) 84 | 85 | return (np.array(boxes, dtype=np.float32), 86 | np.array(labels, dtype=np.int64), 87 | np.array(is_difficult, dtype=np.uint8)) 88 | 89 | def get_img_info(self, index): 90 | img_id = self.ids[index] 91 | annotation_file = os.path.join(self.data_dir, "Annotations", "%s.xml" % img_id) 92 | anno = ET.parse(annotation_file).getroot() 93 | size = anno.find("size") 94 | im_info = tuple(map(int, (size.find("height").text, size.find("width").text))) 95 | return {"height": im_info[0], "width": im_info[1]} 96 | 97 | def _read_image(self, image_id): 98 | image_file = os.path.join(self.data_dir, "JPEGImages", "%s.jpg" % image_id) 99 | image = Image.open(image_file).convert("RGB") 100 | image = np.array(image) 101 | return image 102 | -------------------------------------------------------------------------------- /ssd/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .iteration_based_batch_sampler import IterationBasedBatchSampler 2 | from .distributed import DistributedSampler 3 | 4 | __all__ = ['IterationBasedBatchSampler', 'DistributedSampler'] 5 | -------------------------------------------------------------------------------- /ssd/data/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed. 3 | # FIXME remove this once c10d fixes the bug it has 4 | import math 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import Sampler 8 | 9 | 10 | class DistributedSampler(Sampler): 11 | """Sampler that restricts data loading to a subset of the dataset. 12 | It is especially useful in conjunction with 13 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 14 | process can pass a DistributedSampler instance as a DataLoader sampler, 15 | and load a subset of the original dataset that is exclusive to it. 16 | .. note:: 17 | Dataset is assumed to be of constant size. 18 | Arguments: 19 | dataset: Dataset used for sampling. 20 | num_replicas (optional): Number of processes participating in 21 | distributed training. 22 | rank (optional): Rank of the current process within num_replicas. 23 | """ 24 | 25 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 26 | if num_replicas is None: 27 | if not dist.is_available(): 28 | raise RuntimeError("Requires distributed package to be available") 29 | num_replicas = dist.get_world_size() 30 | if rank is None: 31 | if not dist.is_available(): 32 | raise RuntimeError("Requires distributed package to be available") 33 | rank = dist.get_rank() 34 | self.dataset = dataset 35 | self.num_replicas = num_replicas 36 | self.rank = rank 37 | self.epoch = 0 38 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 39 | self.total_size = self.num_samples * self.num_replicas 40 | self.shuffle = shuffle 41 | 42 | def __iter__(self): 43 | if self.shuffle: 44 | # deterministically shuffle based on epoch 45 | g = torch.Generator() 46 | g.manual_seed(self.epoch) 47 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 48 | else: 49 | indices = torch.arange(len(self.dataset)).tolist() 50 | 51 | # add extra samples to make it evenly divisible 52 | indices += indices[: (self.total_size - len(indices))] 53 | assert len(indices) == self.total_size 54 | 55 | # subsample 56 | offset = self.num_samples * self.rank 57 | indices = indices[offset: offset + self.num_samples] 58 | assert len(indices) == self.num_samples 59 | 60 | return iter(indices) 61 | 62 | def __len__(self): 63 | return self.num_samples 64 | 65 | def set_epoch(self, epoch): 66 | self.epoch = epoch 67 | -------------------------------------------------------------------------------- /ssd/data/samplers/iteration_based_batch_sampler.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data.sampler import BatchSampler 2 | 3 | 4 | class IterationBasedBatchSampler(BatchSampler): 5 | """ 6 | Wraps a BatchSampler, re-sampling from it until 7 | a specified number of iterations have been sampled 8 | """ 9 | 10 | def __init__(self, batch_sampler, num_iterations, start_iter=0): 11 | self.batch_sampler = batch_sampler 12 | self.num_iterations = num_iterations 13 | self.start_iter = start_iter 14 | 15 | def __iter__(self): 16 | iteration = self.start_iter 17 | while iteration <= self.num_iterations: 18 | # if the underlying sampler has a set_epoch method, like 19 | # DistributedSampler, used for making each process see 20 | # a different split of the dataset, then set it 21 | if hasattr(self.batch_sampler.sampler, "set_epoch"): 22 | self.batch_sampler.sampler.set_epoch(iteration) 23 | for batch in self.batch_sampler: 24 | iteration += 1 25 | if iteration > self.num_iterations: 26 | break 27 | yield batch 28 | 29 | def __len__(self): 30 | return self.num_iterations 31 | -------------------------------------------------------------------------------- /ssd/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from ssd.modeling.anchors.prior_box import PriorBox 2 | from .target_transform import SSDTargetTransform 3 | from .transforms import * 4 | 5 | 6 | def build_transforms(cfg, is_train=True): 7 | if is_train: 8 | transform = [ 9 | ConvertFromInts(), 10 | PhotometricDistort(), 11 | Expand(cfg.INPUT.PIXEL_MEAN), 12 | RandomSampleCrop(), 13 | RandomMirror(), 14 | ToPercentCoords(), 15 | Resize(cfg.INPUT.IMAGE_SIZE), 16 | SubtractMeans(cfg.INPUT.PIXEL_MEAN), 17 | ToTensor(), 18 | ] 19 | else: 20 | transform = [ 21 | Resize(cfg.INPUT.IMAGE_SIZE), 22 | SubtractMeans(cfg.INPUT.PIXEL_MEAN), 23 | ToTensor() 24 | ] 25 | transform = Compose(transform) 26 | return transform 27 | 28 | 29 | def build_target_transform(cfg): 30 | transform = SSDTargetTransform(PriorBox(cfg)(), 31 | cfg.MODEL.CENTER_VARIANCE, 32 | cfg.MODEL.SIZE_VARIANCE, 33 | cfg.MODEL.THRESHOLD) 34 | return transform 35 | -------------------------------------------------------------------------------- /ssd/data/transforms/target_transform.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | from ssd.utils import box_utils 5 | 6 | 7 | class SSDTargetTransform: 8 | def __init__(self, center_form_priors, center_variance, size_variance, iou_threshold): 9 | self.center_form_priors = center_form_priors 10 | self.corner_form_priors = box_utils.center_form_to_corner_form(center_form_priors) 11 | self.center_variance = center_variance 12 | self.size_variance = size_variance 13 | self.iou_threshold = iou_threshold 14 | 15 | def __call__(self, gt_boxes, gt_labels): 16 | if type(gt_boxes) is np.ndarray: 17 | gt_boxes = torch.from_numpy(gt_boxes) 18 | if type(gt_labels) is np.ndarray: 19 | gt_labels = torch.from_numpy(gt_labels) 20 | boxes, labels = box_utils.assign_priors(gt_boxes, gt_labels, 21 | self.corner_form_priors, self.iou_threshold) 22 | boxes = box_utils.corner_form_to_center_form(boxes) 23 | locations = box_utils.convert_boxes_to_locations(boxes, self.center_form_priors, self.center_variance, self.size_variance) 24 | 25 | return locations, labels 26 | -------------------------------------------------------------------------------- /ssd/data/transforms/transforms.py: -------------------------------------------------------------------------------- 1 | # from https://github.com/amdegroot/ssd.pytorch 2 | 3 | 4 | import torch 5 | from torchvision import transforms 6 | import cv2 7 | import numpy as np 8 | import types 9 | from numpy import random 10 | 11 | 12 | def intersect(box_a, box_b): 13 | max_xy = np.minimum(box_a[:, 2:], box_b[2:]) 14 | min_xy = np.maximum(box_a[:, :2], box_b[:2]) 15 | inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf) 16 | return inter[:, 0] * inter[:, 1] 17 | 18 | 19 | def jaccard_numpy(box_a, box_b): 20 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 21 | is simply the intersection over union of two boxes. 22 | E.g.: 23 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 24 | Args: 25 | box_a: Multiple bounding boxes, Shape: [num_boxes,4] 26 | box_b: Single bounding box, Shape: [4] 27 | Return: 28 | jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] 29 | """ 30 | inter = intersect(box_a, box_b) 31 | area_a = ((box_a[:, 2] - box_a[:, 0]) * 32 | (box_a[:, 3] - box_a[:, 1])) # [A,B] 33 | area_b = ((box_b[2] - box_b[0]) * 34 | (box_b[3] - box_b[1])) # [A,B] 35 | union = area_a + area_b - inter 36 | return inter / union # [A,B] 37 | 38 | 39 | def remove_empty_boxes(boxes, labels): 40 | """Removes bounding boxes of W or H equal to 0 and its labels 41 | 42 | Args: 43 | boxes (ndarray): NP Array with bounding boxes as lines 44 | * BBOX[x1, y1, x2, y2] 45 | labels (labels): Corresponding labels with boxes 46 | 47 | Returns: 48 | ndarray: Valid bounding boxes 49 | ndarray: Corresponding labels 50 | """ 51 | del_boxes = [] 52 | for idx, box in enumerate(boxes): 53 | if box[0] == box[2] or box[1] == box[3]: 54 | del_boxes.append(idx) 55 | 56 | return np.delete(boxes, del_boxes, 0), np.delete(labels, del_boxes) 57 | 58 | 59 | class Compose(object): 60 | """Composes several augmentations together. 61 | Args: 62 | transforms (List[Transform]): list of transforms to compose. 63 | Example: 64 | >>> augmentations.Compose([ 65 | >>> transforms.CenterCrop(10), 66 | >>> transforms.ToTensor(), 67 | >>> ]) 68 | """ 69 | 70 | def __init__(self, transforms): 71 | self.transforms = transforms 72 | 73 | def __call__(self, img, boxes=None, labels=None): 74 | for t in self.transforms: 75 | img, boxes, labels = t(img, boxes, labels) 76 | if boxes is not None: 77 | boxes, labels = remove_empty_boxes(boxes, labels) 78 | return img, boxes, labels 79 | 80 | 81 | class Lambda(object): 82 | """Applies a lambda as a transform.""" 83 | 84 | def __init__(self, lambd): 85 | assert isinstance(lambd, types.LambdaType) 86 | self.lambd = lambd 87 | 88 | def __call__(self, img, boxes=None, labels=None): 89 | return self.lambd(img, boxes, labels) 90 | 91 | 92 | class ConvertFromInts(object): 93 | def __call__(self, image, boxes=None, labels=None): 94 | return image.astype(np.float32), boxes, labels 95 | 96 | 97 | class SubtractMeans(object): 98 | def __init__(self, mean): 99 | self.mean = np.array(mean, dtype=np.float32) 100 | 101 | def __call__(self, image, boxes=None, labels=None): 102 | image = image.astype(np.float32) 103 | image -= self.mean 104 | return image.astype(np.float32), boxes, labels 105 | 106 | 107 | class ToAbsoluteCoords(object): 108 | def __call__(self, image, boxes=None, labels=None): 109 | height, width, channels = image.shape 110 | boxes[:, 0] *= width 111 | boxes[:, 2] *= width 112 | boxes[:, 1] *= height 113 | boxes[:, 3] *= height 114 | 115 | return image, boxes, labels 116 | 117 | 118 | class ToPercentCoords(object): 119 | def __call__(self, image, boxes=None, labels=None): 120 | height, width, channels = image.shape 121 | boxes[:, 0] /= width 122 | boxes[:, 2] /= width 123 | boxes[:, 1] /= height 124 | boxes[:, 3] /= height 125 | 126 | return image, boxes, labels 127 | 128 | 129 | class Resize(object): 130 | def __init__(self, size=300): 131 | self.size = size 132 | 133 | def __call__(self, image, boxes=None, labels=None): 134 | image = cv2.resize(image, (self.size, 135 | self.size)) 136 | return image, boxes, labels 137 | 138 | 139 | class RandomSaturation(object): 140 | def __init__(self, lower=0.5, upper=1.5): 141 | self.lower = lower 142 | self.upper = upper 143 | assert self.upper >= self.lower, "contrast upper must be >= lower." 144 | assert self.lower >= 0, "contrast lower must be non-negative." 145 | 146 | def __call__(self, image, boxes=None, labels=None): 147 | if random.randint(2): 148 | image[:, :, 1] *= random.uniform(self.lower, self.upper) 149 | 150 | return image, boxes, labels 151 | 152 | 153 | class RandomHue(object): 154 | def __init__(self, delta=18.0): 155 | assert delta >= 0.0 and delta <= 360.0 156 | self.delta = delta 157 | 158 | def __call__(self, image, boxes=None, labels=None): 159 | if random.randint(2): 160 | image[:, :, 0] += random.uniform(-self.delta, self.delta) 161 | image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 162 | image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 163 | return image, boxes, labels 164 | 165 | 166 | class RandomLightingNoise(object): 167 | def __init__(self): 168 | self.perms = ((0, 1, 2), (0, 2, 1), 169 | (1, 0, 2), (1, 2, 0), 170 | (2, 0, 1), (2, 1, 0)) 171 | 172 | def __call__(self, image, boxes=None, labels=None): 173 | if random.randint(2): 174 | swap = self.perms[random.randint(len(self.perms))] 175 | shuffle = SwapChannels(swap) # shuffle channels 176 | image = shuffle(image) 177 | return image, boxes, labels 178 | 179 | 180 | class ConvertColor(object): 181 | def __init__(self, current, transform): 182 | self.transform = transform 183 | self.current = current 184 | 185 | def __call__(self, image, boxes=None, labels=None): 186 | if self.current == 'BGR' and self.transform == 'HSV': 187 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 188 | elif self.current == 'RGB' and self.transform == 'HSV': 189 | image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV) 190 | elif self.current == 'BGR' and self.transform == 'RGB': 191 | image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 192 | elif self.current == 'HSV' and self.transform == 'BGR': 193 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 194 | elif self.current == 'HSV' and self.transform == "RGB": 195 | image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB) 196 | else: 197 | raise NotImplementedError 198 | return image, boxes, labels 199 | 200 | 201 | class RandomContrast(object): 202 | def __init__(self, lower=0.5, upper=1.5): 203 | self.lower = lower 204 | self.upper = upper 205 | assert self.upper >= self.lower, "contrast upper must be >= lower." 206 | assert self.lower >= 0, "contrast lower must be non-negative." 207 | 208 | # expects float image 209 | def __call__(self, image, boxes=None, labels=None): 210 | if random.randint(2): 211 | alpha = random.uniform(self.lower, self.upper) 212 | image *= alpha 213 | return image, boxes, labels 214 | 215 | 216 | class RandomBrightness(object): 217 | def __init__(self, delta=32): 218 | assert delta >= 0.0 219 | assert delta <= 255.0 220 | self.delta = delta 221 | 222 | def __call__(self, image, boxes=None, labels=None): 223 | if random.randint(2): 224 | delta = random.uniform(-self.delta, self.delta) 225 | image += delta 226 | return image, boxes, labels 227 | 228 | 229 | class ToCV2Image(object): 230 | def __call__(self, tensor, boxes=None, labels=None): 231 | return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels 232 | 233 | 234 | class ToTensor(object): 235 | def __call__(self, cvimage, boxes=None, labels=None): 236 | return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels 237 | 238 | 239 | class RandomSampleCrop(object): 240 | """Crop 241 | Arguments: 242 | img (Image): the image being input during training 243 | boxes (Tensor): the original bounding boxes in pt form 244 | labels (Tensor): the class labels for each bbox 245 | mode (float tuple): the min and max jaccard overlaps 246 | Return: 247 | (img, boxes, classes) 248 | img (Image): the cropped image 249 | boxes (Tensor): the adjusted bounding boxes in pt form 250 | labels (Tensor): the class labels for each bbox 251 | """ 252 | 253 | def __init__(self): 254 | self.sample_options = ( 255 | # using entire original input image 256 | None, 257 | # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 258 | (0.1, None), 259 | (0.3, None), 260 | (0.7, None), 261 | (0.9, None), 262 | # randomly sample a patch 263 | (None, None), 264 | ) 265 | 266 | def __call__(self, image, boxes=None, labels=None): 267 | # guard against no boxes 268 | if boxes is not None and boxes.shape[0] == 0: 269 | return image, boxes, labels 270 | height, width, _ = image.shape 271 | while True: 272 | # randomly choose a mode 273 | mode = self.sample_options[random.randint(0, len(self.sample_options))] 274 | if mode is None: 275 | return image, boxes, labels 276 | 277 | min_iou, max_iou = mode 278 | if min_iou is None: 279 | min_iou = float('-inf') 280 | if max_iou is None: 281 | max_iou = float('inf') 282 | 283 | # max trails (50) 284 | for _ in range(50): 285 | current_image = image 286 | 287 | w = random.uniform(0.3 * width, width) 288 | h = random.uniform(0.3 * height, height) 289 | 290 | # aspect ratio constraint b/t .5 & 2 291 | if h / w < 0.5 or h / w > 2: 292 | continue 293 | 294 | left = random.uniform(width - w) 295 | top = random.uniform(height - h) 296 | 297 | # convert to integer rect x1,y1,x2,y2 298 | rect = np.array([int(left), int(top), int(left + w), int(top + h)]) 299 | 300 | # calculate IoU (jaccard overlap) b/t the cropped and gt boxes 301 | overlap = jaccard_numpy(boxes, rect) 302 | 303 | # is min and max overlap constraint satisfied? if not try again 304 | if overlap.max() < min_iou or overlap.min() > max_iou: 305 | continue 306 | 307 | # cut the crop from the image 308 | current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], 309 | :] 310 | 311 | # keep overlap with gt box IF center in sampled patch 312 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 313 | 314 | # mask in all gt boxes that above and to the left of centers 315 | m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) 316 | 317 | # mask in all gt boxes that under and to the right of centers 318 | m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) 319 | 320 | # mask in that both m1 and m2 are true 321 | mask = m1 * m2 322 | 323 | # have any valid boxes? try again if not 324 | if not mask.any(): 325 | continue 326 | 327 | # take only matching gt boxes 328 | current_boxes = boxes[mask, :].copy() 329 | 330 | # take only matching gt labels 331 | current_labels = labels[mask] 332 | 333 | # should we use the box left and top corner or the crop's 334 | current_boxes[:, :2] = np.maximum(current_boxes[:, :2], 335 | rect[:2]) 336 | # adjust to crop (by substracting crop's left,top) 337 | current_boxes[:, :2] -= rect[:2] 338 | 339 | current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], 340 | rect[2:]) 341 | # adjust to crop (by substracting crop's left,top) 342 | current_boxes[:, 2:] -= rect[:2] 343 | 344 | return current_image, current_boxes, current_labels 345 | 346 | 347 | class Expand(object): 348 | def __init__(self, mean): 349 | self.mean = mean 350 | 351 | def __call__(self, image, boxes, labels): 352 | if random.randint(2): 353 | return image, boxes, labels 354 | 355 | height, width, depth = image.shape 356 | ratio = random.uniform(1, 4) 357 | left = random.uniform(0, width * ratio - width) 358 | top = random.uniform(0, height * ratio - height) 359 | 360 | expand_image = np.zeros( 361 | (int(height * ratio), int(width * ratio), depth), 362 | dtype=image.dtype) 363 | expand_image[:, :, :] = self.mean 364 | expand_image[int(top):int(top + height), 365 | int(left):int(left + width)] = image 366 | image = expand_image 367 | 368 | boxes = boxes.copy() 369 | boxes[:, :2] += (int(left), int(top)) 370 | boxes[:, 2:] += (int(left), int(top)) 371 | 372 | return image, boxes, labels 373 | 374 | 375 | class RandomMirror(object): 376 | def __call__(self, image, boxes, classes): 377 | _, width, _ = image.shape 378 | if random.randint(2): 379 | image = image[:, ::-1] 380 | boxes = boxes.copy() 381 | boxes[:, 0::2] = width - boxes[:, 2::-2] 382 | return image, boxes, classes 383 | 384 | 385 | class SwapChannels(object): 386 | """Transforms a tensorized image by swapping the channels in the order 387 | specified in the swap tuple. 388 | Args: 389 | swaps (int triple): final order of channels 390 | eg: (2, 1, 0) 391 | """ 392 | 393 | def __init__(self, swaps): 394 | self.swaps = swaps 395 | 396 | def __call__(self, image): 397 | """ 398 | Args: 399 | image (Tensor): image tensor to be transformed 400 | Return: 401 | a tensor with channels swapped according to swap 402 | """ 403 | # if torch.is_tensor(image): 404 | # image = image.data.cpu().numpy() 405 | # else: 406 | # image = np.array(image) 407 | image = image[:, :, self.swaps] 408 | return image 409 | 410 | 411 | class PhotometricDistort(object): 412 | def __init__(self): 413 | self.pd = [ 414 | RandomContrast(), # RGB 415 | ConvertColor(current="RGB", transform='HSV'), # HSV 416 | RandomSaturation(), # HSV 417 | RandomHue(), # HSV 418 | ConvertColor(current='HSV', transform='RGB'), # RGB 419 | RandomContrast() # RGB 420 | ] 421 | self.rand_brightness = RandomBrightness() 422 | self.rand_light_noise = RandomLightingNoise() 423 | 424 | def __call__(self, image, boxes, labels): 425 | im = image.copy() 426 | im, boxes, labels = self.rand_brightness(im, boxes, labels) 427 | if random.randint(2): 428 | distort = Compose(self.pd[:-1]) 429 | else: 430 | distort = Compose(self.pd[1:]) 431 | im, boxes, labels = distort(im, boxes, labels) 432 | return self.rand_light_noise(im, boxes, labels) 433 | -------------------------------------------------------------------------------- /ssd/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/ssd/engine/__init__.py -------------------------------------------------------------------------------- /ssd/engine/inference.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import torch 5 | import torch.utils.data 6 | from tqdm import tqdm 7 | 8 | from ssd.data.build import make_data_loader 9 | from ssd.data.datasets.evaluation import evaluate 10 | 11 | from ssd.utils import dist_util, mkdir 12 | from ssd.utils.dist_util import synchronize, is_main_process 13 | 14 | 15 | def _accumulate_predictions_from_multiple_gpus(predictions_per_gpu): 16 | all_predictions = dist_util.all_gather(predictions_per_gpu) 17 | if not dist_util.is_main_process(): 18 | return 19 | # merge the list of dicts 20 | predictions = {} 21 | for p in all_predictions: 22 | predictions.update(p) 23 | # convert a dict where the key is the index in a list 24 | image_ids = list(sorted(predictions.keys())) 25 | if len(image_ids) != image_ids[-1] + 1: 26 | logger = logging.getLogger("SSD.inference") 27 | logger.warning( 28 | "Number of images that were gathered from multiple processes is not " 29 | "a contiguous set. Some images might be missing from the evaluation" 30 | ) 31 | 32 | # convert to a list 33 | predictions = [predictions[i] for i in image_ids] 34 | return predictions 35 | 36 | 37 | def compute_on_dataset(model, data_loader, device): 38 | results_dict = {} 39 | for batch in tqdm(data_loader): 40 | images, targets, image_ids = batch 41 | cpu_device = torch.device("cpu") 42 | with torch.no_grad(): 43 | outputs = model(images.to(device)) 44 | 45 | outputs = [o.to(cpu_device) for o in outputs] 46 | results_dict.update( 47 | {int(img_id): result for img_id, result in zip(image_ids, outputs)} 48 | ) 49 | return results_dict 50 | 51 | 52 | def inference(model, data_loader, dataset_name, device, output_folder=None, use_cached=False, **kwargs): 53 | dataset = data_loader.dataset 54 | logger = logging.getLogger("SSD.inference") 55 | logger.info("Evaluating {} dataset({} images):".format(dataset_name, len(dataset))) 56 | predictions_path = os.path.join(output_folder, 'predictions.pth') 57 | if use_cached and os.path.exists(predictions_path): 58 | predictions = torch.load(predictions_path, map_location='cpu') 59 | else: 60 | predictions = compute_on_dataset(model, data_loader, device) 61 | synchronize() 62 | predictions = _accumulate_predictions_from_multiple_gpus(predictions) 63 | if not is_main_process(): 64 | return 65 | if output_folder: 66 | torch.save(predictions, predictions_path) 67 | return evaluate(dataset=dataset, predictions=predictions, output_dir=output_folder, **kwargs) 68 | 69 | 70 | @torch.no_grad() 71 | def do_evaluation(cfg, model, distributed, **kwargs): 72 | if isinstance(model, torch.nn.parallel.DistributedDataParallel): 73 | model = model.module 74 | model.eval() 75 | device = torch.device(cfg.MODEL.DEVICE) 76 | data_loaders_val = make_data_loader(cfg, is_train=False, distributed=distributed) 77 | eval_results = [] 78 | for dataset_name, data_loader in zip(cfg.DATASETS.TEST, data_loaders_val): 79 | output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", dataset_name) 80 | if not os.path.exists(output_folder): 81 | mkdir(output_folder) 82 | eval_result = inference(model, data_loader, dataset_name, device, output_folder, **kwargs) 83 | eval_results.append(eval_result) 84 | return eval_results 85 | -------------------------------------------------------------------------------- /ssd/engine/trainer.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import datetime 3 | import logging 4 | import os 5 | import time 6 | import torch 7 | import torch.distributed as dist 8 | 9 | from ssd.engine.inference import do_evaluation 10 | from ssd.utils import dist_util 11 | from ssd.utils.metric_logger import MetricLogger 12 | 13 | 14 | def write_metric(eval_result, prefix, summary_writer, global_step): 15 | for key in eval_result: 16 | value = eval_result[key] 17 | tag = '{}/{}'.format(prefix, key) 18 | if isinstance(value, collections.Mapping): 19 | write_metric(value, tag, summary_writer, global_step) 20 | else: 21 | summary_writer.add_scalar(tag, value, global_step=global_step) 22 | 23 | 24 | def reduce_loss_dict(loss_dict): 25 | """ 26 | Reduce the loss dictionary from all processes so that process with rank 27 | 0 has the averaged results. Returns a dict with the same fields as 28 | loss_dict, after reduction. 29 | """ 30 | world_size = dist_util.get_world_size() 31 | if world_size < 2: 32 | return loss_dict 33 | with torch.no_grad(): 34 | loss_names = [] 35 | all_losses = [] 36 | for k in sorted(loss_dict.keys()): 37 | loss_names.append(k) 38 | all_losses.append(loss_dict[k]) 39 | all_losses = torch.stack(all_losses, dim=0) 40 | dist.reduce(all_losses, dst=0) 41 | if dist.get_rank() == 0: 42 | # only main process gets accumulated, so only divide by 43 | # world_size in this case 44 | all_losses /= world_size 45 | reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} 46 | return reduced_losses 47 | 48 | 49 | def do_train(cfg, model, 50 | data_loader, 51 | optimizer, 52 | scheduler, 53 | checkpointer, 54 | device, 55 | arguments, 56 | args): 57 | logger = logging.getLogger("SSD.trainer") 58 | logger.info("Start training ...") 59 | meters = MetricLogger() 60 | 61 | model.train() 62 | save_to_disk = dist_util.get_rank() == 0 63 | if args.use_tensorboard and save_to_disk: 64 | try: 65 | from torch.utils.tensorboard import SummaryWriter 66 | except ImportError: 67 | from tensorboardX import SummaryWriter 68 | summary_writer = SummaryWriter(log_dir=os.path.join(cfg.OUTPUT_DIR, 'tf_logs')) 69 | else: 70 | summary_writer = None 71 | 72 | max_iter = len(data_loader) 73 | start_iter = arguments["iteration"] 74 | start_training_time = time.time() 75 | end = time.time() 76 | for iteration, (images, targets, _) in enumerate(data_loader, start_iter): 77 | iteration = iteration + 1 78 | arguments["iteration"] = iteration 79 | 80 | images = images.to(device) 81 | targets = targets.to(device) 82 | loss_dict = model(images, targets=targets) 83 | loss = sum(loss for loss in loss_dict.values()) 84 | 85 | # reduce losses over all GPUs for logging purposes 86 | loss_dict_reduced = reduce_loss_dict(loss_dict) 87 | losses_reduced = sum(loss for loss in loss_dict_reduced.values()) 88 | meters.update(total_loss=losses_reduced, **loss_dict_reduced) 89 | 90 | optimizer.zero_grad() 91 | loss.backward() 92 | optimizer.step() 93 | scheduler.step() 94 | 95 | batch_time = time.time() - end 96 | end = time.time() 97 | meters.update(time=batch_time) 98 | if iteration % args.log_step == 0: 99 | eta_seconds = meters.time.global_avg * (max_iter - iteration) 100 | eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) 101 | if device == "cuda": 102 | logger.info( 103 | meters.delimiter.join([ 104 | "iter: {iter:06d}", 105 | "lr: {lr:.5f}", 106 | '{meters}', 107 | "eta: {eta}", 108 | 'mem: {mem}M', 109 | ]).format( 110 | iter=iteration, 111 | lr=optimizer.param_groups[0]['lr'], 112 | meters=str(meters), 113 | eta=eta_string, 114 | mem=round(torch.cuda.max_memory_allocated() / 1024.0 / 1024.0), 115 | ) 116 | ) 117 | else: 118 | logger.info( 119 | meters.delimiter.join([ 120 | "iter: {iter:06d}", 121 | "lr: {lr:.5f}", 122 | '{meters}', 123 | "eta: {eta}", 124 | ]).format( 125 | iter=iteration, 126 | lr=optimizer.param_groups[0]['lr'], 127 | meters=str(meters), 128 | eta=eta_string, 129 | ) 130 | ) 131 | if summary_writer: 132 | global_step = iteration 133 | summary_writer.add_scalar('losses/total_loss', losses_reduced, global_step=global_step) 134 | for loss_name, loss_item in loss_dict_reduced.items(): 135 | summary_writer.add_scalar('losses/{}'.format(loss_name), loss_item, global_step=global_step) 136 | summary_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=global_step) 137 | 138 | if iteration % args.save_step == 0: 139 | checkpointer.save("model_{:06d}".format(iteration), **arguments) 140 | 141 | if args.eval_step > 0 and iteration % args.eval_step == 0 and not iteration == max_iter: 142 | eval_results = do_evaluation(cfg, model, distributed=args.distributed, iteration=iteration) 143 | if dist_util.get_rank() == 0 and summary_writer: 144 | for eval_result, dataset in zip(eval_results, cfg.DATASETS.TEST): 145 | write_metric(eval_result['metrics'], 'metrics/' + dataset, summary_writer, iteration) 146 | model.train() # *IMPORTANT*: change to train mode after eval. 147 | 148 | checkpointer.save("model_final", **arguments) 149 | # compute training time 150 | total_training_time = int(time.time() - start_training_time) 151 | total_time_str = str(datetime.timedelta(seconds=total_training_time)) 152 | logger.info("Total training time: {} ({:.4f} s / it)".format(total_time_str, total_training_time / max_iter)) 153 | return model 154 | -------------------------------------------------------------------------------- /ssd/layers/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.init as init 4 | from .separable_conv import SeparableConv2d 5 | 6 | __all__ = ['L2Norm', 'SeparableConv2d'] 7 | 8 | 9 | class L2Norm(nn.Module): 10 | def __init__(self, n_channels, scale): 11 | super(L2Norm, self).__init__() 12 | self.n_channels = n_channels 13 | self.gamma = scale or None 14 | self.eps = 1e-10 15 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 16 | self.reset_parameters() 17 | 18 | def reset_parameters(self): 19 | init.constant_(self.weight, self.gamma) 20 | 21 | def forward(self, x): 22 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 23 | x = torch.div(x, norm) 24 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 25 | return out 26 | -------------------------------------------------------------------------------- /ssd/layers/separable_conv.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class SeparableConv2d(nn.Module): 5 | def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, onnx_compatible=False): 6 | super().__init__() 7 | ReLU = nn.ReLU if onnx_compatible else nn.ReLU6 8 | self.conv = nn.Sequential( 9 | nn.Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size, 10 | groups=in_channels, stride=stride, padding=padding), 11 | nn.BatchNorm2d(in_channels), 12 | ReLU(), 13 | nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1), 14 | ) 15 | 16 | def forward(self, x): 17 | return self.conv(x) 18 | -------------------------------------------------------------------------------- /ssd/modeling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/ssd/modeling/__init__.py -------------------------------------------------------------------------------- /ssd/modeling/anchors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/ssd/modeling/anchors/__init__.py -------------------------------------------------------------------------------- /ssd/modeling/anchors/prior_box.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | 3 | import torch 4 | from math import sqrt 5 | 6 | 7 | class PriorBox: 8 | def __init__(self, cfg): 9 | self.image_size = cfg.INPUT.IMAGE_SIZE 10 | prior_config = cfg.MODEL.PRIORS 11 | self.feature_maps = prior_config.FEATURE_MAPS 12 | self.min_sizes = prior_config.MIN_SIZES 13 | self.max_sizes = prior_config.MAX_SIZES 14 | self.strides = prior_config.STRIDES 15 | self.aspect_ratios = prior_config.ASPECT_RATIOS 16 | self.clip = prior_config.CLIP 17 | 18 | def __call__(self): 19 | """Generate SSD Prior Boxes. 20 | It returns the center, height and width of the priors. The values are relative to the image size 21 | Returns: 22 | priors (num_priors, 4): The prior boxes represented as [[center_x, center_y, w, h]]. All the values 23 | are relative to the image size. 24 | """ 25 | priors = [] 26 | for k, f in enumerate(self.feature_maps): 27 | scale = self.image_size / self.strides[k] 28 | for i, j in product(range(f), repeat=2): 29 | # unit center x,y 30 | cx = (j + 0.5) / scale 31 | cy = (i + 0.5) / scale 32 | 33 | # small sized square box 34 | size = self.min_sizes[k] 35 | h = w = size / self.image_size 36 | priors.append([cx, cy, w, h]) 37 | 38 | # big sized square box 39 | size = sqrt(self.min_sizes[k] * self.max_sizes[k]) 40 | h = w = size / self.image_size 41 | priors.append([cx, cy, w, h]) 42 | 43 | # change h/w ratio of the small sized box 44 | size = self.min_sizes[k] 45 | h = w = size / self.image_size 46 | for ratio in self.aspect_ratios[k]: 47 | ratio = sqrt(ratio) 48 | priors.append([cx, cy, w * ratio, h / ratio]) 49 | priors.append([cx, cy, w / ratio, h * ratio]) 50 | 51 | priors = torch.tensor(priors) 52 | if self.clip: 53 | priors.clamp_(max=1, min=0) 54 | return priors 55 | -------------------------------------------------------------------------------- /ssd/modeling/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from ssd.modeling import registry 2 | from .vgg import VGG 3 | from .mobilenet import MobileNetV2 4 | from .efficient_net import EfficientNet 5 | from .mobilenetv3 import MobileNetV3 6 | 7 | __all__ = ['build_backbone', 'VGG', 'MobileNetV2', 'EfficientNet', 'MobileNetV3'] 8 | 9 | 10 | def build_backbone(cfg): 11 | return registry.BACKBONES[cfg.MODEL.BACKBONE.NAME](cfg, cfg.MODEL.BACKBONE.PRETRAINED) 12 | -------------------------------------------------------------------------------- /ssd/modeling/backbone/efficient_net/__init__.py: -------------------------------------------------------------------------------- 1 | from ssd.modeling import registry 2 | from .efficient_net import EfficientNet 3 | 4 | __all__ = ['efficient_net_b3', 'EfficientNet'] 5 | 6 | 7 | @registry.BACKBONES.register('efficient_net-b3') 8 | def efficient_net_b3(cfg, pretrained=True): 9 | if pretrained: 10 | model = EfficientNet.from_pretrained('efficientnet-b3') 11 | else: 12 | model = EfficientNet.from_name('efficientnet-b3') 13 | 14 | return model 15 | -------------------------------------------------------------------------------- /ssd/modeling/backbone/efficient_net/efficient_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | from .utils import ( 5 | relu_fn, 6 | round_filters, 7 | round_repeats, 8 | drop_connect, 9 | Conv2dSamePadding, 10 | get_model_params, 11 | efficientnet_params, 12 | load_pretrained_weights, 13 | ) 14 | 15 | INDICES = { 16 | 'efficientnet-b3': [7, 17, 25] 17 | } 18 | 19 | EXTRAS = { 20 | 'efficientnet-b3': [ 21 | # in, out, k, s, p 22 | [(384, 128, 1, 1, 0), (128, 256, 3, 2, 1)], # 5 x 5 23 | [(256, 128, 1, 1, 0), (128, 256, 3, 1, 0)], # 3 x 3 24 | [(256, 128, 1, 1, 0), (128, 256, 3, 1, 0)], # 1 x 1 25 | 26 | ] 27 | } 28 | 29 | 30 | def add_extras(cfgs): 31 | extras = nn.ModuleList() 32 | for cfg in cfgs: 33 | extra = [] 34 | for params in cfg: 35 | in_channels, out_channels, kernel_size, stride, padding = params 36 | extra.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding)) 37 | extra.append(nn.ReLU()) 38 | extras.append(nn.Sequential(*extra)) 39 | return extras 40 | 41 | 42 | class MBConvBlock(nn.Module): 43 | """ 44 | Mobile Inverted Residual Bottleneck Block 45 | 46 | Args: 47 | block_args (namedtuple): BlockArgs, see above 48 | global_params (namedtuple): GlobalParam, see above 49 | 50 | Attributes: 51 | has_se (bool): Whether the block contains a Squeeze and Excitation layer. 52 | """ 53 | 54 | def __init__(self, block_args, global_params): 55 | super().__init__() 56 | self._block_args = block_args 57 | self._bn_mom = 1 - global_params.batch_norm_momentum 58 | self._bn_eps = global_params.batch_norm_epsilon 59 | self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1) 60 | self.id_skip = block_args.id_skip # skip connection and drop connect 61 | 62 | # Expansion phase 63 | inp = self._block_args.input_filters # number of input channels 64 | oup = self._block_args.input_filters * self._block_args.expand_ratio # number of output channels 65 | if self._block_args.expand_ratio != 1: 66 | self._expand_conv = Conv2dSamePadding(in_channels=inp, out_channels=oup, kernel_size=1, bias=False) 67 | self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 68 | 69 | # Depthwise convolution phase 70 | k = self._block_args.kernel_size 71 | s = self._block_args.stride 72 | self._depthwise_conv = Conv2dSamePadding( 73 | in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise 74 | kernel_size=k, stride=s, bias=False) 75 | self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 76 | 77 | # Squeeze and Excitation layer, if desired 78 | if self.has_se: 79 | num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio)) 80 | self._se_reduce = Conv2dSamePadding(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1) 81 | self._se_expand = Conv2dSamePadding(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1) 82 | 83 | # Output phase 84 | final_oup = self._block_args.output_filters 85 | self._project_conv = Conv2dSamePadding(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False) 86 | self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps) 87 | 88 | def forward(self, inputs, drop_connect_rate=None): 89 | """ 90 | :param inputs: input tensor 91 | :param drop_connect_rate: drop connect rate (float, between 0 and 1) 92 | :return: output of block 93 | """ 94 | 95 | # Expansion and Depthwise Convolution 96 | x = inputs 97 | if self._block_args.expand_ratio != 1: 98 | x = relu_fn(self._bn0(self._expand_conv(inputs))) 99 | x = relu_fn(self._bn1(self._depthwise_conv(x))) 100 | 101 | # Squeeze and Excitation 102 | if self.has_se: 103 | x_squeezed = F.adaptive_avg_pool2d(x, 1) 104 | x_squeezed = self._se_expand(relu_fn(self._se_reduce(x_squeezed))) 105 | x = torch.sigmoid(x_squeezed) * x 106 | 107 | x = self._bn2(self._project_conv(x)) 108 | 109 | # Skip connection and drop connect 110 | input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters 111 | if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters: 112 | if drop_connect_rate: 113 | x = drop_connect(x, p=drop_connect_rate, training=self.training) 114 | x = x + inputs # skip connection 115 | return x 116 | 117 | 118 | class EfficientNet(nn.Module): 119 | """ 120 | An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods 121 | 122 | Args: 123 | blocks_args (list): A list of BlockArgs to construct blocks 124 | global_params (namedtuple): A set of GlobalParams shared between blocks 125 | 126 | Example: 127 | model = EfficientNet.from_pretrained('efficientnet-b0') 128 | 129 | """ 130 | 131 | def __init__(self, model_name, blocks_args=None, global_params=None): 132 | super().__init__() 133 | self.indices = INDICES[model_name] 134 | self.extras = add_extras(EXTRAS[model_name]) 135 | assert isinstance(blocks_args, list), 'blocks_args should be a list' 136 | assert len(blocks_args) > 0, 'block args must be greater than 0' 137 | self._global_params = global_params 138 | self._blocks_args = blocks_args 139 | 140 | # Batch norm parameters 141 | bn_mom = 1 - self._global_params.batch_norm_momentum 142 | bn_eps = self._global_params.batch_norm_epsilon 143 | 144 | # Stem 145 | in_channels = 3 # rgb 146 | out_channels = round_filters(32, self._global_params) # number of output channels 147 | self._conv_stem = Conv2dSamePadding(in_channels, out_channels, kernel_size=3, stride=2, bias=False) 148 | self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) 149 | 150 | # Build blocks 151 | self._blocks = nn.ModuleList([]) 152 | for block_args in self._blocks_args: 153 | 154 | # Update block input and output filters based on depth multiplier. 155 | block_args = block_args._replace( 156 | input_filters=round_filters(block_args.input_filters, self._global_params), 157 | output_filters=round_filters(block_args.output_filters, self._global_params), 158 | num_repeat=round_repeats(block_args.num_repeat, self._global_params) 159 | ) 160 | 161 | # The first block needs to take care of stride and filter size increase. 162 | self._blocks.append(MBConvBlock(block_args, self._global_params)) 163 | if block_args.num_repeat > 1: 164 | block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) 165 | for _ in range(block_args.num_repeat - 1): 166 | self._blocks.append(MBConvBlock(block_args, self._global_params)) 167 | self.reset_parameters() 168 | 169 | def reset_parameters(self): 170 | for m in self.extras.modules(): 171 | if isinstance(m, nn.Conv2d): 172 | nn.init.xavier_uniform_(m.weight) 173 | nn.init.zeros_(m.bias) 174 | 175 | def extract_features(self, inputs): 176 | """ Returns output of the final convolution layer """ 177 | 178 | # Stem 179 | x = relu_fn(self._bn0(self._conv_stem(inputs))) 180 | 181 | features = [] 182 | 183 | # Blocks 184 | for idx, block in enumerate(self._blocks): 185 | drop_connect_rate = self._global_params.drop_connect_rate 186 | if drop_connect_rate: 187 | drop_connect_rate *= float(idx) / len(self._blocks) 188 | x = block(x, drop_connect_rate) 189 | if idx in self.indices: 190 | features.append(x) 191 | 192 | return x, features 193 | 194 | def forward(self, inputs): 195 | """ Calls extract_features to extract features, applies final linear layer, and returns logits. """ 196 | 197 | # Convolution layers 198 | x, features = self.extract_features(inputs) 199 | 200 | for layer in self.extras: 201 | x = layer(x) 202 | features.append(x) 203 | 204 | return tuple(features) 205 | 206 | @classmethod 207 | def from_name(cls, model_name, override_params=None): 208 | cls._check_model_name_is_valid(model_name) 209 | blocks_args, global_params = get_model_params(model_name, override_params) 210 | return EfficientNet(model_name, blocks_args, global_params) 211 | 212 | @classmethod 213 | def from_pretrained(cls, model_name): 214 | model = EfficientNet.from_name(model_name) 215 | load_pretrained_weights(model, model_name) 216 | return model 217 | 218 | @classmethod 219 | def get_image_size(cls, model_name): 220 | cls._check_model_name_is_valid(model_name) 221 | _, _, res, _ = efficientnet_params(model_name) 222 | return res 223 | 224 | @classmethod 225 | def _check_model_name_is_valid(cls, model_name, also_need_pretrained_weights=False): 226 | """ Validates model name. None that pretrained weights are only available for 227 | the first four models (efficientnet-b{i} for i in 0,1,2,3) at the moment. """ 228 | num_models = 4 if also_need_pretrained_weights else 8 229 | valid_models = ['efficientnet_b' + str(i) for i in range(num_models)] 230 | if model_name.replace('-', '_') not in valid_models: 231 | raise ValueError('model_name should be one of: ' + ', '.join(valid_models)) 232 | -------------------------------------------------------------------------------- /ssd/modeling/backbone/efficient_net/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains helper functions for building the model and for loading model parameters. 3 | These helper functions are built to mirror those in the official TensorFlow implementation. 4 | """ 5 | 6 | import re 7 | import math 8 | import collections 9 | import torch 10 | from torch import nn 11 | from torch.nn import functional as F 12 | from ssd.utils.model_zoo import load_state_dict_from_url 13 | 14 | ######################################################################## 15 | ############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ############### 16 | ######################################################################## 17 | 18 | 19 | # Parameters for the entire model (stem, all blocks, and head) 20 | 21 | GlobalParams = collections.namedtuple('GlobalParams', [ 22 | 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 23 | 'num_classes', 'width_coefficient', 'depth_coefficient', 24 | 'depth_divisor', 'min_depth', 'drop_connect_rate', ]) 25 | 26 | # Parameters for an individual model block 27 | BlockArgs = collections.namedtuple('BlockArgs', [ 28 | 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', 29 | 'expand_ratio', 'id_skip', 'stride', 'se_ratio']) 30 | 31 | # Change namedtuple defaults 32 | GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields) 33 | BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields) 34 | 35 | 36 | def relu_fn(x): 37 | """ Swish activation function """ 38 | return x * torch.sigmoid(x) 39 | 40 | 41 | def round_filters(filters, global_params): 42 | """ Calculate and round number of filters based on depth multiplier. """ 43 | multiplier = global_params.width_coefficient 44 | if not multiplier: 45 | return filters 46 | divisor = global_params.depth_divisor 47 | min_depth = global_params.min_depth 48 | filters *= multiplier 49 | min_depth = min_depth or divisor 50 | new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor) 51 | if new_filters < 0.9 * filters: # prevent rounding by more than 10% 52 | new_filters += divisor 53 | return int(new_filters) 54 | 55 | 56 | def round_repeats(repeats, global_params): 57 | """ Round number of filters based on depth multiplier. """ 58 | multiplier = global_params.depth_coefficient 59 | if not multiplier: 60 | return repeats 61 | return int(math.ceil(multiplier * repeats)) 62 | 63 | 64 | def drop_connect(inputs, p, training): 65 | """ Drop connect. """ 66 | if not training: return inputs 67 | batch_size = inputs.shape[0] 68 | keep_prob = 1 - p 69 | random_tensor = keep_prob 70 | random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device) 71 | binary_tensor = torch.floor(random_tensor) 72 | output = inputs / keep_prob * binary_tensor 73 | return output 74 | 75 | 76 | class Conv2dSamePadding(nn.Conv2d): 77 | """ 2D Convolutions like TensorFlow """ 78 | 79 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True): 80 | super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) 81 | self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2 82 | 83 | def forward(self, x): 84 | ih, iw = x.size()[-2:] 85 | kh, kw = self.weight.size()[-2:] 86 | sh, sw = self.stride 87 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) 88 | pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) 89 | pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) 90 | if pad_h > 0 or pad_w > 0: 91 | x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]) 92 | return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 93 | 94 | 95 | ######################################################################## 96 | ############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ############## 97 | ######################################################################## 98 | 99 | 100 | def efficientnet_params(model_name): 101 | """ Map EfficientNet model name to parameter coefficients. """ 102 | params_dict = { 103 | # Coefficients: width,depth,res,dropout 104 | 'efficientnet-b0': (1.0, 1.0, 224, 0.2), 105 | 'efficientnet-b1': (1.0, 1.1, 240, 0.2), 106 | 'efficientnet-b2': (1.1, 1.2, 260, 0.3), 107 | 'efficientnet-b3': (1.2, 1.4, 300, 0.3), 108 | 'efficientnet-b4': (1.4, 1.8, 380, 0.4), 109 | 'efficientnet-b5': (1.6, 2.2, 456, 0.4), 110 | 'efficientnet-b6': (1.8, 2.6, 528, 0.5), 111 | 'efficientnet-b7': (2.0, 3.1, 600, 0.5), 112 | } 113 | return params_dict[model_name] 114 | 115 | 116 | class BlockDecoder(object): 117 | """ Block Decoder for readability, straight from the official TensorFlow repository """ 118 | 119 | @staticmethod 120 | def _decode_block_string(block_string): 121 | """ Gets a block through a string notation of arguments. """ 122 | assert isinstance(block_string, str) 123 | 124 | ops = block_string.split('_') 125 | options = {} 126 | for op in ops: 127 | splits = re.split(r'(\d.*)', op) 128 | if len(splits) >= 2: 129 | key, value = splits[:2] 130 | options[key] = value 131 | 132 | # Check stride 133 | assert (('s' in options and len(options['s']) == 1) or 134 | (len(options['s']) == 2 and options['s'][0] == options['s'][1])) 135 | 136 | return BlockArgs( 137 | kernel_size=int(options['k']), 138 | num_repeat=int(options['r']), 139 | input_filters=int(options['i']), 140 | output_filters=int(options['o']), 141 | expand_ratio=int(options['e']), 142 | id_skip=('noskip' not in block_string), 143 | se_ratio=float(options['se']) if 'se' in options else None, 144 | stride=[int(options['s'][0])]) 145 | 146 | @staticmethod 147 | def _encode_block_string(block): 148 | """Encodes a block to a string.""" 149 | args = [ 150 | 'r%d' % block.num_repeat, 151 | 'k%d' % block.kernel_size, 152 | 's%d%d' % (block.strides[0], block.strides[1]), 153 | 'e%s' % block.expand_ratio, 154 | 'i%d' % block.input_filters, 155 | 'o%d' % block.output_filters 156 | ] 157 | if 0 < block.se_ratio <= 1: 158 | args.append('se%s' % block.se_ratio) 159 | if block.id_skip is False: 160 | args.append('noskip') 161 | return '_'.join(args) 162 | 163 | @staticmethod 164 | def decode(string_list): 165 | """ 166 | Decodes a list of string notations to specify blocks inside the network. 167 | 168 | :param string_list: a list of strings, each string is a notation of block 169 | :return: a list of BlockArgs namedtuples of block args 170 | """ 171 | assert isinstance(string_list, list) 172 | blocks_args = [] 173 | for block_string in string_list: 174 | blocks_args.append(BlockDecoder._decode_block_string(block_string)) 175 | return blocks_args 176 | 177 | @staticmethod 178 | def encode(blocks_args): 179 | """ 180 | Encodes a list of BlockArgs to a list of strings. 181 | 182 | :param blocks_args: a list of BlockArgs namedtuples of block args 183 | :return: a list of strings, each string is a notation of block 184 | """ 185 | block_strings = [] 186 | for block in blocks_args: 187 | block_strings.append(BlockDecoder._encode_block_string(block)) 188 | return block_strings 189 | 190 | 191 | def efficientnet(width_coefficient=None, depth_coefficient=None, 192 | dropout_rate=0.2, drop_connect_rate=0.2): 193 | """ Creates a efficientnet model. """ 194 | 195 | blocks_args = [ 196 | 'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25', 197 | 'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25', 198 | 'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25', 199 | 'r1_k3_s11_e6_i192_o320_se0.25', 200 | ] 201 | blocks_args = BlockDecoder.decode(blocks_args) 202 | 203 | global_params = GlobalParams( 204 | batch_norm_momentum=0.99, 205 | batch_norm_epsilon=1e-3, 206 | dropout_rate=dropout_rate, 207 | drop_connect_rate=drop_connect_rate, 208 | # data_format='channels_last', # removed, this is always true in PyTorch 209 | num_classes=1000, 210 | width_coefficient=width_coefficient, 211 | depth_coefficient=depth_coefficient, 212 | depth_divisor=8, 213 | min_depth=None 214 | ) 215 | 216 | return blocks_args, global_params 217 | 218 | 219 | def get_model_params(model_name, override_params): 220 | """ Get the block args and global params for a given model """ 221 | if model_name.startswith('efficientnet'): 222 | w, d, _, p = efficientnet_params(model_name) 223 | # note: all models have drop connect rate = 0.2 224 | blocks_args, global_params = efficientnet(width_coefficient=w, depth_coefficient=d, dropout_rate=p) 225 | else: 226 | raise NotImplementedError('model name is not pre-defined: %s' % model_name) 227 | if override_params: 228 | # ValueError will be raised here if override_params has fields not included in global_params. 229 | global_params = global_params._replace(**override_params) 230 | return blocks_args, global_params 231 | 232 | 233 | url_map = { 234 | 'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet-b0-08094119.pth', 235 | 'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet-b1-dbc7070a.pth', 236 | 'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet-b2-27687264.pth', 237 | 'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet-b3-c8376fa2.pth', 238 | 'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet-b4-e116e8b3.pth', 239 | 'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet-b5-586e6cc6.pth', 240 | } 241 | 242 | 243 | def load_pretrained_weights(model, model_name): 244 | """ Loads pretrained weights, and downloads if loading for the first time. """ 245 | state_dict = load_state_dict_from_url(url_map[model_name]) 246 | model.load_state_dict(state_dict, strict=False) 247 | print('Loaded pretrained weights for {}'.format(model_name)) 248 | -------------------------------------------------------------------------------- /ssd/modeling/backbone/mobilenet.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from ssd.modeling import registry 4 | from ssd.utils.model_zoo import load_state_dict_from_url 5 | 6 | model_urls = { 7 | 'mobilenet_v2': 'https://download.pytorch.org/models/mobilenet_v2-b0353104.pth', 8 | } 9 | 10 | 11 | class ConvBNReLU(nn.Sequential): 12 | def __init__(self, in_planes, out_planes, kernel_size=3, stride=1, groups=1): 13 | padding = (kernel_size - 1) // 2 14 | super(ConvBNReLU, self).__init__( 15 | nn.Conv2d(in_planes, out_planes, kernel_size, stride, padding, groups=groups, bias=False), 16 | nn.BatchNorm2d(out_planes), 17 | nn.ReLU6(inplace=True) 18 | ) 19 | 20 | 21 | class InvertedResidual(nn.Module): 22 | def __init__(self, inp, oup, stride, expand_ratio): 23 | super(InvertedResidual, self).__init__() 24 | self.stride = stride 25 | assert stride in [1, 2] 26 | 27 | hidden_dim = int(round(inp * expand_ratio)) 28 | self.use_res_connect = self.stride == 1 and inp == oup 29 | 30 | layers = [] 31 | if expand_ratio != 1: 32 | # pw 33 | layers.append(ConvBNReLU(inp, hidden_dim, kernel_size=1)) 34 | layers.extend([ 35 | # dw 36 | ConvBNReLU(hidden_dim, hidden_dim, stride=stride, groups=hidden_dim), 37 | # pw-linear 38 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 39 | nn.BatchNorm2d(oup), 40 | ]) 41 | self.conv = nn.Sequential(*layers) 42 | 43 | def forward(self, x): 44 | if self.use_res_connect: 45 | return x + self.conv(x) 46 | else: 47 | return self.conv(x) 48 | 49 | 50 | class MobileNetV2(nn.Module): 51 | def __init__(self, width_mult=1.0, inverted_residual_setting=None): 52 | super(MobileNetV2, self).__init__() 53 | block = InvertedResidual 54 | input_channel = 32 55 | last_channel = 1280 56 | 57 | if inverted_residual_setting is None: 58 | inverted_residual_setting = [ 59 | # t, c, n, s 60 | [1, 16, 1, 1], 61 | [6, 24, 2, 2], 62 | [6, 32, 3, 2], 63 | [6, 64, 4, 2], 64 | [6, 96, 3, 1], 65 | [6, 160, 3, 2], 66 | [6, 320, 1, 1], 67 | ] 68 | 69 | # only check the first element, assuming user knows t,c,n,s are required 70 | if len(inverted_residual_setting) == 0 or len(inverted_residual_setting[0]) != 4: 71 | raise ValueError("inverted_residual_setting should be non-empty " 72 | "or a 4-element list, got {}".format(inverted_residual_setting)) 73 | 74 | # building first layer 75 | input_channel = int(input_channel * width_mult) 76 | self.last_channel = int(last_channel * max(1.0, width_mult)) 77 | features = [ConvBNReLU(3, input_channel, stride=2)] 78 | # building inverted residual blocks 79 | for t, c, n, s in inverted_residual_setting: 80 | output_channel = int(c * width_mult) 81 | for i in range(n): 82 | stride = s if i == 0 else 1 83 | features.append(block(input_channel, output_channel, stride, expand_ratio=t)) 84 | input_channel = output_channel 85 | # building last several layers 86 | features.append(ConvBNReLU(input_channel, self.last_channel, kernel_size=1)) 87 | # make it nn.Sequential 88 | self.features = nn.Sequential(*features) 89 | self.extras = nn.ModuleList([ 90 | InvertedResidual(1280, 512, 2, 0.2), 91 | InvertedResidual(512, 256, 2, 0.25), 92 | InvertedResidual(256, 256, 2, 0.5), 93 | InvertedResidual(256, 64, 2, 0.25) 94 | ]) 95 | 96 | self.reset_parameters() 97 | 98 | def reset_parameters(self): 99 | # weight initialization 100 | for m in self.modules(): 101 | if isinstance(m, nn.Conv2d): 102 | nn.init.kaiming_normal_(m.weight, mode='fan_out') 103 | if m.bias is not None: 104 | nn.init.zeros_(m.bias) 105 | elif isinstance(m, nn.BatchNorm2d): 106 | nn.init.ones_(m.weight) 107 | nn.init.zeros_(m.bias) 108 | elif isinstance(m, nn.Linear): 109 | nn.init.normal_(m.weight, 0, 0.01) 110 | nn.init.zeros_(m.bias) 111 | 112 | def forward(self, x): 113 | features = [] 114 | for i in range(14): 115 | x = self.features[i](x) 116 | features.append(x) 117 | 118 | for i in range(14, len(self.features)): 119 | x = self.features[i](x) 120 | features.append(x) 121 | 122 | for i in range(len(self.extras)): 123 | x = self.extras[i](x) 124 | features.append(x) 125 | 126 | return tuple(features) 127 | 128 | 129 | @registry.BACKBONES.register('mobilenet_v2') 130 | def mobilenet_v2(cfg, pretrained=True): 131 | model = MobileNetV2() 132 | if pretrained: 133 | model.load_state_dict(load_state_dict_from_url(model_urls['mobilenet_v2']), strict=False) 134 | return model 135 | -------------------------------------------------------------------------------- /ssd/modeling/backbone/mobilenetv3.py: -------------------------------------------------------------------------------- 1 | """ 2 | Creates a MobileNetV3 Model as defined in: 3 | Andrew Howard, Mark Sandler, Grace Chu, Liang-Chieh Chen, Bo Chen, Mingxing Tan, Weijun Wang, Yukun Zhu, Ruoming Pang, Vijay Vasudevan, Quoc V. Le, Hartwig Adam. (2019). 4 | Searching for MobileNetV3 5 | arXiv preprint arXiv:1905.02244. 6 | 7 | 8 | @ Credit from https://github.com/d-li14/mobilenetv3.pytorch 9 | @ Modified by Chakkrit Termritthikun (https://github.com/chakkritte) 10 | 11 | """ 12 | 13 | import torch.nn as nn 14 | import math 15 | 16 | from ssd.modeling import registry 17 | from ssd.utils.model_zoo import load_state_dict_from_url 18 | 19 | model_urls = { 20 | 'mobilenet_v3': 'https://github.com/d-li14/mobilenetv3.pytorch/raw/master/pretrained/mobilenetv3-large-1cd25616.pth', 21 | } 22 | 23 | 24 | def _make_divisible(v, divisor, min_value=None): 25 | """ 26 | This function is taken from the original tf repo. 27 | It ensures that all layers have a channel number that is divisible by 8 28 | It can be seen here: 29 | https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py 30 | :param v: 31 | :param divisor: 32 | :param min_value: 33 | :return: 34 | """ 35 | if min_value is None: 36 | min_value = divisor 37 | new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) 38 | # Make sure that round down does not go down by more than 10%. 39 | if new_v < 0.9 * v: 40 | new_v += divisor 41 | return new_v 42 | 43 | 44 | class h_sigmoid(nn.Module): 45 | def __init__(self, inplace=True): 46 | super(h_sigmoid, self).__init__() 47 | self.relu = nn.ReLU6(inplace=inplace) 48 | 49 | def forward(self, x): 50 | return self.relu(x + 3) / 6 51 | 52 | 53 | class h_swish(nn.Module): 54 | def __init__(self, inplace=True): 55 | super(h_swish, self).__init__() 56 | self.sigmoid = h_sigmoid(inplace=inplace) 57 | 58 | def forward(self, x): 59 | return x * self.sigmoid(x) 60 | 61 | 62 | class SELayer(nn.Module): 63 | def __init__(self, channel, reduction=4): 64 | super(SELayer, self).__init__() 65 | self.avg_pool = nn.AdaptiveAvgPool2d(1) 66 | self.fc = nn.Sequential( 67 | nn.Linear(channel, _make_divisible(channel // reduction, 8)), 68 | nn.ReLU(inplace=True), 69 | nn.Linear(_make_divisible(channel // reduction, 8), channel), 70 | h_sigmoid() 71 | ) 72 | 73 | def forward(self, x): 74 | b, c, _, _ = x.size() 75 | y = self.avg_pool(x).view(b, c) 76 | y = self.fc(y).view(b, c, 1, 1) 77 | return x * y 78 | 79 | 80 | def conv_3x3_bn(inp, oup, stride): 81 | return nn.Sequential( 82 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 83 | nn.BatchNorm2d(oup), 84 | h_swish() 85 | ) 86 | 87 | 88 | def conv_1x1_bn(inp, oup): 89 | return nn.Sequential( 90 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 91 | nn.BatchNorm2d(oup), 92 | h_swish() 93 | ) 94 | 95 | 96 | class InvertedResidual(nn.Module): 97 | def __init__(self, inp, hidden_dim, oup, kernel_size, stride, use_se, use_hs): 98 | super(InvertedResidual, self).__init__() 99 | assert stride in [1, 2] 100 | 101 | self.identity = stride == 1 and inp == oup 102 | 103 | if inp == hidden_dim: 104 | self.conv = nn.Sequential( 105 | # dw 106 | nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False), 107 | nn.BatchNorm2d(hidden_dim), 108 | h_swish() if use_hs else nn.ReLU(inplace=True), 109 | # Squeeze-and-Excite 110 | SELayer(hidden_dim) if use_se else nn.Identity(), 111 | # pw-linear 112 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 113 | nn.BatchNorm2d(oup), 114 | ) 115 | else: 116 | self.conv = nn.Sequential( 117 | # pw 118 | nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), 119 | nn.BatchNorm2d(hidden_dim), 120 | h_swish() if use_hs else nn.ReLU(inplace=True), 121 | # dw 122 | nn.Conv2d(hidden_dim, hidden_dim, kernel_size, stride, (kernel_size - 1) // 2, groups=hidden_dim, bias=False), 123 | nn.BatchNorm2d(hidden_dim), 124 | # Squeeze-and-Excite 125 | SELayer(hidden_dim) if use_se else nn.Identity(), 126 | h_swish() if use_hs else nn.ReLU(inplace=True), 127 | # pw-linear 128 | nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), 129 | nn.BatchNorm2d(oup), 130 | ) 131 | 132 | def forward(self, x): 133 | if self.identity: 134 | return x + self.conv(x) 135 | else: 136 | return self.conv(x) 137 | 138 | 139 | class MobileNetV3(nn.Module): 140 | def __init__(self, mode='large', num_classes=1000, width_mult=1.): 141 | super(MobileNetV3, self).__init__() 142 | # setting of inverted residual blocks 143 | self.cfgs = [ 144 | # k, t, c, SE, HS, s 145 | [3, 1, 16, 0, 0, 1], 146 | [3, 4, 24, 0, 0, 2], 147 | [3, 3, 24, 0, 0, 1], 148 | [5, 3, 40, 1, 0, 2], 149 | [5, 3, 40, 1, 0, 1], 150 | [5, 3, 40, 1, 0, 1], 151 | [3, 6, 80, 0, 1, 2], 152 | [3, 2.5, 80, 0, 1, 1], 153 | [3, 2.3, 80, 0, 1, 1], 154 | [3, 2.3, 80, 0, 1, 1], 155 | [3, 6, 112, 1, 1, 1], 156 | [3, 6, 112, 1, 1, 1], 157 | [5, 6, 160, 1, 1, 2], 158 | [5, 6, 160, 1, 1, 1], 159 | [5, 6, 160, 1, 1, 1]] 160 | 161 | assert mode in ['large', 'small'] 162 | 163 | # building first layer 164 | input_channel = _make_divisible(16 * width_mult, 8) 165 | 166 | layers = [conv_3x3_bn(3, input_channel, 2)] 167 | # building inverted residual blocks 168 | block = InvertedResidual 169 | for k, t, c, use_se, use_hs, s in self.cfgs: 170 | output_channel = _make_divisible(c * width_mult, 8) 171 | exp_size = _make_divisible(input_channel * t, 8) 172 | layers.append(block(input_channel, exp_size, output_channel, k, s, use_se, use_hs)) 173 | input_channel = output_channel 174 | # building last several layers 175 | layers.append(conv_1x1_bn(input_channel, exp_size)) 176 | self.features = nn.Sequential(*layers) 177 | self.extras = nn.ModuleList([ 178 | InvertedResidual(960, _make_divisible(960 * 0.2, 8), 512, 3, 2, True, True), 179 | InvertedResidual(512, _make_divisible(512 * 0.25, 8), 256, 3, 2, True, True), 180 | InvertedResidual(256, _make_divisible(256 * 0.5, 8), 256, 3, 2, True, True), 181 | InvertedResidual(256, _make_divisible(256 * 0.25, 8), 64, 3, 2, True, True), 182 | ]) 183 | 184 | self.reset_parameters() 185 | 186 | def forward(self, x): 187 | features = [] 188 | for i in range(13): 189 | x = self.features[i](x) 190 | features.append(x) 191 | 192 | for i in range(13, len(self.features)): 193 | x = self.features[i](x) 194 | features.append(x) 195 | 196 | for i in range(len(self.extras)): 197 | x = self.extras[i](x) 198 | features.append(x) 199 | 200 | return tuple(features) 201 | 202 | def reset_parameters(self): 203 | for m in self.modules(): 204 | if isinstance(m, nn.Conv2d): 205 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 206 | m.weight.data.normal_(0, math.sqrt(2. / n)) 207 | if m.bias is not None: 208 | m.bias.data.zero_() 209 | elif isinstance(m, nn.BatchNorm2d): 210 | m.weight.data.fill_(1) 211 | m.bias.data.zero_() 212 | elif isinstance(m, nn.Linear): 213 | n = m.weight.size(1) 214 | m.weight.data.normal_(0, 0.01) 215 | m.bias.data.zero_() 216 | 217 | 218 | @registry.BACKBONES.register('mobilenet_v3') 219 | def mobilenet_v3(cfg, pretrained=True): 220 | model = MobileNetV3() 221 | if pretrained: 222 | model.load_state_dict(load_state_dict_from_url(model_urls['mobilenet_v3']), strict=False) 223 | return model 224 | -------------------------------------------------------------------------------- /ssd/modeling/backbone/vgg.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | 4 | from ssd.layers import L2Norm 5 | from ssd.modeling import registry 6 | from ssd.utils.model_zoo import load_state_dict_from_url 7 | 8 | model_urls = { 9 | 'vgg': 'https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth', 10 | } 11 | 12 | 13 | # borrowed from https://github.com/amdegroot/ssd.pytorch/blob/master/ssd.py 14 | def add_vgg(cfg, batch_norm=False): 15 | layers = [] 16 | in_channels = 3 17 | for v in cfg: 18 | if v == 'M': 19 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 20 | elif v == 'C': 21 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 22 | else: 23 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 24 | if batch_norm: 25 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 26 | else: 27 | layers += [conv2d, nn.ReLU(inplace=True)] 28 | in_channels = v 29 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 30 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 31 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 32 | layers += [pool5, conv6, 33 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 34 | return layers 35 | 36 | 37 | def add_extras(cfg, i, size=300): 38 | # Extra layers added to VGG for feature scaling 39 | layers = [] 40 | in_channels = i 41 | flag = False 42 | for k, v in enumerate(cfg): 43 | if in_channels != 'S': 44 | if v == 'S': 45 | layers += [nn.Conv2d(in_channels, cfg[k + 1], kernel_size=(1, 3)[flag], stride=2, padding=1)] 46 | else: 47 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 48 | flag = not flag 49 | in_channels = v 50 | if size == 512: 51 | layers.append(nn.Conv2d(in_channels, 128, kernel_size=1, stride=1)) 52 | layers.append(nn.Conv2d(128, 256, kernel_size=4, stride=1, padding=1)) 53 | return layers 54 | 55 | 56 | vgg_base = { 57 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 58 | 512, 512, 512], 59 | '512': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 60 | 512, 512, 512], 61 | } 62 | extras_base = { 63 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 64 | '512': [256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256], 65 | } 66 | 67 | 68 | class VGG(nn.Module): 69 | def __init__(self, cfg): 70 | super().__init__() 71 | size = cfg.INPUT.IMAGE_SIZE 72 | vgg_config = vgg_base[str(size)] 73 | extras_config = extras_base[str(size)] 74 | 75 | self.vgg = nn.ModuleList(add_vgg(vgg_config)) 76 | self.extras = nn.ModuleList(add_extras(extras_config, i=1024, size=size)) 77 | self.l2_norm = L2Norm(512, scale=20) 78 | self.reset_parameters() 79 | 80 | def reset_parameters(self): 81 | for m in self.extras.modules(): 82 | if isinstance(m, nn.Conv2d): 83 | nn.init.xavier_uniform_(m.weight) 84 | nn.init.zeros_(m.bias) 85 | 86 | def init_from_pretrain(self, state_dict): 87 | self.vgg.load_state_dict(state_dict) 88 | 89 | def forward(self, x): 90 | features = [] 91 | for i in range(23): 92 | x = self.vgg[i](x) 93 | s = self.l2_norm(x) # Conv4_3 L2 normalization 94 | features.append(s) 95 | 96 | # apply vgg up to fc7 97 | for i in range(23, len(self.vgg)): 98 | x = self.vgg[i](x) 99 | features.append(x) 100 | 101 | for k, v in enumerate(self.extras): 102 | x = F.relu(v(x), inplace=True) 103 | if k % 2 == 1: 104 | features.append(x) 105 | 106 | return tuple(features) 107 | 108 | 109 | @registry.BACKBONES.register('vgg') 110 | def vgg(cfg, pretrained=True): 111 | model = VGG(cfg) 112 | if pretrained: 113 | model.init_from_pretrain(load_state_dict_from_url(model_urls['vgg'])) 114 | return model 115 | -------------------------------------------------------------------------------- /ssd/modeling/box_head/__init__.py: -------------------------------------------------------------------------------- 1 | from ssd.modeling import registry 2 | from .box_head import SSDBoxHead 3 | 4 | __all__ = ['build_box_head', 'SSDBoxHead'] 5 | 6 | 7 | def build_box_head(cfg): 8 | return registry.BOX_HEADS[cfg.MODEL.BOX_HEAD.NAME](cfg) 9 | -------------------------------------------------------------------------------- /ssd/modeling/box_head/box_head.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch.nn.functional as F 3 | 4 | from ssd.modeling import registry 5 | from ssd.modeling.anchors.prior_box import PriorBox 6 | from ssd.modeling.box_head.box_predictor import make_box_predictor 7 | from ssd.utils import box_utils 8 | from .inference import PostProcessor 9 | from .loss import MultiBoxLoss 10 | 11 | 12 | @registry.BOX_HEADS.register('SSDBoxHead') 13 | class SSDBoxHead(nn.Module): 14 | def __init__(self, cfg): 15 | super().__init__() 16 | self.cfg = cfg 17 | self.predictor = make_box_predictor(cfg) 18 | self.loss_evaluator = MultiBoxLoss(neg_pos_ratio=cfg.MODEL.NEG_POS_RATIO) 19 | self.post_processor = PostProcessor(cfg) 20 | self.priors = None 21 | 22 | def forward(self, features, targets=None): 23 | cls_logits, bbox_pred = self.predictor(features) 24 | if self.training: 25 | return self._forward_train(cls_logits, bbox_pred, targets) 26 | else: 27 | return self._forward_test(cls_logits, bbox_pred) 28 | 29 | def _forward_train(self, cls_logits, bbox_pred, targets): 30 | gt_boxes, gt_labels = targets['boxes'], targets['labels'] 31 | reg_loss, cls_loss = self.loss_evaluator(cls_logits, bbox_pred, gt_labels, gt_boxes) 32 | loss_dict = dict( 33 | reg_loss=reg_loss, 34 | cls_loss=cls_loss, 35 | ) 36 | detections = (cls_logits, bbox_pred) 37 | return detections, loss_dict 38 | 39 | def _forward_test(self, cls_logits, bbox_pred): 40 | if self.priors is None: 41 | self.priors = PriorBox(self.cfg)().to(bbox_pred.device) 42 | scores = F.softmax(cls_logits, dim=2) 43 | boxes = box_utils.convert_locations_to_boxes( 44 | bbox_pred, self.priors, self.cfg.MODEL.CENTER_VARIANCE, self.cfg.MODEL.SIZE_VARIANCE 45 | ) 46 | boxes = box_utils.center_form_to_corner_form(boxes) 47 | detections = (scores, boxes) 48 | detections = self.post_processor(detections) 49 | return detections, {} 50 | -------------------------------------------------------------------------------- /ssd/modeling/box_head/box_predictor.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | 4 | from ssd.layers import SeparableConv2d 5 | from ssd.modeling import registry 6 | 7 | 8 | class BoxPredictor(nn.Module): 9 | def __init__(self, cfg): 10 | super().__init__() 11 | self.cfg = cfg 12 | self.cls_headers = nn.ModuleList() 13 | self.reg_headers = nn.ModuleList() 14 | for level, (boxes_per_location, out_channels) in enumerate(zip(cfg.MODEL.PRIORS.BOXES_PER_LOCATION, cfg.MODEL.BACKBONE.OUT_CHANNELS)): 15 | self.cls_headers.append(self.cls_block(level, out_channels, boxes_per_location)) 16 | self.reg_headers.append(self.reg_block(level, out_channels, boxes_per_location)) 17 | self.reset_parameters() 18 | 19 | def cls_block(self, level, out_channels, boxes_per_location): 20 | raise NotImplementedError 21 | 22 | def reg_block(self, level, out_channels, boxes_per_location): 23 | raise NotImplementedError 24 | 25 | def reset_parameters(self): 26 | for m in self.modules(): 27 | if isinstance(m, nn.Conv2d): 28 | nn.init.xavier_uniform_(m.weight) 29 | nn.init.zeros_(m.bias) 30 | 31 | def forward(self, features): 32 | cls_logits = [] 33 | bbox_pred = [] 34 | for feature, cls_header, reg_header in zip(features, self.cls_headers, self.reg_headers): 35 | cls_logits.append(cls_header(feature).permute(0, 2, 3, 1).contiguous()) 36 | bbox_pred.append(reg_header(feature).permute(0, 2, 3, 1).contiguous()) 37 | 38 | batch_size = features[0].shape[0] 39 | cls_logits = torch.cat([c.view(c.shape[0], -1) for c in cls_logits], dim=1).view(batch_size, -1, self.cfg.MODEL.NUM_CLASSES) 40 | bbox_pred = torch.cat([l.view(l.shape[0], -1) for l in bbox_pred], dim=1).view(batch_size, -1, 4) 41 | 42 | return cls_logits, bbox_pred 43 | 44 | 45 | @registry.BOX_PREDICTORS.register('SSDBoxPredictor') 46 | class SSDBoxPredictor(BoxPredictor): 47 | def cls_block(self, level, out_channels, boxes_per_location): 48 | return nn.Conv2d(out_channels, boxes_per_location * self.cfg.MODEL.NUM_CLASSES, kernel_size=3, stride=1, padding=1) 49 | 50 | def reg_block(self, level, out_channels, boxes_per_location): 51 | return nn.Conv2d(out_channels, boxes_per_location * 4, kernel_size=3, stride=1, padding=1) 52 | 53 | 54 | @registry.BOX_PREDICTORS.register('SSDLiteBoxPredictor') 55 | class SSDLiteBoxPredictor(BoxPredictor): 56 | def cls_block(self, level, out_channels, boxes_per_location): 57 | num_levels = len(self.cfg.MODEL.BACKBONE.OUT_CHANNELS) 58 | if level == num_levels - 1: 59 | return nn.Conv2d(out_channels, boxes_per_location * self.cfg.MODEL.NUM_CLASSES, kernel_size=1) 60 | return SeparableConv2d(out_channels, boxes_per_location * self.cfg.MODEL.NUM_CLASSES, kernel_size=3, stride=1, padding=1) 61 | 62 | def reg_block(self, level, out_channels, boxes_per_location): 63 | num_levels = len(self.cfg.MODEL.BACKBONE.OUT_CHANNELS) 64 | if level == num_levels - 1: 65 | return nn.Conv2d(out_channels, boxes_per_location * 4, kernel_size=1) 66 | return SeparableConv2d(out_channels, boxes_per_location * 4, kernel_size=3, stride=1, padding=1) 67 | 68 | 69 | def make_box_predictor(cfg): 70 | return registry.BOX_PREDICTORS[cfg.MODEL.BOX_HEAD.PREDICTOR](cfg) 71 | -------------------------------------------------------------------------------- /ssd/modeling/box_head/inference.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from ssd.structures.container import Container 4 | from ssd.utils.nms import batched_nms 5 | 6 | 7 | class PostProcessor: 8 | def __init__(self, cfg): 9 | super().__init__() 10 | self.cfg = cfg 11 | self.width = cfg.INPUT.IMAGE_SIZE 12 | self.height = cfg.INPUT.IMAGE_SIZE 13 | 14 | def __call__(self, detections): 15 | batches_scores, batches_boxes = detections 16 | device = batches_scores.device 17 | batch_size = batches_scores.size(0) 18 | results = [] 19 | for batch_id in range(batch_size): 20 | scores, boxes = batches_scores[batch_id], batches_boxes[batch_id] # (N, #CLS) (N, 4) 21 | num_boxes = scores.shape[0] 22 | num_classes = scores.shape[1] 23 | 24 | boxes = boxes.view(num_boxes, 1, 4).expand(num_boxes, num_classes, 4) 25 | labels = torch.arange(num_classes, device=device) 26 | labels = labels.view(1, num_classes).expand_as(scores) 27 | 28 | # remove predictions with the background label 29 | boxes = boxes[:, 1:] 30 | scores = scores[:, 1:] 31 | labels = labels[:, 1:] 32 | 33 | # batch everything, by making every class prediction be a separate instance 34 | boxes = boxes.reshape(-1, 4) 35 | scores = scores.reshape(-1) 36 | labels = labels.reshape(-1) 37 | 38 | # remove low scoring boxes 39 | indices = torch.nonzero(scores > self.cfg.TEST.CONFIDENCE_THRESHOLD).squeeze(1) 40 | boxes, scores, labels = boxes[indices], scores[indices], labels[indices] 41 | 42 | boxes[:, 0::2] *= self.width 43 | boxes[:, 1::2] *= self.height 44 | 45 | keep = batched_nms(boxes, scores, labels, self.cfg.TEST.NMS_THRESHOLD) 46 | # keep only topk scoring predictions 47 | keep = keep[:self.cfg.TEST.MAX_PER_IMAGE] 48 | boxes, scores, labels = boxes[keep], scores[keep], labels[keep] 49 | 50 | container = Container(boxes=boxes, labels=labels, scores=scores) 51 | container.img_width = self.width 52 | container.img_height = self.height 53 | results.append(container) 54 | return results 55 | -------------------------------------------------------------------------------- /ssd/modeling/box_head/loss.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.nn.functional as F 3 | import torch 4 | 5 | from ssd.utils import box_utils 6 | 7 | 8 | class MultiBoxLoss(nn.Module): 9 | def __init__(self, neg_pos_ratio): 10 | """Implement SSD MultiBox Loss. 11 | 12 | Basically, MultiBox loss combines classification loss 13 | and Smooth L1 regression loss. 14 | """ 15 | super(MultiBoxLoss, self).__init__() 16 | self.neg_pos_ratio = neg_pos_ratio 17 | 18 | def forward(self, confidence, predicted_locations, labels, gt_locations): 19 | """Compute classification loss and smooth l1 loss. 20 | 21 | Args: 22 | confidence (batch_size, num_priors, num_classes): class predictions. 23 | predicted_locations (batch_size, num_priors, 4): predicted locations. 24 | labels (batch_size, num_priors): real labels of all the priors. 25 | gt_locations (batch_size, num_priors, 4): real boxes corresponding all the priors. 26 | """ 27 | num_classes = confidence.size(2) 28 | with torch.no_grad(): 29 | # derived from cross_entropy=sum(log(p)) 30 | loss = -F.log_softmax(confidence, dim=2)[:, :, 0] 31 | mask = box_utils.hard_negative_mining(loss, labels, self.neg_pos_ratio) 32 | 33 | confidence = confidence[mask, :] 34 | classification_loss = F.cross_entropy(confidence.view(-1, num_classes), labels[mask], reduction='sum') 35 | 36 | pos_mask = labels > 0 37 | predicted_locations = predicted_locations[pos_mask, :].view(-1, 4) 38 | gt_locations = gt_locations[pos_mask, :].view(-1, 4) 39 | smooth_l1_loss = F.smooth_l1_loss(predicted_locations, gt_locations, reduction='sum') 40 | num_pos = gt_locations.size(0) 41 | return smooth_l1_loss / num_pos, classification_loss / num_pos 42 | -------------------------------------------------------------------------------- /ssd/modeling/detector/__init__.py: -------------------------------------------------------------------------------- 1 | from .ssd_detector import SSDDetector 2 | 3 | _DETECTION_META_ARCHITECTURES = { 4 | "SSDDetector": SSDDetector 5 | } 6 | 7 | 8 | def build_detection_model(cfg): 9 | meta_arch = _DETECTION_META_ARCHITECTURES[cfg.MODEL.META_ARCHITECTURE] 10 | return meta_arch(cfg) 11 | -------------------------------------------------------------------------------- /ssd/modeling/detector/ssd_detector.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | from ssd.modeling.backbone import build_backbone 4 | from ssd.modeling.box_head import build_box_head 5 | 6 | 7 | class SSDDetector(nn.Module): 8 | def __init__(self, cfg): 9 | super().__init__() 10 | self.cfg = cfg 11 | self.backbone = build_backbone(cfg) 12 | self.box_head = build_box_head(cfg) 13 | 14 | def forward(self, images, targets=None): 15 | features = self.backbone(images) 16 | detections, detector_losses = self.box_head(features, targets) 17 | if self.training: 18 | return detector_losses 19 | return detections 20 | -------------------------------------------------------------------------------- /ssd/modeling/registry.py: -------------------------------------------------------------------------------- 1 | from ssd.utils.registry import Registry 2 | 3 | BACKBONES = Registry() 4 | BOX_HEADS = Registry() 5 | BOX_PREDICTORS = Registry() 6 | -------------------------------------------------------------------------------- /ssd/solver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/ssd/solver/__init__.py -------------------------------------------------------------------------------- /ssd/solver/build.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .lr_scheduler import WarmupMultiStepLR 4 | 5 | 6 | def make_optimizer(cfg, model, lr=None): 7 | lr = cfg.SOLVER.BASE_LR if lr is None else lr 8 | return torch.optim.SGD(model.parameters(), lr=lr, momentum=cfg.SOLVER.MOMENTUM, weight_decay=cfg.SOLVER.WEIGHT_DECAY) 9 | 10 | 11 | def make_lr_scheduler(cfg, optimizer, milestones=None): 12 | return WarmupMultiStepLR(optimizer=optimizer, 13 | milestones=cfg.SOLVER.LR_STEPS if milestones is None else milestones, 14 | gamma=cfg.SOLVER.GAMMA, 15 | warmup_factor=cfg.SOLVER.WARMUP_FACTOR, 16 | warmup_iters=cfg.SOLVER.WARMUP_ITERS) 17 | -------------------------------------------------------------------------------- /ssd/solver/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | from bisect import bisect_right 2 | 3 | from torch.optim.lr_scheduler import _LRScheduler 4 | 5 | 6 | class WarmupMultiStepLR(_LRScheduler): 7 | def __init__(self, optimizer, milestones, gamma=0.1, warmup_factor=1.0 / 3, 8 | warmup_iters=500, last_epoch=-1): 9 | if not list(milestones) == sorted(milestones): 10 | raise ValueError( 11 | "Milestones should be a list of" " increasing integers. Got {}", 12 | milestones, 13 | ) 14 | 15 | self.milestones = milestones 16 | self.gamma = gamma 17 | self.warmup_factor = warmup_factor 18 | self.warmup_iters = warmup_iters 19 | super().__init__(optimizer, last_epoch) 20 | 21 | def get_lr(self): 22 | warmup_factor = 1 23 | if self.last_epoch < self.warmup_iters: 24 | alpha = float(self.last_epoch) / self.warmup_iters 25 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 26 | return [ 27 | base_lr 28 | * warmup_factor 29 | * self.gamma ** bisect_right(self.milestones, self.last_epoch) 30 | for base_lr in self.base_lrs 31 | ] 32 | -------------------------------------------------------------------------------- /ssd/structures/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lufficc/SSD/68dc0a20efaf3997e58b616afaaaa21bf8ca3c05/ssd/structures/__init__.py -------------------------------------------------------------------------------- /ssd/structures/container.py: -------------------------------------------------------------------------------- 1 | class Container: 2 | """ 3 | Help class for manage boxes, labels, etc... 4 | Not inherit dict due to `default_collate` will change dict's subclass to dict. 5 | """ 6 | 7 | def __init__(self, *args, **kwargs): 8 | self._data_dict = dict(*args, **kwargs) 9 | 10 | def __setattr__(self, key, value): 11 | object.__setattr__(self, key, value) 12 | 13 | def __getitem__(self, key): 14 | return self._data_dict[key] 15 | 16 | def __iter__(self): 17 | return self._data_dict.__iter__() 18 | 19 | def __setitem__(self, key, value): 20 | self._data_dict[key] = value 21 | 22 | def _call(self, name, *args, **kwargs): 23 | keys = list(self._data_dict.keys()) 24 | for key in keys: 25 | value = self._data_dict[key] 26 | if hasattr(value, name): 27 | self._data_dict[key] = getattr(value, name)(*args, **kwargs) 28 | return self 29 | 30 | def to(self, *args, **kwargs): 31 | return self._call('to', *args, **kwargs) 32 | 33 | def numpy(self): 34 | return self._call('numpy') 35 | 36 | def resize(self, size): 37 | """resize boxes 38 | Args: 39 | size: (width, height) 40 | Returns: 41 | self 42 | """ 43 | img_width = getattr(self, 'img_width', -1) 44 | img_height = getattr(self, 'img_height', -1) 45 | assert img_width > 0 and img_height > 0 46 | assert 'boxes' in self._data_dict 47 | boxes = self._data_dict['boxes'] 48 | new_width, new_height = size 49 | boxes[:, 0::2] *= (new_width / img_width) 50 | boxes[:, 1::2] *= (new_height / img_height) 51 | return self 52 | 53 | def __repr__(self): 54 | return self._data_dict.__repr__() 55 | -------------------------------------------------------------------------------- /ssd/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .misc import * 2 | -------------------------------------------------------------------------------- /ssd/utils/box_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import math 3 | 4 | 5 | def convert_locations_to_boxes(locations, priors, center_variance, 6 | size_variance): 7 | """Convert regressional location results of SSD into boxes in the form of (center_x, center_y, h, w). 8 | 9 | The conversion: 10 | $$predicted\_center * center_variance = \frac {real\_center - prior\_center} {prior\_hw}$$ 11 | $$exp(predicted\_hw * size_variance) = \frac {real\_hw} {prior\_hw}$$ 12 | We do it in the inverse direction here. 13 | Args: 14 | locations (batch_size, num_priors, 4): the regression output of SSD. It will contain the outputs as well. 15 | priors (num_priors, 4) or (batch_size/1, num_priors, 4): prior boxes. 16 | center_variance: a float used to change the scale of center. 17 | size_variance: a float used to change of scale of size. 18 | Returns: 19 | boxes: priors: [[center_x, center_y, w, h]]. All the values 20 | are relative to the image size. 21 | """ 22 | # priors can have one dimension less. 23 | if priors.dim() + 1 == locations.dim(): 24 | priors = priors.unsqueeze(0) 25 | return torch.cat([ 26 | locations[..., :2] * center_variance * priors[..., 2:] + priors[..., :2], 27 | torch.exp(locations[..., 2:] * size_variance) * priors[..., 2:] 28 | ], dim=locations.dim() - 1) 29 | 30 | 31 | def convert_boxes_to_locations(center_form_boxes, center_form_priors, center_variance, size_variance): 32 | # priors can have one dimension less 33 | if center_form_priors.dim() + 1 == center_form_boxes.dim(): 34 | center_form_priors = center_form_priors.unsqueeze(0) 35 | return torch.cat([ 36 | (center_form_boxes[..., :2] - center_form_priors[..., :2]) / center_form_priors[..., 2:] / center_variance, 37 | torch.log(center_form_boxes[..., 2:] / center_form_priors[..., 2:]) / size_variance 38 | ], dim=center_form_boxes.dim() - 1) 39 | 40 | 41 | def area_of(left_top, right_bottom) -> torch.Tensor: 42 | """Compute the areas of rectangles given two corners. 43 | 44 | Args: 45 | left_top (N, 2): left top corner. 46 | right_bottom (N, 2): right bottom corner. 47 | 48 | Returns: 49 | area (N): return the area. 50 | """ 51 | hw = torch.clamp(right_bottom - left_top, min=0.0) 52 | return hw[..., 0] * hw[..., 1] 53 | 54 | 55 | def iou_of(boxes0, boxes1, eps=1e-5): 56 | """Return intersection-over-union (Jaccard index) of boxes. 57 | 58 | Args: 59 | boxes0 (N, 4): ground truth boxes. 60 | boxes1 (N or 1, 4): predicted boxes. 61 | eps: a small number to avoid 0 as denominator. 62 | Returns: 63 | iou (N): IoU values. 64 | """ 65 | overlap_left_top = torch.max(boxes0[..., :2], boxes1[..., :2]) 66 | overlap_right_bottom = torch.min(boxes0[..., 2:], boxes1[..., 2:]) 67 | 68 | overlap_area = area_of(overlap_left_top, overlap_right_bottom) 69 | area0 = area_of(boxes0[..., :2], boxes0[..., 2:]) 70 | area1 = area_of(boxes1[..., :2], boxes1[..., 2:]) 71 | return overlap_area / (area0 + area1 - overlap_area + eps) 72 | 73 | 74 | def assign_priors(gt_boxes, gt_labels, corner_form_priors, 75 | iou_threshold): 76 | """Assign ground truth boxes and targets to priors. 77 | 78 | Args: 79 | gt_boxes (num_targets, 4): ground truth boxes. 80 | gt_labels (num_targets): labels of targets. 81 | priors (num_priors, 4): corner form priors 82 | Returns: 83 | boxes (num_priors, 4): real values for priors. 84 | labels (num_priros): labels for priors. 85 | """ 86 | # size: num_priors x num_targets 87 | ious = iou_of(gt_boxes.unsqueeze(0), corner_form_priors.unsqueeze(1)) 88 | # size: num_priors 89 | best_target_per_prior, best_target_per_prior_index = ious.max(1) 90 | # size: num_targets 91 | best_prior_per_target, best_prior_per_target_index = ious.max(0) 92 | 93 | for target_index, prior_index in enumerate(best_prior_per_target_index): 94 | best_target_per_prior_index[prior_index] = target_index 95 | # 2.0 is used to make sure every target has a prior assigned 96 | best_target_per_prior.index_fill_(0, best_prior_per_target_index, 2) 97 | # size: num_priors 98 | labels = gt_labels[best_target_per_prior_index] 99 | labels[best_target_per_prior < iou_threshold] = 0 # the backgournd id 100 | boxes = gt_boxes[best_target_per_prior_index] 101 | return boxes, labels 102 | 103 | 104 | def hard_negative_mining(loss, labels, neg_pos_ratio): 105 | """ 106 | It used to suppress the presence of a large number of negative prediction. 107 | It works on image level not batch level. 108 | For any example/image, it keeps all the positive predictions and 109 | cut the number of negative predictions to make sure the ratio 110 | between the negative examples and positive examples is no more 111 | the given ratio for an image. 112 | 113 | Args: 114 | loss (N, num_priors): the loss for each example. 115 | labels (N, num_priors): the labels. 116 | neg_pos_ratio: the ratio between the negative examples and positive examples. 117 | """ 118 | pos_mask = labels > 0 119 | num_pos = pos_mask.long().sum(dim=1, keepdim=True) 120 | num_neg = num_pos * neg_pos_ratio 121 | 122 | loss[pos_mask] = -math.inf 123 | _, indexes = loss.sort(dim=1, descending=True) 124 | _, orders = indexes.sort(dim=1) 125 | neg_mask = orders < num_neg 126 | return pos_mask | neg_mask 127 | 128 | 129 | def center_form_to_corner_form(locations): 130 | return torch.cat([locations[..., :2] - locations[..., 2:] / 2, 131 | locations[..., :2] + locations[..., 2:] / 2], locations.dim() - 1) 132 | 133 | 134 | def corner_form_to_center_form(boxes): 135 | return torch.cat([ 136 | (boxes[..., :2] + boxes[..., 2:]) / 2, 137 | boxes[..., 2:] - boxes[..., :2] 138 | ], boxes.dim() - 1) 139 | -------------------------------------------------------------------------------- /ssd/utils/checkpoint.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | 4 | import torch 5 | from torch.nn.parallel import DistributedDataParallel 6 | 7 | from ssd.utils.model_zoo import cache_url 8 | 9 | 10 | class CheckPointer: 11 | _last_checkpoint_name = 'last_checkpoint.txt' 12 | 13 | def __init__(self, 14 | model, 15 | optimizer=None, 16 | scheduler=None, 17 | save_dir="", 18 | save_to_disk=None, 19 | logger=None): 20 | self.model = model 21 | self.optimizer = optimizer 22 | self.scheduler = scheduler 23 | self.save_dir = save_dir 24 | self.save_to_disk = save_to_disk 25 | if logger is None: 26 | logger = logging.getLogger(__name__) 27 | self.logger = logger 28 | 29 | def save(self, name, **kwargs): 30 | if not self.save_dir: 31 | return 32 | 33 | if not self.save_to_disk: 34 | return 35 | 36 | data = {} 37 | if isinstance(self.model, DistributedDataParallel): 38 | data['model'] = self.model.module.state_dict() 39 | else: 40 | data['model'] = self.model.state_dict() 41 | if self.optimizer is not None: 42 | data["optimizer"] = self.optimizer.state_dict() 43 | if self.scheduler is not None: 44 | data["scheduler"] = self.scheduler.state_dict() 45 | data.update(kwargs) 46 | 47 | save_file = os.path.join(self.save_dir, "{}.pth".format(name)) 48 | self.logger.info("Saving checkpoint to {}".format(save_file)) 49 | torch.save(data, save_file) 50 | 51 | self.tag_last_checkpoint(save_file) 52 | 53 | def load(self, f=None, use_latest=True): 54 | if self.has_checkpoint() and use_latest: 55 | # override argument with existing checkpoint 56 | f = self.get_checkpoint_file() 57 | if not f: 58 | # no checkpoint could be found 59 | self.logger.info("No checkpoint found.") 60 | return {} 61 | 62 | self.logger.info("Loading checkpoint from {}".format(f)) 63 | checkpoint = self._load_file(f) 64 | model = self.model 65 | if isinstance(model, DistributedDataParallel): 66 | model = self.model.module 67 | 68 | model.load_state_dict(checkpoint.pop("model")) 69 | if "optimizer" in checkpoint and self.optimizer: 70 | self.logger.info("Loading optimizer from {}".format(f)) 71 | self.optimizer.load_state_dict(checkpoint.pop("optimizer")) 72 | if "scheduler" in checkpoint and self.scheduler: 73 | self.logger.info("Loading scheduler from {}".format(f)) 74 | self.scheduler.load_state_dict(checkpoint.pop("scheduler")) 75 | 76 | # return any further checkpoint data 77 | return checkpoint 78 | 79 | def get_checkpoint_file(self): 80 | save_file = os.path.join(self.save_dir, self._last_checkpoint_name) 81 | try: 82 | with open(save_file, "r") as f: 83 | last_saved = f.read() 84 | last_saved = last_saved.strip() 85 | except IOError: 86 | # if file doesn't exist, maybe because it has just been 87 | # deleted by a separate process 88 | last_saved = "" 89 | return last_saved 90 | 91 | def has_checkpoint(self): 92 | save_file = os.path.join(self.save_dir, self._last_checkpoint_name) 93 | return os.path.exists(save_file) 94 | 95 | def tag_last_checkpoint(self, last_filename): 96 | save_file = os.path.join(self.save_dir, self._last_checkpoint_name) 97 | with open(save_file, "w") as f: 98 | f.write(last_filename) 99 | 100 | def _load_file(self, f): 101 | # download url files 102 | if f.startswith("http"): 103 | # if the file is a url path, download it and cache it 104 | cached_f = cache_url(f) 105 | self.logger.info("url {} cached in {}".format(f, cached_f)) 106 | f = cached_f 107 | return torch.load(f, map_location=torch.device("cpu")) 108 | -------------------------------------------------------------------------------- /ssd/utils/dist_util.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import torch 4 | import torch.distributed as dist 5 | 6 | 7 | def get_world_size(): 8 | if not dist.is_available(): 9 | return 1 10 | if not dist.is_initialized(): 11 | return 1 12 | return dist.get_world_size() 13 | 14 | 15 | def get_rank(): 16 | if not dist.is_available(): 17 | return 0 18 | if not dist.is_initialized(): 19 | return 0 20 | return dist.get_rank() 21 | 22 | 23 | def is_main_process(): 24 | return get_rank() == 0 25 | 26 | 27 | def synchronize(): 28 | """ 29 | Helper function to synchronize (barrier) among all processes when 30 | using distributed training 31 | """ 32 | if not dist.is_available(): 33 | return 34 | if not dist.is_initialized(): 35 | return 36 | world_size = dist.get_world_size() 37 | if world_size == 1: 38 | return 39 | dist.barrier() 40 | 41 | 42 | def _encode(encoded_data, data): 43 | # gets a byte representation for the data 44 | encoded_bytes = pickle.dumps(data) 45 | # convert this byte string into a byte tensor 46 | storage = torch.ByteStorage.from_buffer(encoded_bytes) 47 | tensor = torch.ByteTensor(storage).to("cuda") 48 | # encoding: first byte is the size and then rest is the data 49 | s = tensor.numel() 50 | assert s <= 255, "Can't encode data greater than 255 bytes" 51 | # put the encoded data in encoded_data 52 | encoded_data[0] = s 53 | encoded_data[1: (s + 1)] = tensor 54 | 55 | 56 | def all_gather(data): 57 | """ 58 | Run all_gather on arbitrary picklable data (not necessarily tensors) 59 | Args: 60 | data: any picklable object 61 | Returns: 62 | list[data]: list of data gathered from each rank 63 | """ 64 | world_size = get_world_size() 65 | if world_size == 1: 66 | return [data] 67 | 68 | # serialized to a Tensor 69 | buffer = pickle.dumps(data) 70 | storage = torch.ByteStorage.from_buffer(buffer) 71 | tensor = torch.ByteTensor(storage).to("cuda") 72 | 73 | # obtain Tensor size of each rank 74 | local_size = torch.LongTensor([tensor.numel()]).to("cuda") 75 | size_list = [torch.LongTensor([0]).to("cuda") for _ in range(world_size)] 76 | dist.all_gather(size_list, local_size) 77 | size_list = [int(size.item()) for size in size_list] 78 | max_size = max(size_list) 79 | 80 | # receiving Tensor from all ranks 81 | # we pad the tensor because torch all_gather does not support 82 | # gathering tensors of different shapes 83 | tensor_list = [] 84 | for _ in size_list: 85 | tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda")) 86 | if local_size != max_size: 87 | padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda") 88 | tensor = torch.cat((tensor, padding), dim=0) 89 | dist.all_gather(tensor_list, tensor) 90 | 91 | data_list = [] 92 | for size, tensor in zip(size_list, tensor_list): 93 | buffer = tensor.cpu().numpy().tobytes()[:size] 94 | data_list.append(pickle.loads(buffer)) 95 | 96 | return data_list 97 | -------------------------------------------------------------------------------- /ssd/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | 5 | 6 | def setup_logger(name, distributed_rank, save_dir=None): 7 | logger = logging.getLogger(name) 8 | logger.setLevel(logging.DEBUG) 9 | # don't log results for the non-master process 10 | if distributed_rank > 0: 11 | return logger 12 | stream_handler = logging.StreamHandler(stream=sys.stdout) 13 | stream_handler.setLevel(logging.DEBUG) 14 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 15 | stream_handler.setFormatter(formatter) 16 | logger.addHandler(stream_handler) 17 | if save_dir: 18 | fh = logging.FileHandler(os.path.join(save_dir, 'log.txt')) 19 | fh.setLevel(logging.DEBUG) 20 | fh.setFormatter(formatter) 21 | logger.addHandler(fh) 22 | return logger 23 | -------------------------------------------------------------------------------- /ssd/utils/metric_logger.py: -------------------------------------------------------------------------------- 1 | from collections import deque, defaultdict 2 | import numpy as np 3 | import torch 4 | 5 | 6 | class SmoothedValue: 7 | """Track a series of values and provide access to smoothed values over a 8 | window or the global series average. 9 | """ 10 | 11 | def __init__(self, window_size=10): 12 | self.deque = deque(maxlen=window_size) 13 | self.value = np.nan 14 | self.series = [] 15 | self.total = 0.0 16 | self.count = 0 17 | 18 | def update(self, value): 19 | self.deque.append(value) 20 | self.series.append(value) 21 | self.count += 1 22 | self.total += value 23 | self.value = value 24 | 25 | @property 26 | def median(self): 27 | values = np.array(self.deque) 28 | return np.median(values) 29 | 30 | @property 31 | def avg(self): 32 | values = np.array(self.deque) 33 | return np.mean(values) 34 | 35 | @property 36 | def global_avg(self): 37 | return self.total / self.count 38 | 39 | 40 | class MetricLogger: 41 | def __init__(self, delimiter=", "): 42 | self.meters = defaultdict(SmoothedValue) 43 | self.delimiter = delimiter 44 | 45 | def update(self, **kwargs): 46 | for k, v in kwargs.items(): 47 | if isinstance(v, torch.Tensor): 48 | v = v.item() 49 | assert isinstance(v, (float, int)) 50 | self.meters[k].update(v) 51 | 52 | def __getattr__(self, attr): 53 | if attr in self.meters: 54 | return self.meters[attr] 55 | if attr in self.__dict__: 56 | return self.__dict__[attr] 57 | raise AttributeError("'{}' object has no attribute '{}'".format( 58 | type(self).__name__, attr)) 59 | 60 | def __str__(self): 61 | loss_str = [] 62 | for name, meter in self.meters.items(): 63 | loss_str.append( 64 | "{}: {:.3f} ({:.3f})".format(name, meter.avg, meter.global_avg) 65 | ) 66 | return self.delimiter.join(loss_str) 67 | -------------------------------------------------------------------------------- /ssd/utils/misc.py: -------------------------------------------------------------------------------- 1 | import errno 2 | import os 3 | 4 | 5 | def str2bool(s): 6 | return s.lower() in ('true', '1') 7 | 8 | 9 | def mkdir(path): 10 | try: 11 | os.makedirs(path) 12 | except OSError as e: 13 | if e.errno != errno.EEXIST: 14 | raise 15 | -------------------------------------------------------------------------------- /ssd/utils/model_zoo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import os 3 | import sys 4 | 5 | import torch 6 | 7 | from ssd.utils.dist_util import is_main_process, synchronize 8 | 9 | from torch.hub import download_url_to_file 10 | from torch.hub import urlparse 11 | from torch.hub import HASH_REGEX 12 | 13 | 14 | # very similar to https://github.com/pytorch/pytorch/blob/master/torch/utils/model_zoo.py 15 | # but with a few improvements and modifications 16 | def cache_url(url, model_dir=None, progress=True): 17 | r"""Loads the Torch serialized object at the given URL. 18 | If the object is already present in `model_dir`, it's deserialized and 19 | returned. The filename part of the URL should follow the naming convention 20 | ``filename-.ext`` where ```` is the first eight or more 21 | digits of the SHA256 hash of the contents of the file. The hash is used to 22 | ensure unique names and to verify the contents of the file. 23 | The default value of `model_dir` is ``$TORCH_HOME/models`` where 24 | ``$TORCH_HOME`` defaults to ``~/.torch``. The default directory can be 25 | overridden with the ``$TORCH_MODEL_ZOO`` environment variable. 26 | Args: 27 | url (string): URL of the object to download 28 | model_dir (string, optional): directory in which to save the object 29 | progress (bool, optional): whether or not to display a progress bar to stderr 30 | Example: 31 | >>> cached_file = maskrcnn_benchmark.utils.model_zoo.cache_url('https://s3.amazonaws.com/pytorch/models/resnet18-5c106cde.pth') 32 | """ 33 | if model_dir is None: 34 | torch_home = os.path.expanduser(os.getenv("TORCH_HOME", "~/.torch")) 35 | model_dir = os.getenv("TORCH_MODEL_ZOO", os.path.join(torch_home, "models")) 36 | if not os.path.exists(model_dir): 37 | os.makedirs(model_dir) 38 | parts = urlparse(url) 39 | filename = os.path.basename(parts.path) 40 | if filename == "model_final.pkl": 41 | # workaround as pre-trained Caffe2 models from Detectron have all the same filename 42 | # so make the full path the filename by replacing / with _ 43 | filename = parts.path.replace("/", "_") 44 | cached_file = os.path.join(model_dir, filename) 45 | if not os.path.exists(cached_file) and is_main_process(): 46 | sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) 47 | hash_prefix = HASH_REGEX.search(filename) 48 | if hash_prefix is not None: 49 | hash_prefix = hash_prefix.group(1) 50 | # workaround: Caffe2 models don't have a hash, but follow the R-50 convention, 51 | # which matches the hash PyTorch uses. So we skip the hash matching 52 | # if the hash_prefix is less than 6 characters 53 | if len(hash_prefix) < 6: 54 | hash_prefix = None 55 | download_url_to_file(url, cached_file, hash_prefix, progress=progress) 56 | synchronize() 57 | return cached_file 58 | 59 | 60 | def load_state_dict_from_url(url, map_location='cpu'): 61 | cached_file = cache_url(url) 62 | return torch.load(cached_file, map_location=map_location) 63 | -------------------------------------------------------------------------------- /ssd/utils/nms.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import warnings 3 | 4 | import torch 5 | import torchvision 6 | 7 | if torchvision.__version__ >= '0.3.0': 8 | _nms = torchvision.ops.nms 9 | else: 10 | warnings.warn('No NMS is available. Please upgrade torchvision to 0.3.0+') 11 | sys.exit(-1) 12 | 13 | 14 | def nms(boxes, scores, nms_thresh): 15 | """ Performs non-maximum suppression, run on GPU or CPU according to 16 | boxes's device. 17 | Args: 18 | boxes(Tensor[N, 4]): boxes in (x1, y1, x2, y2) format, use absolute coordinates(or relative coordinates) 19 | scores(Tensor[N]): scores 20 | nms_thresh(float): thresh 21 | Returns: 22 | indices kept. 23 | """ 24 | keep = _nms(boxes, scores, nms_thresh) 25 | return keep 26 | 27 | 28 | def batched_nms(boxes, scores, idxs, iou_threshold): 29 | """ 30 | Performs non-maximum suppression in a batched fashion. 31 | 32 | Each index value correspond to a category, and NMS 33 | will not be applied between elements of different categories. 34 | 35 | Parameters 36 | ---------- 37 | boxes : Tensor[N, 4] 38 | boxes where NMS will be performed. They 39 | are expected to be in (x1, y1, x2, y2) format 40 | scores : Tensor[N] 41 | scores for each one of the boxes 42 | idxs : Tensor[N] 43 | indices of the categories for each one of the boxes. 44 | iou_threshold : float 45 | discards all overlapping boxes 46 | with IoU < iou_threshold 47 | 48 | Returns 49 | ------- 50 | keep : Tensor 51 | int64 tensor with the indices of 52 | the elements that have been kept by NMS, sorted 53 | in decreasing order of scores 54 | """ 55 | if boxes.numel() == 0: 56 | return torch.empty((0,), dtype=torch.int64, device=boxes.device) 57 | # strategy: in order to perform NMS independently per class. 58 | # we add an offset to all the boxes. The offset is dependent 59 | # only on the class idx, and is large enough so that boxes 60 | # from different classes do not overlap 61 | max_coordinate = boxes.max() 62 | offsets = idxs.to(boxes) * (max_coordinate + 1) 63 | boxes_for_nms = boxes + offsets[:, None] 64 | keep = nms(boxes_for_nms, scores, iou_threshold) 65 | return keep 66 | -------------------------------------------------------------------------------- /ssd/utils/registry.py: -------------------------------------------------------------------------------- 1 | def _register_generic(module_dict, module_name, module): 2 | assert module_name not in module_dict 3 | module_dict[module_name] = module 4 | 5 | 6 | class Registry(dict): 7 | """ 8 | A helper class for managing registering modules, it extends a dictionary 9 | and provides a register functions. 10 | Eg. creating a registry: 11 | some_registry = Registry({"default": default_module}) 12 | There're two ways of registering new modules: 13 | 1): normal way is just calling register function: 14 | def foo(): 15 | ... 16 | some_registry.register("foo_module", foo) 17 | 2): used as decorator when declaring the module: 18 | @some_registry.register("foo_module") 19 | @some_registry.register("foo_module_nickname") 20 | def foo(): 21 | ... 22 | Access of module is just like using a dictionary, eg: 23 | f = some_registry["foo_module"] 24 | """ 25 | 26 | def __init__(self, *args, **kwargs): 27 | super(Registry, self).__init__(*args, **kwargs) 28 | 29 | def register(self, module_name, module=None): 30 | # used as function call 31 | if module is not None: 32 | _register_generic(self, module_name, module) 33 | return 34 | 35 | # used as decorator 36 | def register_fn(fn): 37 | _register_generic(self, module_name, fn) 38 | return fn 39 | 40 | return register_fn 41 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | 5 | import torch 6 | import torch.utils.data 7 | 8 | from ssd.config import cfg 9 | from ssd.engine.inference import do_evaluation 10 | from ssd.modeling.detector import build_detection_model 11 | from ssd.utils import dist_util 12 | from ssd.utils.checkpoint import CheckPointer 13 | from ssd.utils.dist_util import synchronize 14 | from ssd.utils.logger import setup_logger 15 | 16 | 17 | def evaluation(cfg, ckpt, distributed): 18 | logger = logging.getLogger("SSD.inference") 19 | 20 | model = build_detection_model(cfg) 21 | checkpointer = CheckPointer(model, save_dir=cfg.OUTPUT_DIR, logger=logger) 22 | device = torch.device(cfg.MODEL.DEVICE) 23 | model.to(device) 24 | checkpointer.load(ckpt, use_latest=ckpt is None) 25 | do_evaluation(cfg, model, distributed) 26 | 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser(description='SSD Evaluation on VOC and COCO dataset.') 30 | parser.add_argument( 31 | "--config-file", 32 | default="", 33 | metavar="FILE", 34 | help="path to config file", 35 | type=str, 36 | ) 37 | parser.add_argument("--local_rank", type=int, default=0) 38 | parser.add_argument( 39 | "--ckpt", 40 | help="The path to the checkpoint for test, default is the latest checkpoint.", 41 | default=None, 42 | type=str, 43 | ) 44 | 45 | parser.add_argument("--output_dir", default="eval_results", type=str, help="The directory to store evaluation results.") 46 | 47 | parser.add_argument( 48 | "opts", 49 | help="Modify config options using the command-line", 50 | default=None, 51 | nargs=argparse.REMAINDER, 52 | ) 53 | args = parser.parse_args() 54 | 55 | num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 56 | distributed = num_gpus > 1 57 | 58 | if torch.cuda.is_available(): 59 | # This flag allows you to enable the inbuilt cudnn auto-tuner to 60 | # find the best algorithm to use for your hardware. 61 | torch.backends.cudnn.benchmark = True 62 | if distributed: 63 | torch.cuda.set_device(args.local_rank) 64 | torch.distributed.init_process_group(backend="nccl", init_method="env://") 65 | synchronize() 66 | 67 | cfg.merge_from_file(args.config_file) 68 | cfg.merge_from_list(args.opts) 69 | cfg.freeze() 70 | 71 | logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) 72 | logger.info("Using {} GPUs".format(num_gpus)) 73 | logger.info(args) 74 | 75 | logger.info("Loaded configuration file {}".format(args.config_file)) 76 | with open(args.config_file, "r") as cf: 77 | config_str = "\n" + cf.read() 78 | logger.info(config_str) 79 | logger.info("Running with config:\n{}".format(cfg)) 80 | evaluation(cfg, ckpt=args.ckpt, distributed=distributed) 81 | 82 | 83 | if __name__ == '__main__': 84 | main() 85 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import os 4 | 5 | import torch 6 | import torch.distributed as dist 7 | 8 | from ssd.engine.inference import do_evaluation 9 | from ssd.config import cfg 10 | from ssd.data.build import make_data_loader 11 | from ssd.engine.trainer import do_train 12 | from ssd.modeling.detector import build_detection_model 13 | from ssd.solver.build import make_optimizer, make_lr_scheduler 14 | from ssd.utils import dist_util, mkdir 15 | from ssd.utils.checkpoint import CheckPointer 16 | from ssd.utils.dist_util import synchronize 17 | from ssd.utils.logger import setup_logger 18 | from ssd.utils.misc import str2bool 19 | 20 | 21 | def train(cfg, args): 22 | logger = logging.getLogger('SSD.trainer') 23 | model = build_detection_model(cfg) 24 | device = torch.device(cfg.MODEL.DEVICE) 25 | model.to(device) 26 | if args.distributed: 27 | model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) 28 | 29 | lr = cfg.SOLVER.LR * args.num_gpus # scale by num gpus 30 | optimizer = make_optimizer(cfg, model, lr) 31 | 32 | milestones = [step // args.num_gpus for step in cfg.SOLVER.LR_STEPS] 33 | scheduler = make_lr_scheduler(cfg, optimizer, milestones) 34 | 35 | arguments = {"iteration": 0} 36 | save_to_disk = dist_util.get_rank() == 0 37 | checkpointer = CheckPointer(model, optimizer, scheduler, cfg.OUTPUT_DIR, save_to_disk, logger) 38 | extra_checkpoint_data = checkpointer.load() 39 | arguments.update(extra_checkpoint_data) 40 | 41 | max_iter = cfg.SOLVER.MAX_ITER // args.num_gpus 42 | train_loader = make_data_loader(cfg, is_train=True, distributed=args.distributed, max_iter=max_iter, start_iter=arguments['iteration']) 43 | 44 | model = do_train(cfg, model, train_loader, optimizer, scheduler, checkpointer, device, arguments, args) 45 | return model 46 | 47 | 48 | def main(): 49 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detector Training With PyTorch') 50 | parser.add_argument( 51 | "--config-file", 52 | default="", 53 | metavar="FILE", 54 | help="path to config file", 55 | type=str, 56 | ) 57 | parser.add_argument("--local_rank", type=int, default=0) 58 | parser.add_argument('--log_step', default=10, type=int, help='Print logs every log_step') 59 | parser.add_argument('--save_step', default=2500, type=int, help='Save checkpoint every save_step') 60 | parser.add_argument('--eval_step', default=2500, type=int, help='Evaluate dataset every eval_step, disabled when eval_step < 0') 61 | parser.add_argument('--use_tensorboard', default=True, type=str2bool) 62 | parser.add_argument( 63 | "--skip-test", 64 | dest="skip_test", 65 | help="Do not test the final model", 66 | action="store_true", 67 | ) 68 | parser.add_argument( 69 | "opts", 70 | help="Modify config options using the command-line", 71 | default=None, 72 | nargs=argparse.REMAINDER, 73 | ) 74 | args = parser.parse_args() 75 | num_gpus = int(os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 76 | args.distributed = num_gpus > 1 77 | args.num_gpus = num_gpus 78 | 79 | if torch.cuda.is_available(): 80 | # This flag allows you to enable the inbuilt cudnn auto-tuner to 81 | # find the best algorithm to use for your hardware. 82 | torch.backends.cudnn.benchmark = True 83 | if args.distributed: 84 | torch.cuda.set_device(args.local_rank) 85 | torch.distributed.init_process_group(backend="nccl", init_method="env://") 86 | synchronize() 87 | 88 | cfg.merge_from_file(args.config_file) 89 | cfg.merge_from_list(args.opts) 90 | cfg.freeze() 91 | 92 | if cfg.OUTPUT_DIR: 93 | mkdir(cfg.OUTPUT_DIR) 94 | 95 | logger = setup_logger("SSD", dist_util.get_rank(), cfg.OUTPUT_DIR) 96 | logger.info("Using {} GPUs".format(num_gpus)) 97 | logger.info(args) 98 | 99 | logger.info("Loaded configuration file {}".format(args.config_file)) 100 | with open(args.config_file, "r") as cf: 101 | config_str = "\n" + cf.read() 102 | logger.info(config_str) 103 | logger.info("Running with config:\n{}".format(cfg)) 104 | 105 | model = train(cfg, args) 106 | 107 | if not args.skip_test: 108 | logger.info('Start evaluating...') 109 | torch.cuda.empty_cache() # speed up evaluating after training finished 110 | do_evaluation(cfg, model, distributed=args.distributed) 111 | 112 | 113 | if __name__ == '__main__': 114 | main() 115 | --------------------------------------------------------------------------------