├── .gitignore ├── .idea ├── .gitignore ├── SSD.Pytorch.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── LICENSE ├── README.md ├── __pycache__ └── ssd.cpython-37.pyc ├── data ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── coco.cpython-37.pyc │ ├── config.cpython-37.pyc │ └── voc0712.cpython-37.pyc ├── coco.py ├── coco_labels.txt ├── config.py ├── scripts │ ├── COCO2014.sh │ ├── VOC2007.sh │ └── VOC2012.sh ├── vgg_truncated.txt └── voc0712.py ├── demo.py ├── demo ├── .ipynb_checkpoints │ └── demo-checkpoint.ipynb ├── __init__.py ├── demo.ipynb └── live.py ├── efficientnet_pytorch ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── fire_smoke_model.cpython-37.pyc │ ├── model.cpython-37.pyc │ └── utils.cpython-37.pyc ├── fire_smoke_model.py ├── model.py └── utils.py ├── eval.py ├── img ├── SSDplate.jpeg ├── Screenshot from 2020-02-15 20-09-16.png ├── image-20200215220618684.png ├── loss.png ├── map_epoch.png └── resut.jpg ├── layers ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-37.pyc │ └── box_utils.cpython-37.pyc ├── box_utils.py ├── functions │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── detection.cpython-37.pyc │ │ └── prior_box.cpython-37.pyc │ ├── detection.py │ └── prior_box.py └── modules │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-37.pyc │ ├── l2norm.cpython-37.pyc │ └── multibox_loss.cpython-37.pyc │ ├── l2norm.py │ └── multibox_loss.py ├── ssd.py ├── test.py ├── test └── COCO_train2014_000000000659.jpg ├── train.py └── utils ├── __init__.py ├── __pycache__ ├── __init__.cpython-37.pyc └── augmentations.cpython-37.pyc └── augmentations.py /.gitignore: -------------------------------------------------------------------------------- 1 | weights/*.pth 2 | venv/ 3 | ssd300_120000/ 4 | data/VOCdevkit/ 5 | loss.pkl 6 | vgg16weights.py 7 | efficientnetb4_truncated.py 8 | vgg16weights.py 9 | result245.jpg 10 | demo/ 11 | __pycache__/ 12 | loss_visual.py 13 | map_visual.py 14 | result1k.txt 15 | weights/Final_M2Det_VOC_size320_netvgg16.pth 16 | weights/vgg16_reducedfc.pth 17 | weights/efficientnet_b4_truncated.pth 18 | weights/efficientnet-b4-6ed6700e.pth 19 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml 3 | -------------------------------------------------------------------------------- /.idea/SSD.Pytorch.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 midasklr 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # SSD.Pytorch 4 | 5 | Pytorch implementation of [[SSD (Single Shot MultiBox Detector)](https://arxiv.org/abs/1512.02325)]. 6 | 7 | this repository is heavily depend on this implementation [ssd.pytorch](https://github.com/amdegroot/ssd.pytorch).since orginal code is too old to fit the recent version of pytorch. I make some changes , fix some bugs, and give out SSD512 code. 8 | 9 | ## Environment 10 | 11 | python3.7 (python3 may work ok) 12 | 13 | pytorch1.3 14 | 15 | opencv 16 | 17 | ## Dataset 18 | 19 | Currently I only trained on Pascal VOC dataset and my own plate dataset. 20 | 21 | U can make ur own dataset as VOC format and train ur own ssd model. 22 | 23 | datasets are put under ./data , u should change the path according in voc0712.py. 24 | 25 | ## Train 26 | 27 | First download the fc-reduced [VGG-16](https://arxiv.org/abs/1409.1556) PyTorch base network weights at: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth 28 | 29 | By default, we assume you have downloaded the file in the ./weights dir: 30 | 31 | ```shell 32 | mkdir weights 33 | cd weights 34 | wget https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth 35 | ``` 36 | 37 | to train VOC or ur own dataset, simply run : 38 | 39 | ```shell 40 | CUDA_VISIBLE_DEVICES=0 python train.py --input 512 --dataset_root ./data/VOCdevkit --num_class 21 --num_epoch 300 --lr 0.001 --batch_size 16 41 | ``` 42 | 43 | or u can resume ur training from the checkpoint under dir ./weights/ 44 | 45 | ```shell 46 | CUDA_VISIBLE_DEVICES=0 python train.py --input 512 --dataset_root ./data/VOCdevkit --num_class 21 --num_epoch 300 --lr 0.001 --batch_size 16 --resume ./weights/ssd512_VOC_12000.pth 47 | ``` 48 | 49 | ## Evaluation 50 | 51 | use the eval.py to eval ur model: 52 | 53 | ``` 54 | python eval.py --input 512 --trained_model weights/ssd512_VOC_73000_mAP79.80.pth 55 | ``` 56 | 57 | and you will get results as follows: 58 | 59 | AP for aeroplane = 0.8861 60 | AP for bicycle = 0.8694 61 | AP for bird = 0.8078 62 | AP for boat = 0.7698 63 | AP for bottle = 0.6407 64 | AP for bus = 0.8625 65 | AP for car = 0.8825 66 | AP for cat = 0.8671 67 | AP for chair = 0.6424 68 | AP for cow = 0.8712 69 | AP for diningtable = 0.6781 70 | AP for dog = 0.8572 71 | AP for horse = 0.8781 72 | AP for motorbike = 0.8531 73 | AP for person = 0.8091 74 | AP for pottedplant = 0.5479 75 | AP for sheep = 0.8327 76 | AP for sofa = 0.7562 77 | AP for train = 0.8654 78 | AP for tvmonitor = 0.7824 79 | Mean AP = 0.7980 80 | 81 | ## Demo 82 | 83 | u can test single image using demo.py, just change a bit code in demo.py 84 | 85 | [](./img/resut.jpg) 86 | 87 | 88 | 89 | ## Results 90 | 91 | VOC2007 test (0.5) results: 92 | 93 | | model | paper | this implements | 94 | | ------ | ----- | --------------- | 95 | | SSD300 | 77.2 | 77.43 | 96 | | SSD512 | 79.8 | 79.80 | 97 | 98 | SSD300 and SSD512 model weights trained with VOC:https://pan.baidu.com/s/1DxlkOQzkFkkdYdNYsDx_MQ code:dd7m 99 | ![](./img/map_epoch.png) 100 | 101 | ![](./img/loss.png) 102 | 103 | 104 | ## Train with Customer Dataset 105 | 106 | I trained a plate detector with ssd and work pretty well,though with a bit slow latency. 107 | ![avatar](./img/SSDplate.jpeg) 108 | 109 | To train your own dataset: 110 | 111 | **1)make your dataset as VOC format and put it in ./data/ folder. the dataset path could be arrange as** follows: 112 | 113 | ![image-20200215220618684](img/image-20200215220618684.png) 114 | 115 | 116 | 117 | JPEGImages folder is all your dataset,Annotations is all your xml labels, and create your own trainval.txt and test.txt under ImageSets/Main , just follow voc format. Above is my own dataset CityDet. 118 | 119 | **2) change the dataset parser code ./data/voc0712.py:** 120 | 121 | change : 122 | 123 | ```python 124 | VOC_CLASSES = ( # always index 0 125 | ur dataset class) 126 | ``` 127 | 128 | change: 129 | 130 | ```python 131 | VOC_ROOT = osp.join('./', "data/VOCdevkit/") 132 | ``` 133 | 134 | to your own dataset dir: 135 | 136 | ```python 137 | VOC_ROOT = osp.join('./', "data/CityDet/") 138 | ``` 139 | 140 | change: 141 | 142 | ```python 143 | image_sets=[('2007', 'trainval'), ('2012', 'trainval')] 144 | ``` 145 | 146 | to: 147 | 148 | ```python 149 | image_sets=[('2007', 'trainval')] 150 | ``` 151 | 152 | **3) train with vgg pretrained weights** 153 | 154 | Download the fc-reduced [VGG-16](https://arxiv.org/abs/1409.1556) PyTorch base network weights at: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth 155 | 156 | By default, we assume you have downloaded the file in the ./weights dir: 157 | 158 | ```shell 159 | mkdir weights 160 | cd weights 161 | wget https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth 162 | ``` 163 | 164 | run : 165 | 166 | ```shell 167 | CUDA_VISIBLE_DEVICES=0 python train.py --input 512 --dataset_root ${your dataset dir} --num_class ${your dataset class} --num_epoch 300 --lr 0.001 --batch_size 16 168 | ``` 169 | 170 | for my CityDet dataset: 171 | 172 | ```shell 173 | CUDA_VISIBLE_DEVICES=0 python train.py --input 512 --dataset_root ./data/CityDet/ --num_class 22 --num_epoch 300 --lr 0.001 --batch_size 16 174 | ``` 175 | 176 | and you will get start to train your own SSD detector: 177 | 178 | ![sR](./img/Screenshot%20from%202020-02-15%2020-09-16.png) 179 | -------------------------------------------------------------------------------- /__pycache__/ssd.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/__pycache__/ssd.cpython-37.pyc -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .voc0712 import VOCDetection, VOCAnnotationTransform, VOC_CLASSES, VOC_ROOT 2 | 3 | from .coco import COCODetection, COCOAnnotationTransform, COCO_CLASSES, COCO_ROOT, get_label_map 4 | from .config import * 5 | import torch 6 | import cv2 7 | import numpy as np 8 | 9 | def detection_collate(batch): 10 | """Custom collate fn for dealing with batches of images that have a different 11 | number of associated object annotations (bounding boxes). 12 | 13 | Arguments: 14 | batch: (tuple) A tuple of tensor images and lists of annotations 15 | 16 | Return: 17 | A tuple containing: 18 | 1) (tensor) batch of images stacked on their 0 dim 19 | 2) (list of tensors) annotations for a given image are stacked on 20 | 0 dim 21 | """ 22 | targets = [] 23 | imgs = [] 24 | for sample in batch: 25 | imgs.append(sample[0]) 26 | targets.append(torch.FloatTensor(sample[1])) 27 | return torch.stack(imgs, 0), targets 28 | 29 | 30 | def base_transform(image, size, mean): 31 | x = cv2.resize(image, (size, size)).astype(np.float32) 32 | x -= mean 33 | x = x.astype(np.float32) 34 | return x 35 | 36 | 37 | class BaseTransform: 38 | def __init__(self, size, mean): 39 | self.size = size 40 | self.mean = np.array(mean, dtype=np.float32) 41 | 42 | def __call__(self, image, boxes=None, labels=None): 43 | return base_transform(image, self.size, self.mean), boxes, labels 44 | -------------------------------------------------------------------------------- /data/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/data/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /data/__pycache__/coco.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/data/__pycache__/coco.cpython-37.pyc -------------------------------------------------------------------------------- /data/__pycache__/config.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/data/__pycache__/config.cpython-37.pyc -------------------------------------------------------------------------------- /data/__pycache__/voc0712.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/data/__pycache__/voc0712.cpython-37.pyc -------------------------------------------------------------------------------- /data/coco.py: -------------------------------------------------------------------------------- 1 | from .config import HOME 2 | import os 3 | import os.path as osp 4 | import sys 5 | import torch 6 | import torch.utils.data as data 7 | import torchvision.transforms as transforms 8 | import cv2 9 | import numpy as np 10 | 11 | COCO_ROOT = osp.join('./', 'data/') 12 | IMAGES = 'images' 13 | ANNOTATIONS = 'annotations' 14 | COCO_API = 'PythonAPI' 15 | INSTANCES_SET = 'instances_{}.json' 16 | COCO_CLASSES = ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 17 | 'train', 'truck', 'boat', 'traffic light', 'fire', 'hydrant', 18 | 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 19 | 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 20 | 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 21 | 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 22 | 'kite', 'baseball bat', 'baseball glove', 'skateboard', 23 | 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 24 | 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 25 | 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 26 | 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 27 | 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 28 | 'keyboard', 'cell phone', 'microwave oven', 'toaster', 'sink', 29 | 'refrigerator', 'book', 'clock', 'vase', 'scissors', 30 | 'teddy bear', 'hair drier', 'toothbrush') 31 | 32 | 33 | def get_label_map(label_file): 34 | label_map = {} 35 | labels = open(label_file, 'r') 36 | for line in labels: 37 | ids = line.split(',') 38 | label_map[int(ids[0])] = int(ids[1]) 39 | return label_map 40 | 41 | 42 | class COCOAnnotationTransform(object): 43 | """Transforms a COCO annotation into a Tensor of bbox coords and label index 44 | Initilized with a dictionary lookup of classnames to indexes 45 | """ 46 | def __init__(self): 47 | self.label_map = get_label_map(osp.join(COCO_ROOT, 'coco_labels.txt')) 48 | 49 | def __call__(self, target, width, height): 50 | """ 51 | Args: 52 | target (dict): COCO target json annotation as a python dict 53 | height (int): height 54 | width (int): width 55 | Returns: 56 | a list containing lists of bounding boxes [bbox coords, class idx] 57 | """ 58 | scale = np.array([width, height, width, height]) 59 | res = [] 60 | for obj in target: 61 | if 'bbox' in obj: 62 | bbox = obj['bbox'] 63 | bbox[2] += bbox[0] 64 | bbox[3] += bbox[1] 65 | label_idx = self.label_map[obj['category_id']] - 1 66 | final_box = list(np.array(bbox)/scale) 67 | final_box.append(label_idx) 68 | res += [final_box] # [xmin, ymin, xmax, ymax, label_idx] 69 | else: 70 | print("no bbox problem!") 71 | 72 | return res # [[xmin, ymin, xmax, ymax, label_idx], ... ] 73 | 74 | 75 | class COCODetection(data.Dataset): 76 | """`MS Coco Detection `_ Dataset. 77 | Args: 78 | root (string): Root directory where images are downloaded to. 79 | set_name (string): Name of the specific set of COCO images. 80 | transform (callable, optional): A function/transform that augments the 81 | raw images` 82 | target_transform (callable, optional): A function/transform that takes 83 | in the target (bbox) and transforms it. 84 | """ 85 | 86 | def __init__(self, root, image_set='trainval35k', transform=None, 87 | target_transform=COCOAnnotationTransform(), dataset_name='MS COCO'): 88 | sys.path.append(osp.join(root, COCO_API)) 89 | from pycocotools.coco import COCO 90 | self.root = osp.join(root, IMAGES, image_set) 91 | self.coco = COCO(osp.join(root, ANNOTATIONS, 92 | INSTANCES_SET.format(image_set))) 93 | self.ids = list(self.coco.imgToAnns.keys()) 94 | self.transform = transform 95 | self.target_transform = target_transform 96 | self.name = dataset_name 97 | 98 | def __getitem__(self, index): 99 | """ 100 | Args: 101 | index (int): Index 102 | Returns: 103 | tuple: Tuple (image, target). 104 | target is the object returned by ``coco.loadAnns``. 105 | """ 106 | im, gt, h, w = self.pull_item(index) 107 | return im, gt 108 | 109 | def __len__(self): 110 | return len(self.ids) 111 | 112 | def pull_item(self, index): 113 | """ 114 | Args: 115 | index (int): Index 116 | Returns: 117 | tuple: Tuple (image, target, height, width). 118 | target is the object returned by ``coco.loadAnns``. 119 | """ 120 | img_id = self.ids[index] 121 | target = self.coco.imgToAnns[img_id] 122 | ann_ids = self.coco.getAnnIds(imgIds=img_id) 123 | 124 | target = self.coco.loadAnns(ann_ids) 125 | path = osp.join(self.root, self.coco.loadImgs(img_id)[0]['file_name']) 126 | assert osp.exists(path), 'Image path does not exist: {}'.format(path) 127 | img = cv2.imread(osp.join(self.root, path)) 128 | height, width, _ = img.shape 129 | if self.target_transform is not None: 130 | target = self.target_transform(target, width, height) 131 | if self.transform is not None: 132 | target = np.array(target) 133 | img, boxes, labels = self.transform(img, target[:, :4], 134 | target[:, 4]) 135 | # to rgb 136 | img = img[:, :, (2, 1, 0)] 137 | 138 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 139 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width 140 | 141 | def pull_image(self, index): 142 | '''Returns the original image object at index in PIL form 143 | 144 | Note: not using self.__getitem__(), as any transformations passed in 145 | could mess up this functionality. 146 | 147 | Argument: 148 | index (int): index of img to show 149 | Return: 150 | cv2 img 151 | ''' 152 | img_id = self.ids[index] 153 | path = self.coco.loadImgs(img_id)[0]['file_name'] 154 | return cv2.imread(osp.join(self.root, path), cv2.IMREAD_COLOR) 155 | 156 | def pull_anno(self, index): 157 | '''Returns the original annotation of image at index 158 | 159 | Note: not using self.__getitem__(), as any transformations passed in 160 | could mess up this functionality. 161 | 162 | Argument: 163 | index (int): index of img to get annotation of 164 | Return: 165 | list: [img_id, [(label, bbox coords),...]] 166 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 167 | ''' 168 | img_id = self.ids[index] 169 | ann_ids = self.coco.getAnnIds(imgIds=img_id) 170 | return self.coco.loadAnns(ann_ids) 171 | 172 | def __repr__(self): 173 | fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' 174 | fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) 175 | fmt_str += ' Root Location: {}\n'.format(self.root) 176 | tmp = ' Transforms (if any): ' 177 | fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 178 | tmp = ' Target Transforms (if any): ' 179 | fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 180 | return fmt_str 181 | -------------------------------------------------------------------------------- /data/coco_labels.txt: -------------------------------------------------------------------------------- 1 | 1,1,person 2 | 2,2,bicycle 3 | 3,3,car 4 | 4,4,motorcycle 5 | 5,5,airplane 6 | 6,6,bus 7 | 7,7,train 8 | 8,8,truck 9 | 9,9,boat 10 | 10,10,traffic light 11 | 11,11,fire hydrant 12 | 13,12,stop sign 13 | 14,13,parking meter 14 | 15,14,bench 15 | 16,15,bird 16 | 17,16,cat 17 | 18,17,dog 18 | 19,18,horse 19 | 20,19,sheep 20 | 21,20,cow 21 | 22,21,elephant 22 | 23,22,bear 23 | 24,23,zebra 24 | 25,24,giraffe 25 | 27,25,backpack 26 | 28,26,umbrella 27 | 31,27,handbag 28 | 32,28,tie 29 | 33,29,suitcase 30 | 34,30,frisbee 31 | 35,31,skis 32 | 36,32,snowboard 33 | 37,33,sports ball 34 | 38,34,kite 35 | 39,35,baseball bat 36 | 40,36,baseball glove 37 | 41,37,skateboard 38 | 42,38,surfboard 39 | 43,39,tennis racket 40 | 44,40,bottle 41 | 46,41,wine glass 42 | 47,42,cup 43 | 48,43,fork 44 | 49,44,knife 45 | 50,45,spoon 46 | 51,46,bowl 47 | 52,47,banana 48 | 53,48,apple 49 | 54,49,sandwich 50 | 55,50,orange 51 | 56,51,broccoli 52 | 57,52,carrot 53 | 58,53,hot dog 54 | 59,54,pizza 55 | 60,55,donut 56 | 61,56,cake 57 | 62,57,chair 58 | 63,58,couch 59 | 64,59,potted plant 60 | 65,60,bed 61 | 67,61,dining table 62 | 70,62,toilet 63 | 72,63,tv 64 | 73,64,laptop 65 | 74,65,mouse 66 | 75,66,remote 67 | 76,67,keyboard 68 | 77,68,cell phone 69 | 78,69,microwave 70 | 79,70,oven 71 | 80,71,toaster 72 | 81,72,sink 73 | 82,73,refrigerator 74 | 84,74,book 75 | 85,75,clock 76 | 86,76,vase 77 | 87,77,scissors 78 | 88,78,teddy bear 79 | 89,79,hair drier 80 | 90,80,toothbrush 81 | -------------------------------------------------------------------------------- /data/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | import os.path 3 | 4 | # gets home dir cross platform 5 | HOME = os.path.expanduser("~") 6 | 7 | # for making bounding boxes pretty 8 | COLORS = ((255, 0, 0, 128), (0, 255, 0, 128), (0, 0, 255, 128), 9 | (0, 255, 255, 128), (255, 0, 255, 128), (255, 255, 0, 128)) 10 | 11 | MEANS = (104, 117, 123) 12 | 13 | # SSD300 CONFIGS 14 | voc = { 15 | 'SSD300':{ 16 | 'num_classes': 21, 17 | 'lr_steps': (100, 180, 250), 18 | 'max_iter': 520000, 19 | 'feature_maps': [38, 19, 10, 5, 3, 1], 20 | 'min_dim': 300, 21 | 'steps': [8, 16, 32, 64, 100, 300], 22 | 'min_sizes': [30, 60, 111, 162, 213, 264], 23 | 'max_sizes': [60, 111, 162, 213, 264, 315], 24 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 25 | 'variance': [0.1, 0.2], 26 | 'clip': True, 27 | 'name': 'VOC' 28 | }, 29 | 'SSD512':{ 30 | 'num_classes': 21, 31 | 'lr_steps': (100, 200, 300), 32 | 'max_iter': 120000, 33 | 'feature_maps': [64, 32, 16, 8, 4, 2, 1], 34 | 'min_dim': 512, 35 | 'steps': [8, 16, 32, 64, 100, 300, 512], 36 | 'min_sizes': [30, 60, 111, 162, 213, 264, 315], 37 | 'max_sizes': [60, 111, 162, 213, 264, 315, 366], 38 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2],[2]], 39 | 'variance': [0.1, 0.2], 40 | 'clip': True, 41 | 'name': 'VOC', 42 | } 43 | 44 | } 45 | 46 | coco = { 47 | 'num_classes': 201, 48 | 'lr_steps': (280000, 360000, 400000), 49 | 'max_iter': 400000, 50 | 'feature_maps': [38, 19, 10, 5, 3, 1], 51 | 'min_dim': 300, 52 | 'steps': [8, 16, 32, 64, 100, 300], 53 | 'min_sizes': [21, 45, 99, 153, 207, 261], 54 | 'max_sizes': [45, 99, 153, 207, 261, 315], 55 | 'aspect_ratios': [[2], [2, 3], [2, 3], [2, 3], [2], [2]], 56 | 'variance': [0.1, 0.2], 57 | 'clip': True, 58 | 'name': 'COCO', 59 | } 60 | -------------------------------------------------------------------------------- /data/scripts/COCO2014.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | start=`date +%s` 4 | 5 | # handle optional download dir 6 | if [ -z "$1" ] 7 | then 8 | # navigate to ~/data 9 | echo "navigating to ~/data/ ..." 10 | mkdir -p ~/data 11 | cd ~/data/ 12 | mkdir -p ./coco 13 | cd ./coco 14 | mkdir -p ./images 15 | mkdir -p ./annotations 16 | else 17 | # check if specified dir is valid 18 | if [ ! -d $1 ]; then 19 | echo $1 " is not a valid directory" 20 | exit 0 21 | fi 22 | echo "navigating to " $1 " ..." 23 | cd $1 24 | fi 25 | 26 | if [ ! -d images ] 27 | then 28 | mkdir -p ./images 29 | fi 30 | 31 | # Download the image data. 32 | cd ./images 33 | echo "Downloading MSCOCO train images ..." 34 | curl -LO http://images.cocodataset.org/zips/train2014.zip 35 | echo "Downloading MSCOCO val images ..." 36 | curl -LO http://images.cocodataset.org/zips/val2014.zip 37 | 38 | cd ../ 39 | if [ ! -d annotations] 40 | then 41 | mkdir -p ./annotations 42 | fi 43 | 44 | # Download the annotation data. 45 | cd ./annotations 46 | echo "Downloading MSCOCO train/val annotations ..." 47 | curl -LO http://images.cocodataset.org/annotations/annotations_trainval2014.zip 48 | echo "Finished downloading. Now extracting ..." 49 | 50 | # Unzip data 51 | echo "Extracting train images ..." 52 | unzip ../images/train2014.zip -d ../images 53 | echo "Extracting val images ..." 54 | unzip ../images/val2014.zip -d ../images 55 | echo "Extracting annotations ..." 56 | unzip ./annotations_trainval2014.zip 57 | 58 | echo "Removing zip files ..." 59 | rm ../images/train2014.zip 60 | rm ../images/val2014.zip 61 | rm ./annotations_trainval2014.zip 62 | 63 | echo "Creating trainval35k dataset..." 64 | 65 | # Download annotations json 66 | echo "Downloading trainval35k annotations from S3" 67 | curl -LO https://s3.amazonaws.com/amdegroot-datasets/instances_trainval35k.json.zip 68 | 69 | # combine train and val 70 | echo "Combining train and val images" 71 | mkdir ../images/trainval35k 72 | cd ../images/train2014 73 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} + # dir too large for cp 74 | cd ../val2014 75 | find -maxdepth 1 -name '*.jpg' -exec cp -t ../trainval35k {} + 76 | 77 | 78 | end=`date +%s` 79 | runtime=$((end-start)) 80 | 81 | echo "Completed in " $runtime " seconds" 82 | -------------------------------------------------------------------------------- /data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/vgg_truncated.txt: -------------------------------------------------------------------------------- 1 | vgg truncated layers: 2 | [Conv2d(1024, 256, kernel_size=(1, 1), stride=(1, 1)), 3 | Conv2d(256, 512, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), 4 | Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1)), 5 | Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), 6 | Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1)), 7 | Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1)), Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1)), Conv2d(128, 256, kernel_size=(4, 4), stride=(1, 1))] 8 | 9 | -------------------------------------------------------------------------------- /data/voc0712.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | from .config import HOME 9 | import os.path as osp 10 | import sys 11 | import torch 12 | import torch.utils.data as data 13 | import cv2 14 | import numpy as np 15 | if sys.version_info[0] == 2: 16 | import xml.etree.cElementTree as ET 17 | else: 18 | import xml.etree.ElementTree as ET 19 | 20 | VOC_CLASSES = ( # always index 0 21 | 'aeroplane', 'bicycle', 'bird', 'boat', 22 | 'bottle', 'bus', 'car', 'cat', 'chair', 23 | 'cow', 'diningtable', 'dog', 'horse', 24 | 'motorbike', 'person', 'pottedplant', 25 | 'sheep', 'sofa', 'train', 'tvmonitor') 26 | 27 | # note: if you used our download scripts, this should be right 28 | VOC_ROOT = osp.join('./', "data/VOCdevkit/") 29 | 30 | 31 | class VOCAnnotationTransform(object): 32 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 33 | Initilized with a dictionary lookup of classnames to indexes 34 | 35 | Arguments: 36 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 37 | (default: alphabetic indexing of VOC's 20 classes) 38 | keep_difficult (bool, optional): keep difficult instances or not 39 | (default: False) 40 | height (int): height 41 | width (int): width 42 | """ 43 | 44 | def __init__(self, class_to_ind=None, keep_difficult=False): 45 | self.class_to_ind = class_to_ind or dict( 46 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 47 | self.keep_difficult = keep_difficult 48 | 49 | def __call__(self, target, width, height): 50 | """ 51 | Arguments: 52 | target (annotation) : the target annotation to be made usable 53 | will be an ET.Element 54 | Returns: 55 | a list containing lists of bounding boxes [bbox coords, class name] 56 | """ 57 | res = [] 58 | for obj in target.iter('object'): 59 | difficult = int(obj.find('difficult').text) == 1 60 | if not self.keep_difficult and difficult: 61 | continue 62 | name = obj.find('name').text.lower().strip() 63 | bbox = obj.find('bndbox') 64 | 65 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 66 | bndbox = [] 67 | for i, pt in enumerate(pts): 68 | cur_pt = int(bbox.find(pt).text) - 1 69 | # scale height or width 70 | cur_pt = cur_pt / width if i % 2 == 0 else cur_pt / height 71 | bndbox.append(cur_pt) 72 | label_idx = self.class_to_ind[name] 73 | bndbox.append(label_idx) 74 | res += [bndbox] # [xmin, ymin, xmax, ymax, label_ind] 75 | # img_id = target.find('filename').text[:-4] 76 | 77 | return res # [[xmin, ymin, xmax, ymax, label_ind], ... ] 78 | 79 | 80 | class VOCDetection(data.Dataset): 81 | """VOC Detection Dataset Object 82 | 83 | input is image, target is annotation 84 | 85 | Arguments: 86 | root (string): filepath to VOCdevkit folder. 87 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 88 | transform (callable, optional): transformation to perform on the 89 | input image 90 | target_transform (callable, optional): transformation to perform on the 91 | target `annotation` 92 | (eg: take in caption string, return tensor of word indices) 93 | dataset_name (string, optional): which dataset to load 94 | (default: 'VOC2007') 95 | """ 96 | 97 | def __init__(self, root, 98 | image_sets=[('2007', 'trainval'), ('2012', 'trainval')], 99 | transform=None, target_transform=VOCAnnotationTransform(), 100 | dataset_name='VOC0712'): 101 | self.root = root 102 | self.image_set = image_sets 103 | self.transform = transform 104 | self.target_transform = target_transform 105 | self.name = dataset_name 106 | self._annopath = osp.join('%s', 'Annotations', '%s.xml') 107 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') 108 | self.ids = list() 109 | for (year, name) in image_sets: 110 | rootpath = osp.join(self.root, 'VOC' + year) 111 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 112 | self.ids.append((rootpath, line.strip())) 113 | 114 | def __getitem__(self, index): 115 | im, gt, h, w = self.pull_item(index) 116 | 117 | return im, gt 118 | 119 | def __len__(self): 120 | return len(self.ids) 121 | 122 | def pull_item(self, index): 123 | img_id = self.ids[index] 124 | 125 | target = ET.parse(self._annopath % img_id).getroot() 126 | img = cv2.imread(self._imgpath % img_id) 127 | height, width, channels = img.shape 128 | 129 | if self.target_transform is not None: 130 | target = self.target_transform(target, width, height) 131 | 132 | if self.transform is not None: 133 | target = np.array(target) 134 | img, boxes, labels = self.transform(img, target[:, :4], target[:, 4]) 135 | # to rgb 136 | img = img[:, :, (2, 1, 0)] 137 | # img = img.transpose(2, 0, 1) 138 | target = np.hstack((boxes, np.expand_dims(labels, axis=1))) 139 | return torch.from_numpy(img).permute(2, 0, 1), target, height, width 140 | # return torch.from_numpy(img), target, height, width 141 | 142 | def pull_image(self, index): 143 | '''Returns the original image object at index in PIL form 144 | 145 | Note: not using self.__getitem__(), as any transformations passed in 146 | could mess up this functionality. 147 | 148 | Argument: 149 | index (int): index of img to show 150 | Return: 151 | PIL img 152 | ''' 153 | img_id = self.ids[index] 154 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR) 155 | 156 | def pull_anno(self, index): 157 | '''Returns the original annotation of image at index 158 | 159 | Note: not using self.__getitem__(), as any transformations passed in 160 | could mess up this functionality. 161 | 162 | Argument: 163 | index (int): index of img to get annotation of 164 | Return: 165 | list: [img_id, [(label, bbox coords),...]] 166 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 167 | ''' 168 | img_id = self.ids[index] 169 | anno = ET.parse(self._annopath % img_id).getroot() 170 | gt = self.target_transform(anno, 1, 1) 171 | return img_id[1], gt 172 | 173 | def pull_tensor(self, index): 174 | '''Returns the original image at an index in tensor form 175 | 176 | Note: not using self.__getitem__(), as any transformations passed in 177 | could mess up this functionality. 178 | 179 | Argument: 180 | index (int): index of img to show 181 | Return: 182 | tensorized version of img, squeezed 183 | ''' 184 | return torch.Tensor(self.pull_image(index)).unsqueeze_(0) 185 | -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File: demo.py 5 | @Author:kong 6 | @Time: 2020年01月21日09时40分 7 | @Description: 8 | ''' 9 | import os 10 | import sys 11 | module_path = os.path.abspath(os.path.join('..')) 12 | if module_path not in sys.path: 13 | sys.path.append(module_path) 14 | from matplotlib import pyplot as plt 15 | from data import VOCDetection, VOC_ROOT, VOCAnnotationTransform 16 | import torch 17 | import torch.nn as nn 18 | import torch.backends.cudnn as cudnn 19 | from torch.autograd import Variable 20 | import numpy as np 21 | import cv2 22 | if torch.cuda.is_available(): 23 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 24 | from data import VOC_CLASSES as labels 25 | from ssd import build_ssd 26 | 27 | image_path = './test/example.jpg' 28 | weight_path = './weights/ssd300_VOC_100000.pth' 29 | model_input = 300 30 | 31 | net = build_ssd('test', model_input, 21) # initialize SSD 32 | net.load_weights(weight_path) 33 | image = cv2.imread(image_path, cv2.IMREAD_COLOR) # uncomment if dataset not downloaded 34 | rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 35 | x = cv2.resize(image, (model_input, model_input)).astype(np.float32) 36 | x -= (104.0, 117.0, 123.0) 37 | x = x.astype(np.float32) 38 | x = x[:, :, ::-1].copy() 39 | x = torch.from_numpy(x).permute(2, 0, 1) 40 | 41 | xx = Variable(x.unsqueeze(0)) # wrap tensor in Variable 42 | if torch.cuda.is_available(): 43 | xx = xx.cuda() 44 | y = net(xx) 45 | 46 | top_k=10 47 | detections = y.data 48 | # scale each detection back up to the image 49 | scale = torch.Tensor(rgb_image.shape[1::-1]).repeat(2) #4个尺度的缩放系数 50 | for i in range(detections.size(1)): #遍历num_class 51 | j = 0 52 | while detections[0,i,j,0] >= 0.2: 53 | score = detections[0,i,j,0] 54 | label_name = labels[i-1] 55 | display_txt = '%s: %.2f'%(label_name, score) 56 | pt = (detections[0,i,j,1:]*scale).cpu().numpy() 57 | j+=1 58 | image = cv2.rectangle(image,(pt[0],pt[1]),(pt[2],pt[3]),(255,0,0),2) 59 | image = cv2.putText(image,display_txt,(pt[2],pt[1]),cv2.FONT_HERSHEY_COMPLEX,1,(255,0,0),2) 60 | cv2.imwrite('./test/resut.jpg',image) 61 | -------------------------------------------------------------------------------- /demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/demo/__init__.py -------------------------------------------------------------------------------- /demo/demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Object Detection with SSD\n", 8 | "### Here we demostrate detection on example images using SSD with PyTorch" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import os\n", 18 | "import sys\n", 19 | "module_path = os.path.abspath(os.path.join('..'))\n", 20 | "if module_path not in sys.path:\n", 21 | " sys.path.append(module_path)\n", 22 | "\n", 23 | "import torch\n", 24 | "import torch.nn as nn\n", 25 | "import torch.backends.cudnn as cudnn\n", 26 | "from torch.autograd import Variable\n", 27 | "import numpy as np\n", 28 | "import cv2\n", 29 | "if torch.cuda.is_available():\n", 30 | " torch.set_default_tensor_type('torch.cuda.FloatTensor')\n", 31 | "\n", 32 | "from ssd import build_ssd" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "## Build SSD300 in Test Phase\n", 40 | "1. Build the architecture, specifyingsize of the input image (300),\n", 41 | " and number of object classes to score (21 for VOC dataset)\n", 42 | "2. Next we load pretrained weights on the VOC0712 trainval dataset " 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": { 49 | "scrolled": false 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "net = build_ssd('test', 300, 21) # initialize SSD\n", 54 | "net.load_weights('../weights/ssd300_VOC_28000.pth')" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": {}, 60 | "source": [ 61 | "## Load Image \n", 62 | "### Here we just load a sample image from the VOC07 dataset " 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "# image = cv2.imread('./data/example.jpg', cv2.IMREAD_COLOR) # uncomment if dataset not downloaded\n", 72 | "%matplotlib inline\n", 73 | "from matplotlib import pyplot as plt\n", 74 | "from data import VOCDetection, VOC_ROOT, VOCAnnotationTransform\n", 75 | "# here we specify year (07 or 12) and dataset ('test', 'val', 'train') \n", 76 | "testset = VOCDetection(VOC_ROOT, [('2007', 'val')], None, VOCAnnotationTransform())\n", 77 | "img_id = 60\n", 78 | "image = testset.pull_image(img_id)\n", 79 | "rgb_image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n", 80 | "# View the sampled input image before transform\n", 81 | "plt.figure(figsize=(10,10))\n", 82 | "plt.imshow(rgb_image)\n", 83 | "plt.show()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## Pre-process the input. \n", 91 | "#### Using the torchvision package, we can create a Compose of multiple built-in transorm ops to apply \n", 92 | "For SSD, at test time we use a custom BaseTransform callable to\n", 93 | "resize our image to 300x300, subtract the dataset's mean rgb values, \n", 94 | "and swap the color channels for input to SSD300." 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "x = cv2.resize(image, (300, 300)).astype(np.float32)\n", 104 | "x -= (104.0, 117.0, 123.0)\n", 105 | "x = x.astype(np.float32)\n", 106 | "x = x[:, :, ::-1].copy()\n", 107 | "plt.imshow(x)\n", 108 | "x = torch.from_numpy(x).permute(2, 0, 1)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "## SSD Forward Pass\n", 116 | "### Now just wrap the image in a Variable so it is recognized by PyTorch autograd" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "scrolled": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "xx = Variable(x.unsqueeze(0)) # wrap tensor in Variable\n", 128 | "if torch.cuda.is_available():\n", 129 | " xx = xx.cuda()\n", 130 | "y = net(xx)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## Parse the Detections and View Results\n", 138 | "Filter outputs with confidence scores lower than a threshold \n", 139 | "Here we choose 60% " 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "from data import VOC_CLASSES as labels\n", 149 | "top_k=10\n", 150 | "\n", 151 | "plt.figure(figsize=(10,10))\n", 152 | "colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()\n", 153 | "plt.imshow(rgb_image) # plot the image for matplotlib\n", 154 | "currentAxis = plt.gca()\n", 155 | "\n", 156 | "detections = y.data\n", 157 | "# scale each detection back up to the image\n", 158 | "scale = torch.Tensor(rgb_image.shape[1::-1]).repeat(2)\n", 159 | "for i in range(detections.size(1)):\n", 160 | " j = 0\n", 161 | " while detections[0,i,j,0] >= 0.6:\n", 162 | " score = detections[0,i,j,0]\n", 163 | " label_name = labels[i-1]\n", 164 | " display_txt = '%s: %.2f'%(label_name, score)\n", 165 | " pt = (detections[0,i,j,1:]*scale).cpu().numpy()\n", 166 | " coords = (pt[0], pt[1]), pt[2]-pt[0]+1, pt[3]-pt[1]+1\n", 167 | " color = colors[i]\n", 168 | " currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))\n", 169 | " currentAxis.text(pt[0], pt[1], display_txt, bbox={'facecolor':color, 'alpha':0.5})\n", 170 | " j+=1" 171 | ] 172 | } 173 | ], 174 | "metadata": { 175 | "anaconda-cloud": {}, 176 | "kernelspec": { 177 | "display_name": "Python 3", 178 | "language": "python", 179 | "name": "python3" 180 | }, 181 | "language_info": { 182 | "codemirror_mode": { 183 | "name": "ipython", 184 | "version": 3 185 | }, 186 | "file_extension": ".py", 187 | "mimetype": "text/x-python", 188 | "name": "python", 189 | "nbconvert_exporter": "python", 190 | "pygments_lexer": "ipython3", 191 | "version": "3.7.3" 192 | } 193 | }, 194 | "nbformat": 4, 195 | "nbformat_minor": 1 196 | } 197 | -------------------------------------------------------------------------------- /demo/live.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import torch 3 | from torch.autograd import Variable 4 | import cv2 5 | import time 6 | from imutils.video import FPS, WebcamVideoStream 7 | import argparse 8 | 9 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection') 10 | parser.add_argument('--weights', default='weights/ssd_300_VOC0712.pth', 11 | type=str, help='Trained state_dict file path') 12 | parser.add_argument('--cuda', default=False, type=bool, 13 | help='Use cuda in live demo') 14 | args = parser.parse_args() 15 | 16 | COLORS = [(255, 0, 0), (0, 255, 0), (0, 0, 255)] 17 | FONT = cv2.FONT_HERSHEY_SIMPLEX 18 | 19 | 20 | def cv2_demo(net, transform): 21 | def predict(frame): 22 | height, width = frame.shape[:2] 23 | x = torch.from_numpy(transform(frame)[0]).permute(2, 0, 1) 24 | x = Variable(x.unsqueeze(0)) 25 | y = net(x) # forward pass 26 | detections = y.data 27 | # scale each detection back up to the image 28 | scale = torch.Tensor([width, height, width, height]) 29 | for i in range(detections.size(1)): 30 | j = 0 31 | while detections[0, i, j, 0] >= 0.6: 32 | pt = (detections[0, i, j, 1:] * scale).cpu().numpy() 33 | cv2.rectangle(frame, 34 | (int(pt[0]), int(pt[1])), 35 | (int(pt[2]), int(pt[3])), 36 | COLORS[i % 3], 2) 37 | cv2.putText(frame, labelmap[i - 1], (int(pt[0]), int(pt[1])), 38 | FONT, 2, (255, 255, 255), 2, cv2.LINE_AA) 39 | j += 1 40 | return frame 41 | 42 | # start video stream thread, allow buffer to fill 43 | print("[INFO] starting threaded video stream...") 44 | stream = WebcamVideoStream(src=0).start() # default camera 45 | time.sleep(1.0) 46 | # start fps timer 47 | # loop over frames from the video file stream 48 | while True: 49 | # grab next frame 50 | frame = stream.read() 51 | key = cv2.waitKey(1) & 0xFF 52 | 53 | # update FPS counter 54 | fps.update() 55 | frame = predict(frame) 56 | 57 | # keybindings for display 58 | if key == ord('p'): # pause 59 | while True: 60 | key2 = cv2.waitKey(1) or 0xff 61 | cv2.imshow('frame', frame) 62 | if key2 == ord('p'): # resume 63 | break 64 | cv2.imshow('frame', frame) 65 | if key == 27: # exit 66 | break 67 | 68 | 69 | if __name__ == '__main__': 70 | import sys 71 | from os import path 72 | sys.path.append(path.dirname(path.dirname(path.abspath(__file__)))) 73 | 74 | from data import BaseTransform, VOC_CLASSES as labelmap 75 | from ssd import build_ssd 76 | 77 | net = build_ssd('test', 300, 21) # initialize SSD 78 | net.load_state_dict(torch.load(args.weights)) 79 | transform = BaseTransform(net.size, (104/256.0, 117/256.0, 123/256.0)) 80 | 81 | fps = FPS().start() 82 | cv2_demo(net.eval(), transform) 83 | # stop the timer and display FPS information 84 | fps.stop() 85 | 86 | print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) 87 | print("[INFO] approx. FPS: {:.2f}".format(fps.fps())) 88 | 89 | # cleanup 90 | cv2.destroyAllWindows() 91 | stream.stop() 92 | -------------------------------------------------------------------------------- /efficientnet_pytorch/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.5.1" 2 | from .fire_smoke_model import FireSmokeEfficientNet 3 | from .model import EfficientNet 4 | from .utils import ( 5 | GlobalParams, 6 | BlockArgs, 7 | BlockDecoder, 8 | efficientnet, 9 | get_model_params, 10 | ) 11 | 12 | -------------------------------------------------------------------------------- /efficientnet_pytorch/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/efficientnet_pytorch/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /efficientnet_pytorch/__pycache__/fire_smoke_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/efficientnet_pytorch/__pycache__/fire_smoke_model.cpython-37.pyc -------------------------------------------------------------------------------- /efficientnet_pytorch/__pycache__/model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/efficientnet_pytorch/__pycache__/model.cpython-37.pyc -------------------------------------------------------------------------------- /efficientnet_pytorch/__pycache__/utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/efficientnet_pytorch/__pycache__/utils.cpython-37.pyc -------------------------------------------------------------------------------- /efficientnet_pytorch/fire_smoke_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # -*- coding: utf-8 -*- 3 | ''' 4 | @File: fire_smoke_model.py 5 | @Author:konglingran 6 | @Time: 2020年01月02日15时52分 7 | @Description: 8 | ''' 9 | import torch 10 | from torch import nn 11 | from torch.nn import functional as F 12 | 13 | from .utils import ( 14 | round_filters, 15 | round_repeats, 16 | drop_connect, 17 | get_same_padding_conv2d, 18 | get_model_params, 19 | efficientnet_params, 20 | load_pretrained_weights, 21 | Swish, 22 | MemoryEfficientSwish, 23 | ) 24 | 25 | 26 | class MBConvBlock(nn.Module): 27 | """ 28 | Mobile Inverted Residual Bottleneck Block 29 | 30 | Args: 31 | block_args (namedtuple): BlockArgs, see above 32 | global_params (namedtuple): GlobalParam, see above 33 | 34 | Attributes: 35 | has_se (bool): Whether the block contains a Squeeze and Excitation layer. 36 | """ 37 | 38 | def __init__(self, block_args, global_params): 39 | super().__init__() 40 | self._block_args = block_args 41 | self._bn_mom = 1 - global_params.batch_norm_momentum 42 | self._bn_eps = global_params.batch_norm_epsilon 43 | self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1) 44 | self.id_skip = block_args.id_skip # skip connection and drop connect 45 | 46 | # Get static or dynamic convolution depending on image size 47 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 48 | 49 | # Expansion phase 50 | inp = self._block_args.input_filters # number of input channels 51 | oup = self._block_args.input_filters * self._block_args.expand_ratio # number of output channels 52 | if self._block_args.expand_ratio != 1: 53 | self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False) 54 | self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 55 | 56 | # Depthwise convolution phase 57 | k = self._block_args.kernel_size 58 | s = self._block_args.stride 59 | self._depthwise_conv = Conv2d( 60 | in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise 61 | kernel_size=k, stride=s, bias=False) 62 | self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 63 | 64 | # Squeeze and Excitation layer, if desired 65 | if self.has_se: 66 | num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio)) 67 | self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1) 68 | self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1) 69 | 70 | # Output phase 71 | final_oup = self._block_args.output_filters 72 | self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False) 73 | self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps) 74 | self._swish = MemoryEfficientSwish() 75 | 76 | def forward(self, inputs, drop_connect_rate=None): 77 | """ 78 | :param inputs: input tensor 79 | :param drop_connect_rate: drop connect rate (float, between 0 and 1) 80 | :return: output of block 81 | """ 82 | 83 | # Expansion and Depthwise Convolution 84 | x = inputs 85 | if self._block_args.expand_ratio != 1: 86 | x = self._swish(self._bn0(self._expand_conv(inputs))) 87 | x = self._swish(self._bn1(self._depthwise_conv(x))) 88 | 89 | # Squeeze and Excitation 90 | if self.has_se: 91 | x_squeezed = F.adaptive_avg_pool2d(x, 1) 92 | x_squeezed = self._se_expand(self._swish(self._se_reduce(x_squeezed))) 93 | x = torch.sigmoid(x_squeezed) * x 94 | 95 | x = self._bn2(self._project_conv(x)) 96 | 97 | # Skip connection and drop connect 98 | input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters 99 | if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters: 100 | if drop_connect_rate: 101 | x = drop_connect(x, p=drop_connect_rate, training=self.training) 102 | x = x + inputs # skip connection 103 | return x 104 | 105 | def set_swish(self, memory_efficient=True): 106 | """Sets swish function as memory efficient (for training) or standard (for export)""" 107 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 108 | 109 | 110 | class FireSmokeEfficientNet(nn.Module): 111 | """ 112 | An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods 113 | 114 | Args: 115 | blocks_args (list): A list of BlockArgs to construct blocks 116 | global_params (namedtuple): A set of GlobalParams shared between blocks 117 | 118 | Example: 119 | model = EfficientNet.from_pretrained('efficientnet-b0') 120 | 121 | """ 122 | 123 | def __init__(self, blocks_args=None, global_params=None): 124 | super().__init__() 125 | assert isinstance(blocks_args, list), 'blocks_args should be a list' 126 | assert len(blocks_args) > 0, 'block args must be greater than 0' 127 | self._global_params = global_params 128 | self._blocks_args = blocks_args 129 | 130 | # Get static or dynamic convolution depending on image size 131 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 132 | 133 | # Batch norm parameters 134 | bn_mom = 1 - self._global_params.batch_norm_momentum 135 | bn_eps = self._global_params.batch_norm_epsilon 136 | 137 | # Stem 138 | in_channels = 3 # rgb 139 | out_channels = round_filters(32, self._global_params) # number of output channels 140 | self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) 141 | self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) 142 | 143 | # Build blocks 144 | self._blocks = nn.ModuleList([]) 145 | for block_args in self._blocks_args: 146 | 147 | # Update block input and output filters based on depth multiplier. 148 | block_args = block_args._replace( 149 | input_filters=round_filters(block_args.input_filters, self._global_params), 150 | output_filters=round_filters(block_args.output_filters, self._global_params), 151 | num_repeat=round_repeats(block_args.num_repeat, self._global_params) 152 | ) 153 | 154 | # The first block needs to take care of stride and filter size increase. 155 | self._blocks.append(MBConvBlock(block_args, self._global_params)) 156 | if block_args.num_repeat > 1: 157 | block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) 158 | for _ in range(block_args.num_repeat - 1): 159 | self._blocks.append(MBConvBlock(block_args, self._global_params)) 160 | 161 | # Head 162 | in_channels = block_args.output_filters # output of final block 163 | out_channels = round_filters(1280, self._global_params) 164 | self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False) 165 | self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) 166 | 167 | # Final linear layer 168 | self._avg_pooling = nn.AdaptiveAvgPool2d(1) 169 | self._dropout = nn.Dropout(self._global_params.dropout_rate) 170 | self._fc = nn.Linear(out_channels, self._global_params.num_classes) 171 | self._swish = MemoryEfficientSwish() 172 | 173 | def set_swish(self, memory_efficient=True): 174 | """Sets swish function as memory efficient (for training) or standard (for export)""" 175 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 176 | for block in self._blocks: 177 | block.set_swish(memory_efficient) 178 | 179 | def extract_features(self, inputs): 180 | """ Returns output of the final convolution layer """ 181 | 182 | # Stem 183 | x = self._swish(self._bn0(self._conv_stem(inputs))) 184 | 185 | # Blocks 186 | for idx, block in enumerate(self._blocks): 187 | drop_connect_rate = self._global_params.drop_connect_rate 188 | if drop_connect_rate: 189 | drop_connect_rate *= float(idx) / len(self._blocks) 190 | x = block(x, drop_connect_rate=drop_connect_rate) 191 | 192 | # Head 193 | x = self._swish(self._bn1(self._conv_head(x))) 194 | 195 | return x 196 | 197 | def forward(self, inputs): 198 | """ Calls extract_features to extract features, applies final linear layer, and returns logits. """ 199 | bs = inputs.size(0) 200 | # Convolution layers 201 | x = self.extract_features(inputs) 202 | 203 | # Pooling and final linear layer 204 | x = self._avg_pooling(x) 205 | x = x.view(bs, -1) 206 | x = self._dropout(x) 207 | x = self._fc(x) 208 | return x 209 | 210 | @classmethod 211 | def from_name(cls, model_name, override_params=None): 212 | cls._check_model_name_is_valid(model_name) 213 | blocks_args, global_params = get_model_params(model_name, override_params) 214 | return cls(blocks_args, global_params) 215 | 216 | @classmethod 217 | def from_arch(cls, model_name, override_params=None): 218 | cls._check_model_name_is_valid(model_name) 219 | blocks_args, global_params = get_model_params(model_name, override_params) 220 | return cls(blocks_args, global_params) 221 | 222 | @classmethod 223 | def from_pretrained(cls, args, num_classes=1000, in_channels=3): 224 | model = cls.from_name(args.arch, override_params={'num_classes': num_classes}) 225 | load_pretrained_weights(model, args.arch, load_fc=(num_classes == 1000)) 226 | if in_channels != 3: 227 | Conv2d = get_same_padding_conv2d(image_size=model._global_params.image_size) 228 | out_channels = round_filters(32, model._global_params) 229 | model._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) 230 | out_channels = round_filters(1280, model._global_params) 231 | model._fc = nn.Linear(out_channels, args.num_cls) 232 | return model 233 | 234 | @classmethod 235 | def from_pretrained(cls, args, num_classes=1000): 236 | model = cls.from_name(args.arch, override_params={'num_classes': num_classes}) 237 | load_pretrained_weights(model, args.arch, load_fc=(num_classes == 1000)) 238 | out_channels = round_filters(1280, model._global_params) 239 | model._fc = nn.Linear(out_channels, args.num_cls) 240 | return model 241 | 242 | @classmethod 243 | def get_image_size(cls, model_name): 244 | cls._check_model_name_is_valid(model_name) 245 | _, _, res, _ = efficientnet_params(model_name) 246 | return res 247 | 248 | @classmethod 249 | def _check_model_name_is_valid(cls, model_name, also_need_pretrained_weights=False): 250 | """ Validates model name. None that pretrained weights are only available for 251 | the first four models (efficientnet-b{i} for i in 0,1,2,3) at the moment. """ 252 | num_models = 4 if also_need_pretrained_weights else 8 253 | valid_models = ['efficientnet-b' + str(i) for i in range(num_models)] 254 | if model_name not in valid_models: 255 | raise ValueError('model_name should be one of: ' + ', '.join(valid_models)) 256 | -------------------------------------------------------------------------------- /efficientnet_pytorch/model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from torch.nn import functional as F 4 | 5 | from .utils import ( 6 | round_filters, 7 | round_repeats, 8 | drop_connect, 9 | get_same_padding_conv2d, 10 | get_model_params, 11 | efficientnet_params, 12 | load_pretrained_weights, 13 | Swish, 14 | MemoryEfficientSwish, 15 | ) 16 | 17 | class MBConvBlock(nn.Module): 18 | """ 19 | Mobile Inverted Residual Bottleneck Block 20 | 21 | Args: 22 | block_args (namedtuple): BlockArgs, see above 23 | global_params (namedtuple): GlobalParam, see above 24 | 25 | Attributes: 26 | has_se (bool): Whether the block contains a Squeeze and Excitation layer. 27 | """ 28 | 29 | def __init__(self, block_args, global_params): 30 | super().__init__() 31 | self._block_args = block_args 32 | self._bn_mom = 1 - global_params.batch_norm_momentum 33 | self._bn_eps = global_params.batch_norm_epsilon 34 | self.has_se = (self._block_args.se_ratio is not None) and (0 < self._block_args.se_ratio <= 1) 35 | self.id_skip = block_args.id_skip # skip connection and drop connect 36 | 37 | # Get static or dynamic convolution depending on image size 38 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 39 | 40 | # Expansion phase 41 | inp = self._block_args.input_filters # number of input channels 42 | oup = self._block_args.input_filters * self._block_args.expand_ratio # number of output channels 43 | if self._block_args.expand_ratio != 1: 44 | self._expand_conv = Conv2d(in_channels=inp, out_channels=oup, kernel_size=1, bias=False) 45 | self._bn0 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 46 | 47 | # Depthwise convolution phase 48 | k = self._block_args.kernel_size 49 | s = self._block_args.stride 50 | self._depthwise_conv = Conv2d( 51 | in_channels=oup, out_channels=oup, groups=oup, # groups makes it depthwise 52 | kernel_size=k, stride=s, bias=False) 53 | self._bn1 = nn.BatchNorm2d(num_features=oup, momentum=self._bn_mom, eps=self._bn_eps) 54 | 55 | # Squeeze and Excitation layer, if desired 56 | if self.has_se: 57 | num_squeezed_channels = max(1, int(self._block_args.input_filters * self._block_args.se_ratio)) 58 | self._se_reduce = Conv2d(in_channels=oup, out_channels=num_squeezed_channels, kernel_size=1) 59 | self._se_expand = Conv2d(in_channels=num_squeezed_channels, out_channels=oup, kernel_size=1) 60 | 61 | # Output phase 62 | final_oup = self._block_args.output_filters 63 | self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False) 64 | self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps) 65 | self._swish = MemoryEfficientSwish() 66 | 67 | def forward(self, inputs, drop_connect_rate=None): 68 | """ 69 | :param inputs: input tensor 70 | :param drop_connect_rate: drop connect rate (float, between 0 and 1) 71 | :return: output of block 72 | """ 73 | 74 | # Expansion and Depthwise Convolution 75 | x = inputs 76 | if self._block_args.expand_ratio != 1: 77 | x = self._swish(self._bn0(self._expand_conv(inputs))) 78 | x = self._swish(self._bn1(self._depthwise_conv(x))) 79 | 80 | # Squeeze and Excitation 81 | if self.has_se: 82 | x_squeezed = F.adaptive_avg_pool2d(x, 1) 83 | x_squeezed = self._se_expand(self._swish(self._se_reduce(x_squeezed))) 84 | x = torch.sigmoid(x_squeezed) * x 85 | 86 | x = self._bn2(self._project_conv(x)) 87 | 88 | # Skip connection and drop connect 89 | input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters 90 | if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters: 91 | if drop_connect_rate: 92 | x = drop_connect(x, p=drop_connect_rate, training=self.training) 93 | x = x + inputs # skip connection 94 | return x 95 | 96 | def set_swish(self, memory_efficient=True): 97 | """Sets swish function as memory efficient (for training) or standard (for export)""" 98 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 99 | 100 | 101 | class EfficientNet(nn.Module): 102 | """ 103 | An EfficientNet model. Most easily loaded with the .from_name or .from_pretrained methods 104 | 105 | Args: 106 | blocks_args (list): A list of BlockArgs to construct blocks 107 | global_params (namedtuple): A set of GlobalParams shared between blocks 108 | 109 | Example: 110 | model = EfficientNet.from_pretrained('efficientnet-b0') 111 | 112 | """ 113 | 114 | def __init__(self, blocks_args=None, global_params=None): 115 | super().__init__() 116 | assert isinstance(blocks_args, list), 'blocks_args should be a list' 117 | assert len(blocks_args) > 0, 'block args must be greater than 0' 118 | self._global_params = global_params 119 | self._blocks_args = blocks_args 120 | 121 | # Get static or dynamic convolution depending on image size 122 | Conv2d = get_same_padding_conv2d(image_size=global_params.image_size) 123 | 124 | # Batch norm parameters 125 | bn_mom = 1 - self._global_params.batch_norm_momentum 126 | bn_eps = self._global_params.batch_norm_epsilon 127 | 128 | # Stem 129 | in_channels = 3 # rgb 130 | out_channels = round_filters(32, self._global_params) # number of output channels 131 | self._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) 132 | self._bn0 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) 133 | 134 | # Build blocks 135 | self._blocks = nn.ModuleList([]) 136 | for block_args in self._blocks_args: 137 | 138 | # Update block input and output filters based on depth multiplier. 139 | block_args = block_args._replace( 140 | input_filters=round_filters(block_args.input_filters, self._global_params), 141 | output_filters=round_filters(block_args.output_filters, self._global_params), 142 | num_repeat=round_repeats(block_args.num_repeat, self._global_params) 143 | ) 144 | 145 | # The first block needs to take care of stride and filter size increase. 146 | self._blocks.append(MBConvBlock(block_args, self._global_params)) 147 | if block_args.num_repeat > 1: 148 | block_args = block_args._replace(input_filters=block_args.output_filters, stride=1) 149 | for _ in range(block_args.num_repeat - 1): 150 | self._blocks.append(MBConvBlock(block_args, self._global_params)) 151 | 152 | # Head 153 | in_channels = block_args.output_filters # output of final block 154 | out_channels = round_filters(1280, self._global_params) 155 | self._conv_head = Conv2d(in_channels, out_channels, kernel_size=1, bias=False) 156 | self._bn1 = nn.BatchNorm2d(num_features=out_channels, momentum=bn_mom, eps=bn_eps) 157 | 158 | # Final linear layer 159 | self._avg_pooling = nn.AdaptiveAvgPool2d(1) 160 | self._dropout = nn.Dropout(self._global_params.dropout_rate) 161 | self._fc = nn.Linear(out_channels, self._global_params.num_classes) 162 | self._swish = MemoryEfficientSwish() 163 | 164 | def set_swish(self, memory_efficient=True): 165 | """Sets swish function as memory efficient (for training) or standard (for export)""" 166 | self._swish = MemoryEfficientSwish() if memory_efficient else Swish() 167 | for block in self._blocks: 168 | block.set_swish(memory_efficient) 169 | 170 | 171 | def extract_features(self, inputs): 172 | """ Returns output of the final convolution layer """ 173 | 174 | # Stem 175 | x = self._swish(self._bn0(self._conv_stem(inputs))) 176 | 177 | # Blocks 178 | for idx, block in enumerate(self._blocks): 179 | drop_connect_rate = self._global_params.drop_connect_rate 180 | if drop_connect_rate: 181 | drop_connect_rate *= float(idx) / len(self._blocks) 182 | x = block(x, drop_connect_rate=drop_connect_rate) 183 | 184 | # Head 185 | x = self._swish(self._bn1(self._conv_head(x))) 186 | 187 | return x 188 | 189 | def forward(self, inputs): 190 | """ Calls extract_features to extract features, applies final linear layer, and returns logits. """ 191 | bs = inputs.size(0) 192 | # Convolution layers 193 | x = self.extract_features(inputs) 194 | 195 | # Pooling and final linear layer 196 | x = self._avg_pooling(x) 197 | x = x.view(bs, -1) 198 | x = self._dropout(x) 199 | x = self._fc(x) 200 | return x 201 | 202 | @classmethod 203 | def from_name(cls, model_name, override_params=None): 204 | cls._check_model_name_is_valid(model_name) 205 | blocks_args, global_params = get_model_params(model_name, override_params) 206 | return cls(blocks_args, global_params) 207 | 208 | @classmethod 209 | def from_pretrained(cls, model_name, num_classes=1000, in_channels = 3): 210 | model = cls.from_name(model_name, override_params={'num_classes': num_classes}) 211 | load_pretrained_weights(model, model_name, load_fc=(num_classes == 1000)) 212 | if in_channels != 3: 213 | Conv2d = get_same_padding_conv2d(image_size = model._global_params.image_size) 214 | out_channels = round_filters(32, model._global_params) 215 | model._conv_stem = Conv2d(in_channels, out_channels, kernel_size=3, stride=2, bias=False) 216 | return model 217 | 218 | @classmethod 219 | def from_pretrained(cls, model_name, num_classes=1000): 220 | model = cls.from_name(model_name, override_params={'num_classes': num_classes}) 221 | load_pretrained_weights(model, model_name, load_fc=(num_classes == 1000)) 222 | 223 | return model 224 | 225 | @classmethod 226 | def get_image_size(cls, model_name): 227 | cls._check_model_name_is_valid(model_name) 228 | _, _, res, _ = efficientnet_params(model_name) 229 | return res 230 | 231 | @classmethod 232 | def _check_model_name_is_valid(cls, model_name, also_need_pretrained_weights=False): 233 | """ Validates model name. None that pretrained weights are only available for 234 | the first four models (efficientnet-b{i} for i in 0,1,2,3) at the moment. """ 235 | num_models = 4 if also_need_pretrained_weights else 8 236 | valid_models = ['efficientnet-b'+str(i) for i in range(num_models)] 237 | if model_name not in valid_models: 238 | raise ValueError('model_name should be one of: ' + ', '.join(valid_models)) 239 | -------------------------------------------------------------------------------- /efficientnet_pytorch/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains helper functions for building the model and for loading model parameters. 3 | These helper functions are built to mirror those in the official TensorFlow implementation. 4 | """ 5 | 6 | import re 7 | import math 8 | import collections 9 | from functools import partial 10 | import torch 11 | from torch import nn 12 | from torch.nn import functional as F 13 | from torch.utils import model_zoo 14 | 15 | ######################################################################## 16 | ############### HELPERS FUNCTIONS FOR MODEL ARCHITECTURE ############### 17 | ######################################################################## 18 | 19 | 20 | # Parameters for the entire model (stem, all blocks, and head) 21 | GlobalParams = collections.namedtuple('GlobalParams', [ 22 | 'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate', 23 | 'num_classes', 'width_coefficient', 'depth_coefficient', 24 | 'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size']) 25 | 26 | # Parameters for an individual model block 27 | BlockArgs = collections.namedtuple('BlockArgs', [ 28 | 'kernel_size', 'num_repeat', 'input_filters', 'output_filters', 29 | 'expand_ratio', 'id_skip', 'stride', 'se_ratio']) 30 | 31 | # Change namedtuple defaults 32 | GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields) 33 | BlockArgs.__new__.__defaults__ = (None,) * len(BlockArgs._fields) 34 | 35 | 36 | class SwishImplementation(torch.autograd.Function): 37 | @staticmethod 38 | def forward(ctx, i): 39 | result = i * torch.sigmoid(i) 40 | ctx.save_for_backward(i) 41 | return result 42 | 43 | @staticmethod 44 | def backward(ctx, grad_output): 45 | i = ctx.saved_variables[0] 46 | sigmoid_i = torch.sigmoid(i) 47 | return grad_output * (sigmoid_i * (1 + i * (1 - sigmoid_i))) 48 | 49 | 50 | class MemoryEfficientSwish(nn.Module): 51 | def forward(self, x): 52 | return SwishImplementation.apply(x) 53 | 54 | class Swish(nn.Module): 55 | def forward(self, x): 56 | return x * torch.sigmoid(x) 57 | 58 | 59 | def round_filters(filters, global_params): 60 | """ Calculate and round number of filters based on depth multiplier. """ 61 | multiplier = global_params.width_coefficient 62 | if not multiplier: 63 | return filters 64 | divisor = global_params.depth_divisor 65 | min_depth = global_params.min_depth 66 | filters *= multiplier 67 | min_depth = min_depth or divisor 68 | new_filters = max(min_depth, int(filters + divisor / 2) // divisor * divisor) 69 | if new_filters < 0.9 * filters: # prevent rounding by more than 10% 70 | new_filters += divisor 71 | return int(new_filters) 72 | 73 | 74 | def round_repeats(repeats, global_params): 75 | """ Round number of filters based on depth multiplier. """ 76 | multiplier = global_params.depth_coefficient 77 | if not multiplier: 78 | return repeats 79 | return int(math.ceil(multiplier * repeats)) 80 | 81 | 82 | def drop_connect(inputs, p, training): 83 | """ Drop connect. """ 84 | if not training: return inputs 85 | batch_size = inputs.shape[0] 86 | keep_prob = 1 - p 87 | random_tensor = keep_prob 88 | random_tensor += torch.rand([batch_size, 1, 1, 1], dtype=inputs.dtype, device=inputs.device) 89 | binary_tensor = torch.floor(random_tensor) 90 | output = inputs / keep_prob * binary_tensor 91 | return output 92 | 93 | 94 | def get_same_padding_conv2d(image_size=None): 95 | """ Chooses static padding if you have specified an image size, and dynamic padding otherwise. 96 | Static padding is necessary for ONNX exporting of models. """ 97 | if image_size is None: 98 | return Conv2dDynamicSamePadding 99 | else: 100 | return partial(Conv2dStaticSamePadding, image_size=image_size) 101 | 102 | 103 | class Conv2dDynamicSamePadding(nn.Conv2d): 104 | """ 2D Convolutions like TensorFlow, for a dynamic image size """ 105 | 106 | def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True): 107 | super().__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) 108 | self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2 109 | 110 | def forward(self, x): 111 | ih, iw = x.size()[-2:] 112 | kh, kw = self.weight.size()[-2:] 113 | sh, sw = self.stride 114 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) 115 | pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) 116 | pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) 117 | if pad_h > 0 or pad_w > 0: 118 | x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2]) 119 | return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 120 | 121 | 122 | class Conv2dStaticSamePadding(nn.Conv2d): 123 | """ 2D Convolutions like TensorFlow, for a fixed image size""" 124 | 125 | def __init__(self, in_channels, out_channels, kernel_size, image_size=None, **kwargs): 126 | super().__init__(in_channels, out_channels, kernel_size, **kwargs) 127 | self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2 128 | 129 | # Calculate padding based on image size and save it 130 | assert image_size is not None 131 | ih, iw = image_size if type(image_size) == list else [image_size, image_size] 132 | kh, kw = self.weight.size()[-2:] 133 | sh, sw = self.stride 134 | oh, ow = math.ceil(ih / sh), math.ceil(iw / sw) 135 | pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0) 136 | pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0) 137 | if pad_h > 0 or pad_w > 0: 138 | self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2)) 139 | else: 140 | self.static_padding = Identity() 141 | 142 | def forward(self, x): 143 | x = self.static_padding(x) 144 | x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) 145 | return x 146 | 147 | 148 | class Identity(nn.Module): 149 | def __init__(self, ): 150 | super(Identity, self).__init__() 151 | 152 | def forward(self, input): 153 | return input 154 | 155 | 156 | ######################################################################## 157 | ############## HELPERS FUNCTIONS FOR LOADING MODEL PARAMS ############## 158 | ######################################################################## 159 | 160 | 161 | def efficientnet_params(model_name): 162 | """ Map EfficientNet model name to parameter coefficients. """ 163 | params_dict = { 164 | # Coefficients: width,depth,res,dropout 165 | 'efficientnet-b0': (1.0, 1.0, 224, 0.2), 166 | 'efficientnet-b1': (1.0, 1.1, 240, 0.2), 167 | 'efficientnet-b2': (1.1, 1.2, 260, 0.3), 168 | 'efficientnet-b3': (1.2, 1.4, 300, 0.3), 169 | 'efficientnet-b4': (1.4, 1.8, 380, 0.4), 170 | 'efficientnet-b5': (1.6, 2.2, 456, 0.4), 171 | 'efficientnet-b6': (1.8, 2.6, 528, 0.5), 172 | 'efficientnet-b7': (2.0, 3.1, 600, 0.5), 173 | } 174 | return params_dict[model_name] 175 | 176 | 177 | class BlockDecoder(object): 178 | """ Block Decoder for readability, straight from the official TensorFlow repository """ 179 | 180 | @staticmethod 181 | def _decode_block_string(block_string): 182 | """ Gets a block through a string notation of arguments. """ 183 | assert isinstance(block_string, str) 184 | 185 | ops = block_string.split('_') 186 | options = {} 187 | for op in ops: 188 | splits = re.split(r'(\d.*)', op) 189 | if len(splits) >= 2: 190 | key, value = splits[:2] 191 | options[key] = value 192 | 193 | # Check stride 194 | assert (('s' in options and len(options['s']) == 1) or 195 | (len(options['s']) == 2 and options['s'][0] == options['s'][1])) 196 | 197 | return BlockArgs( 198 | kernel_size=int(options['k']), 199 | num_repeat=int(options['r']), 200 | input_filters=int(options['i']), 201 | output_filters=int(options['o']), 202 | expand_ratio=int(options['e']), 203 | id_skip=('noskip' not in block_string), 204 | se_ratio=float(options['se']) if 'se' in options else None, 205 | stride=[int(options['s'][0])]) 206 | 207 | @staticmethod 208 | def _encode_block_string(block): 209 | """Encodes a block to a string.""" 210 | args = [ 211 | 'r%d' % block.num_repeat, 212 | 'k%d' % block.kernel_size, 213 | 's%d%d' % (block.strides[0], block.strides[1]), 214 | 'e%s' % block.expand_ratio, 215 | 'i%d' % block.input_filters, 216 | 'o%d' % block.output_filters 217 | ] 218 | if 0 < block.se_ratio <= 1: 219 | args.append('se%s' % block.se_ratio) 220 | if block.id_skip is False: 221 | args.append('noskip') 222 | return '_'.join(args) 223 | 224 | @staticmethod 225 | def decode(string_list): 226 | """ 227 | Decodes a list of string notations to specify blocks inside the network. 228 | 229 | :param string_list: a list of strings, each string is a notation of block 230 | :return: a list of BlockArgs namedtuples of block args 231 | """ 232 | assert isinstance(string_list, list) 233 | blocks_args = [] 234 | for block_string in string_list: 235 | blocks_args.append(BlockDecoder._decode_block_string(block_string)) 236 | return blocks_args 237 | 238 | @staticmethod 239 | def encode(blocks_args): 240 | """ 241 | Encodes a list of BlockArgs to a list of strings. 242 | 243 | :param blocks_args: a list of BlockArgs namedtuples of block args 244 | :return: a list of strings, each string is a notation of block 245 | """ 246 | block_strings = [] 247 | for block in blocks_args: 248 | block_strings.append(BlockDecoder._encode_block_string(block)) 249 | return block_strings 250 | 251 | 252 | def efficientnet(width_coefficient=None, depth_coefficient=None, dropout_rate=0.2, 253 | drop_connect_rate=0.2, image_size=None, num_classes=1000): 254 | """ Creates a efficientnet model. """ 255 | 256 | blocks_args = [ 257 | 'r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25', 258 | 'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25', 259 | 'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25', 260 | 'r1_k3_s11_e6_i192_o320_se0.25', 261 | ] 262 | blocks_args = BlockDecoder.decode(blocks_args) 263 | 264 | global_params = GlobalParams( 265 | batch_norm_momentum=0.99, 266 | batch_norm_epsilon=1e-3, 267 | dropout_rate=dropout_rate, 268 | drop_connect_rate=drop_connect_rate, 269 | # data_format='channels_last', # removed, this is always true in PyTorch 270 | num_classes=num_classes, 271 | width_coefficient=width_coefficient, 272 | depth_coefficient=depth_coefficient, 273 | depth_divisor=8, 274 | min_depth=None, 275 | image_size=image_size, 276 | ) 277 | 278 | return blocks_args, global_params 279 | 280 | 281 | def get_model_params(model_name, override_params): 282 | """ Get the block args and global params for a given model """ 283 | if model_name.startswith('efficientnet'): 284 | w, d, s, p = efficientnet_params(model_name) 285 | # note: all models have drop connect rate = 0.2 286 | blocks_args, global_params = efficientnet( 287 | width_coefficient=w, depth_coefficient=d, dropout_rate=p, image_size=s) 288 | else: 289 | raise NotImplementedError('model name is not pre-defined: %s' % model_name) 290 | if override_params: 291 | # ValueError will be raised here if override_params has fields not included in global_params. 292 | global_params = global_params._replace(**override_params) 293 | return blocks_args, global_params 294 | 295 | 296 | url_map = { 297 | 'efficientnet-b0': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b0-355c32eb.pth', 298 | 'efficientnet-b1': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b1-f1951068.pth', 299 | 'efficientnet-b2': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b2-8bb594d6.pth', 300 | 'efficientnet-b3': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b3-5fb5a3c3.pth', 301 | 'efficientnet-b4': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b4-6ed6700e.pth', 302 | 'efficientnet-b5': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b5-b6417697.pth', 303 | 'efficientnet-b6': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b6-c76e70fd.pth', 304 | 'efficientnet-b7': 'http://storage.googleapis.com/public-models/efficientnet/efficientnet-b7-dcc49843.pth', 305 | } 306 | 307 | 308 | def load_pretrained_weights(model, model_name, load_fc=True): 309 | """ Loads pretrained weights, and downloads if loading for the first time. """ 310 | state_dict = model_zoo.load_url(url_map[model_name]) 311 | if load_fc: 312 | model.load_state_dict(state_dict) 313 | else: 314 | state_dict.pop('_fc.weight') 315 | state_dict.pop('_fc.bias') 316 | res = model.load_state_dict(state_dict, strict=False) 317 | assert set(res.missing_keys) == set(['_fc.weight', '_fc.bias']), 'issue loading pretrained weights' 318 | print('Loaded pretrained weights for {}'.format(model_name)) 319 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | """Adapted from: 2 | @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch 3 | @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn 4 | Licensed under The MIT License [see LICENSE for details] 5 | """ 6 | 7 | from __future__ import print_function 8 | import torch 9 | import torch.nn as nn 10 | import torch.backends.cudnn as cudnn 11 | from torch.autograd import Variable 12 | from data import VOC_ROOT, VOCAnnotationTransform, VOCDetection, BaseTransform 13 | from data import VOC_CLASSES as labelmap 14 | import torch.utils.data as data 15 | 16 | from ssd import build_ssd 17 | 18 | import sys 19 | import os 20 | import time 21 | import argparse 22 | import numpy as np 23 | import pickle 24 | import cv2 25 | 26 | if sys.version_info[0] == 2: 27 | import xml.etree.cElementTree as ET 28 | else: 29 | import xml.etree.ElementTree as ET 30 | 31 | 32 | def str2bool(v): 33 | return v.lower() in ("yes", "true", "t", "1") 34 | 35 | 36 | parser = argparse.ArgumentParser( 37 | description='Single Shot MultiBox Detector Evaluation') 38 | parser.add_argument('--trained_model', 39 | default='/home/hwits/Documents/SSD.Pytorch/weights/ssd300_VOC_146000.pth', type=str, 40 | help='Trained state_dict file path to open') 41 | parser.add_argument('--input',default=512, type=int, choices=[300, 512], help='ssd input size, currently support ssd300 and ssd512') 42 | parser.add_argument('--save_folder', default='eval/', type=str, 43 | help='File path to save results') 44 | parser.add_argument('--confidence_threshold', default=0.01, type=float, 45 | help='Detection confidence threshold') 46 | parser.add_argument('--top_k', default=5, type=int, 47 | help='Further restrict the number of predictions to parse') 48 | parser.add_argument('--cuda', default=True, type=str2bool, 49 | help='Use cuda to train model') 50 | parser.add_argument('--voc_root', default='./data/VOCdevkit', 51 | help='Location of VOC root directory') 52 | parser.add_argument('--cleanup', default=True, type=str2bool, 53 | help='Cleanup and remove results files following eval') 54 | 55 | args = parser.parse_args() 56 | 57 | if not os.path.exists(args.save_folder): 58 | os.mkdir(args.save_folder) 59 | 60 | if torch.cuda.is_available(): 61 | if args.cuda: 62 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 63 | if not args.cuda: 64 | print("WARNING: It looks like you have a CUDA device, but aren't using \ 65 | CUDA. Run with --cuda for optimal eval speed.") 66 | torch.set_default_tensor_type('torch.FloatTensor') 67 | else: 68 | torch.set_default_tensor_type('torch.FloatTensor') 69 | 70 | annopath = os.path.join(args.voc_root, 'VOC2007', 'Annotations', '%s.xml') 71 | imgpath = os.path.join(args.voc_root, 'VOC2007', 'JPEGImages', '%s.jpg') 72 | imgsetpath = os.path.join(args.voc_root, 'VOC2007', 'ImageSets', 73 | 'Main', '{:s}.txt') 74 | YEAR = '2007' 75 | devkit_path = args.voc_root + 'VOC' + YEAR 76 | dataset_mean = (104, 117, 123) 77 | set_type = 'test' 78 | 79 | 80 | class Timer(object): 81 | """A simple timer.""" 82 | def __init__(self): 83 | self.total_time = 0. 84 | self.calls = 0 85 | self.start_time = 0. 86 | self.diff = 0. 87 | self.average_time = 0. 88 | 89 | def tic(self): 90 | # using time.time instead of time.clock because time time.clock 91 | # does not normalize for multithreading 92 | self.start_time = time.time() 93 | 94 | def toc(self, average=True): 95 | self.diff = time.time() - self.start_time 96 | self.total_time += self.diff 97 | self.calls += 1 98 | self.average_time = self.total_time / self.calls 99 | if average: 100 | return self.average_time 101 | else: 102 | return self.diff 103 | 104 | 105 | def parse_rec(filename): 106 | """ Parse a PASCAL VOC xml file """ 107 | tree = ET.parse(filename) 108 | objects = [] 109 | for obj in tree.findall('object'): 110 | obj_struct = {} 111 | obj_struct['name'] = obj.find('name').text 112 | obj_struct['pose'] = obj.find('pose').text 113 | obj_struct['truncated'] = int(obj.find('truncated').text) 114 | obj_struct['difficult'] = int(obj.find('difficult').text) 115 | bbox = obj.find('bndbox') 116 | obj_struct['bbox'] = [int(bbox.find('xmin').text) - 1, 117 | int(bbox.find('ymin').text) - 1, 118 | int(bbox.find('xmax').text) - 1, 119 | int(bbox.find('ymax').text) - 1] 120 | objects.append(obj_struct) 121 | 122 | return objects 123 | 124 | 125 | def get_output_dir(name, phase): 126 | """Return the directory where experimental artifacts are placed. 127 | If the directory does not exist, it is created. 128 | A canonical path is built using the name from an imdb and a network 129 | (if not None). 130 | """ 131 | filedir = os.path.join(name, phase) 132 | if not os.path.exists(filedir): 133 | os.makedirs(filedir) 134 | return filedir 135 | 136 | 137 | def get_voc_results_file_template(image_set, cls): 138 | # VOCdevkit/VOC2007/results/det_test_aeroplane.txt 139 | filename = 'det_' + image_set + '_%s.txt' % (cls) 140 | filedir = os.path.join(devkit_path, 'results') 141 | if not os.path.exists(filedir): 142 | os.makedirs(filedir) 143 | path = os.path.join(filedir, filename) 144 | return path 145 | 146 | 147 | def write_voc_results_file(all_boxes, dataset): 148 | for cls_ind, cls in enumerate(labelmap): 149 | print('Writing {:s} VOC results file'.format(cls)) 150 | filename = get_voc_results_file_template(set_type, cls) 151 | with open(filename, 'wt') as f: 152 | for im_ind, index in enumerate(dataset.ids): 153 | dets = all_boxes[cls_ind+1][im_ind] 154 | if dets == []: 155 | continue 156 | # the VOCdevkit expects 1-based indices 157 | for k in range(dets.shape[0]): 158 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 159 | format(index[1], dets[k, -1], 160 | dets[k, 0] + 1, dets[k, 1] + 1, 161 | dets[k, 2] + 1, dets[k, 3] + 1)) 162 | 163 | 164 | def do_python_eval(output_dir='output', use_07=True): 165 | cachedir = os.path.join(devkit_path, 'annotations_cache') 166 | aps = [] 167 | # The PASCAL VOC metric changed in 2010 168 | use_07_metric = use_07 169 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 170 | if not os.path.isdir(output_dir): 171 | os.mkdir(output_dir) 172 | for i, cls in enumerate(labelmap): 173 | filename = get_voc_results_file_template(set_type, cls) 174 | rec, prec, ap = voc_eval( 175 | filename, annopath, imgsetpath.format(set_type), cls, cachedir, 176 | ovthresh=0.5, use_07_metric=use_07_metric) 177 | aps += [ap] 178 | print('AP for {} = {:.4f}'.format(cls, ap)) 179 | with open(os.path.join(output_dir, cls + '_pr.pkl'), 'wb') as f: 180 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 181 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 182 | print('~~~~~~~~') 183 | print('Results:') 184 | for ap in aps: 185 | print('{:.3f}'.format(ap)) 186 | print('{:.3f}'.format(np.mean(aps))) 187 | print('~~~~~~~~') 188 | print('') 189 | print('--------------------------------------------------------------') 190 | print('Results computed with the **unofficial** Python eval code.') 191 | print('Results should be very close to the official MATLAB eval code.') 192 | print('--------------------------------------------------------------') 193 | 194 | 195 | def voc_ap(rec, prec, use_07_metric=True): 196 | """ ap = voc_ap(rec, prec, [use_07_metric]) 197 | Compute VOC AP given precision and recall. 198 | If use_07_metric is true, uses the 199 | VOC 07 11 point method (default:True). 200 | """ 201 | if use_07_metric: 202 | # 11 point metric 203 | ap = 0. 204 | for t in np.arange(0., 1.1, 0.1): 205 | if np.sum(rec >= t) == 0: 206 | p = 0 207 | else: 208 | p = np.max(prec[rec >= t]) 209 | ap = ap + p / 11. 210 | else: 211 | # correct AP calculation 212 | # first append sentinel values at the end 213 | mrec = np.concatenate(([0.], rec, [1.])) 214 | mpre = np.concatenate(([0.], prec, [0.])) 215 | 216 | # compute the precision envelope 217 | for i in range(mpre.size - 1, 0, -1): 218 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 219 | 220 | # to calculate area under PR curve, look for points 221 | # where X axis (recall) changes value 222 | i = np.where(mrec[1:] != mrec[:-1])[0] 223 | 224 | # and sum (\Delta recall) * prec 225 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 226 | return ap 227 | 228 | 229 | def voc_eval(detpath, 230 | annopath, 231 | imagesetfile, 232 | classname, 233 | cachedir, 234 | ovthresh=0.5, 235 | use_07_metric=True): 236 | """rec, prec, ap = voc_eval(detpath, 237 | annopath, 238 | imagesetfile, 239 | classname, 240 | [ovthresh], 241 | [use_07_metric]) 242 | Top level function that does the PASCAL VOC evaluation. 243 | detpath: Path to detections 244 | detpath.format(classname) should produce the detection results file. 245 | annopath: Path to annotations 246 | annopath.format(imagename) should be the xml annotations file. 247 | imagesetfile: Text file containing the list of images, one image per line. 248 | classname: Category name (duh) 249 | cachedir: Directory for caching the annotations 250 | [ovthresh]: Overlap threshold (default = 0.5) 251 | [use_07_metric]: Whether to use VOC07's 11 point AP computation 252 | (default True) 253 | """ 254 | # assumes detections are in detpath.format(classname) 255 | # assumes annotations are in annopath.format(imagename) 256 | # assumes imagesetfile is a text file with each line an image name 257 | # cachedir caches the annotations in a pickle file 258 | # first load gt 259 | if not os.path.isdir(cachedir): 260 | os.mkdir(cachedir) 261 | cachefile = os.path.join(cachedir, 'annots.pkl') 262 | # read list of images 263 | with open(imagesetfile, 'r') as f: 264 | lines = f.readlines() 265 | imagenames = [x.strip() for x in lines] 266 | if not os.path.isfile(cachefile): 267 | # load annots 268 | recs = {} 269 | for i, imagename in enumerate(imagenames): 270 | recs[imagename] = parse_rec(annopath % (imagename)) 271 | if i % 100 == 0: 272 | print('Reading annotation for {:d}/{:d}'.format( 273 | i + 1, len(imagenames))) 274 | # save 275 | print('Saving cached annotations to {:s}'.format(cachefile)) 276 | with open(cachefile, 'wb') as f: 277 | pickle.dump(recs, f) 278 | else: 279 | # load 280 | with open(cachefile, 'rb') as f: 281 | recs = pickle.load(f) 282 | 283 | # extract gt objects for this class 284 | class_recs = {} 285 | npos = 0 286 | for imagename in imagenames: 287 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 288 | bbox = np.array([x['bbox'] for x in R]) 289 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 290 | det = [False] * len(R) 291 | npos = npos + sum(~difficult) 292 | class_recs[imagename] = {'bbox': bbox, 293 | 'difficult': difficult, 294 | 'det': det} 295 | 296 | # read dets 297 | detfile = detpath.format(classname) 298 | with open(detfile, 'r') as f: 299 | lines = f.readlines() 300 | if any(lines) == 1: 301 | 302 | splitlines = [x.strip().split(' ') for x in lines] 303 | image_ids = [x[0] for x in splitlines] 304 | confidence = np.array([float(x[1]) for x in splitlines]) 305 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 306 | 307 | # sort by confidence 308 | sorted_ind = np.argsort(-confidence) 309 | sorted_scores = np.sort(-confidence) 310 | BB = BB[sorted_ind, :] 311 | image_ids = [image_ids[x] for x in sorted_ind] 312 | 313 | # go down dets and mark TPs and FPs 314 | nd = len(image_ids) 315 | tp = np.zeros(nd) 316 | fp = np.zeros(nd) 317 | for d in range(nd): 318 | R = class_recs[image_ids[d]] 319 | bb = BB[d, :].astype(float) 320 | ovmax = -np.inf 321 | BBGT = R['bbox'].astype(float) 322 | if BBGT.size > 0: 323 | # compute overlaps 324 | # intersection 325 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 326 | iymin = np.maximum(BBGT[:, 1], bb[1]) 327 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 328 | iymax = np.minimum(BBGT[:, 3], bb[3]) 329 | iw = np.maximum(ixmax - ixmin, 0.) 330 | ih = np.maximum(iymax - iymin, 0.) 331 | inters = iw * ih 332 | uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + 333 | (BBGT[:, 2] - BBGT[:, 0]) * 334 | (BBGT[:, 3] - BBGT[:, 1]) - inters) 335 | overlaps = inters / uni 336 | ovmax = np.max(overlaps) 337 | jmax = np.argmax(overlaps) 338 | 339 | if ovmax > ovthresh: 340 | if not R['difficult'][jmax]: 341 | if not R['det'][jmax]: 342 | tp[d] = 1. 343 | R['det'][jmax] = 1 344 | else: 345 | fp[d] = 1. 346 | else: 347 | fp[d] = 1. 348 | 349 | # compute precision recall 350 | fp = np.cumsum(fp) 351 | tp = np.cumsum(tp) 352 | rec = tp / float(npos) 353 | # avoid divide by zero in case the first detection matches a difficult 354 | # ground truth 355 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 356 | ap = voc_ap(rec, prec, use_07_metric) 357 | else: 358 | rec = -1. 359 | prec = -1. 360 | ap = -1. 361 | 362 | return rec, prec, ap 363 | 364 | 365 | def test_net(save_folder, net, cuda, dataset, transform, top_k, 366 | im_size=300, thresh=0.05): 367 | num_images = len(dataset) 368 | # all detections are collected into: 369 | # all_boxes[cls][image] = N x 5 array of detections in 370 | # (x1, y1, x2, y2, score) 371 | all_boxes = [[[] for _ in range(num_images)] 372 | for _ in range(len(labelmap)+1)] 373 | 374 | # timers 375 | _t = {'im_detect': Timer(), 'misc': Timer()} 376 | output_dir = get_output_dir('ssd300_120000', set_type) 377 | det_file = os.path.join(output_dir, 'detections.pkl') 378 | 379 | for i in range(num_images): 380 | im, gt, h, w = dataset.pull_item(i) 381 | 382 | x = Variable(im.unsqueeze(0)) 383 | if args.cuda: 384 | x = x.cuda() 385 | _t['im_detect'].tic() 386 | detections = net(x).data 387 | detect_time = _t['im_detect'].toc(average=False) 388 | 389 | # skip j = 0, because it's the background class 390 | for j in range(1, detections.size(1)): 391 | dets = detections[0, j, :] 392 | mask = dets[:, 0].gt(0.).expand(5, dets.size(0)).t() 393 | dets = torch.masked_select(dets, mask).view(-1, 5) 394 | if dets.size(0) == 0: 395 | continue 396 | boxes = dets[:, 1:] 397 | boxes[:, 0] *= w 398 | boxes[:, 2] *= w 399 | boxes[:, 1] *= h 400 | boxes[:, 3] *= h 401 | scores = dets[:, 0].cpu().numpy() 402 | cls_dets = np.hstack((boxes.cpu().numpy(), 403 | scores[:, np.newaxis])).astype(np.float32, 404 | copy=False) 405 | all_boxes[j][i] = cls_dets 406 | 407 | print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, 408 | num_images, detect_time)) 409 | 410 | with open(det_file, 'wb') as f: 411 | pickle.dump(all_boxes, f, pickle.HIGHEST_PROTOCOL) 412 | 413 | print('Evaluating detections') 414 | evaluate_detections(all_boxes, output_dir, dataset) 415 | 416 | 417 | def evaluate_detections(box_list, output_dir, dataset): 418 | write_voc_results_file(box_list, dataset) 419 | do_python_eval(output_dir) 420 | 421 | 422 | if __name__ == '__main__': 423 | # load net 424 | num_classes = len(labelmap) + 1 # +1 for background 425 | net = build_ssd('test', args.input, num_classes) # initialize SSD 426 | net.load_state_dict(torch.load(args.trained_model)) 427 | net.eval() 428 | print('Finished loading model!') 429 | # load data 430 | dataset = VOCDetection(args.voc_root, [('2007', set_type)], 431 | BaseTransform(args.input, dataset_mean), 432 | VOCAnnotationTransform()) 433 | if args.cuda: 434 | net = net.cuda() 435 | cudnn.benchmark = True 436 | # evaluation 437 | test_net(args.save_folder, net, args.cuda, dataset, 438 | BaseTransform(net.size, dataset_mean), args.top_k, args.input, 439 | thresh=args.confidence_threshold) 440 | -------------------------------------------------------------------------------- /img/SSDplate.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/img/SSDplate.jpeg -------------------------------------------------------------------------------- /img/Screenshot from 2020-02-15 20-09-16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/img/Screenshot from 2020-02-15 20-09-16.png -------------------------------------------------------------------------------- /img/image-20200215220618684.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/img/image-20200215220618684.png -------------------------------------------------------------------------------- /img/loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/img/loss.png -------------------------------------------------------------------------------- /img/map_epoch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/img/map_epoch.png -------------------------------------------------------------------------------- /img/resut.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/img/resut.jpg -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /layers/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/layers/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /layers/__pycache__/box_utils.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/layers/__pycache__/box_utils.cpython-37.pyc -------------------------------------------------------------------------------- /layers/box_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | 4 | 5 | def point_form(boxes): 6 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 7 | representation for comparison to point form ground truth data. 8 | Args: 9 | boxes: (tensor) center-size default boxes from priorbox layers. 10 | Return: 11 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 12 | """ 13 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 14 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 15 | 16 | 17 | def center_size(boxes): 18 | """ Convert prior_boxes to (cx, cy, w, h) 19 | representation for comparison to center-size form ground truth data. 20 | Args: 21 | boxes: (tensor) point_form boxes 22 | Return: 23 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 24 | """ 25 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 26 | boxes[:, 2:] - boxes[:, :2], 1) # w, h 27 | 28 | 29 | def intersect(box_a, box_b): 30 | """ We resize both tensors to [A,B,2] without new malloc: 31 | [A,2] -> [A,1,2] -> [A,B,2] 32 | [B,2] -> [1,B,2] -> [A,B,2] 33 | Then we compute the area of intersect between box_a and box_b. 34 | Args: 35 | box_a: (tensor) bounding boxes, Shape: [A,4]. 36 | box_b: (tensor) bounding boxes, Shape: [B,4]. 37 | Return: 38 | (tensor) intersection area, Shape: [A,B]. 39 | """ 40 | A = box_a.size(0) 41 | B = box_b.size(0) 42 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 43 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 44 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 45 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 46 | inter = torch.clamp((max_xy - min_xy), min=0) 47 | return inter[:, :, 0] * inter[:, :, 1] 48 | 49 | 50 | def jaccard(box_a, box_b): 51 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 52 | is simply the intersection over union of two boxes. Here we operate on 53 | ground truth boxes and default boxes. 54 | E.g.: 55 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 56 | Args: 57 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 58 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 59 | Return: 60 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 61 | """ 62 | inter = intersect(box_a, box_b) 63 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 64 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 65 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 66 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 67 | union = area_a + area_b - inter 68 | return inter / union # [A,B] 69 | 70 | 71 | def match(threshold, truths, priors, variances, labels, loc_t, conf_t, idx): 72 | """Match each prior box with the ground truth box of the highest jaccard 73 | overlap, encode the bounding boxes, then return the matched indices 74 | corresponding to both confidence and location preds. 75 | Args: 76 | threshold: (float) The overlap threshold used when mathing boxes. 77 | truths: (tensor) Ground truth boxes, Shape: [num_obj, num_priors]. 78 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 79 | variances: (tensor) Variances corresponding to each prior coord, 80 | Shape: [num_priors, 4]. 81 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 82 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 83 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 84 | idx: (int) current batch index 85 | Return: 86 | The matched indices corresponding to 1)location and 2)confidence preds. 87 | """ 88 | # jaccard index 89 | overlaps = jaccard( 90 | truths, 91 | point_form(priors) 92 | ) 93 | # (Bipartite Matching) 94 | # [1,num_objects] best prior for each ground truth 95 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 96 | # [1,num_priors] best ground truth for each prior 97 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 98 | best_truth_idx.squeeze_(0) 99 | best_truth_overlap.squeeze_(0) 100 | best_prior_idx.squeeze_(1) 101 | best_prior_overlap.squeeze_(1) 102 | best_truth_overlap.index_fill_(0, best_prior_idx, 2) # ensure best prior 103 | # TODO refactor: index best_prior_idx with long tensor 104 | # ensure every gt matches with its prior of max overlap 105 | for j in range(best_prior_idx.size(0)): 106 | best_truth_idx[best_prior_idx[j]] = j 107 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 108 | conf = labels[best_truth_idx] + 1 # Shape: [num_priors] 109 | conf[best_truth_overlap < threshold] = 0 # label as background 110 | loc = encode(matches, priors, variances) 111 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 112 | conf_t[idx] = conf # [num_priors] top class label for each prior 113 | 114 | 115 | def encode(matched, priors, variances): 116 | """Encode the variances from the priorbox layers into the ground truth boxes 117 | we have matched (based on jaccard overlap) with the prior boxes. 118 | Args: 119 | matched: (tensor) Coords of ground truth for each prior in point-form 120 | Shape: [num_priors, 4]. 121 | priors: (tensor) Prior boxes in center-offset form 122 | Shape: [num_priors,4]. 123 | variances: (list[float]) Variances of priorboxes 124 | Return: 125 | encoded boxes (tensor), Shape: [num_priors, 4] 126 | """ 127 | 128 | # dist b/t match center and prior's center 129 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 130 | # encode variance 131 | g_cxcy /= (variances[0] * priors[:, 2:]) 132 | # match wh / prior wh 133 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 134 | g_wh = torch.log(g_wh) / variances[1] 135 | # return target for smooth_l1_loss 136 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 137 | 138 | 139 | # Adapted from https://github.com/Hakuyume/chainer-ssd 140 | def decode(loc, priors, variances): 141 | """Decode locations from predictions using priors to undo 142 | the encoding we did for offset regression at train time. 143 | Args: 144 | loc (tensor): location predictions for loc layers, 145 | Shape: [num_priors,4] 146 | priors (tensor): Prior boxes in center-offset form. 147 | Shape: [num_priors,4]. 148 | variances: (list[float]) Variances of priorboxes 149 | Return: 150 | decoded bounding box predictions 151 | """ 152 | 153 | boxes = torch.cat(( 154 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 155 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 156 | boxes[:, :2] -= boxes[:, 2:] / 2 157 | boxes[:, 2:] += boxes[:, :2] 158 | return boxes 159 | 160 | 161 | def log_sum_exp(x): 162 | """Utility function for computing log_sum_exp while determining 163 | This will be used to determine unaveraged confidence loss across 164 | all examples in a batch. 165 | Args: 166 | x (Variable(tensor)): conf_preds from conf layers 167 | """ 168 | x_max = x.data.max() 169 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 170 | 171 | 172 | # Original author: Francisco Massa: 173 | # https://github.com/fmassa/object-detection.torch 174 | # Ported to PyTorch by Max deGroot (02/01/2017) 175 | def nms(boxes, scores, overlap=0.5, top_k=200): 176 | """Apply non-maximum suppression at test time to avoid detecting too many 177 | overlapping bounding boxes for a given object. 178 | Args: 179 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 180 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 181 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 182 | top_k: (int) The Maximum number of box preds to consider. 183 | Return: 184 | The indices of the kept boxes with respect to num_priors. 185 | """ 186 | 187 | keep = scores.new(scores.size(0)).zero_().long() 188 | if boxes.numel() == 0: 189 | return keep 190 | x1 = boxes[:, 0] 191 | y1 = boxes[:, 1] 192 | x2 = boxes[:, 2] 193 | y2 = boxes[:, 3] 194 | area = torch.mul(x2 - x1, y2 - y1) 195 | v, idx = scores.sort(0) # sort in ascending order 196 | # I = I[v >= 0.01] 197 | idx = idx[-top_k:] # indices of the top-k largest vals 198 | xx1 = boxes.new() 199 | yy1 = boxes.new() 200 | xx2 = boxes.new() 201 | yy2 = boxes.new() 202 | w = boxes.new() 203 | h = boxes.new() 204 | 205 | # keep = torch.Tensor() 206 | count = 0 207 | while idx.numel() > 0: 208 | i = idx[-1] # index of current largest val 209 | # keep.append(i) 210 | keep[count] = i 211 | count += 1 212 | if idx.size(0) == 1: 213 | break 214 | idx = idx[:-1] # remove kept element from view 215 | # load bboxes of next highest vals 216 | torch.index_select(x1, 0, idx, out=xx1) 217 | torch.index_select(y1, 0, idx, out=yy1) 218 | torch.index_select(x2, 0, idx, out=xx2) 219 | torch.index_select(y2, 0, idx, out=yy2) 220 | # store element-wise max with next highest score 221 | xx1 = torch.clamp(xx1, min=x1[i]) 222 | yy1 = torch.clamp(yy1, min=y1[i]) 223 | xx2 = torch.clamp(xx2, max=x2[i]) 224 | yy2 = torch.clamp(yy2, max=y2[i]) 225 | w.resize_as_(xx2) 226 | h.resize_as_(yy2) 227 | w = xx2 - xx1 228 | h = yy2 - yy1 229 | # check sizes of xx1 and xx2.. after each iteration 230 | w = torch.clamp(w, min=0.0) 231 | h = torch.clamp(h, min=0.0) 232 | inter = w*h 233 | # IoU = i / (area(a) + area(b) - i) 234 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 235 | union = (rem_areas - inter) + area[i] 236 | IoU = inter/union # store result in iou 237 | # keep only elements with an IoU <= overlap 238 | idx = idx[IoU.le(overlap)] 239 | return keep, count 240 | -------------------------------------------------------------------------------- /layers/functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .detection import Detect 2 | from .prior_box import PriorBox 3 | 4 | 5 | __all__ = ['Detect', 'PriorBox'] 6 | -------------------------------------------------------------------------------- /layers/functions/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/layers/functions/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /layers/functions/__pycache__/detection.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/layers/functions/__pycache__/detection.cpython-37.pyc -------------------------------------------------------------------------------- /layers/functions/__pycache__/prior_box.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/layers/functions/__pycache__/prior_box.cpython-37.pyc -------------------------------------------------------------------------------- /layers/functions/detection.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.autograd import Function 3 | from ..box_utils import decode, nms 4 | from data import voc as cfg 5 | 6 | 7 | class Detect(Function): 8 | """At test time, Detect is the final layer of SSD. Decode location preds, 9 | apply non-maximum suppression to location predictions based on conf 10 | scores and threshold to a top_k number of output predictions for both 11 | confidence score and locations. 12 | """ 13 | def __init__(self, num_classes, size, bkg_label, top_k, conf_thresh, nms_thresh): 14 | self.num_classes = num_classes 15 | self.background_label = bkg_label 16 | self.top_k = top_k 17 | # Parameters used in nms. 18 | self.nms_thresh = nms_thresh 19 | if nms_thresh <= 0: 20 | raise ValueError('nms_threshold must be non negative.') 21 | self.conf_thresh = conf_thresh 22 | self.variance = cfg['SSD{}'.format(size)]['variance'] 23 | 24 | def forward(self, loc_data, conf_data, prior_data): 25 | """ 26 | Args: 27 | loc_data: (tensor) Loc preds from loc layers 28 | Shape: [batch,num_priors*4] 29 | conf_data: (tensor) Shape: Conf preds from conf layers 30 | Shape: [batch*num_priors,num_classes] 31 | prior_data: (tensor) Prior boxes and variances from priorbox layers 32 | Shape: [1,num_priors,4] 33 | """ 34 | num = loc_data.size(0) # batch size 35 | num_priors = prior_data.size(0) 36 | output = torch.zeros(num, self.num_classes, self.top_k, 5) 37 | print('conf_data size:',conf_data.size()) 38 | conf_preds = conf_data.transpose(2,1) 39 | conf_preds = conf_data.view(num, num_priors, 40 | self.num_classes).transpose(2, 1) 41 | print('conf_preds size:',conf_preds.size()) 42 | 43 | # Decode predictions into bboxes. 44 | for i in range(num): 45 | decoded_boxes = decode(loc_data[i], prior_data, self.variance) 46 | # For each class, perform nms 47 | conf_scores = conf_preds[i].clone() 48 | 49 | for cl in range(1, self.num_classes): 50 | c_mask = conf_scores[cl].gt(self.conf_thresh) 51 | scores = conf_scores[cl][c_mask] 52 | if scores.size(0) == 0: 53 | continue 54 | l_mask = c_mask.unsqueeze(1).expand_as(decoded_boxes) 55 | boxes = decoded_boxes[l_mask].view(-1, 4) 56 | # idx of highest scoring and non-overlapping boxes per class 57 | ids, count = nms(boxes, scores, self.nms_thresh, self.top_k) 58 | output[i, cl, :count] = \ 59 | torch.cat((scores[ids[:count]].unsqueeze(1), 60 | boxes[ids[:count]]), 1) 61 | flt = output.contiguous().view(num, -1, 5) 62 | _, idx = flt[:, :, 0].sort(1, descending=True) 63 | _, rank = idx.sort(1) 64 | flt[(rank < self.top_k).unsqueeze(-1).expand_as(flt)].fill_(0) 65 | return output 66 | -------------------------------------------------------------------------------- /layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from math import sqrt as sqrt 3 | from itertools import product as product 4 | import torch 5 | 6 | 7 | class PriorBox(object): 8 | """Compute priorbox coordinates in center-offset form for each source 9 | feature map. 10 | """ 11 | def __init__(self, cfg): 12 | super(PriorBox, self).__init__() 13 | self.image_size = cfg['min_dim'] 14 | # number of priors for feature map location (either 4 or 6) 15 | self.num_priors = len(cfg['aspect_ratios']) 16 | self.variance = cfg['variance'] or [0.1] 17 | self.feature_maps = cfg['feature_maps'] 18 | self.min_sizes = cfg['min_sizes'] 19 | self.max_sizes = cfg['max_sizes'] 20 | self.steps = cfg['steps'] 21 | self.aspect_ratios = cfg['aspect_ratios'] 22 | self.clip = cfg['clip'] 23 | self.version = cfg['name'] 24 | for v in self.variance: 25 | if v <= 0: 26 | raise ValueError('Variances must be greater than 0') 27 | 28 | def forward(self): 29 | mean = [] 30 | for k, f in enumerate(self.feature_maps): 31 | for i, j in product(range(f), repeat=2): 32 | f_k = self.image_size / self.steps[k] 33 | # unit center x,y 34 | cx = (j + 0.5) / f_k 35 | cy = (i + 0.5) / f_k 36 | 37 | # aspect_ratio: 1 38 | # rel size: min_size 39 | s_k = self.min_sizes[k]/self.image_size 40 | mean += [cx, cy, s_k, s_k] 41 | 42 | # aspect_ratio: 1 43 | # rel size: sqrt(s_k * s_(k+1)) 44 | s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size)) 45 | mean += [cx, cy, s_k_prime, s_k_prime] 46 | 47 | # rest of aspect ratios 48 | for ar in self.aspect_ratios[k]: 49 | mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)] 50 | mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)] 51 | # back to torch land 52 | output = torch.Tensor(mean).view(-1, 4) 53 | if self.clip: 54 | output.clamp_(max=1, min=0) 55 | return output 56 | -------------------------------------------------------------------------------- /layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .l2norm import L2Norm 2 | from .multibox_loss import MultiBoxLoss 3 | 4 | __all__ = ['L2Norm', 'MultiBoxLoss'] 5 | -------------------------------------------------------------------------------- /layers/modules/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/layers/modules/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /layers/modules/__pycache__/l2norm.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/layers/modules/__pycache__/l2norm.cpython-37.pyc -------------------------------------------------------------------------------- /layers/modules/__pycache__/multibox_loss.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/layers/modules/__pycache__/multibox_loss.cpython-37.pyc -------------------------------------------------------------------------------- /layers/modules/l2norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Function 4 | from torch.autograd import Variable 5 | import torch.nn.init as init 6 | 7 | class L2Norm(nn.Module): 8 | def __init__(self,n_channels, scale): 9 | super(L2Norm,self).__init__() 10 | self.n_channels = n_channels 11 | self.gamma = scale or None 12 | self.eps = 1e-10 13 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 14 | self.reset_parameters() 15 | 16 | def reset_parameters(self): 17 | init.constant_(self.weight,self.gamma) 18 | 19 | def forward(self, x): 20 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt()+self.eps 21 | #x /= norm 22 | x = torch.div(x,norm) 23 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 24 | return out 25 | -------------------------------------------------------------------------------- /layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from torch.autograd import Variable 6 | from data import coco as cfg 7 | from ..box_utils import match, log_sum_exp 8 | 9 | 10 | class MultiBoxLoss(nn.Module): 11 | """SSD Weighted Loss Function 12 | Compute Targets: 13 | 1) Produce Confidence Target Indices by matching ground truth boxes 14 | with (default) 'priorboxes' that have jaccard index > threshold parameter 15 | (default threshold: 0.5). 16 | 2) Produce localization target by 'encoding' variance into offsets of ground 17 | truth boxes and their matched 'priorboxes'. 18 | 3) Hard negative mining to filter the excessive number of negative examples 19 | that comes with using a large number of default bounding boxes. 20 | (default negative:positive ratio 3:1) 21 | Objective Loss: 22 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 23 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 24 | weighted by α which is set to 1 by cross val. 25 | Args: 26 | c: class confidences, 27 | l: predicted boxes, 28 | g: ground truth boxes 29 | N: number of matched default boxes 30 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 31 | """ 32 | 33 | def __init__(self, num_classes, overlap_thresh, prior_for_matching, 34 | bkg_label, neg_mining, neg_pos, neg_overlap, encode_target, 35 | use_gpu=True): 36 | super(MultiBoxLoss, self).__init__() 37 | self.use_gpu = use_gpu 38 | self.num_classes = num_classes 39 | self.threshold = overlap_thresh 40 | self.background_label = bkg_label 41 | self.encode_target = encode_target 42 | self.use_prior_for_matching = prior_for_matching 43 | self.do_neg_mining = neg_mining 44 | self.negpos_ratio = neg_pos 45 | self.neg_overlap = neg_overlap 46 | self.variance = cfg['variance'] 47 | 48 | def forward(self, predictions, targets): 49 | """Multibox Loss 50 | Args: 51 | predictions (tuple): A tuple containing loc preds, conf preds, 52 | and prior boxes from SSD net. 53 | conf shape: torch.size(batch_size,num_priors,num_classes) 54 | loc shape: torch.size(batch_size,num_priors,4) 55 | priors shape: torch.size(num_priors,4) 56 | 57 | targets (tensor): Ground truth boxes and labels for a batch, 58 | shape: [batch_size,num_objs,5] (last idx is the label). 59 | """ 60 | loc_data, conf_data, priors = predictions 61 | num = loc_data.size(0) 62 | priors = priors[:loc_data.size(1), :] 63 | num_priors = (priors.size(0)) 64 | num_classes = self.num_classes 65 | 66 | # match priors (default boxes) and ground truth boxes 67 | loc_t = torch.Tensor(num, num_priors, 4) 68 | conf_t = torch.LongTensor(num, num_priors) 69 | for idx in range(num): 70 | truths = targets[idx][:, :-1].data 71 | labels = targets[idx][:, -1].data 72 | defaults = priors.data 73 | match(self.threshold, truths, defaults, self.variance, labels, 74 | loc_t, conf_t, idx) 75 | if self.use_gpu: 76 | loc_t = loc_t.cuda() 77 | conf_t = conf_t.cuda() 78 | # wrap targets 79 | loc_t = Variable(loc_t, requires_grad=False) 80 | conf_t = Variable(conf_t, requires_grad=False) 81 | 82 | pos = conf_t > 0 83 | num_pos = pos.sum(dim=1, keepdim=True) 84 | 85 | # Localization Loss (Smooth L1) 86 | # Shape: [batch,num_priors,4] 87 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 88 | loc_p = loc_data[pos_idx].view(-1, 4) 89 | loc_t = loc_t[pos_idx].view(-1, 4) 90 | loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') 91 | 92 | # Compute max conf across batch for hard negative mining 93 | batch_conf = conf_data.view(-1, self.num_classes) 94 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) 95 | loss_c = loss_c.view(pos.size()[0], pos.size()[1]) 96 | # Hard Negative Mining 97 | loss_c[pos] = 0 # filter out pos boxes for now 98 | loss_c = loss_c.view(num, -1) 99 | _, loss_idx = loss_c.sort(1, descending=True) 100 | _, idx_rank = loss_idx.sort(1) 101 | num_pos = pos.long().sum(1, keepdim=True) 102 | num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) 103 | neg = idx_rank < num_neg.expand_as(idx_rank) 104 | 105 | # Confidence Loss Including Positive and Negative Examples 106 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 107 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 108 | conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1, self.num_classes) 109 | targets_weighted = conf_t[(pos+neg).gt(0)] 110 | loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum') 111 | 112 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 113 | 114 | N = num_pos.data.sum() 115 | loss_l /= N 116 | loss_c /= N 117 | return loss_l, loss_c 118 | -------------------------------------------------------------------------------- /ssd.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from layers import * 6 | from data import voc, coco 7 | import os 8 | from efficientnet_pytorch import EfficientNet 9 | 10 | 11 | class SSD(nn.Module): 12 | """Single Shot Multibox Architecture 13 | The network is composed of a base VGG network followed by the 14 | added multibox conv layers. Each multibox layer branches into 15 | 1) conv2d for class conf scores 16 | 2) conv2d for localization predictions 17 | 3) associated priorbox layer to produce default bounding 18 | boxes specific to the layer's feature map size. 19 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 20 | 21 | Args: 22 | phase: (string) Can be "test" or "train" 23 | size: input image size 24 | base: VGG16 layers for input, size of either 300 or 512 25 | extras: extra layers that feed to multibox loc and conf layers 26 | head: "multibox head" consists of loc and conf conv layers 27 | """ 28 | 29 | def __init__(self, phase, size, base, extras, head, num_classes): 30 | super(SSD, self).__init__() 31 | self.phase = phase 32 | self.num_classes = num_classes 33 | self.cfg = voc['SSD{}'.format(size)] 34 | self.priorbox = PriorBox(self.cfg) 35 | with torch.no_grad(): 36 | self.priors = Variable(self.priorbox.forward()) 37 | self.size = size 38 | 39 | # SSD network 40 | self.base = nn.ModuleList(base) 41 | # Layer learns to scale the l2 normalized features from conv4_3 42 | self.L2Norm = L2Norm(512, 20) 43 | self.extras = nn.ModuleList(extras) 44 | 45 | self.loc = nn.ModuleList(head[0]) 46 | self.conf = nn.ModuleList(head[1]) 47 | 48 | if phase == 'test': 49 | self.softmax = nn.Softmax(dim=-1) 50 | self.detect = Detect(num_classes, size, 0, 200, 0.01, 0.45) 51 | 52 | def forward(self, x): 53 | """Applies network layers and ops on input image(s) x. 54 | 55 | Args: 56 | x: input image or batch of images. Shape: [batch,3,300,300]. 57 | 58 | Return: 59 | Depending on phase: 60 | test: 61 | Variable(tensor) of output class label predictions, 62 | confidence score, and corresponding location predictions for 63 | each object detected. Shape: [batch,topk,7] 64 | 65 | train: 66 | list of concat outputs from: 67 | 1: confidence layers, Shape: [batch*num_priors,num_classes] 68 | 2: localization layers, Shape: [batch,num_priors*4] 69 | 3: priorbox layers, Shape: [2,num_priors*4] 70 | """ 71 | sources = list() 72 | loc = list() 73 | conf = list() 74 | 75 | # apply vgg up to conv4_3 relu 76 | for k in range(23): 77 | x = self.base[k](x) 78 | 79 | s = self.L2Norm(x) 80 | sources.append(s) 81 | 82 | # apply vgg up to fc7 83 | for k in range(23, len(self.base)): 84 | x = self.base[k](x) 85 | sources.append(x) 86 | 87 | # apply extra layers and cache source layer outputs 88 | for k, v in enumerate(self.extras): 89 | x = F.relu(v(x), inplace=True) 90 | if k % 2 == 1: 91 | sources.append(x) 92 | 93 | # apply multibox head to source layers 94 | for (x, l, c) in zip(sources, self.loc, self.conf): 95 | loc.append(l(x).permute(0, 2, 3, 1).contiguous()) 96 | conf.append(c(x).permute(0, 2, 3, 1).contiguous()) 97 | 98 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 99 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 100 | if self.phase == "test": 101 | output = self.detect( 102 | loc.view(loc.size(0), -1, 4), # loc preds 103 | self.softmax(conf.view(conf.size(0), -1, 104 | self.num_classes)), # conf preds 105 | self.priors.type(type(x.data)) # default boxes 106 | ) 107 | else: 108 | output = ( 109 | loc.view(loc.size(0), -1, 4), 110 | conf.view(conf.size(0), -1, self.num_classes), 111 | self.priors 112 | ) 113 | return output 114 | 115 | def load_weights(self, base_file): 116 | other, ext = os.path.splitext(base_file) 117 | if ext == '.pkl' or '.pth': 118 | print('Begin loading weights into state dict...') 119 | self.load_state_dict(torch.load(base_file, 120 | map_location=lambda storage, loc: storage)) 121 | print('Finished!') 122 | else: 123 | print('Sorry only .pth and .pkl files supported.') 124 | 125 | 126 | # This function is derived from torchvision VGG make_layers() 127 | # https://github.com/pytorch/vision/blob/master/torchvision/models/vgg.py 128 | def vgg(cfg, i = 3, batch_norm=False): 129 | layers = [] 130 | in_channels = i 131 | for v in cfg: 132 | if v == 'M': 133 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 134 | elif v == 'C': 135 | layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)] 136 | else: 137 | conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1) 138 | if batch_norm: 139 | layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)] 140 | else: 141 | layers += [conv2d, nn.ReLU(inplace=True)] 142 | in_channels = v 143 | pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1) 144 | conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6) 145 | conv7 = nn.Conv2d(1024, 1024, kernel_size=1) 146 | layers += [pool5, conv6, 147 | nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)] 148 | print('VGG base:',layers) 149 | return layers 150 | 151 | def efficientnet_base(batch_norm=False): 152 | base_model = EfficientNet.from_name('efficientnet-b4') 153 | layer1 = [base_model._conv_stem, base_model._bn0] 154 | layer2 = [base_model._blocks[0],base_model._blocks[1],base_model._blocks[2]] 155 | layer3 = [base_model._blocks[3],base_model._blocks[4],base_model._blocks[5],base_model._blocks[6]] 156 | layer4 = [base_model._blocks[7],base_model._blocks[8],base_model._blocks[9],base_model._blocks[10]] 157 | layer5 = [base_model._blocks[11],base_model._blocks[12],base_model._blocks[13],base_model._blocks[14],base_model._blocks[15],base_model._blocks[16],base_model._blocks[17],base_model._blocks[18],base_model._blocks[19],base_model._blocks[20],base_model._blocks[21],base_model._blocks[22]] 158 | print('base network:', layer1 + layer2 + layer3 + layer4 + layer5) 159 | return layer1 + layer2 + layer3 + layer4 + layer5 160 | 161 | 162 | def add_extras(cfg, i, batch_norm=False): 163 | # Extra layers added to VGG for feature scaling 164 | layers = [] 165 | in_channels = i 166 | flag = False 167 | for k, v in enumerate(cfg): 168 | if in_channels != 'S': 169 | if v == 'S': 170 | layers += [nn.Conv2d(in_channels, cfg[k + 1], 171 | kernel_size=(1, 3)[flag], stride=2, padding=1)] 172 | else: 173 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 174 | flag = not flag 175 | in_channels = v 176 | if len(cfg) == 13: 177 | print('input channels:',in_channels) 178 | layers += [nn.Conv2d(in_channels, 256, kernel_size=4,padding=1)] # Fix padding to match Caffe version (pad=1). 179 | print('extras layers:',layers) 180 | return layers 181 | 182 | def add_efficientnet_extras(cfg, i = 272, batch_norm=False): 183 | # Extra layers added to EfficientNet for feature scaling 184 | layers = [] 185 | in_channels = i 186 | flag = False 187 | for k, v in enumerate(cfg): 188 | if in_channels != 'S': 189 | if v == 'S': 190 | layers += [nn.Conv2d(in_channels, cfg[k + 1], 191 | kernel_size=(1, 3)[flag], stride=2, padding=1)] 192 | else: 193 | layers += [nn.Conv2d(in_channels, v, kernel_size=(1, 3)[flag])] 194 | flag = not flag 195 | in_channels = v 196 | print('extras layers:',layers) 197 | return layers 198 | 199 | def multibox(vgg, extra_layers, cfg, num_classes): 200 | loc_layers = [] 201 | conf_layers = [] 202 | vgg_source = [21, -2] #Conv4_3 Conv7 203 | print('VGG16 output size:',len(vgg)) 204 | print('extra layer size:', len(extra_layers)) 205 | for i, layer in enumerate(extra_layers): 206 | print('extra layer {} : {}'.format(i, layer)) 207 | for k, v in enumerate(vgg_source): 208 | loc_layers += [nn.Conv2d(vgg[v].out_channels, 209 | cfg[k] * 4, kernel_size=3, padding=1)] 210 | conf_layers += [nn.Conv2d(vgg[v].out_channels, 211 | cfg[k] * num_classes, kernel_size=3, padding=1)] 212 | for k, v in enumerate(extra_layers[1::2], 2): 213 | loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 214 | * 4, kernel_size=3, padding=1)] 215 | conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 216 | * num_classes, kernel_size=3, padding=1)] 217 | return vgg, extra_layers, (loc_layers, conf_layers) 218 | 219 | def efficientnet_multibox(efficientnet, extra_layers, cfg, num_classes): 220 | loc_layers = [] 221 | conf_layers = [] 222 | efficientnet_source = [9, 13, -1] #P3-p7 223 | print('EfficientNet output size:',len(efficientnet_source)) 224 | print('extra layer size:', len(extra_layers)) 225 | # print('efficientnet',efficientnet[9]) 226 | for i, layer in enumerate(extra_layers): 227 | print('extra layer {} : {}'.format(i, layer)) 228 | for k, v in enumerate(efficientnet_source): 229 | loc_layers += [nn.Conv2d(efficientnet[v]._project_conv.weight.size()[0], 230 | cfg[k] * 4, kernel_size=3, padding=1)] 231 | conf_layers += [nn.Conv2d(efficientnet[v]._project_conv.weight.size()[0], 232 | cfg[k] * num_classes, kernel_size=3, padding=1)] 233 | for k, v in enumerate(extra_layers[1::2], 2): 234 | loc_layers += [nn.Conv2d(v.out_channels, cfg[k] 235 | * 4, kernel_size=3, padding=1)] 236 | conf_layers += [nn.Conv2d(v.out_channels, cfg[k] 237 | * num_classes, kernel_size=3, padding=1)] 238 | return efficientnet, extra_layers, (loc_layers, conf_layers) 239 | 240 | base = { 241 | '300': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 242 | 512, 512, 512], 243 | '512': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M', 244 | 512, 512, 512], 245 | } 246 | extras = { 247 | '300': [256, 'S', 512, 128, 'S', 256, 128, 256, 128, 256], 248 | '512': [256, 'S', 512, 128, 'S', 256, 128, 'S', 256, 128, 'S', 256, 128], 249 | } 250 | mbox = { 251 | '300': [4, 6, 6, 6, 4, 4], # number of boxes per feature map location 252 | '512': [4, 6, 6, 6, 4, 4, 4], 253 | } 254 | efficientnet_mbox = [4, 6, 6, 6, 4, 4] 255 | efficientnet_axtras = [128, 'S', 256, 128, 256, 128, 256] 256 | 257 | 258 | def build_ssd(phase, size=300, num_classes=21): 259 | if phase != "test" and phase != "train": 260 | print("ERROR: Phase: " + phase + " not recognized") 261 | return 262 | if size not in [300, 512] : 263 | print("ERROR: You specified size " + repr(size) + ". However, " + 264 | "currently only SSD300 and SSD512 is supported!") 265 | return 266 | base_, extras_, head_ = multibox(vgg(base[str(size)], 3), 267 | add_extras(extras[str(size)], 1024), 268 | mbox[str(size)], num_classes) 269 | print('Begin to build SSD-VGG...\n') 270 | return SSD(phase, size, base_, extras_, head_, num_classes) 271 | 272 | def build_ssd_efficientnet(phase, size=300, num_classes=21): 273 | if phase != "test" and phase != "train": 274 | print("ERROR: Phase: " + phase + " not recognized") 275 | return 276 | if size not in [300, 512] : 277 | print("ERROR: You specified size " + repr(size) + ". However, " + 278 | "currently only SSD300 and SSD512 is supported!") 279 | return 280 | base_, extras_, head_ = efficientnet_multibox(efficientnet_base(), 281 | add_efficientnet_extras(efficientnet_axtras), 282 | efficientnet_mbox, num_classes) 283 | print('Begin to build SSD-EfficientNet...') 284 | return SSD(phase, size, base_, extras_, head_, num_classes) -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | import os 4 | import argparse 5 | import torch 6 | import torch.nn as nn 7 | import torch.backends.cudnn as cudnn 8 | import torchvision.transforms as transforms 9 | from torch.autograd import Variable 10 | from data import VOC_ROOT, VOC_CLASSES as labelmap 11 | from PIL import Image 12 | from data import VOCAnnotationTransform, VOCDetection, BaseTransform, VOC_CLASSES 13 | import torch.utils.data as data 14 | from ssd import build_ssd 15 | 16 | parser = argparse.ArgumentParser(description='Single Shot MultiBox Detection') 17 | parser.add_argument('--trained_model', default='weights/ssd_300_VOC0712.pth', 18 | type=str, help='Trained state_dict file path to open') 19 | parser.add_argument('--save_folder', default='eval/', type=str, 20 | help='Dir to save results') 21 | parser.add_argument('--visual_threshold', default=0.6, type=float, 22 | help='Final confidence threshold') 23 | parser.add_argument('--cuda', default=True, type=bool, 24 | help='Use cuda to train model') 25 | parser.add_argument('--voc_root', default=VOC_ROOT, help='Location of VOC root directory') 26 | parser.add_argument('-f', default=None, type=str, help="Dummy arg so we can load in Jupyter Notebooks") 27 | args = parser.parse_args() 28 | 29 | if args.cuda and torch.cuda.is_available(): 30 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 31 | else: 32 | torch.set_default_tensor_type('torch.FloatTensor') 33 | 34 | if not os.path.exists(args.save_folder): 35 | os.mkdir(args.save_folder) 36 | 37 | 38 | def test_net(save_folder, net, cuda, testset, transform, thresh): 39 | # dump predictions and assoc. ground truth to text file for now 40 | filename = save_folder+'test1.txt' 41 | num_images = len(testset) 42 | for i in range(num_images): 43 | print('Testing image {:d}/{:d}....'.format(i+1, num_images)) 44 | img = testset.pull_image(i) 45 | img_id, annotation = testset.pull_anno(i) 46 | x = torch.from_numpy(transform(img)[0]).permute(2, 0, 1) 47 | x = Variable(x.unsqueeze(0)) 48 | 49 | with open(filename, mode='a') as f: 50 | f.write('\nGROUND TRUTH FOR: '+img_id+'\n') 51 | for box in annotation: 52 | f.write('label: '+' || '.join(str(b) for b in box)+'\n') 53 | if cuda: 54 | x = x.cuda() 55 | 56 | y = net(x) # forward pass 57 | detections = y.data 58 | # scale each detection back up to the image 59 | scale = torch.Tensor([img.shape[1], img.shape[0], 60 | img.shape[1], img.shape[0]]) 61 | pred_num = 0 62 | for i in range(detections.size(1)): 63 | j = 0 64 | while detections[0, i, j, 0] >= 0.6: 65 | if pred_num == 0: 66 | with open(filename, mode='a') as f: 67 | f.write('PREDICTIONS: '+'\n') 68 | score = detections[0, i, j, 0] 69 | label_name = labelmap[i-1] 70 | pt = (detections[0, i, j, 1:]*scale).cpu().numpy() 71 | coords = (pt[0], pt[1], pt[2], pt[3]) 72 | pred_num += 1 73 | with open(filename, mode='a') as f: 74 | f.write(str(pred_num)+' label: '+label_name+' score: ' + 75 | str(score) + ' '+' || '.join(str(c) for c in coords) + '\n') 76 | j += 1 77 | 78 | 79 | def test_voc(): 80 | # load net 81 | num_classes = len(VOC_CLASSES) + 1 # +1 background 82 | net = build_ssd('test', 300, num_classes) # initialize SSD 83 | net.load_state_dict(torch.load(args.trained_model)) 84 | net.eval() 85 | print('Finished loading model!') 86 | # load data 87 | testset = VOCDetection(args.voc_root, [('2007', 'test')], None, VOCAnnotationTransform()) 88 | if args.cuda: 89 | net = net.cuda() 90 | cudnn.benchmark = True 91 | # evaluation 92 | test_net(args.save_folder, net, args.cuda, testset, 93 | BaseTransform(net.size, (104, 117, 123)), 94 | thresh=args.visual_threshold) 95 | 96 | if __name__ == '__main__': 97 | test_voc() 98 | -------------------------------------------------------------------------------- /test/COCO_train2014_000000000659.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/test/COCO_train2014_000000000659.jpg -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from data import * 2 | from utils.augmentations import SSDAugmentation 3 | from layers.modules import MultiBoxLoss 4 | from ssd import build_ssd, build_ssd_efficientnet 5 | import os 6 | import sys 7 | import time 8 | import torch 9 | from torch.autograd import Variable 10 | import torch.nn as nn 11 | import torch.optim as optim 12 | import torch.backends.cudnn as cudnn 13 | import torch.nn.init as init 14 | import torch.utils.data as data 15 | import numpy as np 16 | import argparse 17 | import pickle 18 | import math 19 | 20 | 21 | def str2bool(v): 22 | return v.lower() in ("yes", "true", "t", "1") 23 | 24 | 25 | parser = argparse.ArgumentParser( 26 | description='Single Shot MultiBox Detector Training With Pytorch') 27 | train_set = parser.add_mutually_exclusive_group() 28 | parser.add_argument('--input',default=300, type=int, choices=[300, 512], help='ssd input size, currently support ssd300 and ssd512') 29 | parser.add_argument('--dataset', default='VOC', choices=['VOC', 'COCO'], 30 | type=str, help='VOC or COCO') 31 | parser.add_argument('--num_class', default=21, type=int, help='number of class in ur dataset') 32 | parser.add_argument('--dataset_root', default='./data/VOCdevkit', 33 | help='Dataset root directory path') 34 | parser.add_argument('--basenet', default='vgg16_reducedfc.pth', type=str, choices=['vgg16_reducedfc.pth', 'efficientnet_b4_truncated.pth'], 35 | help='Pretrained base model') 36 | parser.add_argument('--num_epoch', default=300, type=int, help='number of epochs to train') 37 | parser.add_argument('--batch_size', default=16, type=int, 38 | help='Batch size for training') 39 | parser.add_argument('--resume', default=None, type=str, 40 | help='Checkpoint state_dict file to resume training from') 41 | parser.add_argument('--start_epoch', default=0, type=int, 42 | help='Resume training at this epoch') 43 | parser.add_argument('--num_workers', default=6, type=int, 44 | help='Number of workers used in dataloading') 45 | parser.add_argument('--cuda', default=True, type=str2bool, 46 | help='Use CUDA to train model') 47 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, 48 | help='initial learning rate') 49 | parser.add_argument('--momentum', default=0.9, type=float, 50 | help='Momentum value for optim') 51 | parser.add_argument('--weight_decay', default=1e-8, type=float, 52 | help='Weight decay for SGD') 53 | parser.add_argument('--gamma', default=0.1, type=float, 54 | help='Gamma update for SGD') 55 | parser.add_argument('--visdom', default=False, type=str2bool, 56 | help='Use visdom for loss visualization') 57 | parser.add_argument('--save_folder', default='weights/', 58 | help='Directory for saving checkpoint models') 59 | args = parser.parse_args() 60 | 61 | 62 | if torch.cuda.is_available(): 63 | if args.cuda: 64 | torch.set_default_tensor_type('torch.cuda.FloatTensor') 65 | if not args.cuda: 66 | print("WARNING: It looks like you have a CUDA device, but aren't " + 67 | "using CUDA.\nRun with --cuda for optimal training speed.") 68 | torch.set_default_tensor_type('torch.FloatTensor') 69 | else: 70 | torch.set_default_tensor_type('torch.FloatTensor') 71 | 72 | if not os.path.exists(args.save_folder): 73 | os.mkdir(args.save_folder) 74 | 75 | 76 | def train(): 77 | if args.dataset == 'COCO': 78 | if args.dataset_root == VOC_ROOT: 79 | if not os.path.exists(COCO_ROOT): 80 | parser.error('Must specify dataset_root if specifying dataset') 81 | print("WARNING: Using default COCO dataset_root because " + 82 | "--dataset_root was not specified.") 83 | args.dataset_root = COCO_ROOT 84 | cfg = coco 85 | dataset = COCODetection(root=args.dataset_root, 86 | transform=SSDAugmentation(cfg['min_dim'], 87 | MEANS)) 88 | elif args.dataset == 'VOC': 89 | if args.dataset_root == VOC_ROOT: 90 | parser.error('Must specify dataset if specifying dataset_root') 91 | cfg = voc 92 | dataset = VOCDetection(root=args.dataset_root, 93 | transform=SSDAugmentation(args.input, 94 | MEANS)) 95 | 96 | if args.visdom: 97 | import visdom 98 | viz = visdom.Visdom() 99 | if args.basenet == 'vgg16_reducedfc.pth': 100 | ssd_net = build_ssd('train', args.input, args.num_class) 101 | elif args.basenet == 'efficientnet_b4_truncated.pth': 102 | ssd_net = build_ssd_efficientnet('train', args.input, args.num_class) 103 | net = ssd_net 104 | 105 | if args.cuda: 106 | net = torch.nn.DataParallel(ssd_net) 107 | cudnn.benchmark = True 108 | 109 | if args.resume: 110 | print('Resuming training, loading {}...'.format(args.resume)) 111 | ssd_net.load_weights(args.resume) 112 | else: 113 | if args.basenet == 'vgg16_reducedfc.pth': 114 | vgg_weights = torch.load(args.save_folder + args.basenet) 115 | print('Loading base network weights from %s\n'%(args.save_folder + args.basenet)) 116 | ssd_net.base.load_state_dict(vgg_weights) 117 | elif args.basenet == 'efficientnet_b4_truncated.pth': 118 | efficientnet_weights = torch.load(args.save_folder + args.basenet) 119 | print('Loading base network weights from %s\n' % (args.save_folder + args.basenet)) 120 | print('ssd_net.base:',ssd_net.base) 121 | ssd_net.base.load_state_dict(efficientnet_weights) 122 | 123 | if args.cuda: 124 | net = net.cuda() 125 | 126 | if not args.resume: 127 | print('Initializing weights...') 128 | # initialize newly added layers' weights with xavier method 129 | 130 | ssd_net.extras.apply(weights_init) 131 | ssd_net.loc.apply(weights_init) 132 | ssd_net.conf.apply(weights_init) 133 | 134 | optimizer = optim.AdamW(net.parameters(), lr=args.lr) 135 | criterion = MultiBoxLoss(args.num_class, 0.5, True, 0, True, 3, 0.5, 136 | False, args.cuda) 137 | 138 | net.train() 139 | # loss counters 140 | loc_loss = 0 141 | conf_loss = 0 142 | iteration = 1 143 | loss_total = [] 144 | loss_loc = [] 145 | loss_cls = [] 146 | print('Loading the dataset...') 147 | 148 | epoch_size = math.ceil(len(dataset) / args.batch_size) 149 | print('iteration per epoch:',epoch_size) 150 | print('Training SSD on:', dataset.name) 151 | print('Using the specified args:') 152 | print(args) 153 | step_index = 0 154 | if args.visdom: 155 | vis_title = 'SSD.PyTorch on ' + dataset.name 156 | vis_legend = ['Loc Loss', 'Conf Loss', 'Total Loss'] 157 | iter_plot = create_vis_plot('Iteration', 'Loss', vis_title, vis_legend) 158 | epoch_plot = create_vis_plot('Epoch', 'Loss', vis_title, vis_legend) 159 | 160 | data_loader = data.DataLoader(dataset, args.batch_size, 161 | num_workers=args.num_workers, 162 | shuffle=True, collate_fn=detection_collate, 163 | pin_memory=True) 164 | # create batch iterator 165 | # batch_iterator = iter(data_loader) 166 | for epoch in range(args.start_epoch, args.num_epoch): 167 | print('\n'+'-'*70+'Epoch: {}'.format(epoch)+'-'*70+'\n') 168 | if args.visdom and epoch != 0 and (iteration % epoch_size == 0): 169 | update_vis_plot(epoch, loc_loss, conf_loss, epoch_plot, None, 170 | 'append', epoch_size) 171 | # reset epoch loss counters 172 | loc_loss = 0 173 | conf_loss = 0 174 | epoch += 1 175 | if epoch in cfg['SSD{}'.format(args.input)]['lr_steps']: 176 | step_index += 1 177 | adjust_learning_rate(optimizer, args.gamma, step_index) 178 | if epoch <= 5: 179 | warmup_learning_rate(optimizer,epoch) 180 | for images, targets in data_loader: # load train data 181 | # if iteration % 100 == 0: 182 | for param in optimizer.param_groups: 183 | if 'lr' in param.keys(): 184 | cur_lr = param['lr'] 185 | if args.cuda: 186 | images = Variable(images.cuda()) 187 | targets = [Variable(ann.cuda()) for ann in targets] 188 | else: 189 | images = Variable(images) 190 | targets = [Variable(ann) for ann in targets] 191 | # forward 192 | t0 = time.time() 193 | out = net(images) 194 | # backprop 195 | optimizer.zero_grad() 196 | loss_l, loss_c = criterion(out, targets) 197 | loss = loss_l + loss_c 198 | loss.backward() 199 | optimizer.step() 200 | t1 = time.time() 201 | loc_loss += loss_l.item() 202 | conf_loss += loss_c.item() 203 | 204 | if iteration % 10 == 0: 205 | print('Epoch '+repr(epoch)+'|| iter ' + repr(iteration % epoch_size)+'/'+repr(epoch_size) +'|| Total iter '+repr(iteration)+ ' || Total Loss: %.4f || Loc Loss: %.4f || Cls Loss: %.4f || LR: %f || timer: %.4f sec.\n' % (loss.item(),loss_l.item(),loss_c.item(),cur_lr,(t1 - t0)), end=' ') 206 | loss_cls.append(loss_c.item()) 207 | loss_loc.append(loss_l.item()) 208 | loss_total.append(loss.item()) 209 | loss_dic = {'loss':loss_total, 'loss_cls':loss_cls, 'loss_loc':loss_loc} 210 | 211 | if args.visdom: 212 | update_vis_plot(iteration, loss_l.item(), loss_c.item(), 213 | iter_plot, epoch_plot, 'append') 214 | 215 | if iteration != 0 and iteration % 5000 == 0: 216 | print('Saving state, iter:', iteration) 217 | torch.save(ssd_net.state_dict(), 'weights/ssd{}_VOC_'.format(args.input) + 218 | repr(iteration) + '.pth') 219 | with open('loss.pkl', 'wb') as f: 220 | pickle.dump(loss_dic, f, pickle.HIGHEST_PROTOCOL) 221 | iteration += 1 222 | torch.save(ssd_net.state_dict(), 223 | args.save_folder + '' + args.dataset + '.pth') 224 | 225 | 226 | def adjust_learning_rate(optimizer, gamma, step): 227 | """Sets the learning rate to the initial LR decayed by 10 at every 228 | specified step 229 | # Adapted from PyTorch Imagenet example: 230 | # https://github.com/pytorch/examples/blob/master/imagenet/main.py 231 | """ 232 | lr = args.lr * (gamma ** (step)) 233 | print('Now we change lr ...') 234 | for param_group in optimizer.param_groups: 235 | param_group['lr'] = lr 236 | 237 | def warmup_learning_rate(optimizer,epoch): 238 | lr_ini = 0.0001 239 | print('lr warmup...') 240 | for param_group in optimizer.param_groups: 241 | param_group['lr'] = lr_ini+(args.lr - lr_ini)*epoch/5 242 | 243 | def xavier(param): 244 | init.xavier_uniform_(param) 245 | 246 | 247 | def weights_init(m): 248 | if isinstance(m, nn.Conv2d): 249 | xavier(m.weight.data) 250 | m.bias.data.zero_() 251 | 252 | 253 | def create_vis_plot(_xlabel, _ylabel, _title, _legend): 254 | return viz.line( 255 | X=torch.zeros((1,)).cpu(), 256 | Y=torch.zeros((1, 3)).cpu(), 257 | opts=dict( 258 | xlabel=_xlabel, 259 | ylabel=_ylabel, 260 | title=_title, 261 | legend=_legend 262 | ) 263 | ) 264 | 265 | 266 | def update_vis_plot(iteration, loc, conf, window1, window2, update_type, 267 | epoch_size=1): 268 | viz.line( 269 | X=torch.ones((1, 3)).cpu() * iteration, 270 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu() / epoch_size, 271 | win=window1, 272 | update=update_type 273 | ) 274 | # initialize epoch plot on first iteration 275 | if iteration == 0: 276 | viz.line( 277 | X=torch.zeros((1, 3)).cpu(), 278 | Y=torch.Tensor([loc, conf, loc + conf]).unsqueeze(0).cpu(), 279 | win=window2, 280 | update=True 281 | ) 282 | 283 | 284 | if __name__ == '__main__': 285 | train() 286 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .augmentations import SSDAugmentation -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /utils/__pycache__/augmentations.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/midasklr/SSD.Pytorch/33ec443c0a7f3facbaa0643f4c04d4c3dda3cf53/utils/__pycache__/augmentations.cpython-37.pyc -------------------------------------------------------------------------------- /utils/augmentations.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision import transforms 3 | import cv2 4 | import numpy as np 5 | import types 6 | from numpy import random 7 | 8 | 9 | def intersect(box_a, box_b): 10 | max_xy = np.minimum(box_a[:, 2:], box_b[2:]) 11 | min_xy = np.maximum(box_a[:, :2], box_b[:2]) 12 | inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf) 13 | return inter[:, 0] * inter[:, 1] 14 | 15 | 16 | def jaccard_numpy(box_a, box_b): 17 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 18 | is simply the intersection over union of two boxes. 19 | E.g.: 20 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 21 | Args: 22 | box_a: Multiple bounding boxes, Shape: [num_boxes,4] 23 | box_b: Single bounding box, Shape: [4] 24 | Return: 25 | jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]] 26 | """ 27 | inter = intersect(box_a, box_b) 28 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 29 | (box_a[:, 3]-box_a[:, 1])) # [A,B] 30 | area_b = ((box_b[2]-box_b[0]) * 31 | (box_b[3]-box_b[1])) # [A,B] 32 | union = area_a + area_b - inter 33 | return inter / union # [A,B] 34 | 35 | 36 | class Compose(object): 37 | """Composes several augmentations together. 38 | Args: 39 | transforms (List[Transform]): list of transforms to compose. 40 | Example: 41 | >>> augmentations.Compose([ 42 | >>> transforms.CenterCrop(10), 43 | >>> transforms.ToTensor(), 44 | >>> ]) 45 | """ 46 | 47 | def __init__(self, transforms): 48 | self.transforms = transforms 49 | 50 | def __call__(self, img, boxes=None, labels=None): 51 | for t in self.transforms: 52 | img, boxes, labels = t(img, boxes, labels) 53 | return img, boxes, labels 54 | 55 | 56 | class Lambda(object): 57 | """Applies a lambda as a transform.""" 58 | 59 | def __init__(self, lambd): 60 | assert isinstance(lambd, types.LambdaType) 61 | self.lambd = lambd 62 | 63 | def __call__(self, img, boxes=None, labels=None): 64 | return self.lambd(img, boxes, labels) 65 | 66 | 67 | class ConvertFromInts(object): 68 | def __call__(self, image, boxes=None, labels=None): 69 | return image.astype(np.float32), boxes, labels 70 | 71 | 72 | class SubtractMeans(object): 73 | def __init__(self, mean): 74 | self.mean = np.array(mean, dtype=np.float32) 75 | 76 | def __call__(self, image, boxes=None, labels=None): 77 | image = image.astype(np.float32) 78 | image -= self.mean 79 | return image.astype(np.float32), boxes, labels 80 | 81 | 82 | class ToAbsoluteCoords(object): 83 | def __call__(self, image, boxes=None, labels=None): 84 | height, width, channels = image.shape 85 | boxes[:, 0] *= width 86 | boxes[:, 2] *= width 87 | boxes[:, 1] *= height 88 | boxes[:, 3] *= height 89 | 90 | return image, boxes, labels 91 | 92 | 93 | class ToPercentCoords(object): 94 | def __call__(self, image, boxes=None, labels=None): 95 | height, width, channels = image.shape 96 | boxes[:, 0] /= width 97 | boxes[:, 2] /= width 98 | boxes[:, 1] /= height 99 | boxes[:, 3] /= height 100 | 101 | return image, boxes, labels 102 | 103 | 104 | class Resize(object): 105 | def __init__(self, size=300): 106 | self.size = size 107 | 108 | def __call__(self, image, boxes=None, labels=None): 109 | image = cv2.resize(image, (self.size, 110 | self.size)) 111 | return image, boxes, labels 112 | 113 | 114 | class RandomSaturation(object): 115 | def __init__(self, lower=0.5, upper=1.5): 116 | self.lower = lower 117 | self.upper = upper 118 | assert self.upper >= self.lower, "contrast upper must be >= lower." 119 | assert self.lower >= 0, "contrast lower must be non-negative." 120 | 121 | def __call__(self, image, boxes=None, labels=None): 122 | if random.randint(2): 123 | image[:, :, 1] *= random.uniform(self.lower, self.upper) 124 | 125 | return image, boxes, labels 126 | 127 | 128 | class RandomHue(object): 129 | def __init__(self, delta=18.0): 130 | assert delta >= 0.0 and delta <= 360.0 131 | self.delta = delta 132 | 133 | def __call__(self, image, boxes=None, labels=None): 134 | if random.randint(2): 135 | image[:, :, 0] += random.uniform(-self.delta, self.delta) 136 | image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0 137 | image[:, :, 0][image[:, :, 0] < 0.0] += 360.0 138 | return image, boxes, labels 139 | 140 | 141 | class RandomLightingNoise(object): 142 | def __init__(self): 143 | self.perms = ((0, 1, 2), (0, 2, 1), 144 | (1, 0, 2), (1, 2, 0), 145 | (2, 0, 1), (2, 1, 0)) 146 | 147 | def __call__(self, image, boxes=None, labels=None): 148 | if random.randint(2): 149 | swap = self.perms[random.randint(len(self.perms))] 150 | shuffle = SwapChannels(swap) # shuffle channels 151 | image = shuffle(image) 152 | return image, boxes, labels 153 | 154 | 155 | class ConvertColor(object): 156 | def __init__(self, current='BGR', transform='HSV'): 157 | self.transform = transform 158 | self.current = current 159 | 160 | def __call__(self, image, boxes=None, labels=None): 161 | if self.current == 'BGR' and self.transform == 'HSV': 162 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 163 | elif self.current == 'HSV' and self.transform == 'BGR': 164 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 165 | else: 166 | raise NotImplementedError 167 | return image, boxes, labels 168 | 169 | 170 | class RandomContrast(object): 171 | def __init__(self, lower=0.5, upper=1.5): 172 | self.lower = lower 173 | self.upper = upper 174 | assert self.upper >= self.lower, "contrast upper must be >= lower." 175 | assert self.lower >= 0, "contrast lower must be non-negative." 176 | 177 | # expects float image 178 | def __call__(self, image, boxes=None, labels=None): 179 | if random.randint(2): 180 | alpha = random.uniform(self.lower, self.upper) 181 | image *= alpha 182 | return image, boxes, labels 183 | 184 | 185 | class RandomBrightness(object): 186 | def __init__(self, delta=32): 187 | assert delta >= 0.0 188 | assert delta <= 255.0 189 | self.delta = delta 190 | 191 | def __call__(self, image, boxes=None, labels=None): 192 | if random.randint(2): 193 | delta = random.uniform(-self.delta, self.delta) 194 | image += delta 195 | return image, boxes, labels 196 | 197 | 198 | class ToCV2Image(object): 199 | def __call__(self, tensor, boxes=None, labels=None): 200 | return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels 201 | 202 | 203 | class ToTensor(object): 204 | def __call__(self, cvimage, boxes=None, labels=None): 205 | return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels 206 | 207 | 208 | class RandomSampleCrop(object): 209 | """Crop 210 | Arguments: 211 | img (Image): the image being input during training 212 | boxes (Tensor): the original bounding boxes in pt form 213 | labels (Tensor): the class labels for each bbox 214 | mode (float tuple): the min and max jaccard overlaps 215 | Return: 216 | (img, boxes, classes) 217 | img (Image): the cropped image 218 | boxes (Tensor): the adjusted bounding boxes in pt form 219 | labels (Tensor): the class labels for each bbox 220 | """ 221 | def __init__(self): 222 | self.sample_options = ( 223 | # using entire original input image 224 | None, 225 | # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9 226 | (0.1, None), 227 | (0.3, None), 228 | (0.7, None), 229 | (0.9, None), 230 | # randomly sample a patch 231 | (None, None), 232 | ) 233 | 234 | def __call__(self, image, boxes=None, labels=None): 235 | height, width, _ = image.shape 236 | while True: 237 | # randomly choose a mode 238 | mode = random.choice(self.sample_options) 239 | if mode is None: 240 | return image, boxes, labels 241 | 242 | min_iou, max_iou = mode 243 | if min_iou is None: 244 | min_iou = float('-inf') 245 | if max_iou is None: 246 | max_iou = float('inf') 247 | 248 | # max trails (50) 249 | for _ in range(50): 250 | current_image = image 251 | 252 | w = random.uniform(0.3 * width, width) 253 | h = random.uniform(0.3 * height, height) 254 | 255 | # aspect ratio constraint b/t .5 & 2 256 | if h / w < 0.5 or h / w > 2: 257 | continue 258 | 259 | left = random.uniform(width - w) 260 | top = random.uniform(height - h) 261 | 262 | # convert to integer rect x1,y1,x2,y2 263 | rect = np.array([int(left), int(top), int(left+w), int(top+h)]) 264 | 265 | # calculate IoU (jaccard overlap) b/t the cropped and gt boxes 266 | overlap = jaccard_numpy(boxes, rect) 267 | 268 | # is min and max overlap constraint satisfied? if not try again 269 | if overlap.min() < min_iou and max_iou < overlap.max(): 270 | # if overlap.max() < min_iou: #????righ 271 | continue 272 | 273 | # cut the crop from the image 274 | current_image = current_image[rect[1]:rect[3], rect[0]:rect[2], 275 | :] 276 | 277 | # keep overlap with gt box IF center in sampled patch 278 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 279 | 280 | # mask in all gt boxes that above and to the left of centers 281 | m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1]) 282 | 283 | # mask in all gt boxes that under and to the right of centers 284 | m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1]) 285 | 286 | # mask in that both m1 and m2 are true 287 | mask = m1 * m2 288 | 289 | # have any valid boxes? try again if not 290 | if not mask.any(): 291 | continue 292 | 293 | # take only matching gt boxes 294 | current_boxes = boxes[mask, :].copy() 295 | 296 | # take only matching gt labels 297 | current_labels = labels[mask] 298 | 299 | # should we use the box left and top corner or the crop's 300 | current_boxes[:, :2] = np.maximum(current_boxes[:, :2], 301 | rect[:2]) 302 | # adjust to crop (by substracting crop's left,top) 303 | current_boxes[:, :2] -= rect[:2] 304 | 305 | current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:], 306 | rect[2:]) 307 | # adjust to crop (by substracting crop's left,top) 308 | current_boxes[:, 2:] -= rect[:2] 309 | 310 | return current_image, current_boxes, current_labels 311 | 312 | 313 | class Expand(object): 314 | def __init__(self, mean): 315 | self.mean = mean 316 | 317 | def __call__(self, image, boxes, labels): 318 | if random.randint(2): 319 | return image, boxes, labels 320 | 321 | height, width, depth = image.shape 322 | ratio = random.uniform(1, 4) 323 | left = random.uniform(0, width*ratio - width) 324 | top = random.uniform(0, height*ratio - height) 325 | 326 | expand_image = np.zeros( 327 | (int(height*ratio), int(width*ratio), depth), 328 | dtype=image.dtype) 329 | expand_image[:, :, :] = self.mean 330 | expand_image[int(top):int(top + height), 331 | int(left):int(left + width)] = image 332 | image = expand_image 333 | 334 | boxes = boxes.copy() 335 | boxes[:, :2] += (int(left), int(top)) 336 | boxes[:, 2:] += (int(left), int(top)) 337 | 338 | return image, boxes, labels 339 | 340 | 341 | class RandomMirror(object): 342 | def __call__(self, image, boxes, classes): 343 | _, width, _ = image.shape 344 | if random.randint(2): 345 | image = image[:, ::-1] 346 | boxes = boxes.copy() 347 | boxes[:, 0::2] = width - boxes[:, 2::-2] 348 | return image, boxes, classes 349 | 350 | 351 | class SwapChannels(object): 352 | """Transforms a tensorized image by swapping the channels in the order 353 | specified in the swap tuple. 354 | Args: 355 | swaps (int triple): final order of channels 356 | eg: (2, 1, 0) 357 | """ 358 | 359 | def __init__(self, swaps): 360 | self.swaps = swaps 361 | 362 | def __call__(self, image): 363 | """ 364 | Args: 365 | image (Tensor): image tensor to be transformed 366 | Return: 367 | a tensor with channels swapped according to swap 368 | """ 369 | # if torch.is_tensor(image): 370 | # image = image.data.cpu().numpy() 371 | # else: 372 | # image = np.array(image) 373 | image = image[:, :, self.swaps] 374 | return image 375 | 376 | 377 | class PhotometricDistort(object): 378 | def __init__(self): 379 | self.pd = [ 380 | RandomContrast(), 381 | ConvertColor(transform='HSV'), 382 | RandomSaturation(), 383 | RandomHue(), 384 | ConvertColor(current='HSV', transform='BGR'), 385 | RandomContrast() 386 | ] 387 | self.rand_brightness = RandomBrightness() 388 | self.rand_light_noise = RandomLightingNoise() 389 | 390 | def __call__(self, image, boxes, labels): 391 | im = image.copy() 392 | im, boxes, labels = self.rand_brightness(im, boxes, labels) 393 | if random.randint(2): 394 | distort = Compose(self.pd[:-1]) 395 | else: 396 | distort = Compose(self.pd[1:]) 397 | im, boxes, labels = distort(im, boxes, labels) 398 | return self.rand_light_noise(im, boxes, labels) 399 | 400 | 401 | class SSDAugmentation(object): 402 | def __init__(self, size=300, mean=(104, 117, 123)): 403 | self.mean = mean 404 | self.size = size 405 | self.augment = Compose([ 406 | ConvertFromInts(), 407 | ToAbsoluteCoords(), 408 | PhotometricDistort(), 409 | Expand(self.mean), 410 | RandomSampleCrop(), 411 | RandomMirror(), 412 | ToPercentCoords(), 413 | Resize(self.size), 414 | SubtractMeans(self.mean) 415 | ]) 416 | 417 | def __call__(self, img, boxes, labels): 418 | return self.augment(img, boxes, labels) 419 | --------------------------------------------------------------------------------