├── .gitignore ├── README.md ├── data ├── __init__.py ├── coco.py ├── scripts │ ├── COCO2017.sh │ ├── VOC2007.sh │ └── VOC2012.sh ├── transforms.py └── voc.py ├── eval.py ├── evaluator ├── coco_evaluator.py └── voc_evaluator.py ├── models ├── conv.py ├── fcos.py ├── fcos_rt.py └── resnet.py ├── test.py ├── train.py ├── train_fcos.sh ├── train_fcos_rt.sh └── utils ├── __init__.py ├── box_ops.py ├── com_flops_params.py ├── create_labels.py ├── distributed_utils.py ├── loss.py └── misc.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pt 2 | *.pth 3 | *.pkl 4 | *.txt 5 | __pycache__ 6 | .vscode -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Update: 2022-04-11 2 | I have reproduced FCOS. If you are still interested at `FCOS`, please try the following project: 3 | 4 | https://github.com/yjh0410/DetLAB 5 | 6 | You can delete this my old FCOS project. 7 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .voc import VOCDetection, VOC_CLASSES 2 | from .coco import COCODataset, coco_class_labels, coco_class_index 3 | -------------------------------------------------------------------------------- /data/coco.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import random 4 | 5 | import torch 6 | from torch.utils.data import Dataset 7 | import cv2 8 | from pycocotools.coco import COCO 9 | 10 | 11 | coco_class_labels = ('background', 12 | 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 13 | 'boat', 'traffic light', 'fire hydrant', 'street sign', 'stop sign', 14 | 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 15 | 'elephant', 'bear', 'zebra', 'giraffe', 'hat', 'backpack', 'umbrella', 16 | 'shoe', 'eye glasses', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 17 | 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 18 | 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'plate', 'wine glass', 19 | 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 20 | 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 21 | 'couch', 'potted plant', 'bed', 'mirror', 'dining table', 'window', 'desk', 22 | 'toilet', 'door', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 23 | 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'blender', 'book', 24 | 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush') 25 | 26 | coco_class_index = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 20, 27 | 21, 22, 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 28 | 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 67, 29 | 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, 87, 88, 89, 90] 30 | 31 | 32 | class COCODataset(Dataset): 33 | """ 34 | COCO dataset class. 35 | """ 36 | def __init__(self, 37 | data_dir=None, 38 | image_set='train2017', 39 | transform=None): 40 | """ 41 | COCO dataset initialization. Annotation data are read into memory by COCO API. 42 | Args: 43 | data_dir (str): dataset root directory 44 | json_file (str): COCO json file name 45 | name (str): COCO data name (e.g. 'train2017' or 'val2017') 46 | img_size (int): target image size after pre-processing 47 | min_size (int): bounding boxes smaller than this are ignored 48 | debug (bool): if True, only one data id is selected from the dataset 49 | """ 50 | if image_set == 'train2017': 51 | self.json_file = 'instances_train2017.json' 52 | elif image_set == 'val2017': 53 | self.json_file = 'instances_val2017.json' 54 | elif image_set == 'test2017': 55 | self.json_file == 'image_info_test-dev2017.json' 56 | self.image_set = image_set 57 | self.data_dir = data_dir 58 | self.coco = COCO(os.path.join(self.data_dir, 'annotations', self.json_file)) 59 | self.ids = self.coco.getImgIds() 60 | self.class_ids = sorted(self.coco.getCatIds()) 61 | # augmentation 62 | self.transform = transform 63 | 64 | 65 | def __len__(self): 66 | return len(self.ids) 67 | 68 | 69 | def __getitem__(self, index): 70 | img, target = self.pull_item(index) 71 | return img, target 72 | 73 | 74 | def pull_image(self, index): 75 | id_ = self.ids[index] 76 | img_file = os.path.join(self.data_dir, self.image_set, 77 | '{:012}'.format(id_) + '.jpg') 78 | img = cv2.imread(img_file) 79 | 80 | if self.json_file == 'instances_val5k.json' and img is None: 81 | img_file = os.path.join(self.data_dir, 'train2017', 82 | '{:012}'.format(id_) + '.jpg') 83 | img = cv2.imread(img_file) 84 | 85 | return img, id_ 86 | 87 | 88 | def pull_anno(self, index): 89 | id_ = self.ids[index] 90 | 91 | anno_ids = self.coco.getAnnIds(imgIds=[int(id_)], iscrowd=None) 92 | annotations = self.coco.loadAnns(anno_ids) 93 | 94 | target = [] 95 | for anno in annotations: 96 | if 'bbox' in anno: 97 | xmin = np.max((0, anno['bbox'][0])) 98 | ymin = np.max((0, anno['bbox'][1])) 99 | xmax = xmin + anno['bbox'][2] 100 | ymax = ymin + anno['bbox'][3] 101 | 102 | if anno['area'] > 0 and xmax >= xmin and ymax >= ymin: 103 | label_ind = anno['category_id'] 104 | cls_id = self.class_ids.index(label_ind) 105 | 106 | target.append([xmin, ymin, xmax, ymax, cls_id]) # [xmin, ymin, xmax, ymax, label_ind] 107 | else: 108 | print('No bbox !!') 109 | return target 110 | 111 | 112 | def load_image_annotation(self, index): 113 | anno_ids = self.coco.getAnnIds(imgIds=[int(index)], iscrowd=None) 114 | annotations = self.coco.loadAnns(anno_ids) 115 | 116 | # load an image 117 | img_file = os.path.join(self.data_dir, self.image_set, 118 | '{:012}'.format(index) + '.jpg') 119 | img = cv2.imread(img_file) 120 | 121 | if self.json_file == 'instances_val5k.json' and img is None: 122 | img_file = os.path.join(self.data_dir, 'train2017', 123 | '{:012}'.format(index) + '.jpg') 124 | img = cv2.imread(img_file) 125 | 126 | assert img is not None 127 | 128 | height, width, channels = img.shape 129 | 130 | # load an annotation 131 | annotation = [] 132 | for anno in annotations: 133 | if 'bbox' in anno and anno['area'] > 0: 134 | xmin = np.max((0, anno['bbox'][0])) 135 | ymin = np.max((0, anno['bbox'][1])) 136 | xmax = np.min((width - 1, xmin + np.max((0, anno['bbox'][2] - 1)))) 137 | ymax = np.min((height - 1, ymin + np.max((0, anno['bbox'][3] - 1)))) 138 | if xmax > xmin and ymax > ymin: 139 | label_ind = anno['category_id'] 140 | cls_id = self.class_ids.index(label_ind) 141 | 142 | annotation.append([xmin, ymin, xmax, ymax, cls_id]) # [xmin, ymin, xmax, ymax, label_ind] 143 | else: 144 | print('No bbox !!!') 145 | # end here . 146 | 147 | return img, annotation, height, width 148 | 149 | 150 | def pull_item(self, index): 151 | id_ = self.ids[index] 152 | img, anno, height, width = self.load_image_annotation(id_) 153 | # check anno 154 | if len(anno) == 0: 155 | anno = np.zeros([1, 5]) 156 | else: 157 | anno = np.array(anno) 158 | 159 | # transform 160 | target = {'boxes': anno[:, :4], 161 | 'labels': anno[:, 4], 162 | 'orig_size': [height, width]} 163 | img, target = self.transform(img, target) 164 | 165 | return img, target 166 | 167 | 168 | if __name__ == "__main__": 169 | from transforms import TrainTransforms, ValTransforms 170 | img_size = 512 171 | dataset = COCODataset( 172 | data_dir='/mnt/share/ssd2/dataset/COCO', 173 | transform=TrainTransforms(img_size), 174 | image_set='train') 175 | 176 | np.random.seed(0) 177 | class_colors = [(np.random.randint(255), 178 | np.random.randint(255), 179 | np.random.randint(255)) for _ in range(80)] 180 | rgb_mean = np.array(dataset.transform.mean) 181 | rgb_std = np.array(dataset.transform.std) 182 | print('Data length: ', len(dataset)) 183 | for i in range(1000): 184 | # load an image 185 | img, target = dataset.pull_item(i) 186 | img = img.permute(1,2,0).numpy() 187 | img = (img*rgb_std + rgb_mean) * 255 188 | # from rgb to bgr 189 | img = img[:, :, (2, 1, 0)] 190 | img = img.astype(np.uint8).copy() 191 | # load a target 192 | cls_gt = target['labels'].tolist() 193 | box_gt = target['boxes'].tolist() 194 | for i in range(len(cls_gt)): 195 | cls_id = int(cls_gt[i]) 196 | cx, cy, bw, bh = box_gt[i] 197 | x1 = int((cx - bw / 2) * img_size) 198 | y1 = int((cy - bh / 2) * img_size) 199 | x2 = int((cx + bw / 2) * img_size) 200 | y2 = int((cy + bh / 2) * img_size) 201 | img = cv2.rectangle(img, (x1, y1), (x2, y2), (0,0,255), 2) 202 | color = class_colors[cls_id] 203 | cls_name = coco_class_labels[coco_class_index[cls_id]] 204 | mess = '%s' % (cls_name) 205 | cv2.putText(img, mess, (int(x1), int(y1 - 5)), 0, 0.5, color, 1, lineType=cv2.LINE_AA) 206 | cv2.imshow('gt', img) 207 | cv2.waitKey(0) 208 | -------------------------------------------------------------------------------- /data/scripts/COCO2017.sh: -------------------------------------------------------------------------------- 1 | mkdir COCO 2 | cd COCO 3 | 4 | wget http://images.cocodataset.org/zips/train2017.zip 5 | wget http://images.cocodataset.org/zips/val2017.zip 6 | wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip 7 | 8 | unzip train2017.zip 9 | unzip val2017.zip 10 | unzip annotations_trainval2017.zip 11 | 12 | rm -f train2017.zip 13 | rm -f val2017.zip 14 | rm -f annotations_trainval2017.zip 15 | -------------------------------------------------------------------------------- /data/scripts/VOC2007.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2007 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtrainval_06-Nov-2007.tar 26 | echo "Downloading VOC2007 test data ..." 27 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2007/VOCtest_06-Nov-2007.tar 28 | echo "Done downloading." 29 | 30 | # Extract data 31 | echo "Extracting trainval ..." 32 | tar -xvf VOCtrainval_06-Nov-2007.tar 33 | echo "Extracting test ..." 34 | tar -xvf VOCtest_06-Nov-2007.tar 35 | echo "removing tars ..." 36 | rm VOCtrainval_06-Nov-2007.tar 37 | rm VOCtest_06-Nov-2007.tar 38 | 39 | end=`date +%s` 40 | runtime=$((end-start)) 41 | 42 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/scripts/VOC2012.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Ellis Brown 3 | 4 | start=`date +%s` 5 | 6 | # handle optional download dir 7 | if [ -z "$1" ] 8 | then 9 | # navigate to ~/data 10 | echo "navigating to ~/data/ ..." 11 | mkdir -p ~/data 12 | cd ~/data/ 13 | else 14 | # check if is valid directory 15 | if [ ! -d $1 ]; then 16 | echo $1 "is not a valid directory" 17 | exit 0 18 | fi 19 | echo "navigating to" $1 "..." 20 | cd $1 21 | fi 22 | 23 | echo "Downloading VOC2012 trainval ..." 24 | # Download the data. 25 | curl -LO http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar 26 | echo "Done downloading." 27 | 28 | 29 | # Extract data 30 | echo "Extracting trainval ..." 31 | tar -xvf VOCtrainval_11-May-2012.tar 32 | echo "removing tar ..." 33 | rm VOCtrainval_11-May-2012.tar 34 | 35 | end=`date +%s` 36 | runtime=$((end-start)) 37 | 38 | echo "Completed in" $runtime "seconds" -------------------------------------------------------------------------------- /data/transforms.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | import torchvision.transforms.functional as F 4 | 5 | 6 | 7 | class Compose(object): 8 | """Composes several augmentations together. 9 | Args: 10 | transforms (List[Transform]): list of transforms to compose. 11 | Example: 12 | >>> augmentations.Compose([ 13 | >>> transforms.CenterCrop(10), 14 | >>> transforms.ToTensor(), 15 | >>> ]) 16 | """ 17 | 18 | def __init__(self, transforms): 19 | self.transforms = transforms 20 | 21 | def __call__(self, image, target=None): 22 | for t in self.transforms: 23 | image, target = t(image, target) 24 | return image, target 25 | 26 | 27 | class ToTensor(object): 28 | def __call__(self, image, target=None): 29 | # to rgb 30 | image = image[..., (2, 1, 0)] 31 | image = F.to_tensor(image) 32 | if target is not None: 33 | target["boxes"] = torch.as_tensor(target["boxes"]).float() 34 | target["labels"] = torch.as_tensor(target["labels"]).long() 35 | return image, target 36 | 37 | 38 | class Normalize(object): 39 | def __init__(self, mean, std): 40 | self.mean = mean 41 | self.std = std 42 | 43 | def __call__(self, image, target=None): 44 | image = F.normalize(image, mean=self.mean, std=self.std) 45 | if target is not None: 46 | h, w = target["orig_size"] 47 | if "boxes" in target: 48 | boxes = target["boxes"].clone() 49 | # normalize 50 | boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) 51 | # [x1, y1, x2, y2] -> [cx, cy, w, h] 52 | boxes_ = boxes.clone() 53 | boxes_[:, :2] = (boxes[:, 2:] + boxes[:, :2]) / 2.0 54 | boxes_[:, 2:] = boxes[:, 2:] - boxes[:, :2] 55 | target["boxes"] = boxes_ 56 | 57 | return image, target 58 | 59 | 60 | class Resize(object): 61 | def __init__(self, size=640): 62 | self.size = size 63 | 64 | def __call__(self, image, target=None): 65 | # resize 66 | size = (self.size, self.size) 67 | image = F.resize(image, size) 68 | 69 | return image, target 70 | 71 | 72 | class RandomHorizontalFlip(object): 73 | def __init__(self, p=0.5): 74 | self.p = p 75 | 76 | def __call__(self, image, target=None): 77 | if random.random() < self.p: 78 | image = F.hflip(image) 79 | if target is not None: 80 | h, w = target["orig_size"] 81 | if "boxes" in target: 82 | boxes = target["boxes"].clone() 83 | boxes[..., [0, 2]] = w - boxes[..., [2, 0]] 84 | target["boxes"] = boxes 85 | 86 | return image, target 87 | 88 | 89 | # TrainTransform 90 | class TrainTransforms(object): 91 | def __init__(self, size=512, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)): 92 | self.size = size 93 | self.mean = mean 94 | self.std = std 95 | self.transforms = Compose([ 96 | ToTensor(), 97 | RandomHorizontalFlip(), 98 | Resize(size), 99 | Normalize(mean, std) 100 | ]) 101 | 102 | def __call__(self, image, target): 103 | return self.transforms(image, target) 104 | 105 | 106 | # ValTransform 107 | class ValTransforms(object): 108 | def __init__(self, size=512, mean=(0.406, 0.456, 0.485), std=(0.225, 0.224, 0.229)): 109 | self.size = size 110 | self.mean = mean 111 | self.std = std 112 | self.transforms = Compose([ 113 | ToTensor(), 114 | Resize(size), 115 | Normalize(mean, std) 116 | ]) 117 | 118 | 119 | def __call__(self, image, target=None): 120 | return self.transforms(image, target) 121 | -------------------------------------------------------------------------------- /data/voc.py: -------------------------------------------------------------------------------- 1 | """VOC Dataset Classes 2 | 3 | Original author: Francisco Massa 4 | https://github.com/fmassa/vision/blob/voc_dataset/torchvision/datasets/voc.py 5 | 6 | Updated by: Ellis Brown, Max deGroot 7 | """ 8 | import os.path as osp 9 | import sys 10 | import torch 11 | import torch.utils.data as data 12 | import cv2 13 | import numpy as np 14 | import random 15 | 16 | import xml.etree.ElementTree as ET 17 | 18 | 19 | VOC_CLASSES = ( # always index 0 20 | 'aeroplane', 'bicycle', 'bird', 'boat', 21 | 'bottle', 'bus', 'car', 'cat', 'chair', 22 | 'cow', 'diningtable', 'dog', 'horse', 23 | 'motorbike', 'person', 'pottedplant', 24 | 'sheep', 'sofa', 'train', 'tvmonitor') 25 | 26 | 27 | 28 | class VOCAnnotationTransform(object): 29 | """Transforms a VOC annotation into a Tensor of bbox coords and label index 30 | Initilized with a dictionary lookup of classnames to indexes 31 | 32 | Arguments: 33 | class_to_ind (dict, optional): dictionary lookup of classnames -> indexes 34 | (default: alphabetic indexing of VOC's 20 classes) 35 | keep_difficult (bool, optional): keep difficult instances or not 36 | (default: False) 37 | height (int): height 38 | width (int): width 39 | """ 40 | 41 | def __init__(self, class_to_ind=None, keep_difficult=False): 42 | self.class_to_ind = class_to_ind or dict( 43 | zip(VOC_CLASSES, range(len(VOC_CLASSES)))) 44 | self.keep_difficult = keep_difficult 45 | 46 | def __call__(self, target): 47 | """ 48 | Arguments: 49 | target (annotation) : the target annotation to be made usable 50 | will be an ET.Element 51 | Returns: 52 | a list containing lists of bounding boxes [bbox coords, class name] 53 | """ 54 | res = [] 55 | for obj in target.iter('object'): 56 | difficult = int(obj.find('difficult').text) == 1 57 | if not self.keep_difficult and difficult: 58 | continue 59 | name = obj.find('name').text.lower().strip() 60 | bbox = obj.find('bndbox') 61 | 62 | pts = ['xmin', 'ymin', 'xmax', 'ymax'] 63 | bndbox = [] 64 | for i, pt in enumerate(pts): 65 | cur_pt = int(bbox.find(pt).text) - 1 66 | # scale height or width 67 | cur_pt = cur_pt if i % 2 == 0 else cur_pt 68 | bndbox.append(cur_pt) 69 | label_idx = self.class_to_ind[name] 70 | bndbox.append(label_idx) 71 | res += [bndbox] # [x1, y1, x2, y2, label_ind] 72 | # img_id = target.find('filename').text[:-4] 73 | 74 | return res # [[x1, y1, x2, y2, label_ind], ... ] 75 | 76 | 77 | class VOCDetection(data.Dataset): 78 | """VOC Detection Dataset Object 79 | 80 | input is image, target is annotation 81 | 82 | Arguments: 83 | root (string): filepath to VOCdevkit folder. 84 | image_set (string): imageset to use (eg. 'train', 'val', 'test') 85 | transform (callable, optional): transformation to perform on the 86 | input image 87 | target_transform (callable, optional): transformation to perform on the 88 | target `annotation` 89 | (eg: take in caption string, return tensor of word indices) 90 | dataset_name (string, optional): which dataset to load 91 | (default: 'VOC2007') 92 | """ 93 | 94 | def __init__(self, 95 | data_dir=None, 96 | image_sets=[('2007', 'trainval'), ('2012', 'trainval')], 97 | transform=None, 98 | target_transform=VOCAnnotationTransform() 99 | ): 100 | self.root = data_dir 101 | self.image_set = image_sets 102 | self.target_transform = target_transform 103 | self._annopath = osp.join('%s', 'Annotations', '%s.xml') 104 | self._imgpath = osp.join('%s', 'JPEGImages', '%s.jpg') 105 | self.ids = list() 106 | for (year, name) in image_sets: 107 | rootpath = osp.join(self.root, 'VOC' + year) 108 | for line in open(osp.join(rootpath, 'ImageSets', 'Main', name + '.txt')): 109 | self.ids.append((rootpath, line.strip())) 110 | # augmentation 111 | self.transform = transform 112 | 113 | 114 | def __getitem__(self, index): 115 | img, target = self.pull_item(index) 116 | return img, target 117 | 118 | 119 | def __len__(self): 120 | return len(self.ids) 121 | 122 | 123 | def load_image_annotation(self, img_id): 124 | # load an image 125 | img = cv2.imread(self._imgpath % img_id) 126 | height, width, channels = img.shape 127 | # load an annotation 128 | anno = ET.parse(self._annopath % img_id).getroot() 129 | if self.target_transform is not None: 130 | anno = self.target_transform(anno) 131 | 132 | return img, anno, height, width 133 | 134 | 135 | def pull_item(self, index): 136 | img_id = self.ids[index] 137 | img, anno, height, width = self.load_image_annotation(img_id) 138 | if len(anno) == 0: 139 | anno = np.zeros([1, 5]) 140 | else: 141 | anno = np.array(anno) 142 | 143 | # transform 144 | target = {'boxes': anno[:, :4], 145 | 'labels': anno[:, 4], 146 | 'orig_size': [height, width]} 147 | img, target = self.transform(img, target) 148 | 149 | return img, target 150 | 151 | 152 | def pull_image(self, index): 153 | '''Returns the original image object at index in PIL form 154 | 155 | Note: not using self.__getitem__(), as any transformations passed in 156 | could mess up this functionality. 157 | 158 | Argument: 159 | index (int): index of img to show 160 | Return: 161 | PIL img 162 | ''' 163 | img_id = self.ids[index] 164 | return cv2.imread(self._imgpath % img_id, cv2.IMREAD_COLOR), img_id 165 | 166 | 167 | def pull_anno(self, index): 168 | '''Returns the original annotation of image at index 169 | 170 | Note: not using self.__getitem__(), as any transformations passed in 171 | could mess up this functionality. 172 | 173 | Argument: 174 | index (int): index of img to get annotation of 175 | Return: 176 | list: [img_id, [(label, bbox coords),...]] 177 | eg: ('001718', [('dog', (96, 13, 438, 332))]) 178 | ''' 179 | img_id = self.ids[index] 180 | anno = ET.parse(self._annopath % img_id).getroot() 181 | gt = self.target_transform(anno, 1, 1) 182 | return img_id[1], gt 183 | 184 | 185 | if __name__ == "__main__": 186 | from transforms import TrainTransforms, ValTransforms 187 | img_size = 512 188 | dataset = VOCDetection( 189 | data_dir='/mnt/share/ssd2/dataset/VOCdevkit', 190 | transform=TrainTransforms(img_size)) 191 | 192 | np.random.seed(0) 193 | class_colors = [(np.random.randint(255), 194 | np.random.randint(255), 195 | np.random.randint(255)) for _ in range(20)] 196 | rgb_mean = np.array(dataset.transform.mean) 197 | rgb_std = np.array(dataset.transform.std) 198 | print('Data length: ', len(dataset)) 199 | for i in range(1000): 200 | # load an image 201 | img, target = dataset.pull_item(i) 202 | img = img.permute(1,2,0).numpy() 203 | img = (img*rgb_std + rgb_mean) * 255 204 | # from rgb to bgr 205 | img = img[:, :, (2, 1, 0)] 206 | img = img.astype(np.uint8).copy() 207 | # load a target 208 | cls_gt = target['labels'].tolist() 209 | box_gt = target['boxes'].tolist() 210 | for i in range(len(cls_gt)): 211 | cls_id = int(cls_gt[i]) 212 | cx, cy, bw, bh = box_gt[i] 213 | x1 = int((cx - bw / 2) * img_size) 214 | y1 = int((cy - bh / 2) * img_size) 215 | x2 = int((cx + bw / 2) * img_size) 216 | y2 = int((cy + bh / 2) * img_size) 217 | img = cv2.rectangle(img, (x1, y1), (x2, y2), (0,0,255), 2) 218 | cls_name = VOC_CLASSES[cls_id] 219 | mess = '%s' % (cls_name) 220 | color = class_colors[cls_id] 221 | cv2.putText(img, mess, (int(x1), int(y1 - 5)), 0, 0.5, color, 1, lineType=cv2.LINE_AA) 222 | cv2.imshow('gt', img) 223 | cv2.waitKey(0) 224 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | 6 | from evaluator.voc_evaluator import VOCAPIEvaluator 7 | from evaluator.coco_evaluator import COCOAPIEvaluator 8 | 9 | from data.transforms import ValTransforms 10 | 11 | from utils.misc import TestTimeAugmentation 12 | 13 | 14 | parser = argparse.ArgumentParser(description='FCOS-RT Evaluation') 15 | # basic 16 | parser.add_argument('-size', '--img_size', default=512, type=int, 17 | help='img_size') 18 | parser.add_argument('--cuda', action='store_true', default=False, 19 | help='Use cuda') 20 | # model 21 | parser.add_argument('-v', '--version', default='fcos_rt', 22 | help='fcos_rt') 23 | parser.add_argument('--trained_model', type=str, 24 | default='weights/', 25 | help='Trained state_dict file path to open') 26 | parser.add_argument('--conf_thresh', default=0.001, type=float, 27 | help='NMS threshold') 28 | parser.add_argument('--nms_thresh', default=0.6, type=float, 29 | help='NMS threshold') 30 | # dataset 31 | parser.add_argument('--root', default='/mnt/share/ssd2/dataset', 32 | help='data root') 33 | parser.add_argument('-d', '--dataset', default='coco-val', 34 | help='voc, coco-val, coco-test.') 35 | # TTA 36 | parser.add_argument('-tta', '--test_aug', action='store_true', default=False, 37 | help='use test augmentation.') 38 | 39 | args = parser.parse_args() 40 | 41 | 42 | def voc_test(model, data_dir, device, img_size): 43 | evaluator = VOCAPIEvaluator(data_root=data_dir, 44 | img_size=img_size, 45 | device=device, 46 | transform=ValTransforms(img_size), 47 | display=True 48 | ) 49 | 50 | # VOC evaluation 51 | evaluator.evaluate(model) 52 | 53 | 54 | def coco_test(model, data_dir, device, img_size, test=False): 55 | if test: 56 | # test-dev 57 | print('test on test-dev 2017') 58 | evaluator = COCOAPIEvaluator( 59 | data_dir=data_dir, 60 | img_size=img_size, 61 | device=device, 62 | testset=True, 63 | transform=ValTransforms(img_size) 64 | ) 65 | 66 | else: 67 | # eval 68 | evaluator = COCOAPIEvaluator( 69 | data_dir=data_dir, 70 | img_size=img_size, 71 | device=device, 72 | testset=False, 73 | transform=ValTransforms(img_size) 74 | ) 75 | 76 | # COCO evaluation 77 | evaluator.evaluate(model) 78 | 79 | 80 | if __name__ == '__main__': 81 | # dataset 82 | if args.dataset == 'voc': 83 | print('eval on voc ...') 84 | num_classes = 20 85 | data_dir = os.path.join(args.root, 'VOCdevkit') 86 | elif args.dataset == 'coco-val': 87 | print('eval on coco-val ...') 88 | num_classes = 80 89 | data_dir = os.path.join(args.root, 'COCO') 90 | elif args.dataset == 'coco-test': 91 | print('eval on coco-test-dev ...') 92 | num_classes = 80 93 | data_dir = os.path.join(args.root, 'COCO') 94 | else: 95 | print('unknow dataset !! we only support voc, coco-val, coco-test !!!') 96 | exit(0) 97 | 98 | # cuda 99 | if args.cuda: 100 | print('use cuda') 101 | torch.backends.cudnn.benchmark = True 102 | device = torch.device("cuda") 103 | else: 104 | device = torch.device("cpu") 105 | 106 | # input size 107 | input_size = args.input_size 108 | 109 | # model 110 | model_name = args.version 111 | print('Model: ', model_name) 112 | 113 | # load model and config file 114 | if model_name == 'fcos_rt': 115 | from models.fcos_rt import FCOS_RT 116 | backbone = args.backbone 117 | # model 118 | model = FCOS_RT(device=device, 119 | img_size=input_size, 120 | num_classes=num_classes, 121 | trainable=False, 122 | conf_thresh=args.conf_thresh, 123 | nms_thresh=args.nms_thresh, 124 | bk=backbone) 125 | else: 126 | print('Unknown model name...') 127 | exit(0) 128 | 129 | 130 | # load weight 131 | model.load_state_dict(torch.load(args.trained_model, map_location=device)) 132 | model.to(device).eval() 133 | print('Finished loading model!') 134 | 135 | # TTA 136 | test_aug = TestTimeAugmentation(num_classes=num_classes) if args.test_aug else None 137 | 138 | # evaluation 139 | with torch.no_grad(): 140 | if args.dataset == 'voc': 141 | voc_test(model, data_dir, device, args.img_size) 142 | elif args.dataset == 'coco-val': 143 | coco_test(model, data_dir, device, args.img_size, test=False) 144 | elif args.dataset == 'coco-test': 145 | coco_test(model, data_dir, device, args.img_size, test=True) 146 | -------------------------------------------------------------------------------- /evaluator/coco_evaluator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import tempfile 3 | import torch 4 | from data.coco import * 5 | from pycocotools.cocoeval import COCOeval 6 | 7 | 8 | class COCOAPIEvaluator(): 9 | """ 10 | COCO AP Evaluation class. 11 | All the data in the val2017 dataset are processed \ 12 | and evaluated by COCO API. 13 | """ 14 | def __init__(self, data_dir, device, testset=False, transform=None): 15 | """ 16 | Args: 17 | data_dir (str): dataset root directory 18 | img_size (int): image size after preprocess. images are resized \ 19 | to squares whose shape is (img_size, img_size). 20 | confthre (float): 21 | confidence threshold ranging from 0 to 1, \ 22 | which is defined in the config file. 23 | nmsthre (float): 24 | IoU threshold of non-max supression ranging from 0 to 1. 25 | """ 26 | self.testset = testset 27 | self.dataset = COCODataset( 28 | data_dir=data_dir, 29 | image_set='val2017' if not testset else 'test2017', 30 | transform=None) 31 | self.transform = transform 32 | self.device = device 33 | 34 | self.map = 0. 35 | self.ap50_95 = 0. 36 | self.ap50 = 0. 37 | 38 | def evaluate(self, model): 39 | """ 40 | COCO average precision (AP) Evaluation. Iterate inference on the test dataset 41 | and the results are evaluated by COCO API. 42 | Args: 43 | model : model object 44 | Returns: 45 | ap50_95 (float) : calculated COCO AP for IoU=50:95 46 | ap50 (float) : calculated COCO AP for IoU=50 47 | """ 48 | model.eval() 49 | ids = [] 50 | data_dict = [] 51 | num_images = len(self.dataset) 52 | print('total number of images: %d' % (num_images)) 53 | 54 | # start testing 55 | for index in range(num_images): # all the data in val2017 56 | if index % 500 == 0: 57 | print('[Eval: %d / %d]'%(index, num_images)) 58 | 59 | # load an image 60 | img, id_ = self.dataset.pull_image(index) 61 | h, w, _ = img.shape 62 | scale = np.array([[w, h, w, h]]) 63 | 64 | # preprocess 65 | x = self.transform(img)[0] 66 | x = x.unsqueeze(0).to(self.device) 67 | 68 | id_ = int(id_) 69 | ids.append(id_) 70 | # inference 71 | with torch.no_grad(): 72 | outputs = model(x) 73 | bboxes, scores, cls_inds = outputs 74 | # rescale 75 | bboxes *= scale 76 | 77 | for i, box in enumerate(bboxes): 78 | x1 = float(box[0]) 79 | y1 = float(box[1]) 80 | x2 = float(box[2]) 81 | y2 = float(box[3]) 82 | label = self.dataset.class_ids[int(cls_inds[i])] 83 | 84 | bbox = [x1, y1, x2 - x1, y2 - y1] 85 | score = float(scores[i]) # object score * class score 86 | A = {"image_id": id_, "category_id": label, "bbox": bbox, 87 | "score": score} # COCO json format 88 | data_dict.append(A) 89 | 90 | annType = ['segm', 'bbox', 'keypoints'] 91 | 92 | # Evaluate the Dt (detection) json comparing with the ground truth 93 | if len(data_dict) > 0: 94 | print('evaluating ......') 95 | cocoGt = self.dataset.coco 96 | # workaround: temporarily write data to json file because pycocotools can't process dict in py36. 97 | if self.testset: 98 | json.dump(data_dict, open('coco_test-dev.json', 'w')) 99 | cocoDt = cocoGt.loadRes('coco_test-dev.json') 100 | return -1, -1 101 | else: 102 | _, tmp = tempfile.mkstemp() 103 | json.dump(data_dict, open(tmp, 'w')) 104 | cocoDt = cocoGt.loadRes(tmp) 105 | cocoEval = COCOeval(self.dataset.coco, cocoDt, annType[1]) 106 | cocoEval.params.imgIds = ids 107 | cocoEval.evaluate() 108 | cocoEval.accumulate() 109 | cocoEval.summarize() 110 | 111 | ap50_95, ap50 = cocoEval.stats[0], cocoEval.stats[1] 112 | print('ap50_95 : ', ap50_95) 113 | print('ap50 : ', ap50) 114 | self.map = ap50_95 115 | self.ap50_95 = ap50_95 116 | self.ap50 = ap50 117 | 118 | return ap50, ap50_95 119 | else: 120 | return 0, 0 121 | 122 | -------------------------------------------------------------------------------- /evaluator/voc_evaluator.py: -------------------------------------------------------------------------------- 1 | """Adapted from: 2 | @longcw faster_rcnn_pytorch: https://github.com/longcw/faster_rcnn_pytorch 3 | @rbgirshick py-faster-rcnn https://github.com/rbgirshick/py-faster-rcnn 4 | Licensed under The MIT License [see LICENSE for details] 5 | """ 6 | 7 | from data.voc import VOCDetection, VOC_CLASSES 8 | import sys 9 | import os 10 | import time 11 | import numpy as np 12 | import pickle 13 | import xml.etree.ElementTree as ET 14 | 15 | 16 | class VOCAPIEvaluator(): 17 | """ VOC AP Evaluation class """ 18 | def __init__(self, 19 | data_dir, 20 | device, 21 | transform, 22 | set_type='test', 23 | year='2007', 24 | display=False): 25 | self.data_dir = data_dir 26 | self.device = device 27 | self.transform = transform 28 | self.labelmap = VOC_CLASSES 29 | self.set_type = set_type 30 | self.year = year 31 | self.display = display 32 | 33 | # path 34 | self.devkit_path = os.path.join(data_dir, 'VOC' + year) 35 | self.annopath = os.path.join(data_dir, 'VOC2007', 'Annotations', '%s.xml') 36 | self.imgpath = os.path.join(data_dir, 'VOC2007', 'JPEGImages', '%s.jpg') 37 | self.imgsetpath = os.path.join(data_dir, 'VOC2007', 'ImageSets', 'Main', set_type+'.txt') 38 | self.output_dir = self.get_output_dir('voc_eval/', self.set_type) 39 | 40 | # dataset 41 | self.dataset = VOCDetection(data_dir=data_dir, 42 | image_sets=[('2007', set_type)], 43 | transform=transform) 44 | 45 | def evaluate(self, net): 46 | net.eval() 47 | num_images = len(self.dataset) 48 | # all detections are collected into: 49 | # all_boxes[cls][image] = N x 5 array of detections in 50 | # (x1, y1, x2, y2, score) 51 | self.all_boxes = [[[] for _ in range(num_images)] 52 | for _ in range(len(self.labelmap))] 53 | 54 | # timers 55 | det_file = os.path.join(self.output_dir, 'detections.pkl') 56 | 57 | for i in range(num_images): 58 | im, _ = self.dataset.pull_image(i) 59 | h, w, _ = im.shape 60 | scale = np.array([[w, h, w, h]]) 61 | 62 | # preprocess 63 | x = self.transform(im)[0] 64 | x = x.unsqueeze(0).to(self.device) 65 | 66 | t0 = time.time() 67 | # forward 68 | bboxes, scores, cls_inds = net(x) 69 | detect_time = time.time() - t0 70 | # rescale 71 | bboxes *= scale 72 | 73 | for j in range(len(self.labelmap)): 74 | inds = np.where(cls_inds == j)[0] 75 | if len(inds) == 0: 76 | self.all_boxes[j][i] = np.empty([0, 5], dtype=np.float32) 77 | continue 78 | c_bboxes = bboxes[inds] 79 | c_scores = scores[inds] 80 | c_dets = np.hstack((c_bboxes, 81 | c_scores[:, np.newaxis])).astype(np.float32, 82 | copy=False) 83 | self.all_boxes[j][i] = c_dets 84 | 85 | if i % 500 == 0: 86 | print('im_detect: {:d}/{:d} {:.3f}s'.format(i + 1, num_images, detect_time)) 87 | 88 | with open(det_file, 'wb') as f: 89 | pickle.dump(self.all_boxes, f, pickle.HIGHEST_PROTOCOL) 90 | 91 | print('Evaluating detections') 92 | self.evaluate_detections(self.all_boxes) 93 | 94 | print('Mean AP: ', self.map) 95 | 96 | 97 | def parse_rec(self, filename): 98 | """ Parse a PASCAL VOC xml file """ 99 | tree = ET.parse(filename) 100 | objects = [] 101 | for obj in tree.findall('object'): 102 | obj_struct = {} 103 | obj_struct['name'] = obj.find('name').text 104 | obj_struct['pose'] = obj.find('pose').text 105 | obj_struct['truncated'] = int(obj.find('truncated').text) 106 | obj_struct['difficult'] = int(obj.find('difficult').text) 107 | bbox = obj.find('bndbox') 108 | obj_struct['bbox'] = [int(bbox.find('xmin').text), 109 | int(bbox.find('ymin').text), 110 | int(bbox.find('xmax').text), 111 | int(bbox.find('ymax').text)] 112 | objects.append(obj_struct) 113 | 114 | return objects 115 | 116 | 117 | def get_output_dir(self, name, phase): 118 | """Return the directory where experimental artifacts are placed. 119 | If the directory does not exist, it is created. 120 | A canonical path is built using the name from an imdb and a network 121 | (if not None). 122 | """ 123 | filedir = os.path.join(name, phase) 124 | if not os.path.exists(filedir): 125 | os.makedirs(filedir) 126 | return filedir 127 | 128 | 129 | def get_voc_results_file_template(self, cls): 130 | # VOCdevkit/VOC2007/results/det_test_aeroplane.txt 131 | filename = 'det_' + self.set_type + '_%s.txt' % (cls) 132 | filedir = os.path.join(self.devkit_path, 'results') 133 | if not os.path.exists(filedir): 134 | os.makedirs(filedir) 135 | path = os.path.join(filedir, filename) 136 | return path 137 | 138 | 139 | def write_voc_results_file(self, all_boxes): 140 | for cls_ind, cls in enumerate(self.labelmap): 141 | if self.display: 142 | print('Writing {:s} VOC results file'.format(cls)) 143 | filename = self.get_voc_results_file_template(cls) 144 | with open(filename, 'wt') as f: 145 | for im_ind, index in enumerate(self.dataset.ids): 146 | dets = all_boxes[cls_ind][im_ind] 147 | if dets == []: 148 | continue 149 | # the VOCdevkit expects 1-based indices 150 | for k in range(dets.shape[0]): 151 | f.write('{:s} {:.3f} {:.1f} {:.1f} {:.1f} {:.1f}\n'. 152 | format(index[1], dets[k, -1], 153 | dets[k, 0] + 1, dets[k, 1] + 1, 154 | dets[k, 2] + 1, dets[k, 3] + 1)) 155 | 156 | 157 | def do_python_eval(self, use_07=True): 158 | cachedir = os.path.join(self.devkit_path, 'annotations_cache') 159 | aps = [] 160 | # The PASCAL VOC metric changed in 2010 161 | use_07_metric = use_07 162 | print('VOC07 metric? ' + ('Yes' if use_07_metric else 'No')) 163 | if not os.path.isdir(self.output_dir): 164 | os.mkdir(self.output_dir) 165 | for i, cls in enumerate(self.labelmap): 166 | filename = self.get_voc_results_file_template(cls) 167 | rec, prec, ap = self.voc_eval(detpath=filename, 168 | classname=cls, 169 | cachedir=cachedir, 170 | ovthresh=0.5, 171 | use_07_metric=use_07_metric 172 | ) 173 | aps += [ap] 174 | print('AP for {} = {:.4f}'.format(cls, ap)) 175 | with open(os.path.join(self.output_dir, cls + '_pr.pkl'), 'wb') as f: 176 | pickle.dump({'rec': rec, 'prec': prec, 'ap': ap}, f) 177 | if self.display: 178 | self.map = np.mean(aps) 179 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 180 | print('~~~~~~~~') 181 | print('Results:') 182 | for ap in aps: 183 | print('{:.3f}'.format(ap)) 184 | print('{:.3f}'.format(np.mean(aps))) 185 | print('~~~~~~~~') 186 | print('') 187 | print('--------------------------------------------------------------') 188 | print('Results computed with the **unofficial** Python eval code.') 189 | print('Results should be very close to the official MATLAB eval code.') 190 | print('--------------------------------------------------------------') 191 | else: 192 | self.map = np.mean(aps) 193 | print('Mean AP = {:.4f}'.format(np.mean(aps))) 194 | 195 | 196 | def voc_ap(self, rec, prec, use_07_metric=True): 197 | """ ap = voc_ap(rec, prec, [use_07_metric]) 198 | Compute VOC AP given precision and recall. 199 | If use_07_metric is true, uses the 200 | VOC 07 11 point method (default:True). 201 | """ 202 | if use_07_metric: 203 | # 11 point metric 204 | ap = 0. 205 | for t in np.arange(0., 1.1, 0.1): 206 | if np.sum(rec >= t) == 0: 207 | p = 0 208 | else: 209 | p = np.max(prec[rec >= t]) 210 | ap = ap + p / 11. 211 | else: 212 | # correct AP calculation 213 | # first append sentinel values at the end 214 | mrec = np.concatenate(([0.], rec, [1.])) 215 | mpre = np.concatenate(([0.], prec, [0.])) 216 | 217 | # compute the precision envelope 218 | for i in range(mpre.size - 1, 0, -1): 219 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 220 | 221 | # to calculate area under PR curve, look for points 222 | # where X axis (recall) changes value 223 | i = np.where(mrec[1:] != mrec[:-1])[0] 224 | 225 | # and sum (\Delta recall) * prec 226 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 227 | return ap 228 | 229 | 230 | def voc_eval(self, detpath, classname, cachedir, ovthresh=0.5, use_07_metric=True): 231 | if not os.path.isdir(cachedir): 232 | os.mkdir(cachedir) 233 | cachefile = os.path.join(cachedir, 'annots.pkl') 234 | # read list of images 235 | with open(self.imgsetpath, 'r') as f: 236 | lines = f.readlines() 237 | imagenames = [x.strip() for x in lines] 238 | if not os.path.isfile(cachefile): 239 | # load annots 240 | recs = {} 241 | for i, imagename in enumerate(imagenames): 242 | recs[imagename] = self.parse_rec(self.annopath % (imagename)) 243 | if i % 100 == 0 and self.display: 244 | print('Reading annotation for {:d}/{:d}'.format( 245 | i + 1, len(imagenames))) 246 | # save 247 | if self.display: 248 | print('Saving cached annotations to {:s}'.format(cachefile)) 249 | with open(cachefile, 'wb') as f: 250 | pickle.dump(recs, f) 251 | else: 252 | # load 253 | with open(cachefile, 'rb') as f: 254 | recs = pickle.load(f) 255 | 256 | # extract gt objects for this class 257 | class_recs = {} 258 | npos = 0 259 | for imagename in imagenames: 260 | R = [obj for obj in recs[imagename] if obj['name'] == classname] 261 | bbox = np.array([x['bbox'] for x in R]) 262 | difficult = np.array([x['difficult'] for x in R]).astype(np.bool) 263 | det = [False] * len(R) 264 | npos = npos + sum(~difficult) 265 | class_recs[imagename] = {'bbox': bbox, 266 | 'difficult': difficult, 267 | 'det': det} 268 | 269 | # read dets 270 | detfile = detpath.format(classname) 271 | with open(detfile, 'r') as f: 272 | lines = f.readlines() 273 | if any(lines) == 1: 274 | 275 | splitlines = [x.strip().split(' ') for x in lines] 276 | image_ids = [x[0] for x in splitlines] 277 | confidence = np.array([float(x[1]) for x in splitlines]) 278 | BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) 279 | 280 | # sort by confidence 281 | sorted_ind = np.argsort(-confidence) 282 | sorted_scores = np.sort(-confidence) 283 | BB = BB[sorted_ind, :] 284 | image_ids = [image_ids[x] for x in sorted_ind] 285 | 286 | # go down dets and mark TPs and FPs 287 | nd = len(image_ids) 288 | tp = np.zeros(nd) 289 | fp = np.zeros(nd) 290 | for d in range(nd): 291 | R = class_recs[image_ids[d]] 292 | bb = BB[d, :].astype(float) 293 | ovmax = -np.inf 294 | BBGT = R['bbox'].astype(float) 295 | if BBGT.size > 0: 296 | # compute overlaps 297 | # intersection 298 | ixmin = np.maximum(BBGT[:, 0], bb[0]) 299 | iymin = np.maximum(BBGT[:, 1], bb[1]) 300 | ixmax = np.minimum(BBGT[:, 2], bb[2]) 301 | iymax = np.minimum(BBGT[:, 3], bb[3]) 302 | iw = np.maximum(ixmax - ixmin, 0.) 303 | ih = np.maximum(iymax - iymin, 0.) 304 | inters = iw * ih 305 | uni = ((bb[2] - bb[0]) * (bb[3] - bb[1]) + 306 | (BBGT[:, 2] - BBGT[:, 0]) * 307 | (BBGT[:, 3] - BBGT[:, 1]) - inters) 308 | overlaps = inters / uni 309 | ovmax = np.max(overlaps) 310 | jmax = np.argmax(overlaps) 311 | 312 | if ovmax > ovthresh: 313 | if not R['difficult'][jmax]: 314 | if not R['det'][jmax]: 315 | tp[d] = 1. 316 | R['det'][jmax] = 1 317 | else: 318 | fp[d] = 1. 319 | else: 320 | fp[d] = 1. 321 | 322 | # compute precision recall 323 | fp = np.cumsum(fp) 324 | tp = np.cumsum(tp) 325 | rec = tp / float(npos) 326 | # avoid divide by zero in case the first detection matches a difficult 327 | # ground truth 328 | prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) 329 | ap = self.voc_ap(rec, prec, use_07_metric) 330 | else: 331 | rec = -1. 332 | prec = -1. 333 | ap = -1. 334 | 335 | return rec, prec, ap 336 | 337 | 338 | def evaluate_detections(self, box_list): 339 | self.write_voc_results_file(box_list) 340 | self.do_python_eval() 341 | 342 | 343 | if __name__ == '__main__': 344 | pass -------------------------------------------------------------------------------- /models/conv.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | 3 | 4 | class Conv(nn.Module): 5 | def __init__(self, in_ch, out_ch, k=1, p=0, s=1, d=1, g=1, act=True, bias=False): 6 | super(Conv, self).__init__() 7 | if act: 8 | self.convs = nn.Sequential( 9 | nn.Conv2d(in_ch, out_ch, k, stride=s, padding=p, dilation=d, groups=g, bias=bias), 10 | nn.BatchNorm2d(out_ch), 11 | nn.ReLU(inplace=True) 12 | ) 13 | else: 14 | self.convs = nn.Sequential( 15 | nn.Conv2d(in_ch, out_ch, k, stride=s, padding=p, dilation=d, groups=g, bias=bias), 16 | nn.BatchNorm2d(out_ch) 17 | ) 18 | self.init_weight() 19 | 20 | def init_weight(self): 21 | for m in self.modules(): 22 | if isinstance(m, nn.Conv2d): 23 | nn.init.normal_(m.weight, mean=0, std=0.01) 24 | if hasattr(m, 'bias') and m.bias is not None: 25 | nn.init.constant_(m.bias, 0) 26 | 27 | def forward(self, x): 28 | return self.convs(x) 29 | -------------------------------------------------------------------------------- /models/fcos.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from .resnet import build_backbone 7 | from .conv import Conv 8 | 9 | from utils import box_ops 10 | from utils import loss 11 | 12 | 13 | class FCOS(nn.Module): 14 | def __init__(self, 15 | device, 16 | img_size, 17 | num_classes=80, 18 | trainable=False, 19 | conf_thresh=0.05, 20 | nms_thresh=0.5, 21 | bk='r18', 22 | freeze_bn=False): 23 | super(FCOS, self).__init__() 24 | self.device = device 25 | self.img_size = img_size 26 | self.num_classes = num_classes 27 | self.trainable = trainable 28 | self.conf_thresh = conf_thresh 29 | self.nms_thresh = nms_thresh 30 | self.freeze_bn = freeze_bn 31 | self.strides = [8, 16, 32, 64, 128] 32 | self.grid_cell = self.create_grid(img_size) 33 | 34 | # backbone 35 | self.backbone, feature_channels = build_backbone(pretrained=trainable, freeze=trainable, model=bk) 36 | c3, c4, c5 = feature_channels 37 | 38 | # latter layers 39 | self.latter_1 = nn.Conv2d(c3, 256, kernel_size=1) 40 | self.latter_2 = nn.Conv2d(c4, 256, kernel_size=1) 41 | self.latter_3 = nn.Conv2d(c5, 256, kernel_size=1) 42 | self.latter_4 = nn.Conv2d(256, 256, kernel_size=3, padding=1, stride=2) 43 | self.latter_5 = nn.Conv2d(256, 256, kernel_size=3, padding=1, stride=2) 44 | 45 | # smooth layers 46 | self.smooth_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1) 47 | self.smooth_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1) 48 | self.smooth_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1) 49 | 50 | # head 51 | self.cls_head = nn.Sequential( 52 | Conv(256, 256, k=3, p=1), 53 | Conv(256, 256, k=3, p=1), 54 | Conv(256, 256, k=3, p=1), 55 | Conv(256, 256, k=3, p=1) 56 | ) 57 | self.reg_head = nn.Sequential( 58 | Conv(256, 256, k=3, p=1), 59 | Conv(256, 256, k=3, p=1), 60 | Conv(256, 256, k=3, p=1), 61 | Conv(256, 256, k=3, p=1) 62 | ) 63 | 64 | # det 65 | self.cls_det = nn.Conv2d(256, self.num_classes, kernel_size=1) 66 | self.reg_det = nn.Conv2d(256, 4, kernel_size=1) 67 | self.ctn_det = nn.Conv2d(256, 1, kernel_size=1) 68 | 69 | # init weight 70 | self.init_weight() 71 | 72 | 73 | def init_weight(self): 74 | for m in [self.latter_1, self.latter_2, self.latter_3, self.latter_4, self.latter_5]: 75 | if isinstance(m, nn.Conv2d): 76 | nn.init.normal_(m.weight, mean=0, std=0.01) 77 | if hasattr(m, 'bias') and m.bias is not None: 78 | nn.init.constant_(m.bias, 0) 79 | 80 | for m in [self.smooth_1, self.smooth_2, self.smooth_3]: 81 | if isinstance(m, nn.Conv2d): 82 | nn.init.normal_(m.weight, mean=0, std=0.01) 83 | if hasattr(m, 'bias') and m.bias is not None: 84 | nn.init.constant_(m.bias, 0) 85 | 86 | # init weight of cls_pred 87 | init_prob = 0.01 88 | bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) 89 | nn.init.constant_(self.cls_det.bias, bias_value) 90 | 91 | 92 | def create_grid(self, img_size): 93 | total_grid_xy = [] 94 | w = h = img_size 95 | for s in self.strides: 96 | # generate grid cells 97 | ws, hs = w // s, h // s 98 | grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) 99 | # [H, W, 2] -> [HW, 2] 100 | grid_xy = torch.stack([grid_x, grid_y], dim=-1).float().view(-1, 2) 101 | # [1, H*W, 2] 102 | grid_xy = grid_xy[None, :, :].to(self.device) 103 | 104 | total_grid_xy.append(grid_xy) 105 | 106 | return total_grid_xy 107 | 108 | 109 | def set_grid(self, img_size): 110 | self.img_size = img_size 111 | self.grid_cell = self.create_grid(img_size) 112 | 113 | 114 | def nms(self, dets, scores): 115 | """"Pure Python NMS baseline.""" 116 | x1 = dets[:, 0] #xmin 117 | y1 = dets[:, 1] #ymin 118 | x2 = dets[:, 2] #xmax 119 | y2 = dets[:, 3] #ymax 120 | 121 | areas = (x2 - x1) * (y2 - y1) 122 | order = scores.argsort()[::-1] 123 | 124 | keep = [] 125 | while order.size > 0: 126 | i = order[0] 127 | keep.append(i) 128 | # compute iou 129 | xx1 = np.maximum(x1[i], x1[order[1:]]) 130 | yy1 = np.maximum(y1[i], y1[order[1:]]) 131 | xx2 = np.minimum(x2[i], x2[order[1:]]) 132 | yy2 = np.minimum(y2[i], y2[order[1:]]) 133 | 134 | w = np.maximum(1e-28, xx2 - xx1) 135 | h = np.maximum(1e-28, yy2 - yy1) 136 | inter = w * h 137 | 138 | ovr = inter / (areas[i] + areas[order[1:]] - inter + 1e-10) 139 | #reserve all the boundingbox whose ovr less than thresh 140 | inds = np.where(ovr <= self.nms_thresh)[0] 141 | order = order[inds + 1] 142 | 143 | return keep 144 | 145 | 146 | def postprocess(self, bboxes, scores): 147 | """ 148 | bboxes: (HxW, 4), bsize = 1 149 | scores: (HxW, num_classes), bsize = 1 150 | """ 151 | 152 | cls_inds = np.argmax(scores, axis=1) 153 | scores = scores[(np.arange(scores.shape[0]), cls_inds)] 154 | 155 | # threshold 156 | keep = np.where(scores >= self.conf_thresh) 157 | bboxes = bboxes[keep] 158 | scores = scores[keep] 159 | cls_inds = cls_inds[keep] 160 | 161 | # NMS 162 | keep = np.zeros(len(bboxes), dtype=np.int) 163 | for i in range(self.num_classes): 164 | inds = np.where(cls_inds == i)[0] 165 | if len(inds) == 0: 166 | continue 167 | c_bboxes = bboxes[inds] 168 | c_scores = scores[inds] 169 | c_keep = self.nms(c_bboxes, c_scores) 170 | keep[inds[c_keep]] = 1 171 | 172 | keep = np.where(keep > 0) 173 | bboxes = bboxes[keep] 174 | scores = scores[keep] 175 | cls_inds = cls_inds[keep] 176 | 177 | return bboxes, scores, cls_inds 178 | 179 | 180 | def forward(self, x, targets=None): 181 | B = x.size(0) 182 | C = self.num_classes 183 | # backbone 184 | c3, c4, c5 = self.backbone(x) 185 | 186 | # fpn 187 | p5 = self.latter_3(c5) 188 | p5_up = F.interpolate(p5, scale_factor=2) 189 | p5 = self.smooth_3(p5) 190 | 191 | p4 = self.latter_2(c4) + p5_up 192 | p4_up = F.interpolate(p4, scale_factor=2) 193 | p4 = self.smooth_2(p4) 194 | 195 | p3 = self.smooth_1(self.latter_1(c3) + p4_up) 196 | # p5 -> p6, p6 -> p7 197 | p6 = self.latter_4(p5) 198 | p7 = self.latter_5(p6) 199 | 200 | features = [p3, p4, p5, p6, p7] 201 | 202 | cls_pred = [] 203 | reg_pred = [] 204 | ctn_pred = [] 205 | # head 206 | for i, p in enumerate(features): 207 | cls_feat = self.cls_head(p) 208 | reg_feat = self.reg_head(p) 209 | # [B, C, H, W] -> [B, H*W, C] 210 | cls_pred_i = self.cls_det(cls_feat).permute(0, 2, 3, 1).reshape(B, -1, C) 211 | # [B, 4, H, W] -> [B, H*W, 4] 212 | reg_pred_i = self.reg_det(reg_feat).permute(0, 2, 3, 1).reshape(B, -1, 4) 213 | x1y1_pred_i = (self.grid_cell[i] - reg_pred_i[..., :2].exp()) * self.strides[i] # x1y1 214 | x2y2_pred_i = (self.grid_cell[i] + reg_pred_i[..., 2:].exp()) * self.strides[i] # x2y2 215 | box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1) 216 | # [B, 1, H, W] -> [B, H*W, 1] 217 | ctn_det_i = self.ctn_det(reg_feat).permute(0, 2, 3, 1).reshape(B, -1, 1) 218 | 219 | cls_pred.append(cls_pred_i) 220 | reg_pred.append(box_pred_i) 221 | ctn_pred.append(ctn_det_i) 222 | 223 | cls_pred = torch.cat(cls_pred, dim=1) # [B, N, C] 224 | reg_pred = torch.cat(reg_pred, dim=1) # [B, N, 4] 225 | ctn_pred = torch.cat(ctn_pred, dim=1) # [B, N, 1] 226 | 227 | # train 228 | if self.trainable: 229 | # compute giou between pred bboxes and gt bboxes 230 | x1y1x2y2_pred = (reg_pred / self.img_size).reshape(-1, 4) 231 | x1y1x2y2_gt = targets[:, :, -5:-1].reshape(-1, 4) 232 | 233 | # giou 234 | giou_pred = box_ops.giou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B) 235 | 236 | # compute loss 237 | cls_loss, reg_loss, ctn_loss, total_loss = loss.loss( 238 | pred_cls=cls_pred, 239 | pred_giou=giou_pred, 240 | pred_ctn=ctn_pred, 241 | label=targets, 242 | num_classes=self.num_classes 243 | ) 244 | 245 | return cls_loss, reg_loss, ctn_loss, total_loss 246 | 247 | # test 248 | else: 249 | with torch.no_grad(): 250 | # batch size = 1 251 | scores = torch.sqrt(cls_pred.sigmoid() * ctn_pred.sigmoid())[0] 252 | bboxes = torch.clamp(reg_pred / self.img_size, 0, 1)[0] 253 | 254 | # to cpu 255 | scores = scores.cpu().numpy() 256 | bboxes = bboxes.cpu().numpy() 257 | 258 | # postprocess 259 | bboxes, scores, cls_inds = self.postprocess(bboxes, scores) 260 | 261 | return bboxes, scores, cls_inds 262 | -------------------------------------------------------------------------------- /models/fcos_rt.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import numpy as np 5 | 6 | from .resnet import build_backbone 7 | from .conv import Conv 8 | 9 | from utils import box_ops 10 | from utils import loss 11 | 12 | 13 | class FCOS_RT(nn.Module): 14 | def __init__(self, 15 | device, 16 | img_size=640, 17 | num_classes=80, 18 | trainable=False, 19 | conf_thresh=0.03, 20 | nms_thresh=0.6, 21 | bk='r18'): 22 | super(FCOS_RT, self).__init__() 23 | self.device = device 24 | self.img_size = img_size 25 | self.num_classes = num_classes 26 | self.trainable = trainable 27 | self.conf_thresh = conf_thresh 28 | self.nms_thresh = nms_thresh 29 | self.strides = [8, 16, 32] 30 | self.grid_cell = self.create_grid(img_size) 31 | 32 | # backbone 33 | self.backbone, feature_channels = build_backbone(pretrained=trainable, freeze=trainable, model=bk) 34 | c3, c4, c5 = feature_channels 35 | 36 | # latter layers 37 | self.latter_1 = nn.Conv2d(c3, 256, kernel_size=1) 38 | self.latter_2 = nn.Conv2d(c4, 256, kernel_size=1) 39 | self.latter_3 = nn.Conv2d(c5, 256, kernel_size=1) 40 | 41 | # smooth layers 42 | self.smooth_1 = nn.Conv2d(256, 256, kernel_size=3, padding=1) 43 | self.smooth_2 = nn.Conv2d(256, 256, kernel_size=3, padding=1) 44 | self.smooth_3 = nn.Conv2d(256, 256, kernel_size=3, padding=1) 45 | 46 | # head 47 | self.cls_head = nn.Sequential( 48 | Conv(256, 256, k=3, p=1), 49 | Conv(256, 256, k=3, p=1), 50 | Conv(256, 256, k=3, p=1), 51 | Conv(256, 256, k=3, p=1) 52 | ) 53 | self.reg_head = nn.Sequential( 54 | Conv(256, 256, k=3, p=1), 55 | Conv(256, 256, k=3, p=1), 56 | Conv(256, 256, k=3, p=1), 57 | Conv(256, 256, k=3, p=1) 58 | ) 59 | 60 | # det 61 | self.cls_det = nn.Conv2d(256, self.num_classes, kernel_size=1) 62 | self.reg_det = nn.Conv2d(256, 4, kernel_size=1) 63 | self.ctn_det = nn.Conv2d(256, 1, kernel_size=1) 64 | 65 | # init weight 66 | self.init_weight() 67 | 68 | 69 | def init_weight(self): 70 | for m in [self.latter_1, self.latter_2, self.latter_3]: 71 | if isinstance(m, nn.Conv2d): 72 | nn.init.normal_(m.weight, mean=0, std=0.01) 73 | if hasattr(m, 'bias') and m.bias is not None: 74 | nn.init.constant_(m.bias, 0) 75 | 76 | for m in [self.smooth_1, self.smooth_2, self.smooth_3]: 77 | if isinstance(m, nn.Conv2d): 78 | nn.init.normal_(m.weight, mean=0, std=0.01) 79 | if hasattr(m, 'bias') and m.bias is not None: 80 | nn.init.constant_(m.bias, 0) 81 | 82 | # init weight of cls_pred 83 | init_prob = 0.01 84 | bias_value = -torch.log(torch.tensor((1. - init_prob) / init_prob)) 85 | nn.init.constant_(self.cls_det.bias, bias_value) 86 | 87 | 88 | def create_grid(self, img_size): 89 | total_grid_xy = [] 90 | w = h = img_size 91 | for s in self.strides: 92 | # generate grid cells 93 | ws, hs = w // s, h // s 94 | grid_y, grid_x = torch.meshgrid([torch.arange(hs), torch.arange(ws)]) 95 | # [H, W, 2] -> [HW, 2] 96 | grid_xy = torch.stack([grid_x, grid_y], dim=-1).float().view(-1, 2) 97 | # [1, H*W, 2] 98 | grid_xy = grid_xy[None, :, :].to(self.device) 99 | 100 | total_grid_xy.append(grid_xy) 101 | 102 | return total_grid_xy 103 | 104 | 105 | def set_grid(self, img_size): 106 | self.img_size = img_size 107 | self.grid_cell = self.create_grid(img_size) 108 | 109 | 110 | def nms(self, dets, scores): 111 | """"Pure Python NMS baseline.""" 112 | x1 = dets[:, 0] #xmin 113 | y1 = dets[:, 1] #ymin 114 | x2 = dets[:, 2] #xmax 115 | y2 = dets[:, 3] #ymax 116 | 117 | areas = (x2 - x1) * (y2 - y1) 118 | order = scores.argsort()[::-1] 119 | 120 | keep = [] 121 | while order.size > 0: 122 | i = order[0] 123 | keep.append(i) 124 | # compute iou 125 | xx1 = np.maximum(x1[i], x1[order[1:]]) 126 | yy1 = np.maximum(y1[i], y1[order[1:]]) 127 | xx2 = np.minimum(x2[i], x2[order[1:]]) 128 | yy2 = np.minimum(y2[i], y2[order[1:]]) 129 | 130 | w = np.maximum(1e-28, xx2 - xx1) 131 | h = np.maximum(1e-28, yy2 - yy1) 132 | inter = w * h 133 | 134 | ovr = inter / (areas[i] + areas[order[1:]] - inter + 1e-10) 135 | #reserve all the boundingbox whose ovr less than thresh 136 | inds = np.where(ovr <= self.nms_thresh)[0] 137 | order = order[inds + 1] 138 | 139 | return keep 140 | 141 | 142 | def postprocess(self, bboxes, scores): 143 | """ 144 | bboxes: (HxW, 4), bsize = 1 145 | scores: (HxW, num_classes), bsize = 1 146 | """ 147 | 148 | cls_inds = np.argmax(scores, axis=1) 149 | scores = scores[(np.arange(scores.shape[0]), cls_inds)] 150 | 151 | # threshold 152 | keep = np.where(scores >= self.conf_thresh) 153 | bboxes = bboxes[keep] 154 | scores = scores[keep] 155 | cls_inds = cls_inds[keep] 156 | 157 | # NMS 158 | keep = np.zeros(len(bboxes), dtype=np.int) 159 | for i in range(self.num_classes): 160 | inds = np.where(cls_inds == i)[0] 161 | if len(inds) == 0: 162 | continue 163 | c_bboxes = bboxes[inds] 164 | c_scores = scores[inds] 165 | c_keep = self.nms(c_bboxes, c_scores) 166 | keep[inds[c_keep]] = 1 167 | 168 | keep = np.where(keep > 0) 169 | bboxes = bboxes[keep] 170 | scores = scores[keep] 171 | cls_inds = cls_inds[keep] 172 | 173 | return bboxes, scores, cls_inds 174 | 175 | 176 | def forward(self, x, targets=None): 177 | B = x.size(0) 178 | C = self.num_classes 179 | # backbone 180 | c3, c4, c5 = self.backbone(x) 181 | 182 | # fpn 183 | p5 = self.latter_3(c5) 184 | p5_up = F.interpolate(p5, scale_factor=2) 185 | p5 = self.smooth_3(p5) 186 | 187 | p4 = self.latter_2(c4) + p5_up 188 | p4_up = F.interpolate(p4, scale_factor=2) 189 | p4 = self.smooth_2(p4) 190 | 191 | p3 = self.smooth_1(self.latter_1(c3) + p4_up) 192 | 193 | features = [p3, p4, p5] 194 | 195 | cls_pred = [] 196 | reg_pred = [] 197 | ctn_pred = [] 198 | # head 199 | for i, p in enumerate(features): 200 | cls_feat = self.cls_head(p) 201 | reg_feat = self.reg_head(p) 202 | # [B, C, H, W] -> [B, H*W, C] 203 | cls_pred_i = self.cls_det(cls_feat).permute(0, 2, 3, 1).contiguous().view(B, -1, C) 204 | # [B, 4, H, W] -> [B, H*W, 4] 205 | reg_pred_i = self.reg_det(reg_feat).permute(0, 2, 3, 1).contiguous().view(B, -1, 4) 206 | x1y1_pred_i = (self.grid_cell[i] - reg_pred_i[..., :2].exp()) * self.strides[i] # x1y1 207 | x2y2_pred_i = (self.grid_cell[i] + reg_pred_i[..., 2:].exp()) * self.strides[i] # x2y2 208 | box_pred_i = torch.cat([x1y1_pred_i, x2y2_pred_i], dim=-1) 209 | # [B, 1, H, W] -> [B, H*W, 1] 210 | ctn_det_i = self.ctn_det(reg_feat).permute(0, 2, 3, 1).contiguous().view(B, -1, 1) 211 | 212 | cls_pred.append(cls_pred_i) 213 | reg_pred.append(box_pred_i) 214 | ctn_pred.append(ctn_det_i) 215 | 216 | cls_pred = torch.cat(cls_pred, dim=1) # [B, N, C] 217 | reg_pred = torch.cat(reg_pred, dim=1) # [B, N, 4] 218 | ctn_pred = torch.cat(ctn_pred, dim=1) # [B, N, 1] 219 | 220 | # train 221 | if self.trainable: 222 | # compute giou between pred bboxes and gt bboxes 223 | x1y1x2y2_pred = (reg_pred / self.img_size).view(-1, 4) 224 | x1y1x2y2_gt = targets[:, :, -5:-1].view(-1, 4) 225 | 226 | # giou 227 | giou_pred = box_ops.giou_score(x1y1x2y2_pred, x1y1x2y2_gt, batch_size=B) 228 | 229 | # compute loss 230 | cls_loss, reg_loss, ctn_loss, total_loss = loss.loss( 231 | pred_cls=cls_pred, 232 | pred_giou=giou_pred, 233 | pred_ctn=ctn_pred, 234 | target=targets, 235 | num_classes=self.num_classes) 236 | 237 | return cls_loss, reg_loss, ctn_loss, total_loss 238 | 239 | # test 240 | else: 241 | with torch.no_grad(): 242 | # batch size = 1 243 | scores = torch.sqrt(cls_pred.sigmoid() * ctn_pred.sigmoid())[0] 244 | bboxes = torch.clamp(reg_pred / self.img_size, 0, 1)[0] 245 | 246 | # to cpu 247 | scores = scores.cpu().numpy() 248 | bboxes = bboxes.cpu().numpy() 249 | 250 | # postprocess 251 | bboxes, scores, cls_inds = self.postprocess(bboxes, scores) 252 | 253 | return bboxes, scores, cls_inds 254 | -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.utils.model_zoo as model_zoo 4 | 5 | 6 | __all__ = ['ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 7 | 'resnet152'] 8 | 9 | 10 | model_urls = { 11 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 12 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 13 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 14 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 15 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 16 | } 17 | 18 | 19 | def conv3x3(in_planes, out_planes, stride=1): 20 | """3x3 convolution with padding""" 21 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 22 | padding=1, bias=False) 23 | 24 | def conv1x1(in_planes, out_planes, stride=1): 25 | """1x1 convolution""" 26 | return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) 27 | 28 | class BasicBlock(nn.Module): 29 | expansion = 1 30 | 31 | def __init__(self, inplanes, planes, stride=1, downsample=None): 32 | super(BasicBlock, self).__init__() 33 | self.conv1 = conv3x3(inplanes, planes, stride) 34 | self.bn1 = nn.BatchNorm2d(planes) 35 | self.relu = nn.ReLU(inplace=True) 36 | self.conv2 = conv3x3(planes, planes) 37 | self.bn2 = nn.BatchNorm2d(planes) 38 | self.downsample = downsample 39 | self.stride = stride 40 | 41 | def forward(self, x): 42 | identity = x 43 | 44 | out = self.conv1(x) 45 | out = self.bn1(out) 46 | out = self.relu(out) 47 | 48 | out = self.conv2(out) 49 | out = self.bn2(out) 50 | 51 | if self.downsample is not None: 52 | identity = self.downsample(x) 53 | 54 | out += identity 55 | out = self.relu(out) 56 | 57 | return out 58 | 59 | class Bottleneck(nn.Module): 60 | expansion = 4 61 | 62 | def __init__(self, inplanes, planes, stride=1, downsample=None): 63 | super(Bottleneck, self).__init__() 64 | self.conv1 = conv1x1(inplanes, planes) 65 | self.bn1 = nn.BatchNorm2d(planes) 66 | self.conv2 = conv3x3(planes, planes, stride) 67 | self.bn2 = nn.BatchNorm2d(planes) 68 | self.conv3 = conv1x1(planes, planes * self.expansion) 69 | self.bn3 = nn.BatchNorm2d(planes * self.expansion) 70 | self.relu = nn.ReLU(inplace=True) 71 | self.downsample = downsample 72 | self.stride = stride 73 | 74 | def forward(self, x): 75 | identity = x 76 | 77 | out = self.conv1(x) 78 | out = self.bn1(out) 79 | out = self.relu(out) 80 | 81 | out = self.conv2(out) 82 | out = self.bn2(out) 83 | out = self.relu(out) 84 | 85 | out = self.conv3(out) 86 | out = self.bn3(out) 87 | 88 | if self.downsample is not None: 89 | identity = self.downsample(x) 90 | 91 | out += identity 92 | out = self.relu(out) 93 | 94 | return out 95 | 96 | class ResNet(nn.Module): 97 | 98 | def __init__(self, block, layers, zero_init_residual=False): 99 | super(ResNet, self).__init__() 100 | self.inplanes = 64 101 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 102 | bias=False) 103 | self.bn1 = nn.BatchNorm2d(64) 104 | self.relu = nn.ReLU(inplace=True) 105 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 106 | self.layer1 = self._make_layer(block, 64, layers[0]) 107 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 108 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 109 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 110 | 111 | for m in self.modules(): 112 | if isinstance(m, nn.Conv2d): 113 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 114 | elif isinstance(m, nn.BatchNorm2d): 115 | nn.init.constant_(m.weight, 1) 116 | nn.init.constant_(m.bias, 0) 117 | 118 | # Zero-initialize the last BN in each residual branch, 119 | # so that the residual branch starts with zeros, and each residual block behaves like an identity. 120 | # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677 121 | if zero_init_residual: 122 | for m in self.modules(): 123 | if isinstance(m, Bottleneck): 124 | nn.init.constant_(m.bn3.weight, 0) 125 | elif isinstance(m, BasicBlock): 126 | nn.init.constant_(m.bn2.weight, 0) 127 | 128 | 129 | def _make_layer(self, block, planes, blocks, stride=1): 130 | downsample = None 131 | if stride != 1 or self.inplanes != planes * block.expansion: 132 | downsample = nn.Sequential( 133 | conv1x1(self.inplanes, planes * block.expansion, stride), 134 | nn.BatchNorm2d(planes * block.expansion), 135 | ) 136 | 137 | layers = [] 138 | layers.append(block(self.inplanes, planes, stride, downsample)) 139 | self.inplanes = planes * block.expansion 140 | for _ in range(1, blocks): 141 | layers.append(block(self.inplanes, planes)) 142 | 143 | return nn.Sequential(*layers) 144 | 145 | def freeze_bn(self): 146 | '''Freeze BatchNorm layers.''' 147 | for m in self.modules(): 148 | if isinstance(m, nn.BatchNorm2d): 149 | m.eval() 150 | 151 | def freeze_stage(self): 152 | # freeze stage = 1 153 | for p in self.conv1.parameters(): 154 | p.requires_grad = False 155 | for p in self.bn1.parameters(): 156 | p.requires_grad = False 157 | # for p in self.layer1.parameters(): 158 | # p.requires_grad = False 159 | 160 | def forward(self, x): 161 | c1 = self.conv1(x) 162 | c1 = self.bn1(c1) 163 | c1 = self.relu(c1) 164 | c1 = self.maxpool(c1) 165 | 166 | c2 = self.layer1(c1) 167 | c3 = self.layer2(c2) 168 | c4 = self.layer3(c3) 169 | c5 = self.layer4(c4) 170 | 171 | return c3, c4, c5 172 | 173 | def resnet18(pretrained=False, freeze_bn=False, **kwargs): 174 | """Constructs a ResNet-18 model. 175 | 176 | Args: 177 | pretrained (bool): If True, returns a model pre-trained on ImageNet 178 | """ 179 | model = ResNet(BasicBlock, [2, 2, 2, 2], **kwargs) 180 | if pretrained: 181 | # strict = False as we don't need fc layer params. 182 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18']), strict=False) 183 | # freeze bn 184 | if freeze_bn: 185 | print('freeze bn ...') 186 | model.freeze_bn() 187 | print('freeze stage 1') 188 | model.freeze_stage() 189 | 190 | return model 191 | 192 | def resnet34(pretrained=False, freeze_bn=False, **kwargs): 193 | """Constructs a ResNet-34 model. 194 | 195 | Args: 196 | pretrained (bool): If True, returns a model pre-trained on ImageNet 197 | """ 198 | model = ResNet(BasicBlock, [3, 4, 6, 3], **kwargs) 199 | if pretrained: 200 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34']), strict=False) 201 | # freeze bn 202 | if freeze_bn: 203 | print('freeze bn ...') 204 | model.freeze_bn() 205 | print('freeze stage 1') 206 | model.freeze_stage() 207 | 208 | return model 209 | 210 | def resnet50(pretrained=False, freeze_bn=False, **kwargs): 211 | """Constructs a ResNet-50 model. 212 | 213 | Args: 214 | pretrained (bool): If True, returns a model pre-trained on ImageNet 215 | """ 216 | model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs) 217 | if pretrained: 218 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50']), strict=False) 219 | # freeze bn 220 | if freeze_bn: 221 | print('freeze bn ...') 222 | model.freeze_bn() 223 | print('freeze stage 1') 224 | model.freeze_stage() 225 | 226 | return model 227 | 228 | def resnet101(pretrained=False, freeze_bn=False, **kwargs): 229 | """Constructs a ResNet-101 model. 230 | 231 | Args: 232 | pretrained (bool): If True, returns a model pre-trained on ImageNet 233 | """ 234 | model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs) 235 | if pretrained: 236 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101']), strict=False) 237 | # freeze bn 238 | if freeze_bn: 239 | print('freeze bn ...') 240 | model.freeze_bn() 241 | print('freeze stage 1') 242 | model.freeze_stage() 243 | 244 | return model 245 | 246 | def resnet152(pretrained=False, freeze_bn=False, **kwargs): 247 | """Constructs a ResNet-152 model. 248 | 249 | Args: 250 | pretrained (bool): If True, returns a model pre-trained on ImageNet 251 | """ 252 | model = ResNet(Bottleneck, [3, 8, 36, 3], **kwargs) 253 | if pretrained: 254 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'])) 255 | # freeze bn 256 | if freeze_bn: 257 | print('freeze bn ...') 258 | model.freeze_bn() 259 | print('freeze stage 1') 260 | model.freeze_stage() 261 | 262 | return model 263 | 264 | 265 | def build_backbone(pretrained=False, freeze=False, model='r18'): 266 | if model == 'r18': 267 | return resnet18(pretrained=pretrained, freeze_bn=freeze), [128, 256, 512] 268 | elif model == 'r34': 269 | return resnet34(pretrained=pretrained, freeze_bn=freeze), [128, 256, 512] 270 | elif model == 'r50': 271 | return resnet50(pretrained=pretrained, freeze_bn=freeze), [512, 1024, 2048] 272 | elif model == 'r101': 273 | return resnet101(pretrained=pretrained, freeze_bn=freeze), [512, 1024, 2048] 274 | 275 | 276 | if __name__=='__main__': 277 | #model = torchvision.models.resnet50() 278 | print("found ", torch.cuda.device_count(), " GPU(s)") 279 | device = torch.device("cuda") 280 | model = resnet101(detection=True).to(device) 281 | print(model) 282 | 283 | input = torch.randn(1, 3, 512, 512).to(device) 284 | output = model(input) -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import os 4 | import time 5 | import numpy as np 6 | import torch 7 | import torch.backends.cudnn as cudnn 8 | 9 | from data.voc import VOC_CLASSES, VOCDetection 10 | from data.coco import coco_class_index, coco_class_labels, COCODataset 11 | from data import config 12 | from data.transforms import ValTransforms 13 | 14 | from utils.misc import TestTimeAugmentation 15 | 16 | 17 | 18 | parser = argparse.ArgumentParser(description='FCOS-RT Detection') 19 | # basic 20 | parser.add_argument('-size', '--img_size', default=512, type=int, 21 | help='img_size') 22 | parser.add_argument('--show', action='store_true', default=False, 23 | help='show the visulization results.') 24 | parser.add_argument('-vs', '--visual_threshold', default=0.5, type=float, 25 | help='Final confidence threshold') 26 | parser.add_argument('--cuda', action='store_true', default=False, 27 | help='use cuda.') 28 | parser.add_argument('--save_folder', default='det_results/', type=str, 29 | help='Dir to save results') 30 | # model 31 | parser.add_argument('-v', '--version', default='fcos_rt', 32 | help='fcos_rt') 33 | parser.add_argument('-bk', '--backbone', default='r18', 34 | help='r18, r50, r101') 35 | parser.add_argument('--trained_model', default='weight/', 36 | type=str, help='Trained state_dict file path to open') 37 | parser.add_argument('--conf_thresh', default=0.1, type=float, 38 | help='NMS threshold') 39 | parser.add_argument('--nms_thresh', default=0.45, type=float, 40 | help='NMS threshold') 41 | # dataset 42 | parser.add_argument('--root', default='/mnt/share/ssd2/dataset', 43 | help='data root') 44 | parser.add_argument('-d', '--dataset', default='coco', 45 | help='coco.') 46 | # TTA 47 | parser.add_argument('-tta', '--test_aug', action='store_true', default=False, 48 | help='use test augmentation.') 49 | 50 | args = parser.parse_args() 51 | 52 | 53 | def plot_bbox_labels(img, bbox, label=None, cls_color=None, text_scale=0.4): 54 | x1, y1, x2, y2 = bbox 55 | x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) 56 | t_size = cv2.getTextSize(label, 0, fontScale=1, thickness=2)[0] 57 | # plot bbox 58 | cv2.rectangle(img, (x1, y1), (x2, y2), cls_color, 2) 59 | 60 | if label is not None: 61 | # plot title bbox 62 | cv2.rectangle(img, (x1, y1-t_size[1]), (int(x1 + t_size[0] * text_scale), y1), cls_color, -1) 63 | # put the test on the title bbox 64 | cv2.putText(img, label, (int(x1), int(y1 - 5)), 0, text_scale, (0, 0, 0), 1, lineType=cv2.LINE_AA) 65 | 66 | return img 67 | 68 | 69 | def visualize(img, 70 | bboxes, 71 | scores, 72 | cls_inds, 73 | vis_thresh, 74 | class_colors, 75 | class_names, 76 | class_indexs=None, 77 | dataset_name='voc'): 78 | ts = 0.4 79 | for i, bbox in enumerate(bboxes): 80 | if scores[i] > vis_thresh: 81 | cls_id = int(cls_inds[i]) 82 | if dataset_name == 'coco': 83 | cls_color = class_colors[cls_id] 84 | cls_id = class_indexs[cls_id] 85 | else: 86 | cls_color = class_colors[cls_id] 87 | 88 | if len(class_names) > 1: 89 | mess = '%s: %.2f' % (class_names[cls_id], scores[i]) 90 | else: 91 | cls_color = [255, 0, 0] 92 | mess = None 93 | img = plot_bbox_labels(img, bbox, mess, cls_color, text_scale=ts) 94 | 95 | return img 96 | 97 | 98 | def test(args, 99 | net, 100 | device, 101 | dataset, 102 | transforms=None, 103 | vis_thresh=0.4, 104 | class_colors=None, 105 | class_names=None, 106 | class_indexs=None, 107 | show=False, 108 | test_aug=None, 109 | dataset_name='coco'): 110 | num_images = len(dataset) 111 | save_path = os.path.join('det_results/', args.dataset, args.version) 112 | os.makedirs(save_path, exist_ok=True) 113 | 114 | for index in range(num_images): 115 | print('Testing image {:d}/{:d}....'.format(index+1, num_images)) 116 | image, _ = dataset.pull_image(index) 117 | 118 | h, w, _ = image.shape 119 | scale = np.array([[w, h, w, h]]) 120 | 121 | # prepare 122 | x = transforms(image)[0] 123 | x = x.unsqueeze(0).to(device) 124 | 125 | t0 = time.time() 126 | # forward 127 | # test augmentation: 128 | if test_aug is not None: 129 | bboxes, scores, cls_inds = test_aug(x, net) 130 | else: 131 | # inference 132 | bboxes, scores, cls_inds = net(x) 133 | print("detection time used ", time.time() - t0, "s") 134 | 135 | # rescale 136 | bboxes *= scale 137 | 138 | # vis detection 139 | img_processed = visualize( 140 | img=image, 141 | bboxes=bboxes, 142 | scores=scores, 143 | cls_inds=cls_inds, 144 | vis_thresh=vis_thresh, 145 | class_colors=class_colors, 146 | class_names=class_names, 147 | class_indexs=class_indexs, 148 | dataset_name=dataset_name 149 | ) 150 | if show: 151 | cv2.imshow('detection', img_processed) 152 | cv2.waitKey(0) 153 | # save result 154 | cv2.imwrite(os.path.join(save_path, str(index).zfill(6) +'.jpg'), img_processed) 155 | 156 | 157 | if __name__ == '__main__': 158 | # cuda 159 | if args.cuda: 160 | print('use cuda') 161 | cudnn.benchmark = True 162 | device = torch.device("cuda") 163 | else: 164 | device = torch.device("cpu") 165 | 166 | # input size 167 | input_size = args.input_size 168 | 169 | # dataset and evaluator 170 | if args.dataset == 'voc': 171 | data_dir = os.path.join(args.root, 'VOCdevkit') 172 | class_names = VOC_CLASSES 173 | class_indexs = None 174 | num_classes = 20 175 | dataset = VOCDetection( 176 | data_dir=data_dir, 177 | image_sets=[('2007', 'test')]) 178 | 179 | elif args.dataset == 'coco': 180 | data_dir = os.path.join(args.root, 'COCO') 181 | class_names = coco_class_labels 182 | class_indexs = coco_class_index 183 | num_classes = 80 184 | dataset = COCODataset( 185 | data_dir=data_dir, 186 | image_set='val') 187 | 188 | class_colors = [(np.random.randint(255), 189 | np.random.randint(255), 190 | np.random.randint(255)) for _ in range(num_classes)] 191 | 192 | # model 193 | model_name = args.version 194 | print('Model: ', model_name) 195 | 196 | # load model and config file 197 | if model_name == 'fcos_rt': 198 | from models.fcos_rt import FCOS_RT 199 | backbone = args.backbone 200 | 201 | else: 202 | print('Unknown model name...') 203 | exit(0) 204 | 205 | # model 206 | model = FCOS_RT(device=device, 207 | img_size=input_size, 208 | num_classes=num_classes, 209 | trainable=False, 210 | conf_thresh=args.conf_thresh, 211 | nms_thresh=args.nms_thresh, 212 | bk=backbone) 213 | 214 | 215 | # load weight 216 | model.load_state_dict(torch.load(args.trained_model, map_location=device), strict=False) 217 | model.to(device).eval() 218 | print('Finished loading model!') 219 | 220 | # TTA 221 | test_aug = TestTimeAugmentation(num_classes=num_classes) if args.test_aug else None 222 | 223 | # run 224 | test(args=args, 225 | net=model, 226 | device=device, 227 | dataset=dataset, 228 | transforms=ValTransforms(args.img_size), 229 | vis_thresh=args.visual_threshold, 230 | class_colors=class_colors, 231 | class_names=class_names, 232 | class_indexs=class_indexs, 233 | show=args.show, 234 | test_aug=test_aug, 235 | dataset_name=args.dataset) 236 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import os 4 | import argparse 5 | import time 6 | import random 7 | import numpy as np 8 | import cv2 9 | 10 | import torch 11 | import torch.optim as optim 12 | import torch.backends.cudnn as cudnn 13 | import torch.distributed as dist 14 | from torch.nn.parallel import DistributedDataParallel as DDP 15 | 16 | from data.voc import VOCDetection 17 | from data.coco import COCODataset 18 | from data.transforms import TrainTransforms, ValTransforms 19 | 20 | from utils import distributed_utils 21 | from utils import create_labels 22 | from utils.misc import ModelEMA, detection_collate 23 | from utils.com_flops_params import FLOPs_and_Params 24 | 25 | from evaluator.coco_evaluator import COCOAPIEvaluator 26 | from evaluator.voc_evaluator import VOCAPIEvaluator 27 | 28 | 29 | def parse_args(): 30 | parser = argparse.ArgumentParser(description='FCOS-RT Detection') 31 | # basic 32 | parser.add_argument('--cuda', action='store_true', default=False, 33 | help='use cuda.') 34 | parser.add_argument('--batch_size', default=16, type=int, 35 | help='Batch size for training') 36 | parser.add_argument('--img_size', default=512, type=int, 37 | help='Batch size for training') 38 | parser.add_argument('--max_epoch', type=int, default=12, 39 | help='The upper bound of warm-up') 40 | parser.add_argument('--lr_epoch', nargs='+', default=[8, 10], type=int, 41 | help='lr epoch to decay') 42 | parser.add_argument('--lr', default=0.01, type=float, 43 | help='learning rate') 44 | parser.add_argument('--schedule', default=1, type=int, 45 | help='Schedule for training: 1x, 2x, 3x, 4x.') 46 | parser.add_argument('--start_iter', type=int, default=0, 47 | help='start iteration to train') 48 | parser.add_argument('-r', '--resume', default=None, type=str, 49 | help='keep training') 50 | parser.add_argument('--num_workers', default=8, type=int, 51 | help='Number of workers used in dataloading') 52 | parser.add_argument('--num_gpu', default=1, type=int, 53 | help='Number of GPUs.') 54 | parser.add_argument('--start_epoch', type=int, 55 | default=0, help='the start epoch to train') 56 | parser.add_argument('--eval_epoch', type=int, 57 | default=2, help='interval between evaluations') 58 | parser.add_argument('--tfboard', action='store_true', default=False, 59 | help='use tensorboard') 60 | parser.add_argument('--save_folder', default='weights/', type=str, 61 | help='Gamma update for SGD') 62 | parser.add_argument('--vis', action='store_true', default=False, 63 | help='visualize target.') 64 | 65 | # model 66 | parser.add_argument('-v', '--version', default='fcos_rt', 67 | help='fcos_rt, fcos') 68 | parser.add_argument('-bk', '--backbone', default='r18', 69 | help='r18, r50, r101') 70 | 71 | # dataset 72 | parser.add_argument('--root', default='/mnt/share/ssd2/dataset', 73 | help='data root') 74 | parser.add_argument('-d', '--dataset', default='coco', 75 | help='coco, widerface, crowdhuman') 76 | 77 | # train trick 78 | parser.add_argument('--ema', action='store_true', default=False, 79 | help='use ema training trick') 80 | parser.add_argument('--multi_scale', action='store_true', default=False, 81 | help='use multi scale training trick') 82 | parser.add_argument('--no_warmup', action='store_true', default=False, 83 | help='do not use warmup') 84 | parser.add_argument('--wp_epoch', type=int, 85 | default=1, help='wram-up epoch') 86 | 87 | # train DDP 88 | parser.add_argument('-dist', '--distributed', action='store_true', default=False, 89 | help='distributed training') 90 | parser.add_argument('--local_rank', type=int, default=0, 91 | help='local_rank') 92 | parser.add_argument('--sybn', action='store_true', default=False, 93 | help='use sybn.') 94 | 95 | 96 | return parser.parse_args() 97 | 98 | 99 | def train(): 100 | args = parse_args() 101 | print("Setting Arguments.. : ", args) 102 | print("----------------------------------------------------------") 103 | # model name 104 | model_name = args.version 105 | print('Model: ', model_name) 106 | 107 | # config 108 | if args.version == 'fcos_rt': 109 | scale_range = [[0, 64], [64, 128], [128, 1e5]] 110 | elif args.version == 'fcos': 111 | scale_range = [[0, 64], [64, 128], [128, 256], [256, 512], [512, 1e5]] 112 | 113 | # set distributed 114 | local_rank = 0 115 | if args.distributed: 116 | dist.init_process_group(backend="nccl", init_method="env://") 117 | local_rank = torch.distributed.get_rank() 118 | print(local_rank) 119 | torch.cuda.set_device(local_rank) 120 | 121 | # cuda 122 | if args.cuda: 123 | print('use cuda') 124 | cudnn.benchmark = True 125 | device = torch.device("cuda") 126 | else: 127 | device = torch.device("cpu") 128 | 129 | # path to save model 130 | path_to_save = os.path.join(args.save_folder, args.dataset, args.version) 131 | os.makedirs(path_to_save, exist_ok=True) 132 | 133 | # input size 134 | train_size = args.img_size 135 | val_size = args.img_size 136 | 137 | # EMA trick 138 | if args.ema: 139 | print('use EMA trick ...') 140 | 141 | # dataset and evaluator 142 | dataset, evaluator, num_classes = build_dataset(args, train_size, val_size, device) 143 | # dataloader 144 | dataloader = build_dataloader(args, dataset, detection_collate) 145 | 146 | print('Training model on:', args.dataset) 147 | print('The dataset size:', len(dataset)) 148 | print("----------------------------------------------------------") 149 | 150 | # buile model and config file 151 | if model_name == 'fcos_rt': 152 | from models.fcos_rt import FCOS_RT 153 | backbone = args.backbone 154 | # model 155 | net = FCOS_RT(device=device, 156 | img_size=train_size, 157 | num_classes=num_classes, 158 | trainable=True, 159 | bk=backbone 160 | ) 161 | 162 | elif model_name == 'fcos': 163 | from models.fcos import FCOS 164 | backbone = args.backbone 165 | # model 166 | net = FCOS(device=device, 167 | img_size=train_size, 168 | num_classes=num_classes, 169 | trainable=True, 170 | bk=backbone 171 | ) 172 | else: 173 | print('Unknown model name...') 174 | exit(0) 175 | 176 | model = net 177 | model = model.to(device).train() 178 | 179 | # SyncBatchNorm 180 | if args.distributed and args.sybn and args.cuda and args.num_gpu > 1: 181 | print('use SyncBatchNorm ...') 182 | model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) 183 | 184 | if local_rank == 0: 185 | # compute FLOPs and Params 186 | model.trainable = False 187 | model = model.eval() 188 | FLOPs_and_Params(model=model, size=train_size) 189 | model.trainable = True 190 | model = model.train() 191 | 192 | # keep training 193 | if args.resume is not None: 194 | print('keep training model: %s' % (args.resume)) 195 | if args.distributed: 196 | model.module.load_state_dict(torch.load(args.resume, map_location=device)) 197 | else: 198 | model.load_state_dict(torch.load(args.resume, map_location=device)) 199 | 200 | # EMA 201 | ema = ModelEMA(model) if args.ema else None 202 | 203 | # use tfboard 204 | tblogger = None 205 | if args.tfboard: 206 | print('use tensorboard') 207 | from torch.utils.tensorboard import SummaryWriter 208 | c_time = time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) 209 | log_path = os.path.join('log/', args.dataset, c_time) 210 | os.makedirs(log_path, exist_ok=True) 211 | 212 | tblogger = SummaryWriter(log_path) 213 | 214 | # basic 215 | batch_size = args.batch_size 216 | warmup = not args.no_warmup 217 | max_epoch = args.max_epoch * args.schedule 218 | lr_epoch = [e * args.schedule for e in args.lr_epoch] 219 | epoch_size = len(dataset) // (batch_size * args.num_gpu) 220 | print('Schedule: %dx' % args.schedule) 221 | print('Max epoch: ', max_epoch) 222 | print('Lr step:', lr_epoch) 223 | 224 | # build optimizer 225 | base_lr = args.lr 226 | tmp_lr = base_lr 227 | optimizer = optim.SGD(model.parameters(), 228 | lr=tmp_lr, 229 | momentum=0.9, 230 | weight_decay=1e-4 231 | ) 232 | 233 | best_map = 0. 234 | t0 = time.time() 235 | epoch = 0 236 | # start to train 237 | for epoch in range(args.start_epoch, max_epoch): 238 | # set epoch if DDP 239 | if args.distributed: 240 | dataloader.sampler.set_epoch(epoch) 241 | 242 | # use step lr 243 | if epoch in lr_epoch: 244 | tmp_lr = tmp_lr * 0.1 245 | set_lr(optimizer, tmp_lr) 246 | 247 | # load a batch 248 | for iter_i, (images, targets) in enumerate(dataloader): 249 | ni = iter_i + epoch * epoch_size 250 | # warmup 251 | if epoch < args.wp_epoch and warmup: 252 | nw = args.wp_epoch * epoch_size 253 | tmp_lr = base_lr * pow(ni / nw, 4) 254 | set_lr(optimizer, tmp_lr) 255 | 256 | elif epoch == args.wp_epoch and iter_i == 0 and warmup: 257 | # warmup is over 258 | warmup = False 259 | tmp_lr = base_lr 260 | set_lr(optimizer, tmp_lr) 261 | 262 | # multi-scale trick 263 | if iter_i % 10 == 0 and iter_i > 0 and args.multi_scale: 264 | # randomly choose a new size 265 | train_size = random.randint(10, args.img_size // 32) * 32 266 | model.set_grid(train_size) 267 | if args.multi_scale: 268 | # interpolate 269 | images = torch.nn.functional.interpolate( 270 | input=images, 271 | size=train_size, 272 | mode='bilinear', 273 | align_corners=False) 274 | 275 | # make labels 276 | if args.vis: 277 | vis_data(images, targets, train_size) 278 | continue 279 | targets = create_labels.gt_creator( 280 | img_size=train_size, 281 | num_classes=num_classes, 282 | strides=net.strides, 283 | scale_range=scale_range, 284 | targets=targets) 285 | 286 | # to device 287 | images = images.to(device) 288 | targets = targets.to(device) 289 | 290 | # forward 291 | cls_loss, reg_loss, ctn_loss, total_loss = model(images, targets=targets) 292 | 293 | loss_dict = dict( 294 | cls_loss=cls_loss, 295 | reg_loss=reg_loss, 296 | ctn_loss=ctn_loss, 297 | total_loss=total_loss 298 | ) 299 | loss_dict_reduced = distributed_utils.reduce_loss_dict(loss_dict) 300 | 301 | # check NAN 302 | if torch.isnan(total_loss): 303 | continue 304 | 305 | # backprop 306 | total_loss.backward() 307 | optimizer.step() 308 | optimizer.zero_grad() 309 | 310 | # ema 311 | if args.ema: 312 | ema.update(model) 313 | 314 | # display 315 | if iter_i % 10 == 0: 316 | if args.tfboard: 317 | # viz loss 318 | tblogger.add_scalar('cls loss', loss_dict_reduced['cls_loss'].item(), iter_i) 319 | tblogger.add_scalar('reg loss', loss_dict_reduced['reg_loss'].item(), iter_i) 320 | tblogger.add_scalar('ctn loss', loss_dict_reduced['ctn_loss'].item(), iter_i) 321 | 322 | t1 = time.time() 323 | print('[Epoch %d/%d][Iter %d/%d][lr %.6f][Loss: cls %.2f || reg %.2f || ctn %.2f || size %d || time: %.2f]' 324 | % (epoch+1, 325 | max_epoch, 326 | iter_i, 327 | epoch_size, 328 | tmp_lr, 329 | loss_dict_reduced['cls_loss'].item(), 330 | loss_dict_reduced['reg_loss'].item(), 331 | loss_dict_reduced['ctn_loss'].item(), 332 | train_size, 333 | t1-t0), 334 | flush=True) 335 | 336 | t0 = time.time() 337 | # update iter_i 338 | iter_i += 1 339 | 340 | # evaluate 341 | if (epoch + 1) % args.eval_epoch == 0 or epoch + 1 == max_epoch: 342 | if args.ema: 343 | model_eval = ema.ema 344 | else: 345 | model_eval = model.module if args.distributed else model 346 | 347 | # check evaluator 348 | if evaluator is None: 349 | print('No evaluator ... save model and go on training.') 350 | print('Saving state, epoch:', epoch + 1) 351 | if local_rank == 0: 352 | torch.save(model_eval.state_dict(), os.path.join(path_to_save, 353 | args.version + '_' + args.backbone + '_' + repr(epoch + 1) + '.pth')) 354 | else: 355 | print('eval ...') 356 | 357 | # set eval mode 358 | model_eval.trainable = False 359 | model_eval.set_grid(val_size) 360 | model_eval.eval() 361 | 362 | # we only do evaluation on local_rank-0. 363 | if local_rank == 0: 364 | # evaluate 365 | evaluator.evaluate(model_eval) 366 | 367 | cur_map = evaluator.map 368 | if cur_map > best_map: 369 | # update best-map 370 | best_map = cur_map 371 | # save model 372 | print('Saving state, epoch:', epoch + 1) 373 | torch.save(model_eval.state_dict(), os.path.join(path_to_save, 374 | args.version + '_' + args.backbone + '_' + repr(epoch + 1) + '_' + str(round(best_map, 2)) + '.pth')) 375 | 376 | if args.tfboard: 377 | if args.dataset == 'voc': 378 | tblogger.add_scalar('07test/mAP', evaluator.map, epoch) 379 | elif args.dataset == 'coco': 380 | tblogger.add_scalar('val/AP50_95', evaluator.ap50_95, epoch) 381 | tblogger.add_scalar('val/AP50', evaluator.ap50, epoch) 382 | 383 | # wait for all processes to synchronize 384 | if args.distributed: 385 | dist.barrier() 386 | 387 | # set train mode. 388 | model_eval.trainable = True 389 | model_eval.set_grid(train_size) 390 | model_eval.train() 391 | 392 | if args.tfboard: 393 | tblogger.close() 394 | 395 | 396 | def build_dataset(args, train_size, val_size, device): 397 | if args.dataset == 'voc': 398 | data_dir = os.path.join(args.root, 'VOCdevkit') 399 | num_classes = 20 400 | dataset = VOCDetection( 401 | data_dir=data_dir, 402 | transform=TrainTransforms(train_size)) 403 | 404 | evaluator = VOCAPIEvaluator( 405 | data_dir=data_dir, 406 | device=device, 407 | transform=ValTransforms(val_size)) 408 | 409 | elif args.dataset == 'coco': 410 | data_dir = os.path.join(args.root, 'COCO') 411 | num_classes = 80 412 | dataset = COCODataset( 413 | data_dir=data_dir, 414 | transform=TrainTransforms(train_size)) 415 | 416 | evaluator = COCOAPIEvaluator( 417 | data_dir=data_dir, 418 | device=device, 419 | transform=ValTransforms(val_size)) 420 | 421 | else: 422 | print('unknow dataset !! Only support voc and coco !!') 423 | exit(0) 424 | 425 | return dataset, evaluator, num_classes 426 | 427 | 428 | def build_dataloader(args, dataset, collate_fn=None): 429 | # distributed 430 | if args.distributed and args.num_gpu > 1: 431 | # dataloader 432 | dataloader = torch.utils.data.DataLoader( 433 | dataset=dataset, 434 | batch_size=args.batch_size, 435 | collate_fn=collate_fn, 436 | num_workers=args.num_workers, 437 | pin_memory=True, 438 | sampler=torch.utils.data.distributed.DistributedSampler(dataset) 439 | ) 440 | 441 | else: 442 | # dataloader 443 | dataloader = torch.utils.data.DataLoader( 444 | dataset=dataset, 445 | shuffle=True, 446 | batch_size=args.batch_size, 447 | collate_fn=collate_fn, 448 | num_workers=args.num_workers, 449 | pin_memory=True 450 | ) 451 | return dataloader 452 | 453 | 454 | def set_lr(optimizer, lr): 455 | for param_group in optimizer.param_groups: 456 | param_group['lr'] = lr 457 | 458 | 459 | def vis_data(images, targets, input_size, num_classes): 460 | B = images.size(0) 461 | # vis data 462 | mean=(0.406, 0.456, 0.485) 463 | std=(0.225, 0.224, 0.229) 464 | mean = np.array(mean, dtype=np.float32) 465 | std = np.array(std, dtype=np.float32) 466 | 467 | for bi in range(B): 468 | img = images[bi].permute(1, 2, 0).cpu().numpy()[:, :, ::-1] 469 | img = ((img * std + mean)*255).astype(np.uint8) 470 | cv2.imwrite('1.jpg', img) 471 | 472 | img_ = cv2.imread('1.jpg') 473 | target_i = targets[bi] # [N, C] 474 | bboxes = target_i['boxes'] 475 | labels = target_i['labels'] 476 | for box, cls_id in zip(bboxes, labels): 477 | xmin, ymin, xmax, ymax = box 478 | cls_id = int(cls_id) 479 | xmin *= input_size 480 | ymin *= input_size 481 | xmax *= input_size 482 | ymax *= input_size 483 | cv2.rectangle(img_, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (0, 0, 255), 2) 484 | 485 | cv2.imshow('img', img_) 486 | cv2.waitKey(0) 487 | 488 | 489 | if __name__ == '__main__': 490 | train() 491 | -------------------------------------------------------------------------------- /train_fcos.sh: -------------------------------------------------------------------------------- 1 | python train.py \ 2 | --cuda \ 3 | -v fcos \ 4 | -bk r50 \ 5 | --img_size 640 \ 6 | --lr 0.01 \ 7 | --batch_size 16 \ 8 | --schedule 1 \ 9 | --no_warmup 10 | -------------------------------------------------------------------------------- /train_fcos_rt.sh: -------------------------------------------------------------------------------- 1 | python train.py \ 2 | --cuda \ 3 | -d coco \ 4 | -v fcos_rt \ 5 | -bk r50 \ 6 | --img_size 512 \ 7 | --lr 0.01 \ 8 | --batch_size 16 \ 9 | --schedule 4 \ 10 | --multi_scale 11 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yjh0410/FCOS-RT_PyTorch/d81eb389e12f6e05ae75bfa69a56447b8fa2a02f/utils/__init__.py -------------------------------------------------------------------------------- /utils/box_ops.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def iou_score(bboxes_a, bboxes_b, batch_size): 4 | """ 5 | Input:\n 6 | bboxes_a : [B*N, 4] = [x1, y1, x2, y2] \n 7 | bboxes_b : [B*N, 4] = [x1, y1, x2, y2] \n 8 | 9 | Output:\n 10 | iou : [B, N] = [iou, ...] \n 11 | """ 12 | tl = torch.max(bboxes_a[:, :2], bboxes_b[:, :2]) 13 | br = torch.min(bboxes_a[:, 2:], bboxes_b[:, 2:]) 14 | area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) 15 | area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) 16 | 17 | en = (tl < br).type(tl.type()).prod(dim=1) 18 | area_i = torch.prod(br - tl, 1) * en # * ((tl < br).all()) 19 | iou = area_i / (area_a + area_b - area_i + 1e-14) 20 | 21 | return iou.view(batch_size, -1) 22 | 23 | 24 | def giou_score(bboxes_a, bboxes_b, batch_size): 25 | """ 26 | bbox_1 : [B*N, 4] = [x1, y1, x2, y2] 27 | bbox_2 : [B*N, 4] = [x1, y1, x2, y2] 28 | """ 29 | # iou 30 | tl = torch.max(bboxes_a[:, :2], bboxes_b[:, :2]) 31 | br = torch.min(bboxes_a[:, 2:], bboxes_b[:, 2:]) 32 | area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1) 33 | area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1) 34 | 35 | en = (tl < br).type(tl.type()).prod(dim=1) 36 | area_i = torch.prod(br - tl, 1) * en # * ((tl < br).all()) 37 | iou = (area_i / (area_a + area_b - area_i + 1e-14)).clamp(0) 38 | 39 | # giou 40 | tl = torch.min(bboxes_a[:, :2], bboxes_b[:, :2]) 41 | br = torch.max(bboxes_a[:, 2:], bboxes_b[:, 2:]) 42 | en = (tl < br).type(tl.type()).prod(dim=1) 43 | area_c = torch.prod(br - tl, 1) * en # * ((tl < br).all()) 44 | 45 | giou = (iou - (area_c - area_i) / (area_c + 1e-14)) 46 | 47 | return giou.view(batch_size, -1) 48 | 49 | 50 | if __name__ == '__main__': 51 | box1 = torch.tensor([[10, 10, 20, 20]]) 52 | box2 = torch.tensor([[15, 15, 25, 25]]) 53 | iou = iou_score(box1, box2) 54 | print(iou) 55 | giou = giou_score(box1, box2) 56 | print(giou) 57 | -------------------------------------------------------------------------------- /utils/com_flops_params.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from thop import profile 3 | 4 | 5 | 6 | 7 | def FLOPs_and_Params(model, size): 8 | device = model.device 9 | x = torch.randn(1, 3, size, size).to(device) 10 | model.trainable = False 11 | model.eval() 12 | 13 | flops, params = profile(model, inputs=(x, )) 14 | print('FLOPs : ', flops / 1e9, ' B') 15 | print('Params : ', params / 1e6, ' M') 16 | 17 | model.trainable = True 18 | model.train() 19 | 20 | 21 | if __name__ == "__main__": 22 | pass 23 | -------------------------------------------------------------------------------- /utils/create_labels.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | def gt_creator(img_size, num_classes, strides, scale_range, targets): 6 | batch_size = len(targets) 7 | w = h = img_size 8 | gt_tensor = [] 9 | 10 | # empty gt tensor 11 | for s in strides: 12 | gt_tensor.append(np.zeros([batch_size, h//s, w//s, num_classes + 4 + 1])) 13 | 14 | # generate gt datas 15 | for bi in range(batch_size): 16 | target = targets[bi] 17 | bboxes = target['boxes'].tolist() 18 | labels = target['labels'].tolist() 19 | for box, cls_id in zip(bboxes, labels): 20 | x1, y1, x2, y2 = box 21 | cls_id = int(cls_id) 22 | 23 | # compute the center, width and height 24 | xc = (x2 + x1) / 2 * w 25 | yc = (y2 + y1) / 2 * h 26 | bw = (x2 - x1) * w 27 | bh = (y2 - y1) * h 28 | 29 | if bw < 1. or bh < 1.: 30 | # print('A dirty data !!!') 31 | continue 32 | 33 | for si, s in enumerate(strides): 34 | hs, ws = h // s, w // s 35 | x1_s, x2_s = x1 * ws, x2 * ws 36 | y1_s, y2_s = y1 * hs, y2 * hs 37 | xc_s = xc / s 38 | yc_s = yc / s 39 | sr = scale_range[si] 40 | 41 | gridx = int(xc_s) 42 | gridy = int(yc_s) 43 | 44 | # By default, we only consider the 3x3 neighborhood of the center point 45 | for i in range(gridx - 1, gridx + 2): 46 | for j in range(gridy - 1, gridy + 2): 47 | if (j >= 0 and j < gt_tensor[si].shape[1]) and (i >= 0 and i < gt_tensor[si].shape[2]): 48 | t = j - y1_s 49 | b = y2_s - j 50 | l = i - x1_s 51 | r = x2_s - i 52 | if min(t, b, l, r) > 0: 53 | if max(t, b, l, r) >= (sr[0]/s) and max(t, b, l, r) < (sr[1]/s): 54 | gt_tensor[si][bi, j, i, cls_id] = 1.0 55 | gt_tensor[si][bi, j, i, num_classes:num_classes + 4] = np.array([x1, y1, x2, y2]) 56 | gt_tensor[si][bi, j, i, num_classes + 4] = np.sqrt(min(l, r) / max(l, r) * \ 57 | min(t, b) / max(t, b)) 58 | 59 | gt_tensor = [gt.reshape(batch_size, -1, num_classes + 4 + 1) for gt in gt_tensor] 60 | gt_tensor = np.concatenate(gt_tensor, axis=1) 61 | 62 | return torch.from_numpy(gt_tensor).float() 63 | 64 | 65 | if __name__ == "__main__": 66 | pass -------------------------------------------------------------------------------- /utils/distributed_utils.py: -------------------------------------------------------------------------------- 1 | # from github: https://github.com/ruinmessi/ASFF/blob/master/utils/distributed_util.py 2 | 3 | import torch 4 | 5 | 6 | def get_world_size(): 7 | if not torch.distributed.is_initialized(): 8 | return 1 9 | return torch.distributed.get_world_size() 10 | 11 | 12 | def get_rank(): 13 | if not torch.distributed.is_initialized(): 14 | return 0 15 | return torch.distributed.get_rank() 16 | 17 | 18 | def is_main_process(): 19 | if not torch.distributed.is_initialized(): 20 | return True 21 | return torch.distributed.get_rank() == 0 22 | 23 | 24 | def synchronize(): 25 | """ 26 | Helper function to synchronize between multiple processes when 27 | using distributed training 28 | """ 29 | if not torch.distributed.is_initialized(): 30 | return 31 | world_size = torch.distributed.get_world_size() 32 | rank = torch.distributed.get_rank() 33 | if world_size == 1: 34 | return 35 | 36 | def _send_and_wait(r): 37 | if rank == r: 38 | tensor = torch.tensor(0, device="cuda") 39 | else: 40 | tensor = torch.tensor(1, device="cuda") 41 | torch.distributed.broadcast(tensor, r) 42 | while tensor.item() == 1: 43 | time.sleep(1) 44 | 45 | _send_and_wait(0) 46 | # now sync on the main process 47 | _send_and_wait(1) 48 | 49 | 50 | def reduce_loss_dict(loss_dict): 51 | """ 52 | Reduce the loss dictionary from all processes so that process with rank 53 | 0 has the averaged results. Returns a dict with the same fields as 54 | loss_dict, after reduction. 55 | """ 56 | world_size = get_world_size() 57 | if world_size < 2: 58 | return loss_dict 59 | with torch.no_grad(): 60 | loss_names = [] 61 | all_losses = [] 62 | for k in sorted(loss_dict.keys()): 63 | loss_names.append(k) 64 | all_losses.append(loss_dict[k]) 65 | all_losses = torch.stack(all_losses, dim=0) 66 | torch.distributed.reduce(all_losses, dst=0) 67 | if torch.distributed.get_rank() == 0: 68 | # only main process gets accumulated, so only divide by 69 | # world_size in this case 70 | all_losses /= world_size 71 | reduced_losses = {k: v for k, v in zip(loss_names, all_losses)} 72 | return reduced_losses -------------------------------------------------------------------------------- /utils/loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class FocalWithLogitsLoss(nn.Module): 7 | def __init__(self, reduction='mean', gamma=2.0, alpha=0.25): 8 | super(FocalWithLogitsLoss, self).__init__() 9 | self.reduction = reduction 10 | self.gamma = gamma 11 | self.alpha = alpha 12 | 13 | def forward(self, logits, targets): 14 | p = torch.sigmoid(logits) 15 | ce_loss = F.binary_cross_entropy_with_logits(input=logits, 16 | target=targets, 17 | reduction="none" 18 | ) 19 | p_t = p * targets + (1.0 - p) * (1.0 - targets) 20 | loss = ce_loss * ((1.0 - p_t) ** self.gamma) 21 | 22 | if self.alpha >= 0: 23 | alpha_t = self.alpha * targets + (1.0 - self.alpha) * (1.0 - targets) 24 | loss = alpha_t * loss 25 | 26 | if self.reduction == "mean": 27 | batch_size = logits.size(0) 28 | pos_inds = (targets == 1.0).float() 29 | # [B, H*W, C] -> [B,] 30 | num_pos = pos_inds.sum([1, 2]).clamp(1) 31 | loss = loss.sum([1, 2]) 32 | 33 | loss = (loss / num_pos).sum() / batch_size 34 | 35 | elif self.reduction == "sum": 36 | loss = torch.sum(loss) 37 | 38 | return loss 39 | 40 | 41 | def loss(pred_cls, pred_giou, pred_ctn, target, num_classes): 42 | # create loss_f 43 | cls_loss_function = FocalWithLogitsLoss(reduction='mean') 44 | ctn_loss_function = nn.BCELoss(reduction='none') 45 | 46 | # groundtruth 47 | gt_cls = target[..., :num_classes] 48 | gt_ctn = target[..., -1] 49 | gt_pos = (gt_ctn > 0.).float() 50 | num_pos = gt_pos.sum(-1, keepdim=True).clamp(1) 51 | 52 | batch_size = pred_cls.size(0) 53 | # cls loss 54 | cls_loss = cls_loss_function(pred_cls, gt_cls) 55 | 56 | # reg loss 57 | reg_loss = ((1. - pred_giou) * gt_pos / num_pos).sum() / batch_size 58 | 59 | # ctn loss 60 | ctn_loss = (ctn_loss_function(pred_ctn[..., 0].sigmoid(), gt_ctn) * gt_pos / num_pos).sum() / batch_size 61 | 62 | # total loss 63 | total_loss = cls_loss + reg_loss + ctn_loss 64 | 65 | return cls_loss, reg_loss, ctn_loss, total_loss 66 | 67 | 68 | if __name__ == "__main__": 69 | pass -------------------------------------------------------------------------------- /utils/misc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | import math 5 | from copy import deepcopy 6 | 7 | 8 | def nms(dets, scores, nms_thresh=0.4): 9 | """"Pure Python NMS baseline.""" 10 | x1 = dets[:, 0] #xmin 11 | y1 = dets[:, 1] #ymin 12 | x2 = dets[:, 2] #xmax 13 | y2 = dets[:, 3] #ymax 14 | 15 | areas = (x2 - x1) * (y2 - y1) 16 | order = scores.argsort()[::-1] 17 | 18 | keep = [] 19 | while order.size > 0: 20 | i = order[0] 21 | keep.append(i) 22 | xx1 = np.maximum(x1[i], x1[order[1:]]) 23 | yy1 = np.maximum(y1[i], y1[order[1:]]) 24 | xx2 = np.minimum(x2[i], x2[order[1:]]) 25 | yy2 = np.minimum(y2[i], y2[order[1:]]) 26 | 27 | w = np.maximum(1e-28, xx2 - xx1) 28 | h = np.maximum(1e-28, yy2 - yy1) 29 | inter = w * h 30 | 31 | # Cross Area / (bbox + particular area - Cross Area) 32 | ovr = inter / (areas[i] + areas[order[1:]] - inter + 1e-10) 33 | #reserve all the boundingbox whose ovr less than thresh 34 | inds = np.where(ovr <= nms_thresh)[0] 35 | order = order[inds + 1] 36 | 37 | return keep 38 | 39 | 40 | def is_parallel(model): 41 | # Returns True if model is of type DP or DDP 42 | return type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) 43 | 44 | 45 | def detection_collate(batch): 46 | """Custom collate fn for dealing with batches of images that have a different 47 | number of associated object annotations (bounding boxes). 48 | 49 | Arguments: 50 | batch: (tuple) A tuple of tensor images and lists of annotations 51 | 52 | Return: 53 | A tuple containing: 54 | 1) (tensor) batch of images stacked on their 0 dim 55 | 2) (list of tensors) annotations for a given image are stacked on 56 | 0 dim 57 | """ 58 | targets = [] 59 | imgs = [] 60 | for sample in batch: 61 | imgs.append(sample[0]) 62 | targets.append(sample[1]) 63 | return torch.stack(imgs, 0), targets 64 | 65 | 66 | # Model EMA 67 | class ModelEMA(object): 68 | def __init__(self, model, decay=0.9999, updates=0): 69 | # create EMA 70 | self.ema = deepcopy(model.module if is_parallel(model) else model).eval() # FP32 EMA 71 | self.updates = updates 72 | self.decay = lambda x: decay * (1 - math.exp(-x / 2000.)) 73 | for p in self.ema.parameters(): 74 | p.requires_grad_(False) 75 | 76 | def update(self, model): 77 | # Update EMA parameters 78 | with torch.no_grad(): 79 | self.updates += 1 80 | d = self.decay(self.updates) 81 | 82 | msd = model.module.state_dict() if is_parallel(model) else model.state_dict() # model state_dict 83 | for k, v in self.ema.state_dict().items(): 84 | if v.dtype.is_floating_point: 85 | v *= d 86 | v += (1. - d) * msd[k].detach() 87 | 88 | 89 | # test augmentation 90 | class TestTimeAugmentation(object): 91 | def __init__(self, num_classes=80, nms_thresh=0.4, scale_range=[320, 640, 32]): 92 | self.nms = nms 93 | self.num_classes = num_classes 94 | self.nms_thresh = nms_thresh 95 | self.scales = np.arange(scale_range[0], scale_range[1]+1, scale_range[2]) 96 | 97 | def __call__(self, x, model): 98 | # x: Tensor -> [B, C, H, W] 99 | bboxes_list = [] 100 | scores_list = [] 101 | labels_list = [] 102 | 103 | # multi scale 104 | for s in self.scales: 105 | if x.size(-1) == s and x.size(-2) == s: 106 | x_scale = x 107 | else: 108 | x_scale =torch.nn.functional.interpolate( 109 | input=x, 110 | size=(s, s), 111 | mode='bilinear', 112 | align_corners=False) 113 | model.set_grid(s) 114 | bboxes, scores, labels = model(x_scale) 115 | bboxes_list.append(bboxes) 116 | scores_list.append(scores) 117 | labels_list.append(labels) 118 | 119 | # Flip 120 | x_flip = torch.flip(x_scale, [-1]) 121 | bboxes, scores, labels = model(x_flip) 122 | bboxes = bboxes.copy() 123 | bboxes[:, 0::2] = 1.0 - bboxes[:, 2::-2] 124 | bboxes_list.append(bboxes) 125 | scores_list.append(scores) 126 | labels_list.append(labels) 127 | 128 | bboxes = np.concatenate(bboxes_list) 129 | scores = np.concatenate(scores_list) 130 | labels = np.concatenate(labels_list) 131 | 132 | # nms 133 | keep = np.zeros(len(bboxes), dtype=np.int) 134 | for i in range(self.num_classes): 135 | inds = np.where(labels == i)[0] 136 | if len(inds) == 0: 137 | continue 138 | c_bboxes = bboxes[inds] 139 | c_scores = scores[inds] 140 | c_keep = self.nms(c_bboxes, c_scores, self.nms_thresh) 141 | keep[inds[c_keep]] = 1 142 | 143 | keep = np.where(keep > 0) 144 | bboxes = bboxes[keep] 145 | scores = scores[keep] 146 | labels = labels[keep] 147 | 148 | return bboxes, scores, labels 149 | --------------------------------------------------------------------------------