├── lib ├── __init__.py ├── nms │ ├── __init__.py │ ├── src │ │ ├── nms_cuda.h │ │ ├── nms.h │ │ ├── cuda │ │ │ ├── nms_kernel.h │ │ │ └── nms_kernel.cu │ │ ├── nms_cuda.c │ │ └── nms.c │ ├── build.py │ └── pth_nms.py └── build.sh ├── images ├── 1.jpg ├── 3.jpg ├── 4.jpg ├── 5.jpg ├── 6.jpg ├── 7.jpg └── 8.jpg ├── .gitignore ├── coco_eval.py ├── visualize.py ├── anchors.py ├── utils.py ├── README.md ├── losses.py ├── train.py ├── csv_eval.py ├── oid_dataset.py ├── LICENSE ├── model.py └── dataloader.py /lib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/1.jpg -------------------------------------------------------------------------------- /images/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/3.jpg -------------------------------------------------------------------------------- /images/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/4.jpg -------------------------------------------------------------------------------- /images/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/5.jpg -------------------------------------------------------------------------------- /images/6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/6.jpg -------------------------------------------------------------------------------- /images/7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/7.jpg -------------------------------------------------------------------------------- /images/8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mimoralea/pytorch-retinanet/master/images/8.jpg -------------------------------------------------------------------------------- /lib/nms/src/nms_cuda.h: -------------------------------------------------------------------------------- 1 | int gpu_nms(THLongTensor * keep_out, THLongTensor* num_out, THCudaTensor * boxes, float nms_overlap_thresh); -------------------------------------------------------------------------------- /lib/nms/src/nms.h: -------------------------------------------------------------------------------- 1 | int cpu_nms(THLongTensor * keep_out, THLongTensor * num_out, THFloatTensor * boxes, THLongTensor * order, THFloatTensor * areas, float nms_overlap_thresh); -------------------------------------------------------------------------------- /lib/nms/src/cuda/nms_kernel.h: -------------------------------------------------------------------------------- 1 | #ifndef _NMS_KERNEL 2 | #define _NMS_KERNEL 3 | 4 | #ifdef __cplusplus 5 | extern "C" { 6 | #endif 7 | 8 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 9 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 10 | 11 | void _nms(int boxes_num, float * boxes_dev, 12 | unsigned long long * mask_dev, float nms_overlap_thresh); 13 | 14 | #ifdef __cplusplus 15 | } 16 | #endif 17 | 18 | #endif 19 | 20 | -------------------------------------------------------------------------------- /lib/build.sh: -------------------------------------------------------------------------------- 1 | CUDA_ARCH="-gencode arch=compute_30,code=sm_30 \ 2 | -gencode arch=compute_35,code=sm_35 \ 3 | -gencode arch=compute_50,code=sm_50 \ 4 | -gencode arch=compute_52,code=sm_52 \ 5 | -gencode arch=compute_60,code=sm_60 \ 6 | -gencode arch=compute_61,code=sm_61" 7 | 8 | 9 | # Build NMS 10 | cd nms/src/cuda 11 | echo "Compiling nms kernels by nvcc..." 12 | /usr/local/cuda/bin/nvcc -c -o nms_kernel.cu.o nms_kernel.cu -x cu -Xcompiler -fPIC $CUDA_ARCH 13 | cd ../../ 14 | python build.py 15 | cd ../ 16 | -------------------------------------------------------------------------------- /lib/nms/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from torch.utils.ffi import create_extension 4 | 5 | 6 | sources = ['src/nms.c'] 7 | headers = ['src/nms.h'] 8 | defines = [] 9 | with_cuda = False 10 | 11 | if torch.cuda.is_available(): 12 | print('Including CUDA code.') 13 | sources += ['src/nms_cuda.c'] 14 | headers += ['src/nms_cuda.h'] 15 | defines += [('WITH_CUDA', None)] 16 | with_cuda = True 17 | 18 | this_file = os.path.dirname(os.path.realpath(__file__)) 19 | print(this_file) 20 | extra_objects = ['src/cuda/nms_kernel.cu.o'] 21 | extra_objects = [os.path.join(this_file, fname) for fname in extra_objects] 22 | 23 | ffi = create_extension( 24 | '_ext.nms', 25 | headers=headers, 26 | sources=sources, 27 | define_macros=defines, 28 | relative_to=__file__, 29 | with_cuda=with_cuda, 30 | extra_objects=extra_objects, 31 | extra_compile_args=['-std=c99'] 32 | ) 33 | 34 | if __name__ == '__main__': 35 | ffi.build() 36 | -------------------------------------------------------------------------------- /lib/nms/pth_nms.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from ._ext import nms 3 | import numpy as np 4 | 5 | def pth_nms(dets, thresh): 6 | """ 7 | dets has to be a tensor 8 | """ 9 | if not dets.is_cuda: 10 | x1 = dets[:, 0] 11 | y1 = dets[:, 1] 12 | x2 = dets[:, 2] 13 | y2 = dets[:, 3] 14 | scores = dets[:, 4] 15 | 16 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 17 | order = scores.sort(0, descending=True)[1] 18 | # order = torch.from_numpy(np.ascontiguousarray(scores.numpy().argsort()[::-1])).long() 19 | 20 | keep = torch.LongTensor(dets.size(0)) 21 | num_out = torch.LongTensor(1) 22 | nms.cpu_nms(keep, num_out, dets, order, areas, thresh) 23 | 24 | return keep[:num_out[0]] 25 | else: 26 | x1 = dets[:, 0] 27 | y1 = dets[:, 1] 28 | x2 = dets[:, 2] 29 | y2 = dets[:, 3] 30 | scores = dets[:, 4] 31 | 32 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 33 | order = scores.sort(0, descending=True)[1] 34 | # order = torch.from_numpy(np.ascontiguousarray(scores.cpu().numpy().argsort()[::-1])).long().cuda() 35 | 36 | dets = dets[order].contiguous() 37 | 38 | keep = torch.LongTensor(dets.size(0)) 39 | num_out = torch.LongTensor(1) 40 | # keep = torch.cuda.LongTensor(dets.size(0)) 41 | # num_out = torch.cuda.LongTensor(1) 42 | nms.gpu_nms(keep, num_out, dets, thresh) 43 | 44 | return order[keep[:num_out[0]].cuda()].contiguous() 45 | # return order[keep[:num_out[0]]].contiguous() 46 | 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | .spyproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # mkdocs documentation 97 | /site 98 | 99 | # mypy 100 | .mypy_cache/ 101 | 102 | *.zip 103 | *.pt 104 | -------------------------------------------------------------------------------- /lib/nms/src/nms_cuda.c: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "cuda/nms_kernel.h" 13 | 14 | 15 | extern THCState *state; 16 | 17 | int gpu_nms(THLongTensor * keep, THLongTensor* num_out, THCudaTensor * boxes, float nms_overlap_thresh) { 18 | // boxes has to be sorted 19 | THArgCheck(THLongTensor_isContiguous(keep), 0, "boxes must be contiguous"); 20 | THArgCheck(THCudaTensor_isContiguous(state, boxes), 2, "boxes must be contiguous"); 21 | // Number of ROIs 22 | int boxes_num = THCudaTensor_size(state, boxes, 0); 23 | int boxes_dim = THCudaTensor_size(state, boxes, 1); 24 | 25 | float* boxes_flat = THCudaTensor_data(state, boxes); 26 | 27 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 28 | THCudaLongTensor * mask = THCudaLongTensor_newWithSize2d(state, boxes_num, col_blocks); 29 | unsigned long long* mask_flat = THCudaLongTensor_data(state, mask); 30 | 31 | _nms(boxes_num, boxes_flat, mask_flat, nms_overlap_thresh); 32 | 33 | THLongTensor * mask_cpu = THLongTensor_newWithSize2d(boxes_num, col_blocks); 34 | THLongTensor_copyCuda(state, mask_cpu, mask); 35 | THCudaLongTensor_free(state, mask); 36 | 37 | unsigned long long * mask_cpu_flat = THLongTensor_data(mask_cpu); 38 | 39 | THLongTensor * remv_cpu = THLongTensor_newWithSize1d(col_blocks); 40 | unsigned long long* remv_cpu_flat = THLongTensor_data(remv_cpu); 41 | THLongTensor_fill(remv_cpu, 0); 42 | 43 | long * keep_flat = THLongTensor_data(keep); 44 | long num_to_keep = 0; 45 | 46 | int i, j; 47 | for (i = 0; i < boxes_num; i++) { 48 | int nblock = i / threadsPerBlock; 49 | int inblock = i % threadsPerBlock; 50 | 51 | if (!(remv_cpu_flat[nblock] & (1ULL << inblock))) { 52 | keep_flat[num_to_keep++] = i; 53 | unsigned long long *p = &mask_cpu_flat[0] + i * col_blocks; 54 | for (j = nblock; j < col_blocks; j++) { 55 | remv_cpu_flat[j] |= p[j]; 56 | } 57 | } 58 | } 59 | 60 | long * num_out_flat = THLongTensor_data(num_out); 61 | * num_out_flat = num_to_keep; 62 | 63 | THLongTensor_free(mask_cpu); 64 | THLongTensor_free(remv_cpu); 65 | 66 | return 1; 67 | } 68 | -------------------------------------------------------------------------------- /lib/nms/src/nms.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int cpu_nms(THLongTensor * keep_out, THLongTensor * num_out, THFloatTensor * boxes, THLongTensor * order, THFloatTensor * areas, float nms_overlap_thresh) { 5 | // boxes has to be sorted 6 | THArgCheck(THLongTensor_isContiguous(keep_out), 0, "keep_out must be contiguous"); 7 | THArgCheck(THLongTensor_isContiguous(boxes), 2, "boxes must be contiguous"); 8 | THArgCheck(THLongTensor_isContiguous(order), 3, "order must be contiguous"); 9 | THArgCheck(THLongTensor_isContiguous(areas), 4, "areas must be contiguous"); 10 | // Number of ROIs 11 | long boxes_num = THFloatTensor_size(boxes, 0); 12 | long boxes_dim = THFloatTensor_size(boxes, 1); 13 | 14 | long * keep_out_flat = THLongTensor_data(keep_out); 15 | float * boxes_flat = THFloatTensor_data(boxes); 16 | long * order_flat = THLongTensor_data(order); 17 | float * areas_flat = THFloatTensor_data(areas); 18 | 19 | THByteTensor* suppressed = THByteTensor_newWithSize1d(boxes_num); 20 | THByteTensor_fill(suppressed, 0); 21 | unsigned char * suppressed_flat = THByteTensor_data(suppressed); 22 | 23 | // nominal indices 24 | int i, j; 25 | // sorted indices 26 | int _i, _j; 27 | // temp variables for box i's (the box currently under consideration) 28 | float ix1, iy1, ix2, iy2, iarea; 29 | // variables for computing overlap with box j (lower scoring box) 30 | float xx1, yy1, xx2, yy2; 31 | float w, h; 32 | float inter, ovr; 33 | 34 | long num_to_keep = 0; 35 | for (_i=0; _i < boxes_num; ++_i) { 36 | i = order_flat[_i]; 37 | if (suppressed_flat[i] == 1) { 38 | continue; 39 | } 40 | keep_out_flat[num_to_keep++] = i; 41 | ix1 = boxes_flat[i * boxes_dim]; 42 | iy1 = boxes_flat[i * boxes_dim + 1]; 43 | ix2 = boxes_flat[i * boxes_dim + 2]; 44 | iy2 = boxes_flat[i * boxes_dim + 3]; 45 | iarea = areas_flat[i]; 46 | for (_j = _i + 1; _j < boxes_num; ++_j) { 47 | j = order_flat[_j]; 48 | if (suppressed_flat[j] == 1) { 49 | continue; 50 | } 51 | xx1 = fmaxf(ix1, boxes_flat[j * boxes_dim]); 52 | yy1 = fmaxf(iy1, boxes_flat[j * boxes_dim + 1]); 53 | xx2 = fminf(ix2, boxes_flat[j * boxes_dim + 2]); 54 | yy2 = fminf(iy2, boxes_flat[j * boxes_dim + 3]); 55 | w = fmaxf(0.0, xx2 - xx1 + 1); 56 | h = fmaxf(0.0, yy2 - yy1 + 1); 57 | inter = w * h; 58 | ovr = inter / (iarea + areas_flat[j] - inter); 59 | if (ovr >= nms_overlap_thresh) { 60 | suppressed_flat[j] = 1; 61 | } 62 | } 63 | } 64 | 65 | long *num_out_flat = THLongTensor_data(num_out); 66 | *num_out_flat = num_to_keep; 67 | THByteTensor_free(suppressed); 68 | return 1; 69 | } -------------------------------------------------------------------------------- /coco_eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from pycocotools.coco import COCO 4 | from pycocotools.cocoeval import COCOeval 5 | 6 | import numpy as np 7 | import json 8 | import os 9 | 10 | import torch 11 | 12 | def evaluate_coco(dataset, model, threshold=0.05): 13 | 14 | model.eval() 15 | 16 | with torch.no_grad(): 17 | 18 | # start collecting results 19 | results = [] 20 | image_ids = [] 21 | 22 | for index in range(len(dataset)): 23 | data = dataset[index] 24 | scale = data['scale'] 25 | 26 | # run network 27 | scores, labels, boxes = model(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0)) 28 | scores = scores.cpu() 29 | labels = labels.cpu() 30 | boxes = boxes.cpu() 31 | 32 | # correct boxes for image scale 33 | boxes /= scale 34 | 35 | if boxes.shape[0] > 0: 36 | # change to (x, y, w, h) (MS COCO standard) 37 | boxes[:, 2] -= boxes[:, 0] 38 | boxes[:, 3] -= boxes[:, 1] 39 | 40 | # compute predicted labels and scores 41 | #for box, score, label in zip(boxes[0], scores[0], labels[0]): 42 | for box_id in range(boxes.shape[0]): 43 | score = float(scores[box_id]) 44 | label = int(labels[box_id]) 45 | box = boxes[box_id, :] 46 | 47 | # scores are sorted, so we can break 48 | if score < threshold: 49 | break 50 | 51 | # append detection for each positively labeled class 52 | image_result = { 53 | 'image_id' : dataset.image_ids[index], 54 | 'category_id' : dataset.label_to_coco_label(label), 55 | 'score' : float(score), 56 | 'bbox' : box.tolist(), 57 | } 58 | 59 | # append detection to results 60 | results.append(image_result) 61 | 62 | # append image to list of processed images 63 | image_ids.append(dataset.image_ids[index]) 64 | 65 | # print progress 66 | print('{}/{}'.format(index, len(dataset)), end='\r') 67 | 68 | if not len(results): 69 | return 70 | 71 | # write output 72 | json.dump(results, open('{}_bbox_results.json'.format(dataset.set_name), 'w'), indent=4) 73 | 74 | # load results in COCO evaluation tool 75 | coco_true = dataset.coco 76 | coco_pred = coco_true.loadRes('{}_bbox_results.json'.format(dataset.set_name)) 77 | 78 | # run COCO evaluation 79 | coco_eval = COCOeval(coco_true, coco_pred, 'bbox') 80 | coco_eval.params.imgIds = image_ids 81 | coco_eval.evaluate() 82 | coco_eval.accumulate() 83 | coco_eval.summarize() 84 | 85 | model.train() 86 | 87 | return 88 | -------------------------------------------------------------------------------- /lib/nms/src/cuda/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | #include 12 | #include 13 | #include 14 | #include "nms_kernel.h" 15 | 16 | __device__ inline float devIoU(float const * const a, float const * const b) { 17 | float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]); 18 | float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]); 19 | float width = fmaxf(right - left + 1, 0.f), height = fmaxf(bottom - top + 1, 0.f); 20 | float interS = width * height; 21 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 22 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 23 | return interS / (Sa + Sb - interS); 24 | } 25 | 26 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 27 | const float *dev_boxes, unsigned long long *dev_mask) { 28 | const int row_start = blockIdx.y; 29 | const int col_start = blockIdx.x; 30 | 31 | // if (row_start > col_start) return; 32 | 33 | const int row_size = 34 | fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 35 | const int col_size = 36 | fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 37 | 38 | __shared__ float block_boxes[threadsPerBlock * 5]; 39 | if (threadIdx.x < col_size) { 40 | block_boxes[threadIdx.x * 5 + 0] = 41 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 42 | block_boxes[threadIdx.x * 5 + 1] = 43 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 44 | block_boxes[threadIdx.x * 5 + 2] = 45 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 46 | block_boxes[threadIdx.x * 5 + 3] = 47 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 48 | block_boxes[threadIdx.x * 5 + 4] = 49 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 50 | } 51 | __syncthreads(); 52 | 53 | if (threadIdx.x < row_size) { 54 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 55 | const float *cur_box = dev_boxes + cur_box_idx * 5; 56 | int i = 0; 57 | unsigned long long t = 0; 58 | int start = 0; 59 | if (row_start == col_start) { 60 | start = threadIdx.x + 1; 61 | } 62 | for (i = start; i < col_size; i++) { 63 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 64 | t |= 1ULL << i; 65 | } 66 | } 67 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 68 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 69 | } 70 | } 71 | 72 | 73 | void _nms(int boxes_num, float * boxes_dev, 74 | unsigned long long * mask_dev, float nms_overlap_thresh) { 75 | 76 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 77 | DIVUP(boxes_num, threadsPerBlock)); 78 | dim3 threads(threadsPerBlock); 79 | nms_kernel<<>>(boxes_num, 80 | nms_overlap_thresh, 81 | boxes_dev, 82 | mask_dev); 83 | } 84 | 85 | #ifdef __cplusplus 86 | } 87 | #endif 88 | -------------------------------------------------------------------------------- /visualize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torchvision 3 | import time 4 | import os 5 | import copy 6 | import pdb 7 | import time 8 | import argparse 9 | 10 | import sys 11 | import cv2 12 | 13 | import torch 14 | from torch.utils.data import Dataset, DataLoader 15 | from torchvision import datasets, models, transforms 16 | 17 | from dataloader import CocoDataset, CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, UnNormalizer, Normalizer 18 | 19 | 20 | assert torch.__version__.split('.')[1] == '4' 21 | 22 | print('CUDA available: {}'.format(torch.cuda.is_available())) 23 | 24 | 25 | def main(args=None): 26 | parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') 27 | 28 | parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') 29 | parser.add_argument('--coco_path', help='Path to COCO directory') 30 | parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') 31 | parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)') 32 | 33 | parser.add_argument('--model', help='Path to model (.pt) file.') 34 | 35 | parser = parser.parse_args(args) 36 | 37 | if parser.dataset == 'coco': 38 | dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer()])) 39 | elif parser.dataset == 'csv': 40 | dataset_val = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()])) 41 | else: 42 | raise ValueError('Dataset type not understood (must be csv or coco), exiting.') 43 | 44 | sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) 45 | dataloader_val = DataLoader(dataset_val, num_workers=1, collate_fn=collater, batch_sampler=sampler_val) 46 | 47 | retinanet = torch.load(parser.model) 48 | 49 | use_gpu = True 50 | 51 | if use_gpu: 52 | retinanet = retinanet.cuda() 53 | 54 | retinanet.eval() 55 | 56 | unnormalize = UnNormalizer() 57 | 58 | def draw_caption(image, box, caption): 59 | 60 | b = np.array(box).astype(int) 61 | cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2) 62 | cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) 63 | 64 | for idx, data in enumerate(dataloader_val): 65 | 66 | with torch.no_grad(): 67 | st = time.time() 68 | scores, classification, transformed_anchors = retinanet(data['img'].cuda().float()) 69 | print('Elapsed time: {}'.format(time.time()-st)) 70 | idxs = np.where(scores>0.5) 71 | img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy() 72 | 73 | img[img<0] = 0 74 | img[img>255] = 255 75 | 76 | img = np.transpose(img, (1, 2, 0)) 77 | 78 | img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) 79 | 80 | for j in range(idxs[0].shape[0]): 81 | bbox = transformed_anchors[idxs[0][j], :] 82 | x1 = int(bbox[0]) 83 | y1 = int(bbox[1]) 84 | x2 = int(bbox[2]) 85 | y2 = int(bbox[3]) 86 | label_name = dataset_val.labels[int(classification[idxs[0][j]])] 87 | draw_caption(img, (x1, y1, x2, y2), label_name) 88 | 89 | cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 0, 255), thickness=2) 90 | print(label_name) 91 | 92 | cv2.imshow('img', img) 93 | cv2.waitKey(0) 94 | 95 | 96 | 97 | if __name__ == '__main__': 98 | main() -------------------------------------------------------------------------------- /anchors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class Anchors(nn.Module): 7 | def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None): 8 | super(Anchors, self).__init__() 9 | 10 | if pyramid_levels is None: 11 | self.pyramid_levels = [3, 4, 5, 6, 7] 12 | if strides is None: 13 | self.strides = [2 ** x for x in self.pyramid_levels] 14 | if sizes is None: 15 | self.sizes = [2 ** (x + 2) for x in self.pyramid_levels] 16 | if ratios is None: 17 | self.ratios = np.array([0.5, 1, 2]) 18 | if scales is None: 19 | self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 20 | 21 | def forward(self, image): 22 | 23 | image_shape = image.shape[2:] 24 | image_shape = np.array(image_shape) 25 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels] 26 | 27 | # compute anchors over all pyramid levels 28 | all_anchors = np.zeros((0, 4)).astype(np.float32) 29 | 30 | for idx, p in enumerate(self.pyramid_levels): 31 | anchors = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales) 32 | shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors) 33 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 34 | 35 | all_anchors = np.expand_dims(all_anchors, axis=0) 36 | 37 | return torch.from_numpy(all_anchors.astype(np.float32)).cuda() 38 | 39 | def generate_anchors(base_size=16, ratios=None, scales=None): 40 | """ 41 | Generate anchor (reference) windows by enumerating aspect ratios X 42 | scales w.r.t. a reference window. 43 | """ 44 | 45 | if ratios is None: 46 | ratios = np.array([0.5, 1, 2]) 47 | 48 | if scales is None: 49 | scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 50 | 51 | num_anchors = len(ratios) * len(scales) 52 | 53 | # initialize output anchors 54 | anchors = np.zeros((num_anchors, 4)) 55 | 56 | # scale base_size 57 | anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T 58 | 59 | # compute areas of anchors 60 | areas = anchors[:, 2] * anchors[:, 3] 61 | 62 | # correct for ratios 63 | anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales))) 64 | anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales)) 65 | 66 | # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) 67 | anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T 68 | anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T 69 | 70 | return anchors 71 | 72 | def compute_shape(image_shape, pyramid_levels): 73 | """Compute shapes based on pyramid levels. 74 | 75 | :param image_shape: 76 | :param pyramid_levels: 77 | :return: 78 | """ 79 | image_shape = np.array(image_shape[:2]) 80 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels] 81 | return image_shapes 82 | 83 | 84 | def anchors_for_shape( 85 | image_shape, 86 | pyramid_levels=None, 87 | ratios=None, 88 | scales=None, 89 | strides=None, 90 | sizes=None, 91 | shapes_callback=None, 92 | ): 93 | 94 | image_shapes = compute_shape(image_shape, pyramid_levels) 95 | 96 | # compute anchors over all pyramid levels 97 | all_anchors = np.zeros((0, 4)) 98 | for idx, p in enumerate(pyramid_levels): 99 | anchors = generate_anchors(base_size=sizes[idx], ratios=ratios, scales=scales) 100 | shifted_anchors = shift(image_shapes[idx], strides[idx], anchors) 101 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 102 | 103 | return all_anchors 104 | 105 | 106 | def shift(shape, stride, anchors): 107 | shift_x = (np.arange(0, shape[1]) + 0.5) * stride 108 | shift_y = (np.arange(0, shape[0]) + 0.5) * stride 109 | 110 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 111 | 112 | shifts = np.vstack(( 113 | shift_x.ravel(), shift_y.ravel(), 114 | shift_x.ravel(), shift_y.ravel() 115 | )).transpose() 116 | 117 | # add A anchors (1, A, 4) to 118 | # cell K shifts (K, 1, 4) to get 119 | # shift anchors (K, A, 4) 120 | # reshape to (K*A, 4) shifted anchors 121 | A = anchors.shape[0] 122 | K = shifts.shape[0] 123 | all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) 124 | all_anchors = all_anchors.reshape((K * A, 4)) 125 | 126 | return all_anchors 127 | 128 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | 5 | def conv3x3(in_planes, out_planes, stride=1): 6 | """3x3 convolution with padding""" 7 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 8 | padding=1, bias=False) 9 | 10 | class BasicBlock(nn.Module): 11 | expansion = 1 12 | 13 | def __init__(self, inplanes, planes, stride=1, downsample=None): 14 | super(BasicBlock, self).__init__() 15 | self.conv1 = conv3x3(inplanes, planes, stride) 16 | self.bn1 = nn.BatchNorm2d(planes) 17 | self.relu = nn.ReLU(inplace=True) 18 | self.conv2 = conv3x3(planes, planes) 19 | self.bn2 = nn.BatchNorm2d(planes) 20 | self.downsample = downsample 21 | self.stride = stride 22 | 23 | def forward(self, x): 24 | residual = x 25 | 26 | out = self.conv1(x) 27 | out = self.bn1(out) 28 | out = self.relu(out) 29 | 30 | out = self.conv2(out) 31 | out = self.bn2(out) 32 | 33 | if self.downsample is not None: 34 | residual = self.downsample(x) 35 | 36 | out += residual 37 | out = self.relu(out) 38 | 39 | return out 40 | 41 | 42 | class Bottleneck(nn.Module): 43 | expansion = 4 44 | 45 | def __init__(self, inplanes, planes, stride=1, downsample=None): 46 | super(Bottleneck, self).__init__() 47 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 48 | self.bn1 = nn.BatchNorm2d(planes) 49 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 50 | padding=1, bias=False) 51 | self.bn2 = nn.BatchNorm2d(planes) 52 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 53 | self.bn3 = nn.BatchNorm2d(planes * 4) 54 | self.relu = nn.ReLU(inplace=True) 55 | self.downsample = downsample 56 | self.stride = stride 57 | 58 | def forward(self, x): 59 | residual = x 60 | 61 | out = self.conv1(x) 62 | out = self.bn1(out) 63 | out = self.relu(out) 64 | 65 | out = self.conv2(out) 66 | out = self.bn2(out) 67 | out = self.relu(out) 68 | 69 | out = self.conv3(out) 70 | out = self.bn3(out) 71 | 72 | if self.downsample is not None: 73 | residual = self.downsample(x) 74 | 75 | out += residual 76 | out = self.relu(out) 77 | 78 | return out 79 | 80 | class BBoxTransform(nn.Module): 81 | 82 | def __init__(self, mean=None, std=None): 83 | super(BBoxTransform, self).__init__() 84 | if mean is None: 85 | self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)).cuda() 86 | else: 87 | self.mean = mean 88 | if std is None: 89 | self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)).cuda() 90 | else: 91 | self.std = std 92 | 93 | def forward(self, boxes, deltas): 94 | 95 | widths = boxes[:, :, 2] - boxes[:, :, 0] 96 | heights = boxes[:, :, 3] - boxes[:, :, 1] 97 | ctr_x = boxes[:, :, 0] + 0.5 * widths 98 | ctr_y = boxes[:, :, 1] + 0.5 * heights 99 | 100 | dx = deltas[:, :, 0] * self.std[0] + self.mean[0] 101 | dy = deltas[:, :, 1] * self.std[1] + self.mean[1] 102 | dw = deltas[:, :, 2] * self.std[2] + self.mean[2] 103 | dh = deltas[:, :, 3] * self.std[3] + self.mean[3] 104 | 105 | pred_ctr_x = ctr_x + dx * widths 106 | pred_ctr_y = ctr_y + dy * heights 107 | pred_w = torch.exp(dw) * widths 108 | pred_h = torch.exp(dh) * heights 109 | 110 | pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w 111 | pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h 112 | pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w 113 | pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h 114 | 115 | pred_boxes = torch.stack([pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2) 116 | 117 | return pred_boxes 118 | 119 | 120 | class ClipBoxes(nn.Module): 121 | 122 | def __init__(self, width=None, height=None): 123 | super(ClipBoxes, self).__init__() 124 | 125 | def forward(self, boxes, img): 126 | 127 | batch_size, num_channels, height, width = img.shape 128 | 129 | boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0) 130 | boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0) 131 | 132 | boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width) 133 | boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height) 134 | 135 | return boxes 136 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pytorch-retinanet 2 | 3 | ![img3](https://github.com/yhenon/pytorch-retinanet/blob/master/images/3.jpg) 4 | ![img5](https://github.com/yhenon/pytorch-retinanet/blob/master/images/5.jpg) 5 | 6 | Pytorch implementation of RetinaNet object detection as described in [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) by Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He and Piotr Dollár. 7 | 8 | This implementation is primarily designed to be easy to read and simple to modify. 9 | 10 | ## Results 11 | Currently, this repo achieves 33.7% mAP at 600px resolution with a Resnet-50 backbone. The published result is 34.0% mAP. The difference is likely due to the use of Adam optimizer instead of SGD with weight decay. 12 | 13 | ## Installation 14 | 15 | 1) Clone this repo 16 | 17 | 2) Install the required packages: 18 | 19 | ``` 20 | apt-get install tk-dev python-tk 21 | ``` 22 | 23 | 3) Install the python packages: 24 | 25 | ``` 26 | pip install cffi 27 | 28 | pip install pandas 29 | 30 | pip install pycocotools 31 | 32 | pip install cython 33 | 34 | pip install pycocotools 35 | 36 | pip install opencv-python 37 | 38 | pip install requests 39 | 40 | ``` 41 | 42 | 4) Build the NMS extension. 43 | 44 | ``` 45 | cd pytorch-retinanet/lib 46 | bash build.sh 47 | cd ../ 48 | ``` 49 | 50 | Note that you may have to edit line 14 of `build.sh` if you want to change which version of python you are building the extension for. 51 | 52 | ## Training 53 | 54 | The network can be trained using the `train.py` script. Currently, two dataloaders are available: COCO and CSV. For training on coco, use 55 | 56 | ``` 57 | python train.py --dataset coco --coco_path ../coco --depth 50 58 | ``` 59 | 60 | For training using a custom dataset, with annotations in CSV format (see below), use 61 | 62 | ``` 63 | python train.py --dataset csv --csv_train --csv_classes --csv_val 64 | ``` 65 | 66 | Note that the --csv_val argument is optional, in which case no validation will be performed. 67 | 68 | ## Pre-trained model 69 | 70 | A pre-trained model is available at: 71 | - https://drive.google.com/open?id=1yLmjq3JtXi841yXWBxst0coAgR26MNBS (this is a pytorch state dict) 72 | - https://drive.google.com/open?id=1hCtM35R_t6T8RJVSd74K4gB-A1MR-TxC (this is a pytorch model serialized via `torch.save()`) 73 | 74 | The state dict model can be loaded using: 75 | 76 | ``` 77 | retinanet = model.resnet50(num_classes=dataset_train.num_classes(),) 78 | retinanet.load_state_dict(torch.load(PATH_TO_WEIGHTS)) 79 | ``` 80 | 81 | The pytorch model can be loaded directly using: 82 | 83 | ``` 84 | retinanet = torch.load(PATH_TO_MODEL) 85 | ``` 86 | 87 | ## Visualization 88 | 89 | To visualize the network detection, use `visualize.py`: 90 | 91 | ``` 92 | python visualize.py --dataset coco --coco_path ../coco --model 93 | ``` 94 | This will visualize bounding boxes on the validation set. To visualise with a CSV dataset, use: 95 | 96 | ``` 97 | python visualize.py --dataset csv --csv_classes --csv_val --model 98 | ``` 99 | 100 | ## Model 101 | 102 | The retinanet model uses a resnet backbone. You can set the depth of the resnet model using the --depth argument. Depth must be one of 18, 34, 50, 101 or 152. Note that deeper models are more accurate but are slower and use more memory. 103 | 104 | ## CSV datasets 105 | The `CSVGenerator` provides an easy way to define your own datasets. 106 | It uses two CSV files: one file containing annotations and one file containing a class name to ID mapping. 107 | 108 | ### Annotations format 109 | The CSV file with annotations should contain one annotation per line. 110 | Images with multiple bounding boxes should use one row per bounding box. 111 | Note that indexing for pixel values starts at 0. 112 | The expected format of each line is: 113 | ``` 114 | path/to/image.jpg,x1,y1,x2,y2,class_name 115 | ``` 116 | 117 | Some images may not contain any labeled objects. 118 | To add these images to the dataset as negative examples, 119 | add an annotation where `x1`, `y1`, `x2`, `y2` and `class_name` are all empty: 120 | ``` 121 | path/to/image.jpg,,,,, 122 | ``` 123 | 124 | A full example: 125 | ``` 126 | /data/imgs/img_001.jpg,837,346,981,456,cow 127 | /data/imgs/img_002.jpg,215,312,279,391,cat 128 | /data/imgs/img_002.jpg,22,5,89,84,bird 129 | /data/imgs/img_003.jpg,,,,, 130 | ``` 131 | 132 | This defines a dataset with 3 images. 133 | `img_001.jpg` contains a cow. 134 | `img_002.jpg` contains a cat and a bird. 135 | `img_003.jpg` contains no interesting objects/animals. 136 | 137 | 138 | ### Class mapping format 139 | The class name to ID mapping file should contain one mapping per line. 140 | Each line should use the following format: 141 | ``` 142 | class_name,id 143 | ``` 144 | 145 | Indexing for classes starts at 0. 146 | Do not include a background class as it is implicit. 147 | 148 | For example: 149 | ``` 150 | cow,0 151 | cat,1 152 | bird,2 153 | ``` 154 | 155 | ## Acknowledgements 156 | 157 | - Significant amounts of code are borrowed from the [keras retinanet implementation](https://github.com/fizyr/keras-retinanet) 158 | - The NMS module used is from the [pytorch faster-rcnn implementation](https://github.com/ruotianluo/pytorch-faster-rcnn) 159 | 160 | ## Examples 161 | 162 | ![img1](https://github.com/yhenon/pytorch-retinanet/blob/master/images/1.jpg) 163 | ![img2](https://github.com/yhenon/pytorch-retinanet/blob/master/images/2.jpg) 164 | ![img4](https://github.com/yhenon/pytorch-retinanet/blob/master/images/4.jpg) 165 | ![img6](https://github.com/yhenon/pytorch-retinanet/blob/master/images/6.jpg) 166 | ![img7](https://github.com/yhenon/pytorch-retinanet/blob/master/images/7.jpg) 167 | ![img8](https://github.com/yhenon/pytorch-retinanet/blob/master/images/8.jpg) 168 | -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | def calc_iou(a, b): 6 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 7 | 8 | iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0]) 9 | ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1]) 10 | 11 | iw = torch.clamp(iw, min=0) 12 | ih = torch.clamp(ih, min=0) 13 | 14 | ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih 15 | 16 | ua = torch.clamp(ua, min=1e-8) 17 | 18 | intersection = iw * ih 19 | 20 | IoU = intersection / ua 21 | 22 | return IoU 23 | 24 | class FocalLoss(nn.Module): 25 | #def __init__(self): 26 | 27 | def forward(self, classifications, regressions, anchors, annotations): 28 | alpha = 0.25 29 | gamma = 2.0 30 | batch_size = classifications.shape[0] 31 | classification_losses = [] 32 | regression_losses = [] 33 | 34 | anchor = anchors[0, :, :] 35 | 36 | anchor_widths = anchor[:, 2] - anchor[:, 0] 37 | anchor_heights = anchor[:, 3] - anchor[:, 1] 38 | anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths 39 | anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights 40 | 41 | for j in range(batch_size): 42 | 43 | classification = classifications[j, :, :] 44 | regression = regressions[j, :, :] 45 | 46 | bbox_annotation = annotations[j, :, :] 47 | bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] 48 | 49 | if bbox_annotation.shape[0] == 0: 50 | regression_losses.append(torch.tensor(0).float().cuda()) 51 | classification_losses.append(torch.tensor(0).float().cuda()) 52 | 53 | continue 54 | 55 | classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) 56 | 57 | IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) # num_anchors x num_annotations 58 | 59 | IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1 60 | 61 | #import pdb 62 | #pdb.set_trace() 63 | 64 | # compute the loss for classification 65 | targets = torch.ones(classification.shape) * -1 66 | targets = targets.cuda() 67 | 68 | targets[torch.lt(IoU_max, 0.4), :] = 0 69 | 70 | positive_indices = torch.ge(IoU_max, 0.5) 71 | 72 | num_positive_anchors = positive_indices.sum() 73 | 74 | assigned_annotations = bbox_annotation[IoU_argmax, :] 75 | 76 | targets[positive_indices, :] = 0 77 | targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1 78 | 79 | alpha_factor = torch.ones(targets.shape).cuda() * alpha 80 | 81 | alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor) 82 | focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification) 83 | focal_weight = alpha_factor * torch.pow(focal_weight, gamma) 84 | 85 | bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification)) 86 | 87 | # cls_loss = focal_weight * torch.pow(bce, gamma) 88 | cls_loss = focal_weight * bce 89 | 90 | cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda()) 91 | 92 | classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0)) 93 | 94 | # compute the loss for regression 95 | 96 | if positive_indices.sum() > 0: 97 | assigned_annotations = assigned_annotations[positive_indices, :] 98 | 99 | anchor_widths_pi = anchor_widths[positive_indices] 100 | anchor_heights_pi = anchor_heights[positive_indices] 101 | anchor_ctr_x_pi = anchor_ctr_x[positive_indices] 102 | anchor_ctr_y_pi = anchor_ctr_y[positive_indices] 103 | 104 | gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0] 105 | gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1] 106 | gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths 107 | gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights 108 | 109 | # clip widths to 1 110 | gt_widths = torch.clamp(gt_widths, min=1) 111 | gt_heights = torch.clamp(gt_heights, min=1) 112 | 113 | targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi 114 | targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi 115 | targets_dw = torch.log(gt_widths / anchor_widths_pi) 116 | targets_dh = torch.log(gt_heights / anchor_heights_pi) 117 | 118 | targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh)) 119 | targets = targets.t() 120 | 121 | targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda() 122 | 123 | 124 | negative_indices = 1 - positive_indices 125 | 126 | regression_diff = torch.abs(targets - regression[positive_indices, :]) 127 | 128 | regression_loss = torch.where( 129 | torch.le(regression_diff, 1.0 / 9.0), 130 | 0.5 * 9.0 * torch.pow(regression_diff, 2), 131 | regression_diff - 0.5 / 9.0 132 | ) 133 | regression_losses.append(regression_loss.mean()) 134 | else: 135 | regression_losses.append(torch.tensor(0).float().cuda()) 136 | 137 | return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True) 138 | 139 | 140 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import copy 4 | import argparse 5 | import pdb 6 | import collections 7 | import sys 8 | 9 | import numpy as np 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | from torch.optim import lr_scheduler 15 | from torch.autograd import Variable 16 | from torchvision import datasets, models, transforms 17 | import torchvision 18 | 19 | import model 20 | from anchors import Anchors 21 | import losses 22 | from dataloader import CocoDataset, CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, UnNormalizer, Normalizer 23 | from torch.utils.data import Dataset, DataLoader 24 | 25 | import coco_eval 26 | import csv_eval 27 | 28 | assert torch.__version__.split('.')[1] == '4' 29 | 30 | print('CUDA available: {}'.format(torch.cuda.is_available())) 31 | 32 | 33 | def main(args=None): 34 | 35 | parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') 36 | 37 | parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') 38 | parser.add_argument('--coco_path', help='Path to COCO directory') 39 | parser.add_argument('--csv_train', help='Path to file containing training annotations (see readme)') 40 | parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') 41 | parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)') 42 | 43 | parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) 44 | parser.add_argument('--epochs', help='Number of epochs', type=int, default=100) 45 | 46 | parser = parser.parse_args(args) 47 | 48 | # Create the data loaders 49 | if parser.dataset == 'coco': 50 | 51 | if parser.coco_path is None: 52 | raise ValueError('Must provide --coco_path when training on COCO,') 53 | 54 | dataset_train = CocoDataset(parser.coco_path, set_name='train2017', transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()])) 55 | dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer()])) 56 | 57 | elif parser.dataset == 'csv': 58 | 59 | if parser.csv_train is None: 60 | raise ValueError('Must provide --csv_train when training on COCO,') 61 | 62 | if parser.csv_classes is None: 63 | raise ValueError('Must provide --csv_classes when training on COCO,') 64 | 65 | 66 | dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()])) 67 | 68 | if parser.csv_val is None: 69 | dataset_val = None 70 | print('No validation annotations provided.') 71 | else: 72 | dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()])) 73 | 74 | else: 75 | raise ValueError('Dataset type not understood (must be csv or coco), exiting.') 76 | 77 | sampler = AspectRatioBasedSampler(dataset_train, batch_size=2, drop_last=False) 78 | dataloader_train = DataLoader(dataset_train, num_workers=3, collate_fn=collater, batch_sampler=sampler) 79 | 80 | if dataset_val is not None: 81 | sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) 82 | dataloader_val = DataLoader(dataset_val, num_workers=3, collate_fn=collater, batch_sampler=sampler_val) 83 | 84 | # Create the model 85 | if parser.depth == 18: 86 | retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True) 87 | elif parser.depth == 34: 88 | retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) 89 | elif parser.depth == 50: 90 | retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) 91 | elif parser.depth == 101: 92 | retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True) 93 | elif parser.depth == 152: 94 | retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True) 95 | else: 96 | raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152') 97 | 98 | use_gpu = True 99 | 100 | if use_gpu: 101 | retinanet = retinanet.cuda() 102 | 103 | retinanet = torch.nn.DataParallel(retinanet).cuda() 104 | 105 | retinanet.training = True 106 | 107 | optimizer = optim.Adam(retinanet.parameters(), lr=1e-5) 108 | 109 | scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) 110 | 111 | loss_hist = collections.deque(maxlen=500) 112 | 113 | retinanet.train() 114 | retinanet.module.freeze_bn() 115 | 116 | print('Num training images: {}'.format(len(dataset_train))) 117 | 118 | for epoch_num in range(parser.epochs): 119 | 120 | retinanet.train() 121 | retinanet.module.freeze_bn() 122 | 123 | epoch_loss = [] 124 | 125 | for iter_num, data in enumerate(dataloader_train): 126 | try: 127 | optimizer.zero_grad() 128 | 129 | classification_loss, regression_loss = retinanet([data['img'].cuda().float(), data['annot']]) 130 | 131 | classification_loss = classification_loss.mean() 132 | regression_loss = regression_loss.mean() 133 | 134 | loss = classification_loss + regression_loss 135 | 136 | if bool(loss == 0): 137 | continue 138 | 139 | loss.backward() 140 | 141 | torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1) 142 | 143 | optimizer.step() 144 | 145 | loss_hist.append(float(loss)) 146 | 147 | epoch_loss.append(float(loss)) 148 | 149 | print('Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}'.format(epoch_num, iter_num, float(classification_loss), float(regression_loss), np.mean(loss_hist))) 150 | 151 | del classification_loss 152 | del regression_loss 153 | except Exception as e: 154 | print(e) 155 | continue 156 | 157 | if parser.dataset == 'coco': 158 | 159 | print('Evaluating dataset') 160 | 161 | coco_eval.evaluate_coco(dataset_val, retinanet) 162 | 163 | elif parser.dataset == 'csv' and parser.csv_val is not None: 164 | 165 | print('Evaluating dataset') 166 | 167 | mAP = csv_eval.evaluate(dataset_val, retinanet) 168 | 169 | 170 | scheduler.step(np.mean(epoch_loss)) 171 | 172 | torch.save(retinanet.module, '{}_retinanet_{}.pt'.format(parser.dataset, epoch_num)) 173 | 174 | retinanet.eval() 175 | 176 | torch.save(retinanet, 'model_final.pt'.format(epoch_num)) 177 | 178 | if __name__ == '__main__': 179 | main() 180 | -------------------------------------------------------------------------------- /csv_eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import json 5 | import os 6 | 7 | import torch 8 | 9 | 10 | 11 | def compute_overlap(a, b): 12 | """ 13 | Parameters 14 | ---------- 15 | a: (N, 4) ndarray of float 16 | b: (K, 4) ndarray of float 17 | Returns 18 | ------- 19 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 20 | """ 21 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 22 | 23 | iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0]) 24 | ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1]) 25 | 26 | iw = np.maximum(iw, 0) 27 | ih = np.maximum(ih, 0) 28 | 29 | ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih 30 | 31 | ua = np.maximum(ua, np.finfo(float).eps) 32 | 33 | intersection = iw * ih 34 | 35 | return intersection / ua 36 | 37 | 38 | def _compute_ap(recall, precision): 39 | """ Compute the average precision, given the recall and precision curves. 40 | Code originally from https://github.com/rbgirshick/py-faster-rcnn. 41 | # Arguments 42 | recall: The recall curve (list). 43 | precision: The precision curve (list). 44 | # Returns 45 | The average precision as computed in py-faster-rcnn. 46 | """ 47 | # correct AP calculation 48 | # first append sentinel values at the end 49 | mrec = np.concatenate(([0.], recall, [1.])) 50 | mpre = np.concatenate(([0.], precision, [0.])) 51 | 52 | # compute the precision envelope 53 | for i in range(mpre.size - 1, 0, -1): 54 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 55 | 56 | # to calculate area under PR curve, look for points 57 | # where X axis (recall) changes value 58 | i = np.where(mrec[1:] != mrec[:-1])[0] 59 | 60 | # and sum (\Delta recall) * prec 61 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 62 | return ap 63 | 64 | 65 | def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, save_path=None): 66 | """ Get the detections from the retinanet using the generator. 67 | The result is a list of lists such that the size is: 68 | all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes] 69 | # Arguments 70 | dataset : The generator used to run images through the retinanet. 71 | retinanet : The retinanet to run on the images. 72 | score_threshold : The score confidence threshold to use. 73 | max_detections : The maximum number of detections to use per image. 74 | save_path : The path to save the images with visualized detections to. 75 | # Returns 76 | A list of lists containing the detections for each image in the generator. 77 | """ 78 | all_detections = [[None for i in range(dataset.num_classes())] for j in range(len(dataset))] 79 | 80 | retinanet.eval() 81 | 82 | with torch.no_grad(): 83 | 84 | for index in range(len(dataset)): 85 | data = dataset[index] 86 | scale = data['scale'] 87 | 88 | # run network 89 | scores, labels, boxes = retinanet(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0)) 90 | scores = scores.cpu().numpy() 91 | labels = labels.cpu().numpy() 92 | boxes = boxes.cpu().numpy() 93 | 94 | # correct boxes for image scale 95 | boxes /= scale 96 | 97 | # select indices which have a score above the threshold 98 | indices = np.where(scores > score_threshold)[0] 99 | if indices.shape[0] > 0: 100 | # select those scores 101 | scores = scores[indices] 102 | 103 | # find the order with which to sort the scores 104 | scores_sort = np.argsort(-scores)[:max_detections] 105 | 106 | # select detections 107 | image_boxes = boxes[indices[scores_sort], :] 108 | image_scores = scores[scores_sort] 109 | image_labels = labels[indices[scores_sort]] 110 | image_detections = np.concatenate([image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1) 111 | 112 | # copy detections to all_detections 113 | for label in range(dataset.num_classes()): 114 | all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1] 115 | else: 116 | # copy detections to all_detections 117 | for label in range(dataset.num_classes()): 118 | all_detections[index][label] = np.zeros((0, 5)) 119 | 120 | print('{}/{}'.format(index + 1, len(dataset)), end='\r') 121 | 122 | return all_detections 123 | 124 | 125 | def _get_annotations(generator): 126 | """ Get the ground truth annotations from the generator. 127 | The result is a list of lists such that the size is: 128 | all_detections[num_images][num_classes] = annotations[num_detections, 5] 129 | # Arguments 130 | generator : The generator used to retrieve ground truth annotations. 131 | # Returns 132 | A list of lists containing the annotations for each image in the generator. 133 | """ 134 | all_annotations = [[None for i in range(generator.num_classes())] for j in range(len(generator))] 135 | 136 | for i in range(len(generator)): 137 | # load the annotations 138 | annotations = generator.load_annotations(i) 139 | 140 | # copy detections to all_annotations 141 | for label in range(generator.num_classes()): 142 | all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() 143 | 144 | print('{}/{}'.format(i + 1, len(generator)), end='\r') 145 | 146 | return all_annotations 147 | 148 | 149 | def evaluate( 150 | generator, 151 | retinanet, 152 | iou_threshold=0.5, 153 | score_threshold=0.05, 154 | max_detections=100, 155 | save_path=None 156 | ): 157 | """ Evaluate a given dataset using a given retinanet. 158 | # Arguments 159 | generator : The generator that represents the dataset to evaluate. 160 | retinanet : The retinanet to evaluate. 161 | iou_threshold : The threshold used to consider when a detection is positive or negative. 162 | score_threshold : The score confidence threshold to use for detections. 163 | max_detections : The maximum number of detections to use per image. 164 | save_path : The path to save images with visualized detections to. 165 | # Returns 166 | A dict mapping class names to mAP scores. 167 | """ 168 | 169 | 170 | 171 | # gather all detections and annotations 172 | 173 | all_detections = _get_detections(generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path) 174 | all_annotations = _get_annotations(generator) 175 | 176 | average_precisions = {} 177 | 178 | for label in range(generator.num_classes()): 179 | false_positives = np.zeros((0,)) 180 | true_positives = np.zeros((0,)) 181 | scores = np.zeros((0,)) 182 | num_annotations = 0.0 183 | 184 | for i in range(len(generator)): 185 | detections = all_detections[i][label] 186 | annotations = all_annotations[i][label] 187 | num_annotations += annotations.shape[0] 188 | detected_annotations = [] 189 | 190 | for d in detections: 191 | scores = np.append(scores, d[4]) 192 | 193 | if annotations.shape[0] == 0: 194 | false_positives = np.append(false_positives, 1) 195 | true_positives = np.append(true_positives, 0) 196 | continue 197 | 198 | overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) 199 | assigned_annotation = np.argmax(overlaps, axis=1) 200 | max_overlap = overlaps[0, assigned_annotation] 201 | 202 | if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: 203 | false_positives = np.append(false_positives, 0) 204 | true_positives = np.append(true_positives, 1) 205 | detected_annotations.append(assigned_annotation) 206 | else: 207 | false_positives = np.append(false_positives, 1) 208 | true_positives = np.append(true_positives, 0) 209 | 210 | # no annotations -> AP for this class is 0 (is this correct?) 211 | if num_annotations == 0: 212 | average_precisions[label] = 0, 0 213 | continue 214 | 215 | # sort by score 216 | indices = np.argsort(-scores) 217 | false_positives = false_positives[indices] 218 | true_positives = true_positives[indices] 219 | 220 | # compute false positives and true positives 221 | false_positives = np.cumsum(false_positives) 222 | true_positives = np.cumsum(true_positives) 223 | 224 | # compute recall and precision 225 | recall = true_positives / num_annotations 226 | precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) 227 | 228 | # compute average precision 229 | average_precision = _compute_ap(recall, precision) 230 | average_precisions[label] = average_precision, num_annotations 231 | 232 | print('\nmAP:') 233 | for label in range(generator.num_classes()): 234 | label_name = generator.label_to_name(label) 235 | print('{}: {}'.format(label_name, average_precisions[label][0])) 236 | 237 | return average_precisions 238 | 239 | -------------------------------------------------------------------------------- /oid_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | 3 | import csv 4 | import json 5 | import os 6 | import warnings 7 | 8 | import numpy as np 9 | import skimage 10 | import skimage.color 11 | import skimage.io 12 | import skimage.transform 13 | from PIL import Image 14 | from torch.utils.data import Dataset 15 | 16 | 17 | def get_labels(metadata_dir, version='v4'): 18 | if version == 'v4' or version == 'challenge2018': 19 | csv_file = 'class-descriptions-boxable.csv' if version == 'v4' else 'challenge-2018-class-descriptions-500.csv' 20 | 21 | boxable_classes_descriptions = os.path.join(metadata_dir, csv_file) 22 | id_to_labels = {} 23 | cls_index = {} 24 | 25 | i = 0 26 | with open(boxable_classes_descriptions) as f: 27 | for row in csv.reader(f): 28 | # make sure the csv row is not empty (usually the last one) 29 | if len(row): 30 | label = row[0] 31 | description = row[1].replace("\"", "").replace("'", "").replace('`', '') 32 | 33 | id_to_labels[i] = description 34 | cls_index[label] = i 35 | 36 | i += 1 37 | else: 38 | trainable_classes_path = os.path.join(metadata_dir, 'classes-bbox-trainable.txt') 39 | description_path = os.path.join(metadata_dir, 'class-descriptions.csv') 40 | 41 | description_table = {} 42 | with open(description_path) as f: 43 | for row in csv.reader(f): 44 | # make sure the csv row is not empty (usually the last one) 45 | if len(row): 46 | description_table[row[0]] = row[1].replace("\"", "").replace("'", "").replace('`', '') 47 | 48 | with open(trainable_classes_path, 'rb') as f: 49 | trainable_classes = f.read().split('\n') 50 | 51 | id_to_labels = dict([(i, description_table[c]) for i, c in enumerate(trainable_classes)]) 52 | cls_index = dict([(c, i) for i, c in enumerate(trainable_classes)]) 53 | 54 | return id_to_labels, cls_index 55 | 56 | 57 | def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'): 58 | validation_image_ids = {} 59 | 60 | if version == 'v4': 61 | annotations_path = os.path.join(metadata_dir, subset, '{}-annotations-bbox.csv'.format(subset)) 62 | elif version == 'challenge2018': 63 | validation_image_ids_path = os.path.join(metadata_dir, 'challenge-2018-image-ids-valset-od.csv') 64 | 65 | with open(validation_image_ids_path, 'r') as csv_file: 66 | reader = csv.DictReader(csv_file, fieldnames=['ImageID']) 67 | reader.next() 68 | for line, row in enumerate(reader): 69 | image_id = row['ImageID'] 70 | validation_image_ids[image_id] = True 71 | 72 | annotations_path = os.path.join(metadata_dir, 'challenge-2018-train-annotations-bbox.csv') 73 | else: 74 | annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.csv') 75 | 76 | fieldnames = ['ImageID', 'Source', 'LabelName', 'Confidence', 77 | 'XMin', 'XMax', 'YMin', 'YMax', 78 | 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside'] 79 | 80 | id_annotations = dict() 81 | with open(annotations_path, 'r') as csv_file: 82 | reader = csv.DictReader(csv_file, fieldnames=fieldnames) 83 | next(reader) 84 | 85 | images_sizes = {} 86 | for line, row in enumerate(reader): 87 | frame = row['ImageID'] 88 | 89 | if version == 'challenge2018': 90 | if subset == 'train': 91 | if frame in validation_image_ids: 92 | continue 93 | elif subset == 'validation': 94 | if frame not in validation_image_ids: 95 | continue 96 | else: 97 | raise NotImplementedError('This generator handles only the train and validation subsets') 98 | 99 | class_name = row['LabelName'] 100 | 101 | if class_name not in cls_index: 102 | continue 103 | 104 | cls_id = cls_index[class_name] 105 | 106 | if version == 'challenge2018': 107 | # We recommend participants to use the provided subset of the training set as a validation set. 108 | # This is preferable over using the V4 val/test sets, as the training set is more densely annotated. 109 | img_path = os.path.join(main_dir, 'images', 'train', frame + '.jpg') 110 | else: 111 | img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg') 112 | 113 | if frame in images_sizes: 114 | width, height = images_sizes[frame] 115 | else: 116 | try: 117 | with Image.open(img_path) as img: 118 | width, height = img.width, img.height 119 | images_sizes[frame] = (width, height) 120 | except Exception as ex: 121 | if version == 'challenge2018': 122 | raise ex 123 | continue 124 | 125 | x1 = float(row['XMin']) 126 | x2 = float(row['XMax']) 127 | y1 = float(row['YMin']) 128 | y2 = float(row['YMax']) 129 | 130 | x1_int = int(round(x1 * width)) 131 | x2_int = int(round(x2 * width)) 132 | y1_int = int(round(y1 * height)) 133 | y2_int = int(round(y2 * height)) 134 | 135 | # Check that the bounding box is valid. 136 | if x2 <= x1: 137 | raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) 138 | if y2 <= y1: 139 | raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) 140 | 141 | if y2_int == y1_int: 142 | warnings.warn('filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'.format(line, y2, y1)) 143 | continue 144 | 145 | if x2_int == x1_int: 146 | warnings.warn('filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'.format(line, x2, x1)) 147 | continue 148 | 149 | img_id = row['ImageID'] 150 | annotation = {'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2} 151 | 152 | if img_id in id_annotations: 153 | annotations = id_annotations[img_id] 154 | annotations['boxes'].append(annotation) 155 | else: 156 | id_annotations[img_id] = {'w': width, 'h': height, 'boxes': [annotation]} 157 | return id_annotations 158 | 159 | 160 | class OidDataset(Dataset): 161 | """Oid dataset.""" 162 | 163 | def __init__(self, main_dir, subset, version='v4', annotation_cache_dir='.', transform=None): 164 | if version == 'v4': 165 | metadata = '2018_04' 166 | elif version == 'challenge2018': 167 | metadata = 'challenge2018' 168 | elif version == 'v3': 169 | metadata = '2017_11' 170 | else: 171 | raise NotImplementedError('There is currently no implementation for versions older than v3') 172 | 173 | self.transform = transform 174 | 175 | if version == 'challenge2018': 176 | self.base_dir = os.path.join(main_dir, 'images', 'train') 177 | else: 178 | self.base_dir = os.path.join(main_dir, 'images', subset) 179 | 180 | metadata_dir = os.path.join(main_dir, metadata) 181 | annotation_cache_json = os.path.join(annotation_cache_dir, subset + '.json') 182 | 183 | self.id_to_labels, cls_index = get_labels(metadata_dir, version=version) 184 | 185 | if os.path.exists(annotation_cache_json): 186 | with open(annotation_cache_json, 'r') as f: 187 | self.annotations = json.loads(f.read()) 188 | else: 189 | self.annotations = generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, 190 | version=version) 191 | json.dump(self.annotations, open(annotation_cache_json, "w")) 192 | 193 | self.id_to_image_id = dict([(i, k) for i, k in enumerate(self.annotations)]) 194 | 195 | # (label -> name) 196 | self.labels = self.id_to_labels 197 | 198 | def __len__(self): 199 | return len(self.annotations) 200 | 201 | def __getitem__(self, idx): 202 | 203 | img = self.load_image(idx) 204 | annot = self.load_annotations(idx) 205 | sample = {'img': img, 'annot': annot} 206 | if self.transform: 207 | sample = self.transform(sample) 208 | 209 | return sample 210 | 211 | def image_path(self, image_index): 212 | path = os.path.join(self.base_dir, self.id_to_image_id[image_index] + '.jpg') 213 | return path 214 | 215 | def load_image(self, image_index): 216 | path = self.image_path(image_index) 217 | img = skimage.io.imread(path) 218 | 219 | if len(img.shape) == 1: 220 | img = img[0] 221 | 222 | if len(img.shape) == 2: 223 | img = skimage.color.gray2rgb(img) 224 | 225 | try: 226 | return img.astype(np.float32) / 255.0 227 | except Exception: 228 | print (path) 229 | exit(0) 230 | 231 | def load_annotations(self, image_index): 232 | # get ground truth annotations 233 | image_annotations = self.annotations[self.id_to_image_id[image_index]] 234 | 235 | labels = image_annotations['boxes'] 236 | height, width = image_annotations['h'], image_annotations['w'] 237 | 238 | boxes = np.zeros((len(labels), 5)) 239 | for idx, ann in enumerate(labels): 240 | cls_id = ann['cls_id'] 241 | x1 = ann['x1'] * width 242 | x2 = ann['x2'] * width 243 | y1 = ann['y1'] * height 244 | y2 = ann['y2'] * height 245 | 246 | boxes[idx, 0] = x1 247 | boxes[idx, 1] = y1 248 | boxes[idx, 2] = x2 249 | boxes[idx, 3] = y2 250 | boxes[idx, 4] = cls_id 251 | 252 | return boxes 253 | 254 | def image_aspect_ratio(self, image_index): 255 | img_annotations = self.annotations[self.id_to_image_id[image_index]] 256 | height, width = img_annotations['h'], img_annotations['w'] 257 | return float(width) / float(height) 258 | 259 | def num_classes(self): 260 | return len(self.id_to_labels) 261 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import math 4 | import time 5 | import torch.utils.model_zoo as model_zoo 6 | from utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes 7 | from anchors import Anchors 8 | import losses 9 | from lib.nms.pth_nms import pth_nms 10 | 11 | def nms(dets, thresh): 12 | "Dispatch to either CPU or GPU NMS implementations.\ 13 | Accept dets as tensor""" 14 | return pth_nms(dets, thresh) 15 | 16 | model_urls = { 17 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 18 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 19 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 20 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 21 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 22 | } 23 | 24 | class PyramidFeatures(nn.Module): 25 | def __init__(self, C3_size, C4_size, C5_size, feature_size=256): 26 | super(PyramidFeatures, self).__init__() 27 | 28 | # upsample C5 to get P5 from the FPN paper 29 | self.P5_1 = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0) 30 | self.P5_upsampled = nn.Upsample(scale_factor=2, mode='nearest') 31 | self.P5_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 32 | 33 | # add P5 elementwise to C4 34 | self.P4_1 = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0) 35 | self.P4_upsampled = nn.Upsample(scale_factor=2, mode='nearest') 36 | self.P4_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 37 | 38 | # add P4 elementwise to C3 39 | self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0) 40 | self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 41 | 42 | # "P6 is obtained via a 3x3 stride-2 conv on C5" 43 | self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1) 44 | 45 | # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6" 46 | self.P7_1 = nn.ReLU() 47 | self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1) 48 | 49 | def forward(self, inputs): 50 | 51 | C3, C4, C5 = inputs 52 | 53 | P5_x = self.P5_1(C5) 54 | P5_upsampled_x = self.P5_upsampled(P5_x) 55 | P5_x = self.P5_2(P5_x) 56 | 57 | P4_x = self.P4_1(C4) 58 | P4_x = P5_upsampled_x + P4_x 59 | P4_upsampled_x = self.P4_upsampled(P4_x) 60 | P4_x = self.P4_2(P4_x) 61 | 62 | P3_x = self.P3_1(C3) 63 | P3_x = P3_x + P4_upsampled_x 64 | P3_x = self.P3_2(P3_x) 65 | 66 | P6_x = self.P6(C5) 67 | 68 | P7_x = self.P7_1(P6_x) 69 | P7_x = self.P7_2(P7_x) 70 | 71 | return [P3_x, P4_x, P5_x, P6_x, P7_x] 72 | 73 | 74 | class RegressionModel(nn.Module): 75 | def __init__(self, num_features_in, num_anchors=9, feature_size=256): 76 | super(RegressionModel, self).__init__() 77 | 78 | self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) 79 | self.act1 = nn.ReLU() 80 | 81 | self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 82 | self.act2 = nn.ReLU() 83 | 84 | self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 85 | self.act3 = nn.ReLU() 86 | 87 | self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 88 | self.act4 = nn.ReLU() 89 | 90 | self.output = nn.Conv2d(feature_size, num_anchors*4, kernel_size=3, padding=1) 91 | 92 | def forward(self, x): 93 | 94 | out = self.conv1(x) 95 | out = self.act1(out) 96 | 97 | out = self.conv2(out) 98 | out = self.act2(out) 99 | 100 | out = self.conv3(out) 101 | out = self.act3(out) 102 | 103 | out = self.conv4(out) 104 | out = self.act4(out) 105 | 106 | out = self.output(out) 107 | 108 | # out is B x C x W x H, with C = 4*num_anchors 109 | out = out.permute(0, 2, 3, 1) 110 | 111 | return out.contiguous().view(out.shape[0], -1, 4) 112 | 113 | class ClassificationModel(nn.Module): 114 | def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256): 115 | super(ClassificationModel, self).__init__() 116 | 117 | self.num_classes = num_classes 118 | self.num_anchors = num_anchors 119 | 120 | self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) 121 | self.act1 = nn.ReLU() 122 | 123 | self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 124 | self.act2 = nn.ReLU() 125 | 126 | self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 127 | self.act3 = nn.ReLU() 128 | 129 | self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 130 | self.act4 = nn.ReLU() 131 | 132 | self.output = nn.Conv2d(feature_size, num_anchors*num_classes, kernel_size=3, padding=1) 133 | self.output_act = nn.Sigmoid() 134 | 135 | def forward(self, x): 136 | 137 | out = self.conv1(x) 138 | out = self.act1(out) 139 | 140 | out = self.conv2(out) 141 | out = self.act2(out) 142 | 143 | out = self.conv3(out) 144 | out = self.act3(out) 145 | 146 | out = self.conv4(out) 147 | out = self.act4(out) 148 | 149 | out = self.output(out) 150 | out = self.output_act(out) 151 | 152 | # out is B x C x W x H, with C = n_classes + n_anchors 153 | out1 = out.permute(0, 2, 3, 1) 154 | 155 | batch_size, width, height, channels = out1.shape 156 | 157 | out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes) 158 | 159 | return out2.contiguous().view(x.shape[0], -1, self.num_classes) 160 | 161 | class ResNet(nn.Module): 162 | 163 | def __init__(self, num_classes, block, layers): 164 | self.inplanes = 64 165 | super(ResNet, self).__init__() 166 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 167 | self.bn1 = nn.BatchNorm2d(64) 168 | self.relu = nn.ReLU(inplace=True) 169 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 170 | self.layer1 = self._make_layer(block, 64, layers[0]) 171 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 172 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 173 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 174 | 175 | if block == BasicBlock: 176 | fpn_sizes = [self.layer2[layers[1]-1].conv2.out_channels, self.layer3[layers[2]-1].conv2.out_channels, self.layer4[layers[3]-1].conv2.out_channels] 177 | elif block == Bottleneck: 178 | fpn_sizes = [self.layer2[layers[1]-1].conv3.out_channels, self.layer3[layers[2]-1].conv3.out_channels, self.layer4[layers[3]-1].conv3.out_channels] 179 | 180 | self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2]) 181 | 182 | self.regressionModel = RegressionModel(256) 183 | self.classificationModel = ClassificationModel(256, num_classes=num_classes) 184 | 185 | self.anchors = Anchors() 186 | 187 | self.regressBoxes = BBoxTransform() 188 | 189 | self.clipBoxes = ClipBoxes() 190 | 191 | self.focalLoss = losses.FocalLoss() 192 | 193 | for m in self.modules(): 194 | if isinstance(m, nn.Conv2d): 195 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 196 | m.weight.data.normal_(0, math.sqrt(2. / n)) 197 | elif isinstance(m, nn.BatchNorm2d): 198 | m.weight.data.fill_(1) 199 | m.bias.data.zero_() 200 | 201 | prior = 0.01 202 | 203 | self.classificationModel.output.weight.data.fill_(0) 204 | self.classificationModel.output.bias.data.fill_(-math.log((1.0-prior)/prior)) 205 | 206 | self.regressionModel.output.weight.data.fill_(0) 207 | self.regressionModel.output.bias.data.fill_(0) 208 | 209 | self.freeze_bn() 210 | 211 | def _make_layer(self, block, planes, blocks, stride=1): 212 | downsample = None 213 | if stride != 1 or self.inplanes != planes * block.expansion: 214 | downsample = nn.Sequential( 215 | nn.Conv2d(self.inplanes, planes * block.expansion, 216 | kernel_size=1, stride=stride, bias=False), 217 | nn.BatchNorm2d(planes * block.expansion), 218 | ) 219 | 220 | layers = [] 221 | layers.append(block(self.inplanes, planes, stride, downsample)) 222 | self.inplanes = planes * block.expansion 223 | for i in range(1, blocks): 224 | layers.append(block(self.inplanes, planes)) 225 | 226 | return nn.Sequential(*layers) 227 | 228 | def freeze_bn(self): 229 | '''Freeze BatchNorm layers.''' 230 | for layer in self.modules(): 231 | if isinstance(layer, nn.BatchNorm2d): 232 | layer.eval() 233 | 234 | def forward(self, inputs): 235 | 236 | if self.training: 237 | img_batch, annotations = inputs 238 | else: 239 | img_batch = inputs 240 | 241 | x = self.conv1(img_batch) 242 | x = self.bn1(x) 243 | x = self.relu(x) 244 | x = self.maxpool(x) 245 | 246 | x1 = self.layer1(x) 247 | x2 = self.layer2(x1) 248 | x3 = self.layer3(x2) 249 | x4 = self.layer4(x3) 250 | 251 | features = self.fpn([x2, x3, x4]) 252 | 253 | regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1) 254 | 255 | classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1) 256 | 257 | anchors = self.anchors(img_batch) 258 | 259 | if self.training: 260 | return self.focalLoss(classification, regression, anchors, annotations) 261 | else: 262 | transformed_anchors = self.regressBoxes(anchors, regression) 263 | transformed_anchors = self.clipBoxes(transformed_anchors, img_batch) 264 | 265 | scores = torch.max(classification, dim=2, keepdim=True)[0] 266 | 267 | scores_over_thresh = (scores>0.05)[0, :, 0] 268 | 269 | if scores_over_thresh.sum() == 0: 270 | # no boxes to NMS, just return 271 | return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] 272 | 273 | classification = classification[:, scores_over_thresh, :] 274 | transformed_anchors = transformed_anchors[:, scores_over_thresh, :] 275 | scores = scores[:, scores_over_thresh, :] 276 | 277 | anchors_nms_idx = nms(torch.cat([transformed_anchors, scores], dim=2)[0, :, :], 0.5) 278 | 279 | nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1) 280 | 281 | return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]] 282 | 283 | 284 | 285 | def resnet18(num_classes, pretrained=False, **kwargs): 286 | """Constructs a ResNet-18 model. 287 | Args: 288 | pretrained (bool): If True, returns a model pre-trained on ImageNet 289 | """ 290 | model = ResNet(num_classes, BasicBlock, [2, 2, 2, 2], **kwargs) 291 | if pretrained: 292 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18'], model_dir='.'), strict=False) 293 | return model 294 | 295 | 296 | def resnet34(num_classes, pretrained=False, **kwargs): 297 | """Constructs a ResNet-34 model. 298 | Args: 299 | pretrained (bool): If True, returns a model pre-trained on ImageNet 300 | """ 301 | model = ResNet(num_classes, BasicBlock, [3, 4, 6, 3], **kwargs) 302 | if pretrained: 303 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34'], model_dir='.'), strict=False) 304 | return model 305 | 306 | 307 | def resnet50(num_classes, pretrained=False, **kwargs): 308 | """Constructs a ResNet-50 model. 309 | Args: 310 | pretrained (bool): If True, returns a model pre-trained on ImageNet 311 | """ 312 | model = ResNet(num_classes, Bottleneck, [3, 4, 6, 3], **kwargs) 313 | if pretrained: 314 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50'], model_dir='.'), strict=False) 315 | return model 316 | 317 | def resnet101(num_classes, pretrained=False, **kwargs): 318 | """Constructs a ResNet-101 model. 319 | Args: 320 | pretrained (bool): If True, returns a model pre-trained on ImageNet 321 | """ 322 | model = ResNet(num_classes, Bottleneck, [3, 4, 23, 3], **kwargs) 323 | if pretrained: 324 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101'], model_dir='.'), strict=False) 325 | return model 326 | 327 | 328 | def resnet152(num_classes, pretrained=False, **kwargs): 329 | """Constructs a ResNet-152 model. 330 | Args: 331 | pretrained (bool): If True, returns a model pre-trained on ImageNet 332 | """ 333 | model = ResNet(num_classes, Bottleneck, [3, 8, 36, 3], **kwargs) 334 | if pretrained: 335 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'], model_dir='.'), strict=False) 336 | return model -------------------------------------------------------------------------------- /dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import sys 3 | import os 4 | import torch 5 | import numpy as np 6 | import random 7 | import csv 8 | 9 | from torch.utils.data import Dataset, DataLoader 10 | from torchvision import transforms, utils 11 | from torch.utils.data.sampler import Sampler 12 | 13 | from pycocotools.coco import COCO 14 | 15 | import skimage.io 16 | import skimage.transform 17 | import skimage.color 18 | import skimage 19 | 20 | from PIL import Image 21 | 22 | 23 | class CocoDataset(Dataset): 24 | """Coco dataset.""" 25 | 26 | def __init__(self, root_dir, set_name='train2017', transform=None): 27 | """ 28 | Args: 29 | root_dir (string): COCO directory. 30 | transform (callable, optional): Optional transform to be applied 31 | on a sample. 32 | """ 33 | self.root_dir = root_dir 34 | self.set_name = set_name 35 | self.transform = transform 36 | 37 | self.coco = COCO(os.path.join(self.root_dir, 'annotations', 'instances_' + self.set_name + '.json')) 38 | self.image_ids = self.coco.getImgIds() 39 | 40 | self.load_classes() 41 | 42 | def load_classes(self): 43 | # load class names (name -> label) 44 | categories = self.coco.loadCats(self.coco.getCatIds()) 45 | categories.sort(key=lambda x: x['id']) 46 | 47 | self.classes = {} 48 | self.coco_labels = {} 49 | self.coco_labels_inverse = {} 50 | for c in categories: 51 | self.coco_labels[len(self.classes)] = c['id'] 52 | self.coco_labels_inverse[c['id']] = len(self.classes) 53 | self.classes[c['name']] = len(self.classes) 54 | 55 | # also load the reverse (label -> name) 56 | self.labels = {} 57 | for key, value in self.classes.items(): 58 | self.labels[value] = key 59 | 60 | def __len__(self): 61 | return len(self.image_ids) 62 | 63 | def __getitem__(self, idx): 64 | 65 | img = self.load_image(idx) 66 | annot = self.load_annotations(idx) 67 | sample = {'img': img, 'annot': annot} 68 | if self.transform: 69 | sample = self.transform(sample) 70 | 71 | return sample 72 | 73 | def load_image(self, image_index): 74 | image_info = self.coco.loadImgs(self.image_ids[image_index])[0] 75 | path = os.path.join(self.root_dir, 'images', self.set_name, image_info['file_name']) 76 | img = skimage.io.imread(path) 77 | 78 | if len(img.shape) == 2: 79 | img = skimage.color.gray2rgb(img) 80 | 81 | return img.astype(np.float32)/255.0 82 | 83 | def load_annotations(self, image_index): 84 | # get ground truth annotations 85 | annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False) 86 | annotations = np.zeros((0, 5)) 87 | 88 | # some images appear to miss annotations (like image with id 257034) 89 | if len(annotations_ids) == 0: 90 | return annotations 91 | 92 | # parse annotations 93 | coco_annotations = self.coco.loadAnns(annotations_ids) 94 | for idx, a in enumerate(coco_annotations): 95 | 96 | # some annotations have basically no width / height, skip them 97 | if a['bbox'][2] < 1 or a['bbox'][3] < 1: 98 | continue 99 | 100 | annotation = np.zeros((1, 5)) 101 | annotation[0, :4] = a['bbox'] 102 | annotation[0, 4] = self.coco_label_to_label(a['category_id']) 103 | annotations = np.append(annotations, annotation, axis=0) 104 | 105 | # transform from [x, y, w, h] to [x1, y1, x2, y2] 106 | annotations[:, 2] = annotations[:, 0] + annotations[:, 2] 107 | annotations[:, 3] = annotations[:, 1] + annotations[:, 3] 108 | 109 | return annotations 110 | 111 | def coco_label_to_label(self, coco_label): 112 | return self.coco_labels_inverse[coco_label] 113 | 114 | 115 | def label_to_coco_label(self, label): 116 | return self.coco_labels[label] 117 | 118 | def image_aspect_ratio(self, image_index): 119 | image = self.coco.loadImgs(self.image_ids[image_index])[0] 120 | return float(image['width']) / float(image['height']) 121 | 122 | def num_classes(self): 123 | return 80 124 | 125 | 126 | class CSVDataset(Dataset): 127 | """CSV dataset.""" 128 | 129 | def __init__(self, train_file, class_list, transform=None): 130 | """ 131 | Args: 132 | train_file (string): CSV file with training annotations 133 | annotations (string): CSV file with class list 134 | test_file (string, optional): CSV file with testing annotations 135 | """ 136 | self.train_file = train_file 137 | self.class_list = class_list 138 | self.transform = transform 139 | 140 | # parse the provided class file 141 | try: 142 | with self._open_for_csv(self.class_list) as file: 143 | self.classes = self.load_classes(csv.reader(file, delimiter=',')) 144 | except ValueError as e: 145 | raise_from(ValueError('invalid CSV class file: {}: {}'.format(self.class_list, e)), None) 146 | 147 | self.labels = {} 148 | for key, value in self.classes.items(): 149 | self.labels[value] = key 150 | 151 | # csv with img_path, x1, y1, x2, y2, class_name 152 | try: 153 | with self._open_for_csv(self.train_file) as file: 154 | self.image_data = self._read_annotations(csv.reader(file, delimiter=','), self.classes) 155 | except ValueError as e: 156 | raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(self.train_file, e)), None) 157 | self.image_names = list(self.image_data.keys()) 158 | 159 | def _parse(self, value, function, fmt): 160 | """ 161 | Parse a string into a value, and format a nice ValueError if it fails. 162 | Returns `function(value)`. 163 | Any `ValueError` raised is catched and a new `ValueError` is raised 164 | with message `fmt.format(e)`, where `e` is the caught `ValueError`. 165 | """ 166 | try: 167 | return function(value) 168 | except ValueError as e: 169 | raise_from(ValueError(fmt.format(e)), None) 170 | 171 | def _open_for_csv(self, path): 172 | """ 173 | Open a file with flags suitable for csv.reader. 174 | This is different for python2 it means with mode 'rb', 175 | for python3 this means 'r' with "universal newlines". 176 | """ 177 | if sys.version_info[0] < 3: 178 | return open(path, 'rb') 179 | else: 180 | return open(path, 'r', newline='') 181 | 182 | 183 | def load_classes(self, csv_reader): 184 | result = {} 185 | 186 | for line, row in enumerate(csv_reader): 187 | line += 1 188 | 189 | try: 190 | class_name, class_id = row 191 | except ValueError: 192 | raise_from(ValueError('line {}: format should be \'class_name,class_id\''.format(line)), None) 193 | class_id = self._parse(class_id, int, 'line {}: malformed class ID: {{}}'.format(line)) 194 | 195 | if class_name in result: 196 | raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name)) 197 | result[class_name] = class_id 198 | return result 199 | 200 | 201 | def __len__(self): 202 | return len(self.image_names) 203 | 204 | def __getitem__(self, idx): 205 | 206 | img = self.load_image(idx) 207 | annot = self.load_annotations(idx) 208 | sample = {'img': img, 'annot': annot} 209 | if self.transform: 210 | sample = self.transform(sample) 211 | 212 | return sample 213 | 214 | def load_image(self, image_index): 215 | img = skimage.io.imread(self.image_names[image_index]) 216 | 217 | if len(img.shape) == 2: 218 | img = skimage.color.gray2rgb(img) 219 | 220 | return img.astype(np.float32)/255.0 221 | 222 | def load_annotations(self, image_index): 223 | # get ground truth annotations 224 | annotation_list = self.image_data[self.image_names[image_index]] 225 | annotations = np.zeros((0, 5)) 226 | 227 | # some images appear to miss annotations (like image with id 257034) 228 | if len(annotation_list) == 0: 229 | return annotations 230 | 231 | # parse annotations 232 | for idx, a in enumerate(annotation_list): 233 | # some annotations have basically no width / height, skip them 234 | x1 = a['x1'] 235 | x2 = a['x2'] 236 | y1 = a['y1'] 237 | y2 = a['y2'] 238 | 239 | if (x2-x1) < 1 or (y2-y1) < 1: 240 | continue 241 | 242 | annotation = np.zeros((1, 5)) 243 | 244 | annotation[0, 0] = x1 245 | annotation[0, 1] = y1 246 | annotation[0, 2] = x2 247 | annotation[0, 3] = y2 248 | 249 | annotation[0, 4] = self.name_to_label(a['class']) 250 | annotations = np.append(annotations, annotation, axis=0) 251 | 252 | return annotations 253 | 254 | def _read_annotations(self, csv_reader, classes): 255 | result = {} 256 | for line, row in enumerate(csv_reader): 257 | line += 1 258 | 259 | try: 260 | img_file, x1, y1, x2, y2, class_name = row[:6] 261 | except ValueError: 262 | raise_from(ValueError('line {}: format should be \'img_file,x1,y1,x2,y2,class_name\' or \'img_file,,,,,\''.format(line)), None) 263 | 264 | if img_file not in result: 265 | result[img_file] = [] 266 | 267 | # If a row contains only an image path, it's an image without annotations. 268 | if (x1, y1, x2, y2, class_name) == ('', '', '', '', ''): 269 | continue 270 | 271 | x1 = self._parse(x1, int, 'line {}: malformed x1: {{}}'.format(line)) 272 | y1 = self._parse(y1, int, 'line {}: malformed y1: {{}}'.format(line)) 273 | x2 = self._parse(x2, int, 'line {}: malformed x2: {{}}'.format(line)) 274 | y2 = self._parse(y2, int, 'line {}: malformed y2: {{}}'.format(line)) 275 | 276 | # Check that the bounding box is valid. 277 | if x2 <= x1: 278 | raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) 279 | if y2 <= y1: 280 | raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) 281 | 282 | # check if the current class name is correctly present 283 | if class_name not in classes: 284 | raise ValueError('line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes)) 285 | 286 | result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name}) 287 | return result 288 | 289 | def name_to_label(self, name): 290 | return self.classes[name] 291 | 292 | def label_to_name(self, label): 293 | return self.labels[label] 294 | 295 | def num_classes(self): 296 | return max(self.classes.values()) + 1 297 | 298 | def image_aspect_ratio(self, image_index): 299 | image = Image.open(self.image_names[image_index]) 300 | return float(image.width) / float(image.height) 301 | 302 | 303 | def collater(data): 304 | 305 | imgs = [s['img'] for s in data] 306 | annots = [s['annot'] for s in data] 307 | scales = [s['scale'] for s in data] 308 | 309 | widths = [int(s.shape[0]) for s in imgs] 310 | heights = [int(s.shape[1]) for s in imgs] 311 | batch_size = len(imgs) 312 | 313 | max_width = np.array(widths).max() 314 | max_height = np.array(heights).max() 315 | 316 | padded_imgs = torch.zeros(batch_size, max_width, max_height, 3) 317 | 318 | for i in range(batch_size): 319 | img = imgs[i] 320 | padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), :] = img 321 | 322 | max_num_annots = max(annot.shape[0] for annot in annots) 323 | 324 | if max_num_annots > 0: 325 | 326 | annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1 327 | 328 | if max_num_annots > 0: 329 | for idx, annot in enumerate(annots): 330 | #print(annot.shape) 331 | if annot.shape[0] > 0: 332 | annot_padded[idx, :annot.shape[0], :] = annot 333 | else: 334 | annot_padded = torch.ones((len(annots), 1, 5)) * -1 335 | 336 | 337 | padded_imgs = padded_imgs.permute(0, 3, 1, 2) 338 | 339 | return {'img': padded_imgs, 'annot': annot_padded, 'scale': scales} 340 | 341 | class Resizer(object): 342 | """Convert ndarrays in sample to Tensors.""" 343 | 344 | def __call__(self, sample, min_side=608, max_side=1024): 345 | image, annots = sample['img'], sample['annot'] 346 | 347 | rows, cols, cns = image.shape 348 | 349 | smallest_side = min(rows, cols) 350 | 351 | # rescale the image so the smallest side is min_side 352 | scale = min_side / smallest_side 353 | 354 | # check if the largest side is now greater than max_side, which can happen 355 | # when images have a large aspect ratio 356 | largest_side = max(rows, cols) 357 | 358 | if largest_side * scale > max_side: 359 | scale = max_side / largest_side 360 | 361 | # resize the image with the computed scale 362 | image = skimage.transform.resize(image, (int(round(rows*scale)), int(round((cols*scale))))) 363 | rows, cols, cns = image.shape 364 | 365 | pad_w = 32 - rows%32 366 | pad_h = 32 - cols%32 367 | 368 | new_image = np.zeros((rows + pad_w, cols + pad_h, cns)).astype(np.float32) 369 | new_image[:rows, :cols, :] = image.astype(np.float32) 370 | 371 | annots[:, :4] *= scale 372 | 373 | return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale} 374 | 375 | 376 | class Augmenter(object): 377 | """Convert ndarrays in sample to Tensors.""" 378 | 379 | def __call__(self, sample, flip_x=0.5): 380 | 381 | if np.random.rand() < flip_x: 382 | image, annots = sample['img'], sample['annot'] 383 | image = image[:, ::-1, :] 384 | 385 | rows, cols, channels = image.shape 386 | 387 | x1 = annots[:, 0].copy() 388 | x2 = annots[:, 2].copy() 389 | 390 | x_tmp = x1.copy() 391 | 392 | annots[:, 0] = cols - x2 393 | annots[:, 2] = cols - x_tmp 394 | 395 | sample = {'img': image, 'annot': annots} 396 | 397 | return sample 398 | 399 | 400 | class Normalizer(object): 401 | 402 | def __init__(self): 403 | self.mean = np.array([[[0.485, 0.456, 0.406]]]) 404 | self.std = np.array([[[0.229, 0.224, 0.225]]]) 405 | 406 | def __call__(self, sample): 407 | 408 | image, annots = sample['img'], sample['annot'] 409 | 410 | return {'img':((image.astype(np.float32)-self.mean)/self.std), 'annot': annots} 411 | 412 | class UnNormalizer(object): 413 | def __init__(self, mean=None, std=None): 414 | if mean == None: 415 | self.mean = [0.485, 0.456, 0.406] 416 | else: 417 | self.mean = mean 418 | if std == None: 419 | self.std = [0.229, 0.224, 0.225] 420 | else: 421 | self.std = std 422 | 423 | def __call__(self, tensor): 424 | """ 425 | Args: 426 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 427 | Returns: 428 | Tensor: Normalized image. 429 | """ 430 | for t, m, s in zip(tensor, self.mean, self.std): 431 | t.mul_(s).add_(m) 432 | return tensor 433 | 434 | 435 | class AspectRatioBasedSampler(Sampler): 436 | 437 | def __init__(self, data_source, batch_size, drop_last): 438 | self.data_source = data_source 439 | self.batch_size = batch_size 440 | self.drop_last = drop_last 441 | self.groups = self.group_images() 442 | 443 | def __iter__(self): 444 | random.shuffle(self.groups) 445 | for group in self.groups: 446 | yield group 447 | 448 | def __len__(self): 449 | if self.drop_last: 450 | return len(self.sampler) // self.batch_size 451 | else: 452 | return (len(self.sampler) + self.batch_size - 1) // self.batch_size 453 | 454 | def group_images(self): 455 | # determine the order of the images 456 | order = list(range(len(self.data_source))) 457 | order.sort(key=lambda x: self.data_source.image_aspect_ratio(x)) 458 | 459 | # divide into groups, one group = one batch 460 | return [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in range(0, len(order), self.batch_size)] 461 | --------------------------------------------------------------------------------