├── README.md ├── __init__ ├── config ├── cifar100_config.json └── coco_config.json ├── dataloaders.py ├── lib ├── README.md ├── build │ └── temp.linux-x86_64-3.6 │ │ └── nms │ │ ├── gpu_nms.o │ │ └── nms_kernel.o ├── nms │ ├── __init__.py │ ├── __pycache__ │ │ └── __init__.cpython-36.pyc │ ├── gpu_nms.cpp │ ├── gpu_nms.cpython-36m-x86_64-linux-gnu.so │ ├── gpu_nms.hpp │ ├── gpu_nms.pyx │ ├── nms_kernel.cu │ ├── nums_py.py │ ├── nums_py1.c │ ├── nums_py1.pyx │ ├── nums_py2.c │ ├── nums_py2.pyx │ ├── setup1.py │ └── setup2.py ├── setup3.py └── test_num.py ├── losses.py ├── model ├── BottleneckBlock.py ├── __init__.py ├── anchors.py ├── attentionConv2d.py ├── gpu_nms.pyx ├── losses.py ├── retinanet.py └── wideresnet.py ├── pytorch-retinanet ├── .gitignore ├── LICENSE ├── README.md ├── anchors.py ├── attentionConv2d.py ├── coco_eval.py ├── csv_eval.py ├── dataloader.py ├── images │ ├── 1.jpg │ ├── 3.jpg │ ├── 4.jpg │ ├── 5.jpg │ ├── 6.jpg │ ├── 7.jpg │ └── 8.jpg ├── lib │ ├── README.md │ ├── nms │ │ ├── __init__.py │ │ ├── gpu_nms.cpp │ │ ├── gpu_nms.hpp │ │ ├── gpu_nms.pyx │ │ ├── nms_kernel.cu │ │ ├── nums_py.py │ │ ├── nums_py1.c │ │ ├── nums_py1.pyx │ │ ├── nums_py2.c │ │ ├── nums_py2.pyx │ │ ├── setup1.py │ │ └── setup2.py │ ├── setup3.py │ └── test_num.py ├── losses.py ├── model.py ├── oid_dataset.py ├── train.py ├── utils.py └── visualize.py ├── train.py └── utils └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # pytorch-attention-augmented-convolution 2 | A pytorch implementation of https://arxiv.org/abs/1904.09925 3 | -------------------------------------------------------------------------------- /__init__: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/__init__ -------------------------------------------------------------------------------- /config/cifar100_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cifar100_net", 3 | "model": "AttentionWideResNet", 4 | "checkpoint_dir": "wgts/attconv_cifar100/", 5 | "tb_logdir": "cifar100net_logs", 6 | "batch_size": 16, 7 | "epochs": 500, 8 | "lr": 0.01, 9 | "momentum": 0.9, 10 | "log_interval": 200 11 | } 12 | -------------------------------------------------------------------------------- /config/coco_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "coco_net", 3 | "model": "AttentionRetinaNet", 4 | "checkpoint_dir": "wgts/attnconv_coco/", 5 | "tb_logdir": "coconet_logs", 6 | "batch_size": 16, 7 | "epochs": 500, 8 | "lr": 0.01, 9 | "momentum": 0.9, 10 | "log_interval": 200 11 | } -------------------------------------------------------------------------------- /dataloaders.py: -------------------------------------------------------------------------------- 1 | import torch.utils.data as data 2 | from PIL import Image 3 | import os 4 | import os.path 5 | import numpy as np 6 | 7 | 8 | class CocoDetection(data.Dataset): 9 | """`MS Coco Detection `_ Dataset. 10 | 11 | Args: 12 | root (string): Root directory where images are downloaded to. 13 | annFile (string): Path to json annotation file. 14 | transform (callable, optional): A function/transform that takes in an PIL image 15 | and returns a transformed version. E.g, ``transforms.ToTensor`` 16 | target_transform (callable, optional): A function/transform that takes in the 17 | target and transforms it. 18 | """ 19 | 20 | def __init__(self, root, annFile, transform=None, img_and_target_transform=None, target_transform=None): 21 | from pycocotools.coco import COCO 22 | self.root = root 23 | self.coco = COCO(annFile) 24 | self.ids = list(self.coco.imgs.keys()) 25 | self.transform = transform 26 | self.target_transform = target_transform 27 | self.img_and_target_transform=img_and_target_transform 28 | 29 | 30 | def __getitem__(self, index): 31 | """ 32 | Args: 33 | index (int): Index 34 | 35 | Returns: 36 | tuple: Tuple (image, target). target is the object returned by ``coco.loadAnns``. 37 | """ 38 | coco = self.coco 39 | img_id = self.ids[index] 40 | ann_ids = coco.getAnnIds(imgIds=img_id) 41 | target = coco.loadAnns(ann_ids) 42 | 43 | path = coco.loadImgs(img_id)[0]['file_name'] 44 | 45 | img = Image.open(os.path.join(self.root, path)).convert('RGB') 46 | print(np.array(img).shape) 47 | 48 | if self.img_and_target_transform is not None: 49 | img, target, _ = self.img_and_target_transform(img, target) 50 | 51 | if self.transform is not None: 52 | img = self.transform(img) 53 | 54 | return img, target 55 | 56 | def __len__(self): 57 | return len(self.ids) 58 | 59 | def __repr__(self): 60 | fmt_str = 'Dataset ' + self.__class__.__name__ + '\n' 61 | fmt_str += ' Number of datapoints: {}\n'.format(self.__len__()) 62 | fmt_str += ' Root Location: {}\n'.format(self.root) 63 | tmp = ' Transforms (if any): ' 64 | fmt_str += '{0}{1}\n'.format(tmp, self.transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 65 | tmp = ' Target Transforms (if any): ' 66 | fmt_str += '{0}{1}'.format(tmp, self.target_transform.__repr__().replace('\n', '\n' + ' ' * len(tmp))) 67 | return fmt_str -------------------------------------------------------------------------------- /lib/README.md: -------------------------------------------------------------------------------- 1 | # NMS 2 | the comparison of nms in speed 3 | 4 | method 1: 5 | thresh=0.7, time wastes:0.0287 6 | thresh=0.8, time wastes:0.1057 7 | thresh=0.9, time wastes:0.4204 8 | 9 | method 2: 10 | thresh=0.7, time wastes:0.0272 11 | thresh=0.8, time wastes:0.1038 12 | thresh=0.9, time wastes:0.4184 13 | 14 | method 3: 15 | thresh=0.7, time wastes:0.0019 16 | thresh=0.8, time wastes:0.0028 17 | thresh=0.9, time wastes:0.0036 18 | 19 | method 4: 20 | thresh=0.7, time wastes:0.0120 21 | thresh=0.8, time wastes:0.0063 22 | thresh=0.9, time wastes:0.0071 23 | 24 | Reference: 25 | py-faster-rcnn: https://github.com/rbgirshick/py-faster-rcnn/tree/master/lib/nms 26 | -------------------------------------------------------------------------------- /lib/build/temp.linux-x86_64-3.6/nms/gpu_nms.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/lib/build/temp.linux-x86_64-3.6/nms/gpu_nms.o -------------------------------------------------------------------------------- /lib/build/temp.linux-x86_64-3.6/nms/nms_kernel.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/lib/build/temp.linux-x86_64-3.6/nms/nms_kernel.o -------------------------------------------------------------------------------- /lib/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/lib/nms/__init__.py -------------------------------------------------------------------------------- /lib/nms/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/lib/nms/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /lib/nms/gpu_nms.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/lib/nms/gpu_nms.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | 2 | # -------------------------------------------------------- 3 | # Faster R-CNN 4 | # Copyright (c) 2015 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Ross Girshick 7 | # -------------------------------------------------------- 8 | 9 | 10 | import numpy as np 11 | cimport numpy as np 12 | 13 | assert sizeof(int) == sizeof(np.int32_t) 14 | 15 | cdef extern from "gpu_nms.hpp": 16 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 17 | 18 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 19 | np.int32_t device_id=0): 20 | dets = dets.numpy() 21 | cdef int boxes_num = dets.shape[0] 22 | cdef int boxes_dim = dets.shape[1] 23 | cdef int num_out 24 | cdef np.ndarray[np.int32_t, ndim=1] \ 25 | keep = np.zeros(boxes_num, dtype=np.int32) 26 | cdef np.ndarray[np.float32_t, ndim=1] \ 27 | scores = dets[:, 4] 28 | cdef np.ndarray[np.int_t, ndim=1] \ 29 | order = scores.argsort()[::-1] 30 | cdef np.ndarray[np.float32_t, ndim=2] \ 31 | sorted_dets = dets[order, :] 32 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 33 | keep = keep[:num_out] 34 | return list(order[keep]) 35 | 36 | 37 | -------------------------------------------------------------------------------- /lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | 9 | #include "gpu_nms.hpp" 10 | #include 11 | #include 12 | 13 | #define CUDA_CHECK(condition) \ 14 | /* Code block avoids redefinition of cudaError_t error */ \ 15 | do { \ 16 | cudaError_t error = condition; \ 17 | if (error != cudaSuccess) { \ 18 | std::cout << cudaGetErrorString(error) << std::endl; \ 19 | } \ 20 | } while (0) 21 | 22 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 23 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 24 | 25 | __device__ inline float devIoU(float const * const a, float const * const b) { 26 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 27 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 28 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 29 | float interS = width * height; 30 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 31 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 32 | return interS / (Sa + Sb - interS); 33 | } 34 | 35 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 36 | const float *dev_boxes, unsigned long long *dev_mask) { 37 | const int row_start = blockIdx.y; 38 | const int col_start = blockIdx.x; 39 | 40 | // if (row_start > col_start) return; 41 | 42 | const int row_size = 43 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 44 | const int col_size = 45 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 46 | 47 | __shared__ float block_boxes[threadsPerBlock * 5]; 48 | if (threadIdx.x < col_size) { 49 | block_boxes[threadIdx.x * 5 + 0] = 50 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 51 | block_boxes[threadIdx.x * 5 + 1] = 52 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 53 | block_boxes[threadIdx.x * 5 + 2] = 54 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 55 | block_boxes[threadIdx.x * 5 + 3] = 56 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 57 | block_boxes[threadIdx.x * 5 + 4] = 58 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 59 | } 60 | __syncthreads(); 61 | 62 | if (threadIdx.x < row_size) { 63 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 64 | const float *cur_box = dev_boxes + cur_box_idx * 5; 65 | int i = 0; 66 | unsigned long long t = 0; 67 | int start = 0; 68 | if (row_start == col_start) { 69 | start = threadIdx.x + 1; 70 | } 71 | for (i = start; i < col_size; i++) { 72 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 73 | t |= 1ULL << i; 74 | } 75 | } 76 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 77 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 78 | } 79 | } 80 | 81 | void _set_device(int device_id) { 82 | int current_device; 83 | CUDA_CHECK(cudaGetDevice(¤t_device)); 84 | if (current_device == device_id) { 85 | return; 86 | } 87 | // The call to cudaSetDevice must come before any calls to Get, which 88 | // may perform initialization using the GPU. 89 | CUDA_CHECK(cudaSetDevice(device_id)); 90 | } 91 | 92 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 93 | int boxes_dim, float nms_overlap_thresh, int device_id) { 94 | _set_device(device_id); 95 | 96 | float* boxes_dev = NULL; 97 | unsigned long long* mask_dev = NULL; 98 | 99 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 100 | 101 | CUDA_CHECK(cudaMalloc(&boxes_dev, 102 | boxes_num * boxes_dim * sizeof(float))); 103 | CUDA_CHECK(cudaMemcpy(boxes_dev, 104 | boxes_host, 105 | boxes_num * boxes_dim * sizeof(float), 106 | cudaMemcpyHostToDevice)); 107 | 108 | CUDA_CHECK(cudaMalloc(&mask_dev, 109 | boxes_num * col_blocks * sizeof(unsigned long long))); 110 | 111 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 112 | DIVUP(boxes_num, threadsPerBlock)); 113 | dim3 threads(threadsPerBlock); 114 | nms_kernel<<>>(boxes_num, 115 | nms_overlap_thresh, 116 | boxes_dev, 117 | mask_dev); 118 | 119 | std::vector mask_host(boxes_num * col_blocks); 120 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 121 | mask_dev, 122 | sizeof(unsigned long long) * boxes_num * col_blocks, 123 | cudaMemcpyDeviceToHost)); 124 | 125 | std::vector remv(col_blocks); 126 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 127 | 128 | int num_to_keep = 0; 129 | for (int i = 0; i < boxes_num; i++) { 130 | int nblock = i / threadsPerBlock; 131 | int inblock = i % threadsPerBlock; 132 | 133 | if (!(remv[nblock] & (1ULL << inblock))) { 134 | keep_out[num_to_keep++] = i; 135 | unsigned long long *p = &mask_host[0] + i * col_blocks; 136 | for (int j = nblock; j < col_blocks; j++) { 137 | remv[j] |= p[j]; 138 | } 139 | } 140 | } 141 | *num_out = num_to_keep; 142 | 143 | CUDA_CHECK(cudaFree(boxes_dev)); 144 | CUDA_CHECK(cudaFree(mask_dev)); 145 | } 146 | -------------------------------------------------------------------------------- /lib/nms/nums_py.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon May 7 21:45:37 2018 5 | 6 | @author: lps 7 | """ 8 | import numpy as np 9 | 10 | 11 | boxes=np.array([[100,100,210,210,0.72], 12 | [250,250,420,420,0.8], 13 | [220,220,320,330,0.92], 14 | [100,100,210,210,0.72], 15 | [230,240,325,330,0.81], 16 | [220,230,315,340,0.9]]) 17 | 18 | 19 | def py_cpu_nms(dets, thresh): 20 | # dets:(m,5) thresh:scaler 21 | 22 | x1 = dets[:,0] 23 | y1 = dets[:,1] 24 | x2 = dets[:,2] 25 | y2 = dets[:,3] 26 | 27 | areas = (y2-y1+1) * (x2-x1+1) 28 | scores = dets[:,4] 29 | keep = [] 30 | 31 | index = scores.argsort()[::-1] 32 | 33 | while index.size >0: 34 | 35 | i = index[0] # every time the first is the biggst, and add it directly 36 | keep.append(i) 37 | 38 | x11 = np.maximum(x1[i], x1[index[1:]]) # calculate the points of overlap 39 | y11 = np.maximum(y1[i], y1[index[1:]]) 40 | x22 = np.minimum(x2[i], x2[index[1:]]) 41 | y22 = np.minimum(y2[i], y2[index[1:]]) 42 | 43 | w = np.maximum(0, x22-x11+1) # the weights of overlap 44 | h = np.maximum(0, y22-y11+1) # the height of overlap 45 | 46 | overlaps = w*h 47 | 48 | ious = overlaps / (areas[i]+areas[index[1:]] - overlaps) 49 | 50 | idx = np.where(ious<=thresh)[0] 51 | 52 | index = index[idx+1] # because index start from 1 53 | 54 | return keep 55 | 56 | 57 | import matplotlib.pyplot as plt 58 | def plot_bbox(dets, c='k'): 59 | 60 | x1 = dets[:,0] 61 | y1 = dets[:,1] 62 | x2 = dets[:,2] 63 | y2 = dets[:,3] 64 | 65 | 66 | plt.plot([x1,x2], [y1,y1], c) 67 | plt.plot([x1,x1], [y1,y2], c) 68 | plt.plot([x1,x2], [y2,y2], c) 69 | plt.plot([x2,x2], [y1,y2], c) 70 | plt.title("after nms") 71 | 72 | #plot_bbox(boxes,'k') # before nms 73 | # 74 | #keep = py_cpu_nms(boxes, thresh=0.7) 75 | #plot_bbox(boxes[keep], 'r')# after nms 76 | # 77 | 78 | -------------------------------------------------------------------------------- /lib/nms/nums_py1.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | boxes=np.array([[100,100,210,210,0.72], 5 | [250,250,420,420,0.8], 6 | [220,220,320,330,0.92], 7 | [100,100,210,210,0.72], 8 | [230,240,325,330,0.81], 9 | [220,230,315,340,0.9]]) 10 | 11 | 12 | def py_cpu_nms(dets, thresh): 13 | # dets:(m,5) thresh:scaler 14 | 15 | x1 = dets[:,0] 16 | y1 = dets[:,1] 17 | x2 = dets[:,2] 18 | y2 = dets[:,3] 19 | 20 | areas = (y2-y1+1) * (x2-x1+1) 21 | scores = dets[:,4] 22 | keep = [] 23 | 24 | index = scores.argsort()[::-1] 25 | 26 | j=0 27 | while index.size >0: 28 | 29 | j = j+1 30 | i = index[0] # every time the first is the biggst, and add it directly 31 | keep.append(i) 32 | 33 | x11 = np.maximum(x1[i], x1[index[1:]]) # calculate the points of overlap 34 | y11 = np.maximum(y1[i], y1[index[1:]]) 35 | x22 = np.minimum(x2[i], x2[index[1:]]) 36 | y22 = np.minimum(y2[i], y2[index[1:]]) 37 | 38 | w = np.maximum(0, x22-x11+1) # the weights of overlap 39 | h = np.maximum(0, y22-y11+1) # the height of overlap 40 | 41 | overlaps = w*h 42 | 43 | ious = overlaps / (areas[i]+areas[index[1:]] - overlaps) 44 | 45 | idx = np.where(ious<=thresh)[0] 46 | 47 | index = index[idx+1] # because index starts with 1 48 | 49 | return keep,j 50 | 51 | import matplotlib.pyplot as plt 52 | def plot_bbox(dets, c='k'): 53 | 54 | x1 = dets[:,0] 55 | y1 = dets[:,1] 56 | x2 = dets[:,2] 57 | y2 = dets[:,3] 58 | 59 | plt.plot([x1,x2], [y1,y1], c) 60 | plt.plot([x1,x1], [y1,y2], c) 61 | plt.plot([x1,x2], [y2,y2], c) 62 | plt.plot([x2,x2], [y1,y2], c) 63 | 64 | #plot_bbox(boxes,'k') # before nms 65 | 66 | #keep = py_cpu_nms(boxes, thresh=0.7) 67 | #plot_bbox(boxes[keep], 'r')# after nms 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /lib/nms/nums_py2.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | # 4 | #boxes=np.array([[100,100,210,210,0.72], 5 | # [250,250,420,420,0.8], 6 | # [220,220,320,330,0.92], 7 | # [100,100,210,210,0.72], 8 | # [230,240,325,330,0.81], 9 | # [220,230,315,340,0.9]]) 10 | # 11 | 12 | 13 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 14 | return a if a >= b else b 15 | 16 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 17 | return a if a <= b else b 18 | 19 | def py_cpu_nms(np.ndarray[np.float32_t,ndim=2] dets, np.float thresh): 20 | # dets:(m,5) thresh:scaler 21 | 22 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:,0] 23 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:,1] 24 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:,2] 25 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:,3] 26 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 27 | 28 | cdef np.ndarray[np.float32_t, ndim=1] areas = (y2-y1+1) * (x2-x1+1) 29 | cdef np.ndarray[np.int_t, ndim=1] index = scores.argsort()[::-1] # can be rewriten 30 | keep = [] 31 | 32 | cdef int ndets = dets.shape[0] 33 | cdef np.ndarray[np.int_t, ndim=1] suppressed = np.zeros(ndets, dtype=np.int) 34 | 35 | cdef int _i, _j 36 | 37 | cdef int i, j 38 | 39 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 40 | cdef np.float32_t w, h 41 | cdef np.float32_t overlap, ious 42 | 43 | j=0 44 | 45 | for _i in range(ndets): 46 | i = index[_i] 47 | 48 | if suppressed[i] == 1: 49 | continue 50 | keep.append(i) 51 | 52 | ix1 = x1[i] 53 | iy1 = y1[i] 54 | ix2 = x2[i] 55 | iy2 = y2[i] 56 | 57 | iarea = areas[i] 58 | 59 | for _j in range(_i+1, ndets): 60 | j = index[_j] 61 | if suppressed[j] == 1: 62 | continue 63 | xx1 = max(ix1, x1[j]) 64 | yy1 = max(iy1, y1[j]) 65 | xx2 = max(ix2, x2[j]) 66 | yy2 = max(iy2, y2[j]) 67 | 68 | w = max(0.0, xx2-xx1+1) 69 | h = max(0.0, yy2-yy1+1) 70 | 71 | overlap = w*h 72 | ious = overlap / (iarea + areas[j] - overlap) 73 | if ious>thresh: 74 | suppressed[j] = 1 75 | 76 | return keep 77 | 78 | import matplotlib.pyplot as plt 79 | def plot_bbox(dets, c='k'): 80 | 81 | x1 = dets[:,0] 82 | y1 = dets[:,1] 83 | x2 = dets[:,2] 84 | y2 = dets[:,3] 85 | 86 | plt.plot([x1,x2], [y1,y1], c) 87 | plt.plot([x1,x1], [y1,y2], c) 88 | plt.plot([x1,x2], [y2,y2], c) 89 | plt.plot([x2,x2], [y1,y2], c) 90 | 91 | 92 | #plot_bbox(boxes,'k') # before nms 93 | # 94 | #keep = py_cpu_nms(boxes, thresh=0.7) 95 | #plot_bbox(boxes[keep], 'r')# after nms 96 | 97 | 98 | -------------------------------------------------------------------------------- /lib/nms/setup1.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | 4 | setup( 5 | name = 'nms_module', 6 | ext_modules = cythonize('nums_py1.pyx'), 7 | ) 8 | -------------------------------------------------------------------------------- /lib/nms/setup2.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | 4 | setup( 5 | name = 'nms_module', 6 | ext_modules = cythonize('nums_py2.pyx'), 7 | ) 8 | -------------------------------------------------------------------------------- /lib/setup3.py: -------------------------------------------------------------------------------- 1 | 2 | # -------------------------------------------------------- 3 | # Faster R-CNN 4 | # Copyright (c) 2015 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Ross Girshick 7 | # -------------------------------------------------------- 8 | 9 | 10 | from distutils.core import setup 11 | from Cython.Build import cythonize 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | import subprocess 15 | import numpy as np 16 | import os 17 | from os.path import join as pjoin 18 | 19 | 20 | def find_in_path(name, path): 21 | "Find a file in a search path" 22 | # Adapted fom 23 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 24 | for dir in path.split(os.pathsep): 25 | binpath = pjoin(dir, name) 26 | if os.path.exists(binpath): 27 | return os.path.abspath(binpath) 28 | return None 29 | 30 | def locate_cuda(): 31 | """Locate the CUDA environment on the system 32 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 33 | and values giving the absolute path to each directory. 34 | Starts by looking for the CUDAHOME env variable. If not found, everything 35 | is based on finding 'nvcc' in the PATH. 36 | """ 37 | 38 | # first check if the CUDAHOME env variable is in use 39 | if 'CUDAHOME' in os.environ: 40 | home = os.environ['CUDAHOME'] 41 | nvcc = pjoin(home, 'bin', 'nvcc') 42 | else: 43 | # otherwise, search the PATH for NVCC 44 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 45 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 46 | if nvcc is None: 47 | raise EnvironmentError('The nvcc binary could not be ' 48 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 49 | home = os.path.dirname(os.path.dirname(nvcc)) 50 | 51 | cudaconfig = {'home':home, 'nvcc':nvcc, 52 | 'include': pjoin(home, 'include'), 53 | 'lib64': pjoin(home, 'lib64')} 54 | for k, v in cudaconfig.items(): 55 | if not os.path.exists(v): 56 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 57 | 58 | return cudaconfig 59 | CUDA = locate_cuda() 60 | 61 | try: 62 | numpy_include = np.get_include() 63 | except AttributeError: 64 | numpy_include = np.get_numpy_include() 65 | 66 | 67 | def customize_compiler_for_nvcc(self): 68 | """inject deep into distutils to customize how the dispatch 69 | to gcc/nvcc works. 70 | If you subclass UnixCCompiler, it's not trivial to get your subclass 71 | injected in, and still have the right customizations (i.e. 72 | distutils.sysconfig.customize_compiler) run on it. So instead of going 73 | the OO route, I have this. Note, it's kindof like a wierd functional 74 | subclassing going on.""" 75 | 76 | # tell the compiler it can processes .cu 77 | self.src_extensions.append('.cu') 78 | 79 | # save references to the default compiler_so and _comple methods 80 | default_compiler_so = self.compiler_so 81 | super = self._compile 82 | 83 | # now redefine the _compile method. This gets executed for each 84 | # object but distutils doesn't have the ability to change compilers 85 | # based on source extension: we add it. 86 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 87 | if os.path.splitext(src)[1] == '.cu': 88 | # use the cuda for .cu files 89 | self.set_executable('compiler_so', CUDA['nvcc']) 90 | # use only a subset of the extra_postargs, which are 1-1 translated 91 | # from the extra_compile_args in the Extension class 92 | postargs = extra_postargs['nvcc'] 93 | else: 94 | postargs = extra_postargs['gcc'] 95 | 96 | super(obj, src, ext, cc_args, postargs, pp_opts) 97 | # reset the default compiler_so, which we might have changed for cuda 98 | self.compiler_so = default_compiler_so 99 | 100 | # inject our redefined _compile method into the class 101 | self._compile = _compile 102 | 103 | 104 | # run the customize_compiler 105 | class custom_build_ext(build_ext): 106 | def build_extensions(self): 107 | customize_compiler_for_nvcc(self.compiler) 108 | build_ext.build_extensions(self) 109 | 110 | ext_modules = [Extension('nms.gpu_nms', 111 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 112 | library_dirs=[CUDA['lib64']], 113 | libraries=['cudart'], 114 | language='c++', 115 | runtime_library_dirs=[CUDA['lib64']], 116 | # this syntax is specific to this build system 117 | # we're only going to use certain compiler args with nvcc and not with 118 | # gcc the implementation of this trick is in customize_compiler() below 119 | extra_compile_args={'gcc': ["-Wno-unused-function"], 120 | 'nvcc': ['-arch=sm_35', 121 | '--ptxas-options=-v', 122 | '-c', 123 | '--compiler-options', 124 | "'-fPIC'"]}, 125 | include_dirs = [numpy_include, CUDA['include']] 126 | )] 127 | 128 | setup( 129 | name='fast_rcnn', 130 | ext_modules=ext_modules, 131 | # inject our custom trigger 132 | cmdclass={'build_ext': custom_build_ext}, 133 | ) 134 | 135 | -------------------------------------------------------------------------------- /lib/test_num.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | #from nms.nums_py2 import py_cpu_nms # for cpu 4 | from nms.gpu_nms import gpu_nms # for gpu 5 | 6 | 7 | np.random.seed( 1 ) # keep fixed 8 | num_rois = 6000 9 | minxy = np.random.randint(50,145,size=(num_rois ,2)) 10 | maxxy = np.random.randint(150,200,size=(num_rois ,2)) 11 | score = 0.8*np.random.random_sample((num_rois ,1))+0.2 12 | 13 | boxes_new = np.concatenate((minxy,maxxy,score), axis=1).astype(np.float32) 14 | 15 | def nms_test_time(boxes_new): 16 | 17 | thresh = [0.7,0.8,0.9] 18 | T = 50 19 | for i in range(len(thresh)): 20 | since = time.time() 21 | for t in range(T): 22 | 23 | # keep = py_cpu_nms(boxes_new, thresh=thresh[i]) # for cpu 24 | keep = gpu_nms(boxes_new, thresh=thresh[i]) # for gpu 25 | print("thresh={:.1f}, time wastes:{:.4f}".format(thresh[i], (time.time()-since)/T)) 26 | 27 | return keep 28 | 29 | 30 | if __name__ =="__main__": 31 | nms_test_time(boxes_new) 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | def calc_iou(a, b): 6 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 7 | 8 | iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0]) 9 | ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1]) 10 | 11 | iw = torch.clamp(iw, min=0) 12 | ih = torch.clamp(ih, min=0) 13 | 14 | ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih 15 | 16 | ua = torch.clamp(ua, min=1e-8) 17 | 18 | intersection = iw * ih 19 | 20 | IoU = intersection / ua 21 | 22 | return IoU 23 | 24 | class FocalLoss(nn.Module): 25 | #def __init__(self): 26 | 27 | def forward(self, classifications, regressions, anchors, annotations): 28 | alpha = 0.25 29 | gamma = 2.0 30 | batch_size = classifications.shape[0] 31 | classification_losses = [] 32 | regression_losses = [] 33 | 34 | anchor = anchors[0, :, :] 35 | 36 | anchor_widths = anchor[:, 2] - anchor[:, 0] 37 | anchor_heights = anchor[:, 3] - anchor[:, 1] 38 | anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths 39 | anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights 40 | 41 | for j in range(batch_size): 42 | 43 | classification = classifications[j, :, :] 44 | regression = regressions[j, :, :] 45 | 46 | bbox_annotation = annotations[j, :, :] 47 | bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] 48 | 49 | if bbox_annotation.shape[0] == 0: 50 | regression_losses.append(torch.tensor(0).float().cuda()) 51 | classification_losses.append(torch.tensor(0).float().cuda()) 52 | 53 | continue 54 | 55 | classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) 56 | 57 | IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) # num_anchors x num_annotations 58 | 59 | IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1 60 | 61 | #import pdb 62 | #pdb.set_trace() 63 | 64 | # compute the loss for classification 65 | targets = torch.ones(classification.shape) * -1 66 | targets = targets.cuda() 67 | 68 | targets[torch.lt(IoU_max, 0.4), :] = 0 69 | 70 | positive_indices = torch.ge(IoU_max, 0.5) 71 | 72 | num_positive_anchors = positive_indices.sum() 73 | 74 | assigned_annotations = bbox_annotation[IoU_argmax, :] 75 | 76 | targets[positive_indices, :] = 0 77 | targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1 78 | 79 | alpha_factor = torch.ones(targets.shape).cuda() * alpha 80 | 81 | alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor) 82 | focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification) 83 | focal_weight = alpha_factor * torch.pow(focal_weight, gamma) 84 | 85 | bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification)) 86 | 87 | # cls_loss = focal_weight * torch.pow(bce, gamma) 88 | cls_loss = focal_weight * bce 89 | 90 | cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda()) 91 | 92 | classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0)) 93 | 94 | # compute the loss for regression 95 | 96 | if positive_indices.sum() > 0: 97 | assigned_annotations = assigned_annotations[positive_indices, :] 98 | 99 | anchor_widths_pi = anchor_widths[positive_indices] 100 | anchor_heights_pi = anchor_heights[positive_indices] 101 | anchor_ctr_x_pi = anchor_ctr_x[positive_indices] 102 | anchor_ctr_y_pi = anchor_ctr_y[positive_indices] 103 | 104 | gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0] 105 | gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1] 106 | gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths 107 | gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights 108 | 109 | # clip widths to 1 110 | gt_widths = torch.clamp(gt_widths, min=1) 111 | gt_heights = torch.clamp(gt_heights, min=1) 112 | 113 | targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi 114 | targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi 115 | targets_dw = torch.log(gt_widths / anchor_widths_pi) 116 | targets_dh = torch.log(gt_heights / anchor_heights_pi) 117 | 118 | targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh)) 119 | targets = targets.t() 120 | 121 | targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda() 122 | 123 | 124 | negative_indices = 1 - positive_indices 125 | 126 | regression_diff = torch.abs(targets - regression[positive_indices, :]) 127 | 128 | regression_loss = torch.where( 129 | torch.le(regression_diff, 1.0 / 9.0), 130 | 0.5 * 9.0 * torch.pow(regression_diff, 2), 131 | regression_diff - 0.5 / 9.0 132 | ) 133 | regression_losses.append(regression_loss.mean()) 134 | else: 135 | regression_losses.append(torch.tensor(0).float().cuda()) 136 | 137 | return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True) 138 | 139 | 140 | -------------------------------------------------------------------------------- /model/BottleneckBlock.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .attentionConv2d import AttentionConv2d 4 | from ..utils.utils import comptue_dim 5 | 6 | 7 | class BottleneckBlock(nn.Module): 8 | expansion = 4 9 | def __init__(self, input_dim, output_dim, stride=1, downsample_shortcut=None, attention=False, expansion=4, 10 | kappa=None, nu=None, num_heads=None, H=None, W=None): 11 | super(BottleneckBlock, self).__init__() 12 | self.expansion = expansion 13 | 14 | self.relu = nn.ReLU(inplace=True) 15 | self.conv1 = nn.Conv2d(input_dim, output_dim, kernel_size=1, bias=False) 16 | self.bn1 = nn.BatchNorm2d(output_dim) 17 | 18 | self.conv2 = nn.Conv2d(output_dim, output_dim, kernel_size=3, stride=stride, padding=1, bias=False) 19 | self.bn2 = nn.BatchNorm2d(output_dim) 20 | 21 | expansion_dim = expansion * output_dim 22 | self.conv3 = None 23 | if attention: 24 | dk = round(kappa * expansion_dim) 25 | dv = round(nu * expansion_dim) 26 | h = comptue_dim(H, 1, 3, stride) 27 | w = comptue_dim(W, 1, 3, stride) 28 | self.conv3 = AttentionConv2d(output_dim, expansion_dim, dk, dv, num_heads, kernel_size=1, padding=0, 29 | height=h, width=w) 30 | 31 | else: 32 | self.conv3 = nn.Conv2d(output_dim, expansion_dim, kernel_size=1, bias=False) 33 | 34 | self.bn3 = nn.BatchNorm2d(expansion * output_dim) 35 | self.downsample_shortcut = downsample_shortcut 36 | 37 | def forward(self, x): 38 | residual = x 39 | out = self.conv1(x) 40 | out = self.bn1(out) 41 | out = self.relu(out) 42 | 43 | out = self.conv2(out) 44 | out = self.bn2(out) 45 | out = self.relu(out) 46 | 47 | out = self.conv3(out) 48 | out = self.bn3(out) 49 | 50 | if self.downsample_shortcut is not None: 51 | residual = self.downsample_shortcut(x) 52 | 53 | out += residual 54 | out = self.relu(out) 55 | 56 | return out 57 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/model/__init__.py -------------------------------------------------------------------------------- /model/anchors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class Anchors(nn.Module): 7 | def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None): 8 | super(Anchors, self).__init__() 9 | 10 | if pyramid_levels is None: 11 | self.pyramid_levels = [3, 4, 5, 6, 7] 12 | if strides is None: 13 | self.strides = [2 ** x for x in self.pyramid_levels] 14 | if sizes is None: 15 | self.sizes = [2 ** (x + 2) for x in self.pyramid_levels] 16 | if ratios is None: 17 | self.ratios = np.array([0.5, 1, 2]) 18 | if scales is None: 19 | self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 20 | 21 | def forward(self, image): 22 | 23 | image_shape = image.shape[2:] 24 | image_shape = np.array(image_shape) 25 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels] 26 | 27 | # compute anchors over all pyramid levels 28 | all_anchors = np.zeros((0, 4)).astype(np.float32) 29 | 30 | for idx, p in enumerate(self.pyramid_levels): 31 | anchors = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales) 32 | shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors) 33 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 34 | 35 | all_anchors = np.expand_dims(all_anchors, axis=0) 36 | 37 | return torch.from_numpy(all_anchors.astype(np.float32)).cuda() 38 | 39 | def generate_anchors(base_size=16, ratios=None, scales=None): 40 | """ 41 | Generate anchor (reference) windows by enumerating aspect ratios X 42 | scales w.r.t. a reference window. 43 | """ 44 | 45 | if ratios is None: 46 | ratios = np.array([0.5, 1, 2]) 47 | 48 | if scales is None: 49 | scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 50 | 51 | num_anchors = len(ratios) * len(scales) 52 | 53 | # initialize output anchors 54 | anchors = np.zeros((num_anchors, 4)) 55 | 56 | # scale base_size 57 | anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T 58 | 59 | # compute areas of anchors 60 | areas = anchors[:, 2] * anchors[:, 3] 61 | 62 | # correct for ratios 63 | anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales))) 64 | anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales)) 65 | 66 | # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) 67 | anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T 68 | anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T 69 | 70 | return anchors 71 | 72 | def compute_shape(image_shape, pyramid_levels): 73 | """Compute shapes based on pyramid levels. 74 | 75 | :param image_shape: 76 | :param pyramid_levels: 77 | :return: 78 | """ 79 | image_shape = np.array(image_shape[:2]) 80 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels] 81 | return image_shapes 82 | 83 | 84 | def anchors_for_shape( 85 | image_shape, 86 | pyramid_levels=None, 87 | ratios=None, 88 | scales=None, 89 | strides=None, 90 | sizes=None, 91 | shapes_callback=None, 92 | ): 93 | 94 | image_shapes = compute_shape(image_shape, pyramid_levels) 95 | 96 | # compute anchors over all pyramid levels 97 | all_anchors = np.zeros((0, 4)) 98 | for idx, p in enumerate(pyramid_levels): 99 | anchors = generate_anchors(base_size=sizes[idx], ratios=ratios, scales=scales) 100 | shifted_anchors = shift(image_shapes[idx], strides[idx], anchors) 101 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 102 | 103 | return all_anchors 104 | 105 | 106 | def shift(shape, stride, anchors): 107 | shift_x = (np.arange(0, shape[1]) + 0.5) * stride 108 | shift_y = (np.arange(0, shape[0]) + 0.5) * stride 109 | 110 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 111 | 112 | shifts = np.vstack(( 113 | shift_x.ravel(), shift_y.ravel(), 114 | shift_x.ravel(), shift_y.ravel() 115 | )).transpose() 116 | 117 | # add A anchors (1, A, 4) to 118 | # cell K shifts (K, 1, 4) to get 119 | # shift anchors (K, A, 4) 120 | # reshape to (K*A, 4) shifted anchors 121 | A = anchors.shape[0] 122 | K = shifts.shape[0] 123 | all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) 124 | all_anchors = all_anchors.reshape((K * A, 4)) 125 | 126 | return all_anchors 127 | 128 | -------------------------------------------------------------------------------- /model/attentionConv2d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import einsum 5 | 6 | 7 | class AttentionConv2d(nn.Module): 8 | def __init__(self, input_dim, output_dim, dk, dv, num_heads, kernel_size, padding, rel_encoding=True, height=None, width=None): 9 | super(AttentionConv2d, self).__init__() 10 | self.input_dim = input_dim 11 | self.output_dim = output_dim 12 | self.dk = dk 13 | self.dv = dv 14 | self.num_heads = num_heads 15 | self.kernel_size = kernel_size 16 | self.dkh = self.dk // self.num_heads 17 | if rel_encoding and not height: 18 | raise("Cannot use relative encoding without specifying input's height and width") 19 | self.H = height 20 | self.W = width 21 | 22 | self.conv_qkv = nn.Conv2d(input_dim, 2*dk + dv, 1) 23 | self.conv_attn = nn.Conv2d(dv, dv, 1) 24 | self.conv_out = nn.Conv2d(input_dim, output_dim - dv, kernel_size, padding=padding) 25 | self.softmax = nn.Softmax(dim=-1) 26 | self.key_rel_w = nn.Parameter(self.dkh**-0.5 + torch.rand(2*width-1, self.dkh), requires_grad=True) 27 | self.key_rel_h = nn.Parameter(self.dkh**-0.5 + torch.rand(2*height-1, self.dkh), requires_grad=True) 28 | self.relative_encoding = rel_encoding 29 | 30 | 31 | def forward(self, input): 32 | conv_out = self.conv_out(input) 33 | 34 | qkv = self.conv_qkv(input) # batch_size, 2*dk+dv, H, W 35 | 36 | q, k, v = torch.split(qkv, [self.dk, self.dk, self.dv], dim=1) 37 | 38 | batch_size, _, H, W = q.size() 39 | 40 | q = q.view([batch_size, self.num_heads, self.dk // self.num_heads, H*W]) 41 | k = k.view([batch_size, self.num_heads, self.dk // self.num_heads, H*W]) 42 | v = v.view([batch_size, self.num_heads, self.dv // self.num_heads, H*W]) 43 | 44 | q *= self.dkh ** -0.5 45 | logits = einsum('ijkl, ijkm -> ijlm', q, k) 46 | if self.relative_encoding: 47 | h_rel_logits, w_rel_logits = self._relative_logits(q) 48 | logits += h_rel_logits 49 | logits += w_rel_logits 50 | 51 | weights = self.softmax(logits) 52 | attn_out = einsum('ijkl, ijfl -> ijfk', weights, v) 53 | attn_out = attn_out.contiguous().view(batch_size, self.dv, H, W) 54 | attn_out = self.conv_attn(attn_out) 55 | output = torch.cat([conv_out, attn_out], dim=1) 56 | return output 57 | 58 | def _relative_logits(self, q): 59 | b, nh, dkh, _ = q.size() 60 | q = q.view(b, nh, dkh, self.H, self.W) 61 | 62 | rel_logits_w = self._relative_logits1d(q, self.key_rel_w, self.H, self.W, nh, [0, 1, 2, 4, 3, 5]) 63 | rel_logits_h = self._relative_logits1d(q.permute(0, 1, 2, 4, 3), self.key_rel_h, self.W, self.H, nh, [0, 1, 4, 2, 5, 3]) 64 | return rel_logits_h, rel_logits_w 65 | 66 | def _relative_logits1d(self, q, rel_k, H, W, Nh, transpose_mask): 67 | rel_logits = einsum('bhdxy, md -> bhxym', q, rel_k) 68 | 69 | rel_logits = rel_logits.view([-1, Nh*H, W, 2*W-1]) 70 | rel_logits = self._rel_to_abs(rel_logits) 71 | rel_logits = rel_logits.view([-1, Nh, H, W, W]).unsqueeze(dim=3).repeat([1,1,1,H,1,1]) 72 | rel_logits = rel_logits.permute(*transpose_mask) 73 | rel_logits = rel_logits.contiguous().view(-1, Nh, H*W, H*W) 74 | return rel_logits 75 | 76 | def _rel_to_abs(self, x): 77 | b, nh, l, _ = x.size() 78 | 79 | 80 | x = F.pad(x, (0,1), 'constant', 0) 81 | flat_x = x.view([b, nh, l*(2*l)]); 82 | flat_x_padded = F.pad(flat_x, (0, l-1), 'constant', 0) 83 | 84 | final_x = flat_x_padded.view([b, nh, l+1, 2*l-1]) 85 | final_x = final_x[:, :, :l, l-1:] 86 | 87 | return final_x 88 | -------------------------------------------------------------------------------- /model/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | 2 | # -------------------------------------------------------- 3 | # Faster R-CNN 4 | # Copyright (c) 2015 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Ross Girshick 7 | # -------------------------------------------------------- 8 | 9 | 10 | import numpy as np 11 | cimport numpy as np 12 | 13 | assert sizeof(int) == sizeof(np.int32_t) 14 | 15 | cdef extern from "gpu_nms.hpp": 16 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 17 | 18 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 19 | np.int32_t device_id=0): 20 | cdef int boxes_num = dets.shape[0] 21 | cdef int boxes_dim = dets.shape[1] 22 | cdef int num_out 23 | cdef np.ndarray[np.int32_t, ndim=1] \ 24 | keep = np.zeros(boxes_num, dtype=np.int32) 25 | cdef np.ndarray[np.float32_t, ndim=1] \ 26 | scores = dets[:, 4] 27 | cdef np.ndarray[np.int_t, ndim=1] \ 28 | order = scores.argsort()[::-1] 29 | cdef np.ndarray[np.float32_t, ndim=2] \ 30 | sorted_dets = dets[order, :] 31 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 32 | keep = keep[:num_out] 33 | return list(order[keep]) 34 | 35 | 36 | -------------------------------------------------------------------------------- /model/losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | def calc_iou(a, b): 6 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 7 | 8 | iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0]) 9 | ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1]) 10 | 11 | iw = torch.clamp(iw, min=0) 12 | ih = torch.clamp(ih, min=0) 13 | 14 | ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih 15 | 16 | ua = torch.clamp(ua, min=1e-8) 17 | 18 | intersection = iw * ih 19 | 20 | IoU = intersection / ua 21 | 22 | return IoU 23 | 24 | class FocalLoss(nn.Module): 25 | #def __init__(self): 26 | 27 | def forward(self, classifications, regressions, anchors, annotations): 28 | alpha = 0.25 29 | gamma = 2.0 30 | batch_size = classifications.shape[0] 31 | classification_losses = [] 32 | regression_losses = [] 33 | 34 | anchor = anchors[0, :, :] 35 | 36 | anchor_widths = anchor[:, 2] - anchor[:, 0] 37 | anchor_heights = anchor[:, 3] - anchor[:, 1] 38 | anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths 39 | anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights 40 | 41 | for j in range(batch_size): 42 | 43 | classification = classifications[j, :, :] 44 | regression = regressions[j, :, :] 45 | 46 | bbox_annotation = annotations[j, :, :] 47 | bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] 48 | 49 | if bbox_annotation.shape[0] == 0: 50 | regression_losses.append(torch.tensor(0).float().cuda()) 51 | classification_losses.append(torch.tensor(0).float().cuda()) 52 | 53 | continue 54 | 55 | classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) 56 | 57 | IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) # num_anchors x num_annotations 58 | 59 | IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1 60 | 61 | #import pdb 62 | #pdb.set_trace() 63 | 64 | # compute the loss for classification 65 | targets = torch.ones(classification.shape) * -1 66 | targets = targets.cuda() 67 | 68 | targets[torch.lt(IoU_max, 0.4), :] = 0 69 | 70 | positive_indices = torch.ge(IoU_max, 0.5) 71 | 72 | num_positive_anchors = positive_indices.sum() 73 | 74 | assigned_annotations = bbox_annotation[IoU_argmax, :] 75 | 76 | targets[positive_indices, :] = 0 77 | targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1 78 | 79 | alpha_factor = torch.ones(targets.shape).cuda() * alpha 80 | 81 | alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor) 82 | focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification) 83 | focal_weight = alpha_factor * torch.pow(focal_weight, gamma) 84 | 85 | bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification)) 86 | 87 | # cls_loss = focal_weight * torch.pow(bce, gamma) 88 | cls_loss = focal_weight * bce 89 | 90 | cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda()) 91 | 92 | classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0)) 93 | 94 | # compute the loss for regression 95 | 96 | if positive_indices.sum() > 0: 97 | assigned_annotations = assigned_annotations[positive_indices, :] 98 | 99 | anchor_widths_pi = anchor_widths[positive_indices] 100 | anchor_heights_pi = anchor_heights[positive_indices] 101 | anchor_ctr_x_pi = anchor_ctr_x[positive_indices] 102 | anchor_ctr_y_pi = anchor_ctr_y[positive_indices] 103 | 104 | gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0] 105 | gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1] 106 | gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths 107 | gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights 108 | 109 | # clip widths to 1 110 | gt_widths = torch.clamp(gt_widths, min=1) 111 | gt_heights = torch.clamp(gt_heights, min=1) 112 | 113 | targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi 114 | targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi 115 | targets_dw = torch.log(gt_widths / anchor_widths_pi) 116 | targets_dh = torch.log(gt_heights / anchor_heights_pi) 117 | 118 | targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh)) 119 | targets = targets.t() 120 | 121 | targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda() 122 | 123 | 124 | negative_indices = 1 - positive_indices 125 | 126 | regression_diff = torch.abs(targets - regression[positive_indices, :]) 127 | 128 | regression_loss = torch.where( 129 | torch.le(regression_diff, 1.0 / 9.0), 130 | 0.5 * 9.0 * torch.pow(regression_diff, 2), 131 | regression_diff - 0.5 / 9.0 132 | ) 133 | regression_losses.append(regression_loss.mean()) 134 | else: 135 | regression_losses.append(torch.tensor(0).float().cuda()) 136 | 137 | return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True) 138 | 139 | 140 | -------------------------------------------------------------------------------- /model/retinanet.py: -------------------------------------------------------------------------------- 1 | """Original from from https://github.com/yhenon/pytorch-retinanet""" 2 | import sys 3 | #sys.path.append('../') 4 | import torch.nn as nn 5 | import torch 6 | import math 7 | import time 8 | import torch.utils.model_zoo as model_zoo 9 | from ..utils.utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes 10 | from .BottleneckBlock import BottleneckBlock 11 | from .anchors import Anchors 12 | from .losses import FocalLoss 13 | from ..lib.nms.gpu_nms import gpu_nms 14 | 15 | 16 | def nms(dets, thresh): 17 | """Dispatch to either CPU or GPU NMS implementations.\ 18 | Accept dets as tensor""" 19 | return gpu_nms(dets, thresh) 20 | 21 | 22 | model_urls = { 23 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 24 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 25 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 26 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 27 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 28 | } 29 | 30 | 31 | class PyramidFeatures(nn.Module): 32 | def __init__(self, C3_size, C4_size, C5_size, feature_size=256): 33 | super(PyramidFeatures, self).__init__() 34 | 35 | # upsample C5 to get P5 from the FPN paper 36 | self.P5_1 = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0) 37 | self.P5_upsampled = nn.Upsample(scale_factor=2, mode='nearest') 38 | self.P5_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 39 | 40 | # add P5 elementwise to C4 41 | self.P4_1 = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0) 42 | self.P4_upsampled = nn.Upsample(scale_factor=2, mode='nearest') 43 | self.P4_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 44 | 45 | # add P4 elementwise to C3 46 | self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0) 47 | self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 48 | 49 | # "P6 is obtained via a 3x3 stride-2 conv on C5" 50 | self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1) 51 | 52 | # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6" 53 | self.P7_1 = nn.ReLU() 54 | self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1) 55 | 56 | def forward(self, inputs): 57 | C3, C4, C5 = inputs 58 | 59 | P5_x = self.P5_1(C5) 60 | P5_upsampled_x = self.P5_upsampled(P5_x) 61 | P5_x = self.P5_2(P5_x) 62 | 63 | P4_x = self.P4_1(C4) 64 | P4_x = P5_upsampled_x + P4_x 65 | P4_upsampled_x = self.P4_upsampled(P4_x) 66 | P4_x = self.P4_2(P4_x) 67 | 68 | P3_x = self.P3_1(C3) 69 | P3_x = P3_x + P4_upsampled_x 70 | P3_x = self.P3_2(P3_x) 71 | 72 | P6_x = self.P6(C5) 73 | 74 | P7_x = self.P7_1(P6_x) 75 | P7_x = self.P7_2(P7_x) 76 | 77 | return [P3_x, P4_x, P5_x, P6_x, P7_x] 78 | 79 | 80 | class RegressionModel(nn.Module): 81 | def __init__(self, num_features_in, num_anchors=9, feature_size=256): 82 | super(RegressionModel, self).__init__() 83 | 84 | self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) 85 | self.act1 = nn.ReLU() 86 | 87 | self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 88 | self.act2 = nn.ReLU() 89 | 90 | self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 91 | self.act3 = nn.ReLU() 92 | 93 | self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 94 | self.act4 = nn.ReLU() 95 | 96 | self.output = nn.Conv2d(feature_size, num_anchors * 4, kernel_size=3, padding=1) 97 | 98 | def forward(self, x): 99 | out = self.conv1(x) 100 | out = self.act1(out) 101 | 102 | out = self.conv2(out) 103 | out = self.act2(out) 104 | 105 | out = self.conv3(out) 106 | out = self.act3(out) 107 | 108 | out = self.conv4(out) 109 | out = self.act4(out) 110 | 111 | out = self.output(out) 112 | 113 | # out is B x C x W x H, with C = 4*num_anchors 114 | out = out.permute(0, 2, 3, 1) 115 | 116 | return out.contiguous().view(out.shape[0], -1, 4) 117 | 118 | 119 | class ClassificationModel(nn.Module): 120 | def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256): 121 | super(ClassificationModel, self).__init__() 122 | 123 | self.num_classes = num_classes 124 | self.num_anchors = num_anchors 125 | 126 | self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) 127 | self.act1 = nn.ReLU() 128 | 129 | self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 130 | self.act2 = nn.ReLU() 131 | 132 | self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 133 | self.act3 = nn.ReLU() 134 | 135 | self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 136 | self.act4 = nn.ReLU() 137 | 138 | self.output = nn.Conv2d(feature_size, num_anchors * num_classes, kernel_size=3, padding=1) 139 | self.output_act = nn.Sigmoid() 140 | 141 | def forward(self, x): 142 | out = self.conv1(x) 143 | out = self.act1(out) 144 | 145 | out = self.conv2(out) 146 | out = self.act2(out) 147 | 148 | out = self.conv3(out) 149 | out = self.act3(out) 150 | 151 | out = self.conv4(out) 152 | out = self.act4(out) 153 | 154 | out = self.output(out) 155 | out = self.output_act(out) 156 | 157 | # out is B x C x W x H, with C = n_classes + n_anchors 158 | out1 = out.permute(0, 2, 3, 1) 159 | 160 | batch_size, width, height, channels = out1.shape 161 | 162 | out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes) 163 | 164 | return out2.contiguous().view(x.shape[0], -1, self.num_classes) 165 | 166 | 167 | class ResNet(nn.Module): 168 | 169 | def __init__(self, num_classes, block, layers, attention=False, input_size=None): 170 | self.inplanes = 64 171 | super(ResNet, self).__init__() 172 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 173 | self.bn1 = nn.BatchNorm2d(64) 174 | self.relu = nn.ReLU(inplace=True) 175 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 176 | 177 | dummy = torch.rand((1, *input_size)) 178 | sizes = self.compute_sizes(self.conv1, dummy) 179 | 180 | self.layer1 = self._make_layer(block, 64, layers[0], attention, h=sizes[2], w=sizes[3]) 181 | dummy = torch.rand(sizes) 182 | sizes = self.compute_sizes(self.layer1, dummy) 183 | 184 | self.layer2 = self._make_layer(block, 128, layers[1], attention, stride=2, h=sizes[2], w=sizes[3]) 185 | dummy = torch.rand(sizes) 186 | sizes = self.compute_sizes(self.layer2, dummy) 187 | 188 | self.layer3 = self._make_layer(block, 256, layers[2], attention, stride=2, h=sizes[2], w=sizes[3]) 189 | dummy = torch.rand(sizes) 190 | sizes = self.compute_sizes(self.layer3, dummy) 191 | 192 | self.layer4 = self._make_layer(block, 512, layers[3], attention, stride=2, h=sizes[2], w=sizes[3]) 193 | 194 | if block == BasicBlock: 195 | fpn_sizes = [self.layer2[layers[1] - 1].conv2.out_channels, self.layer3[layers[2] - 1].conv2.out_channels, 196 | self.layer4[layers[3] - 1].conv2.out_channels] 197 | elif block == Bottleneck or block == BottleneckBlock: 198 | fpn_sizes = [self.layer2[layers[1] - 1].conv3.out_channels, self.layer3[layers[2] - 1].conv3.out_channels, 199 | self.layer4[layers[3] - 1].conv3.out_channels] 200 | 201 | self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2]) 202 | 203 | self.regressionModel = RegressionModel(256) 204 | self.classificationModel = ClassificationModel(256, num_classes=num_classes) 205 | 206 | self.anchors = Anchors() 207 | 208 | self.regressBoxes = BBoxTransform() 209 | 210 | self.clipBoxes = ClipBoxes() 211 | 212 | self.focalLoss = FocalLoss() 213 | 214 | for m in self.modules(): 215 | if isinstance(m, nn.Conv2d): 216 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 217 | m.weight.data.normal_(0, math.sqrt(2. / n)) 218 | elif isinstance(m, nn.BatchNorm2d): 219 | m.weight.data.fill_(1) 220 | m.bias.data.zero_() 221 | 222 | prior = 0.01 223 | 224 | self.classificationModel.output.weight.data.fill_(0) 225 | self.classificationModel.output.bias.data.fill_(-math.log((1.0 - prior) / prior)) 226 | 227 | self.regressionModel.output.weight.data.fill_(0) 228 | self.regressionModel.output.bias.data.fill_(0) 229 | 230 | self.freeze_bn() 231 | 232 | def compute_sizes(self, layer, dummy_input): 233 | dummy_input = layer(dummy_input) 234 | return dummy_input.size() 235 | 236 | def _make_layer(self, block, planes, blocks, attention, stride=1, h=None, w=None): 237 | downsample = None 238 | if stride != 1 or self.inplanes != planes * block.expansion: 239 | downsample = nn.Sequential( 240 | nn.Conv2d(self.inplanes, planes * block.expansion, 241 | kernel_size=1, stride=stride, bias=False), 242 | nn.BatchNorm2d(planes * block.expansion), 243 | ) 244 | 245 | layers = [] 246 | if block == BottleneckBlock: 247 | layers.append(block(self.inplanes, planes, stride, downsample, attention, kappa=0.1, nu=0.05, num_heads=4, H=h, W=w)) #how to determine heights and widths =X 248 | self.inplanes = planes * block.expansion 249 | for i in range(1, blocks): 250 | layers.append(block(self.inplanes, planes)) 251 | 252 | return nn.Sequential(*layers) 253 | 254 | def freeze_bn(self): 255 | '''Freeze BatchNorm layers.''' 256 | for layer in self.modules(): 257 | if isinstance(layer, nn.BatchNorm2d): 258 | layer.eval() 259 | 260 | def forward(self, inputs): 261 | 262 | if self.training: 263 | img_batch, annotations = inputs 264 | else: 265 | img_batch = inputs 266 | 267 | x = self.conv1(img_batch) 268 | x = self.bn1(x) 269 | x = self.relu(x) 270 | x = self.maxpool(x) 271 | 272 | x1 = self.layer1(x) 273 | x2 = self.layer2(x1) 274 | x3 = self.layer3(x2) 275 | x4 = self.layer4(x3) 276 | 277 | features = self.fpn([x2, x3, x4]) 278 | 279 | regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1) 280 | 281 | classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1) 282 | 283 | anchors = self.anchors(img_batch) 284 | 285 | if self.training: 286 | return self.focalLoss(classification, regression, anchors, annotations) 287 | else: 288 | transformed_anchors = self.regressBoxes(anchors, regression) 289 | transformed_anchors = self.clipBoxes(transformed_anchors, img_batch) 290 | 291 | scores = torch.max(classification, dim=2, keepdim=True)[0] 292 | 293 | scores_over_thresh = (scores > 0.05)[0, :, 0] 294 | 295 | if scores_over_thresh.sum() == 0: 296 | # no boxes to NMS, just return 297 | return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] 298 | 299 | classification = classification[:, scores_over_thresh, :] 300 | transformed_anchors = transformed_anchors[:, scores_over_thresh, :] 301 | scores = scores[:, scores_over_thresh, :] 302 | 303 | anchors_nms_idx = nms(torch.cat([transformed_anchors, scores], dim=2)[0, :, :], 0.5) 304 | 305 | nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1) 306 | 307 | return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]] 308 | 309 | 310 | def AttentionRetinaNet(num_classes, input_size): 311 | model = ResNet(num_classes, BottleneckBlock, [3, 4, 6, 3], attention=True, input_size=input_size) 312 | return model 313 | 314 | def resnet18(num_classes, pretrained=False, **kwargs): 315 | """Constructs a ResNet-18 model. 316 | Args: 317 | pretrained (bool): If True, returns a model pre-trained on ImageNet 318 | """ 319 | model = ResNet(num_classes, BasicBlock, [2, 2, 2, 2], **kwargs) 320 | if pretrained: 321 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18'], model_dir='.'), strict=False) 322 | return model 323 | 324 | 325 | def resnet34(num_classes, pretrained=False, **kwargs): 326 | """Constructs a ResNet-34 model. 327 | Args: 328 | pretrained (bool): If True, returns a model pre-trained on ImageNet 329 | """ 330 | model = ResNet(num_classes, BasicBlock, [3, 4, 6, 3], **kwargs) 331 | if pretrained: 332 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34'], model_dir='.'), strict=False) 333 | return model 334 | 335 | 336 | def resnet50(num_classes, pretrained=False, **kwargs): 337 | """Constructs a ResNet-50 model. 338 | Args: 339 | pretrained (bool): If True, returns a model pre-trained on ImageNet 340 | """ 341 | model = ResNet(num_classes, Bottleneck, [3, 4, 6, 3], **kwargs) 342 | if pretrained: 343 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50'], model_dir='.'), strict=False) 344 | return model 345 | 346 | def resnet50_attn(num_classes, pretrained=False): 347 | model = ResNet(num_classes, BottleneckBlock, [3, 4, 6, 3], attention=True, input_size=(3, )) 348 | 349 | def resnet101(num_classes, pretrained=False, **kwargs): 350 | """Constructs a ResNet-101 model. 351 | Args: 352 | pretrained (bool): If True, returns a model pre-trained on ImageNet 353 | """ 354 | model = ResNet(num_classes, Bottleneck, [3, 4, 23, 3], **kwargs) 355 | if pretrained: 356 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101'], model_dir='.'), strict=False) 357 | return model 358 | 359 | 360 | def resnet152(num_classes, pretrained=False, **kwargs): 361 | """Constructs a ResNet-152 model. 362 | Args: 363 | pretrained (bool): If True, returns a model pre-trained on ImageNet 364 | """ 365 | model = ResNet(num_classes, BottleneckBlock, [3, 8, 36, 3], attention=True) 366 | if pretrained: 367 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'], model_dir='.'), strict=False) 368 | return model 369 | -------------------------------------------------------------------------------- /model/wideresnet.py: -------------------------------------------------------------------------------- 1 | import math 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | from .attentionConv2d import AttentionConv2d 6 | 7 | 8 | class BasicAttentionBlock(nn.Module): 9 | def __init__(self, in_planes, out_planes, stride, height, width, dk, dv, dropRate=0.0): 10 | super(BasicAttentionBlock, self).__init__() 11 | self.bn1 = nn.BatchNorm2d(in_planes) 12 | self.relu1 = nn.ReLU(inplace=True) 13 | 14 | #self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) 15 | self.conv1 = AttentionConv2d(in_planes, out_planes, height, width, dk, dv, num_heads=8, kernel_size=3, padding=1) 16 | self.bn2 = nn.BatchNorm2d(out_planes) 17 | self.relu2 = nn.ReLU(inplace=True) 18 | self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3, stride=1, 19 | padding=1, bias=False) 20 | self.droprate = dropRate 21 | self.equalInOut = (in_planes == out_planes) 22 | self.convShortcut = (not self.equalInOut) and nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=1, 23 | padding=0, bias=False) or None 24 | 25 | def forward(self, x): 26 | if not self.equalInOut: 27 | x = self.relu1(self.bn1(x)) 28 | else: 29 | out = self.relu1(self.bn1(x)) 30 | out = self.relu2(self.bn2(self.conv1(out if self.equalInOut else x))) 31 | if self.droprate > 0: 32 | out = F.dropout(out, p=self.droprate, training=self.training) 33 | out = self.conv2(out) 34 | return torch.add(x if self.equalInOut else self.convShortcut(x), out) 35 | 36 | class NetworkBlock(nn.Module): 37 | def __init__(self, nb_layers, in_planes, out_planes, block, stride, height, width, dropRate=0.0): 38 | super(NetworkBlock, self).__init__() 39 | self.layer = self._make_layer(block, in_planes, out_planes, nb_layers, stride, height, width, dropRate) 40 | 41 | def _make_layer(self, block, in_planes, out_planes, nb_layers, stride, height, width, dropRate): 42 | layers = [] 43 | dk = int(0.1 * out_planes) 44 | dv = int(0.2 * out_planes) 45 | 46 | for i in range(int(nb_layers)): 47 | layers.append(block(i == 0 and in_planes or out_planes, out_planes, i == 0 and stride or 1, height, width, dk, dv, dropRate)) 48 | return nn.Sequential(*layers) 49 | 50 | def forward(self, x): 51 | return self.layer(x) 52 | 53 | class AttentionWideResNet(nn.Module): 54 | def __init__(self, depth, num_classes, widen_factor=1, input_dim=(32, 32), dropRate=0.0): 55 | super(AttentionWideResNet, self).__init__() 56 | nChannels = [16, 16*widen_factor, 32*widen_factor, 64*widen_factor] 57 | 58 | height, width = input_dim 59 | assert((depth - 4) % 6 == 0) 60 | n = (depth - 4) / 6 61 | block = BasicAttentionBlock 62 | # 1st conv before any network block 63 | self.conv1 = nn.Conv2d(3, nChannels[0], kernel_size=3, stride=1, 64 | padding=1, bias=False) 65 | # 1st block 66 | self.block1 = NetworkBlock(n, nChannels[0], nChannels[1], block, 1, height, width, dropRate) 67 | # 2nd block 68 | self.block2 = NetworkBlock(n, nChannels[1], nChannels[2], block, 2, height, width, dropRate) 69 | # 3rd block 70 | self.block3 = NetworkBlock(n, nChannels[2], nChannels[3], block, 2, height, width, dropRate) 71 | # global average pooling and classifier 72 | self.bn1 = nn.BatchNorm2d(nChannels[3]) 73 | self.relu = nn.ReLU(inplace=True) 74 | self.fc = nn.Linear(nChannels[3], num_classes) 75 | self.nChannels = nChannels[3] 76 | 77 | for m in self.modules(): 78 | if isinstance(m, nn.Conv2d): 79 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 80 | m.weight.data.normal_(0, math.sqrt(2. / n)) 81 | elif isinstance(m, nn.BatchNorm2d): 82 | m.weight.data.fill_(1) 83 | m.bias.data.zero_() 84 | elif isinstance(m, nn.Linear): 85 | m.bias.data.zero_() 86 | 87 | def forward(self, x): 88 | out = self.conv1(x) 89 | out = self.block1(out) 90 | out = self.block2(out) 91 | out = self.block3(out) 92 | out = self.relu(self.bn1(out)) 93 | 94 | out = F.avg_pool2d(out, 32) 95 | out = out.view(-1, self.nChannels) 96 | #print(out.size()) 97 | return self.fc(out) 98 | -------------------------------------------------------------------------------- /pytorch-retinanet/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # Jupyter Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # SageMath parsed files 79 | *.sage.py 80 | 81 | # dotenv 82 | .env 83 | 84 | # virtualenv 85 | .venv 86 | venv/ 87 | ENV/ 88 | 89 | # Spyder project settings 90 | .spyderproject 91 | .spyproject 92 | 93 | # Rope project settings 94 | .ropeproject 95 | 96 | # mkdocs documentation 97 | /site 98 | 99 | # mypy 100 | .mypy_cache/ 101 | 102 | *.zip 103 | *.pt 104 | -------------------------------------------------------------------------------- /pytorch-retinanet/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /pytorch-retinanet/README.md: -------------------------------------------------------------------------------- 1 | # pytorch-retinanet 2 | 3 | ![img3](https://github.com/yhenon/pytorch-retinanet/blob/master/images/3.jpg) 4 | ![img5](https://github.com/yhenon/pytorch-retinanet/blob/master/images/5.jpg) 5 | 6 | Pytorch implementation of RetinaNet object detection as described in [Focal Loss for Dense Object Detection](https://arxiv.org/abs/1708.02002) by Tsung-Yi Lin, Priya Goyal, Ross Girshick, Kaiming He and Piotr Dollár. 7 | 8 | This implementation is primarily designed to be easy to read and simple to modify. 9 | 10 | ## Results 11 | Currently, this repo achieves 33.7% mAP at 600px resolution with a Resnet-50 backbone. The published result is 34.0% mAP. The difference is likely due to the use of Adam optimizer instead of SGD with weight decay. 12 | 13 | ## Installation 14 | 15 | 1) Clone this repo 16 | 17 | 2) Install the required packages: 18 | 19 | ``` 20 | apt-get install tk-dev python-tk 21 | ``` 22 | 23 | 3) Install the python packages: 24 | 25 | ``` 26 | pip install cffi 27 | 28 | pip install pandas 29 | 30 | pip install pycocotools 31 | 32 | pip install cython 33 | 34 | pip install opencv-python 35 | 36 | pip install requests 37 | 38 | ``` 39 | 40 | 4) Build the NMS extension. 41 | 42 | ``` 43 | cd pytorch-retinanet/lib 44 | bash build.sh 45 | cd ../ 46 | ``` 47 | 48 | Note that you may have to edit line 14 of `build.sh` if you want to change which version of python you are building the extension for. 49 | 50 | ## Training 51 | 52 | The network can be trained using the `train.py` script. Currently, two dataloaders are available: COCO and CSV. For training on coco, use 53 | 54 | ``` 55 | python train.py --dataset coco --coco_path ../coco --depth 50 56 | ``` 57 | 58 | For training using a custom dataset, with annotations in CSV format (see below), use 59 | 60 | ``` 61 | python train.py --dataset csv --csv_train --csv_classes --csv_val 62 | ``` 63 | 64 | Note that the --csv_val argument is optional, in which case no validation will be performed. 65 | 66 | ## Pre-trained model 67 | 68 | A pre-trained model is available at: 69 | - https://drive.google.com/open?id=1yLmjq3JtXi841yXWBxst0coAgR26MNBS (this is a pytorch state dict) 70 | - https://drive.google.com/open?id=1hCtM35R_t6T8RJVSd74K4gB-A1MR-TxC (this is a pytorch model serialized via `torch.save()`) 71 | 72 | The state dict model can be loaded using: 73 | 74 | ``` 75 | retinanet = model.resnet50(num_classes=dataset_train.num_classes(),) 76 | retinanet.load_state_dict(torch.load(PATH_TO_WEIGHTS)) 77 | ``` 78 | 79 | The pytorch model can be loaded directly using: 80 | 81 | ``` 82 | retinanet = torch.load(PATH_TO_MODEL) 83 | ``` 84 | 85 | ## Visualization 86 | 87 | To visualize the network detection, use `visualize.py`: 88 | 89 | ``` 90 | python visualize.py --dataset coco --coco_path ../coco --model 91 | ``` 92 | This will visualize bounding boxes on the validation set. To visualise with a CSV dataset, use: 93 | 94 | ``` 95 | python visualize.py --dataset csv --csv_classes --csv_val --model 96 | ``` 97 | 98 | ## Model 99 | 100 | The retinanet model uses a resnet backbone. You can set the depth of the resnet model using the --depth argument. Depth must be one of 18, 34, 50, 101 or 152. Note that deeper models are more accurate but are slower and use more memory. 101 | 102 | ## CSV datasets 103 | The `CSVGenerator` provides an easy way to define your own datasets. 104 | It uses two CSV files: one file containing annotations and one file containing a class name to ID mapping. 105 | 106 | ### Annotations format 107 | The CSV file with annotations should contain one annotation per line. 108 | Images with multiple bounding boxes should use one row per bounding box. 109 | Note that indexing for pixel values starts at 0. 110 | The expected format of each line is: 111 | ``` 112 | path/to/image.jpg,x1,y1,x2,y2,class_name 113 | ``` 114 | 115 | Some images may not contain any labeled objects. 116 | To add these images to the dataset as negative examples, 117 | add an annotation where `x1`, `y1`, `x2`, `y2` and `class_name` are all empty: 118 | ``` 119 | path/to/image.jpg,,,,, 120 | ``` 121 | 122 | A full example: 123 | ``` 124 | /data/imgs/img_001.jpg,837,346,981,456,cow 125 | /data/imgs/img_002.jpg,215,312,279,391,cat 126 | /data/imgs/img_002.jpg,22,5,89,84,bird 127 | /data/imgs/img_003.jpg,,,,, 128 | ``` 129 | 130 | This defines a dataset with 3 images. 131 | `img_001.jpg` contains a cow. 132 | `img_002.jpg` contains a cat and a bird. 133 | `img_003.jpg` contains no interesting objects/animals. 134 | 135 | 136 | ### Class mapping format 137 | The class name to ID mapping file should contain one mapping per line. 138 | Each line should use the following format: 139 | ``` 140 | class_name,id 141 | ``` 142 | 143 | Indexing for classes starts at 0. 144 | Do not include a background class as it is implicit. 145 | 146 | For example: 147 | ``` 148 | cow,0 149 | cat,1 150 | bird,2 151 | ``` 152 | 153 | ## Acknowledgements 154 | 155 | - Significant amounts of code are borrowed from the [keras retinanet implementation](https://github.com/fizyr/keras-retinanet) 156 | - The NMS module used is from the [pytorch faster-rcnn implementation](https://github.com/ruotianluo/pytorch-faster-rcnn) 157 | 158 | ## Examples 159 | 160 | ![img1](https://github.com/yhenon/pytorch-retinanet/blob/master/images/1.jpg) 161 | ![img2](https://github.com/yhenon/pytorch-retinanet/blob/master/images/2.jpg) 162 | ![img4](https://github.com/yhenon/pytorch-retinanet/blob/master/images/4.jpg) 163 | ![img6](https://github.com/yhenon/pytorch-retinanet/blob/master/images/6.jpg) 164 | ![img7](https://github.com/yhenon/pytorch-retinanet/blob/master/images/7.jpg) 165 | ![img8](https://github.com/yhenon/pytorch-retinanet/blob/master/images/8.jpg) 166 | -------------------------------------------------------------------------------- /pytorch-retinanet/anchors.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class Anchors(nn.Module): 7 | def __init__(self, pyramid_levels=None, strides=None, sizes=None, ratios=None, scales=None): 8 | super(Anchors, self).__init__() 9 | 10 | if pyramid_levels is None: 11 | self.pyramid_levels = [3, 4, 5, 6, 7] 12 | if strides is None: 13 | self.strides = [2 ** x for x in self.pyramid_levels] 14 | if sizes is None: 15 | self.sizes = [2 ** (x + 2) for x in self.pyramid_levels] 16 | if ratios is None: 17 | self.ratios = np.array([0.5, 1, 2]) 18 | if scales is None: 19 | self.scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 20 | 21 | def forward(self, image): 22 | 23 | image_shape = image.shape[2:] 24 | image_shape = np.array(image_shape) 25 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in self.pyramid_levels] 26 | 27 | # compute anchors over all pyramid levels 28 | all_anchors = np.zeros((0, 4)).astype(np.float32) 29 | 30 | for idx, p in enumerate(self.pyramid_levels): 31 | anchors = generate_anchors(base_size=self.sizes[idx], ratios=self.ratios, scales=self.scales) 32 | shifted_anchors = shift(image_shapes[idx], self.strides[idx], anchors) 33 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 34 | 35 | all_anchors = np.expand_dims(all_anchors, axis=0) 36 | 37 | return torch.from_numpy(all_anchors.astype(np.float32)).cuda() 38 | 39 | def generate_anchors(base_size=16, ratios=None, scales=None): 40 | """ 41 | Generate anchor (reference) windows by enumerating aspect ratios X 42 | scales w.r.t. a reference window. 43 | """ 44 | 45 | if ratios is None: 46 | ratios = np.array([0.5, 1, 2]) 47 | 48 | if scales is None: 49 | scales = np.array([2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]) 50 | 51 | num_anchors = len(ratios) * len(scales) 52 | 53 | # initialize output anchors 54 | anchors = np.zeros((num_anchors, 4)) 55 | 56 | # scale base_size 57 | anchors[:, 2:] = base_size * np.tile(scales, (2, len(ratios))).T 58 | 59 | # compute areas of anchors 60 | areas = anchors[:, 2] * anchors[:, 3] 61 | 62 | # correct for ratios 63 | anchors[:, 2] = np.sqrt(areas / np.repeat(ratios, len(scales))) 64 | anchors[:, 3] = anchors[:, 2] * np.repeat(ratios, len(scales)) 65 | 66 | # transform from (x_ctr, y_ctr, w, h) -> (x1, y1, x2, y2) 67 | anchors[:, 0::2] -= np.tile(anchors[:, 2] * 0.5, (2, 1)).T 68 | anchors[:, 1::2] -= np.tile(anchors[:, 3] * 0.5, (2, 1)).T 69 | 70 | return anchors 71 | 72 | def compute_shape(image_shape, pyramid_levels): 73 | """Compute shapes based on pyramid levels. 74 | 75 | :param image_shape: 76 | :param pyramid_levels: 77 | :return: 78 | """ 79 | image_shape = np.array(image_shape[:2]) 80 | image_shapes = [(image_shape + 2 ** x - 1) // (2 ** x) for x in pyramid_levels] 81 | return image_shapes 82 | 83 | 84 | def anchors_for_shape( 85 | image_shape, 86 | pyramid_levels=None, 87 | ratios=None, 88 | scales=None, 89 | strides=None, 90 | sizes=None, 91 | shapes_callback=None, 92 | ): 93 | 94 | image_shapes = compute_shape(image_shape, pyramid_levels) 95 | 96 | # compute anchors over all pyramid levels 97 | all_anchors = np.zeros((0, 4)) 98 | for idx, p in enumerate(pyramid_levels): 99 | anchors = generate_anchors(base_size=sizes[idx], ratios=ratios, scales=scales) 100 | shifted_anchors = shift(image_shapes[idx], strides[idx], anchors) 101 | all_anchors = np.append(all_anchors, shifted_anchors, axis=0) 102 | 103 | return all_anchors 104 | 105 | 106 | def shift(shape, stride, anchors): 107 | shift_x = (np.arange(0, shape[1]) + 0.5) * stride 108 | shift_y = (np.arange(0, shape[0]) + 0.5) * stride 109 | 110 | shift_x, shift_y = np.meshgrid(shift_x, shift_y) 111 | 112 | shifts = np.vstack(( 113 | shift_x.ravel(), shift_y.ravel(), 114 | shift_x.ravel(), shift_y.ravel() 115 | )).transpose() 116 | 117 | # add A anchors (1, A, 4) to 118 | # cell K shifts (K, 1, 4) to get 119 | # shift anchors (K, A, 4) 120 | # reshape to (K*A, 4) shifted anchors 121 | A = anchors.shape[0] 122 | K = shifts.shape[0] 123 | all_anchors = (anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) 124 | all_anchors = all_anchors.reshape((K * A, 4)) 125 | 126 | return all_anchors 127 | 128 | -------------------------------------------------------------------------------- /pytorch-retinanet/attentionConv2d.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import einsum 5 | 6 | 7 | class AttentionConv2d(nn.Module): 8 | def __init__(self, input_dim, output_dim, dk, dv, num_heads, kernel_size, padding, rel_encoding=True, height=None, width=None): 9 | super(AttentionConv2d, self).__init__() 10 | self.input_dim = input_dim 11 | self.output_dim = output_dim 12 | self.dk = dk 13 | self.dv = dv 14 | self.num_heads = num_heads 15 | self.kernel_size = kernel_size 16 | self.dkh = self.dk // self.num_heads 17 | if rel_encoding and not height: 18 | raise("Cannot use relative encoding without specifying input's height and width") 19 | self.H = height 20 | self.W = width 21 | 22 | self.conv_qkv = nn.Conv2d(input_dim, 2*dk + dv, 1) 23 | self.conv_attn = nn.Conv2d(dv, dv, 1) 24 | self.conv_out = nn.Conv2d(input_dim, output_dim - dv, kernel_size, padding=padding) 25 | self.softmax = nn.Softmax(dim=-1) 26 | if width is not None: 27 | self.key_rel_w = nn.Parameter(self.dkh**-0.5 + torch.rand(2*width-1, self.dkh), requires_grad=True) 28 | if height is not None: 29 | self.key_rel_h = nn.Parameter(self.dkh**-0.5 + torch.rand(2*height-1, self.dkh), requires_grad=True) 30 | self.relative_encoding = rel_encoding 31 | 32 | 33 | def forward(self, input): 34 | conv_out = self.conv_out(input) 35 | 36 | qkv = self.conv_qkv(input) # batch_size, 2*dk+dv, H, W 37 | 38 | q, k, v = torch.split(qkv, [self.dk, self.dk, self.dv], dim=1) 39 | 40 | batch_size, _, H, W = q.size() 41 | 42 | q = q.view([batch_size, self.num_heads, self.dk // self.num_heads, H*W]) 43 | k = k.view([batch_size, self.num_heads, self.dk // self.num_heads, H*W]) 44 | v = v.view([batch_size, self.num_heads, self.dv // self.num_heads, H*W]) 45 | 46 | q *= self.dkh ** -0.5 47 | logits = einsum('ijkl, ijkm -> ijlm', q, k) 48 | if self.relative_encoding: 49 | h_rel_logits, w_rel_logits = self._relative_logits(q) 50 | logits += h_rel_logits 51 | logits += w_rel_logits 52 | 53 | weights = self.softmax(logits) 54 | attn_out = einsum('ijkl, ijfl -> ijfk', weights, v) 55 | attn_out = attn_out.contiguous().view(batch_size, self.dv, H, W) 56 | attn_out = self.conv_attn(attn_out) 57 | output = torch.cat([conv_out, attn_out], dim=1) 58 | return output 59 | 60 | def _relative_logits(self, q): 61 | b, nh, dkh, _ = q.size() 62 | q = q.view(b, nh, dkh, self.H, self.W) 63 | 64 | rel_logits_w = self._relative_logits1d(q, self.key_rel_w, self.H, self.W, nh, [0, 1, 2, 4, 3, 5]) 65 | rel_logits_h = self._relative_logits1d(q.permute(0, 1, 2, 4, 3), self.key_rel_h, self.W, self.H, nh, [0, 1, 4, 2, 5, 3]) 66 | return rel_logits_h, rel_logits_w 67 | 68 | def _relative_logits1d(self, q, rel_k, H, W, Nh, transpose_mask): 69 | rel_logits = einsum('bhdxy, md -> bhxym', q, rel_k) 70 | 71 | rel_logits = rel_logits.view([-1, Nh*H, W, 2*W-1]) 72 | rel_logits = self._rel_to_abs(rel_logits) 73 | rel_logits = rel_logits.view([-1, Nh, H, W, W]).unsqueeze(dim=3).repeat([1,1,1,H,1,1]) 74 | rel_logits = rel_logits.permute(*transpose_mask) 75 | rel_logits = rel_logits.contiguous().view(-1, Nh, H*W, H*W) 76 | return rel_logits 77 | 78 | def _rel_to_abs(self, x): 79 | b, nh, l, _ = x.size() 80 | 81 | 82 | x = F.pad(x, (0,1), 'constant', 0) 83 | flat_x = x.view([b, nh, l*(2*l)]); 84 | flat_x_padded = F.pad(flat_x, (0, l-1), 'constant', 0) 85 | 86 | final_x = flat_x_padded.view([b, nh, l+1, 2*l-1]) 87 | final_x = final_x[:, :, :l, l-1:] 88 | 89 | return final_x 90 | -------------------------------------------------------------------------------- /pytorch-retinanet/coco_eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | from pycocotools.coco import COCO 4 | from pycocotools.cocoeval import COCOeval 5 | 6 | import numpy as np 7 | import json 8 | import os 9 | 10 | import torch 11 | 12 | def evaluate_coco(dataset, model, threshold=0.05): 13 | 14 | model.eval() 15 | 16 | with torch.no_grad(): 17 | 18 | # start collecting results 19 | results = [] 20 | image_ids = [] 21 | 22 | for index in range(len(dataset)): 23 | data = dataset[index] 24 | scale = data['scale'] 25 | 26 | # run network 27 | scores, labels, boxes = model(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0)) 28 | scores = scores.cpu() 29 | labels = labels.cpu() 30 | boxes = boxes.cpu() 31 | 32 | # correct boxes for image scale 33 | boxes /= scale 34 | 35 | if boxes.shape[0] > 0: 36 | # change to (x, y, w, h) (MS COCO standard) 37 | boxes[:, 2] -= boxes[:, 0] 38 | boxes[:, 3] -= boxes[:, 1] 39 | 40 | # compute predicted labels and scores 41 | #for box, score, label in zip(boxes[0], scores[0], labels[0]): 42 | for box_id in range(boxes.shape[0]): 43 | score = float(scores[box_id]) 44 | label = int(labels[box_id]) 45 | box = boxes[box_id, :] 46 | 47 | # scores are sorted, so we can break 48 | if score < threshold: 49 | break 50 | 51 | # append detection for each positively labeled class 52 | image_result = { 53 | 'image_id' : dataset.image_ids[index], 54 | 'category_id' : dataset.label_to_coco_label(label), 55 | 'score' : float(score), 56 | 'bbox' : box.tolist(), 57 | } 58 | 59 | # append detection to results 60 | results.append(image_result) 61 | 62 | # append image to list of processed images 63 | image_ids.append(dataset.image_ids[index]) 64 | 65 | # print progress 66 | print('{}/{}'.format(index, len(dataset)), end='\r') 67 | 68 | if not len(results): 69 | return 70 | 71 | # write output 72 | json.dump(results, open('{}_bbox_results.json'.format(dataset.set_name), 'w'), indent=4) 73 | 74 | # load results in COCO evaluation tool 75 | coco_true = dataset.coco 76 | coco_pred = coco_true.loadRes('{}_bbox_results.json'.format(dataset.set_name)) 77 | 78 | # run COCO evaluation 79 | coco_eval = COCOeval(coco_true, coco_pred, 'bbox') 80 | coco_eval.params.imgIds = image_ids 81 | coco_eval.evaluate() 82 | coco_eval.accumulate() 83 | coco_eval.summarize() 84 | 85 | model.train() 86 | 87 | return 88 | -------------------------------------------------------------------------------- /pytorch-retinanet/csv_eval.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | import json 5 | import os 6 | 7 | import torch 8 | 9 | 10 | 11 | def compute_overlap(a, b): 12 | """ 13 | Parameters 14 | ---------- 15 | a: (N, 4) ndarray of float 16 | b: (K, 4) ndarray of float 17 | Returns 18 | ------- 19 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 20 | """ 21 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 22 | 23 | iw = np.minimum(np.expand_dims(a[:, 2], axis=1), b[:, 2]) - np.maximum(np.expand_dims(a[:, 0], 1), b[:, 0]) 24 | ih = np.minimum(np.expand_dims(a[:, 3], axis=1), b[:, 3]) - np.maximum(np.expand_dims(a[:, 1], 1), b[:, 1]) 25 | 26 | iw = np.maximum(iw, 0) 27 | ih = np.maximum(ih, 0) 28 | 29 | ua = np.expand_dims((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), axis=1) + area - iw * ih 30 | 31 | ua = np.maximum(ua, np.finfo(float).eps) 32 | 33 | intersection = iw * ih 34 | 35 | return intersection / ua 36 | 37 | 38 | def _compute_ap(recall, precision): 39 | """ Compute the average precision, given the recall and precision curves. 40 | Code originally from https://github.com/rbgirshick/py-faster-rcnn. 41 | # Arguments 42 | recall: The recall curve (list). 43 | precision: The precision curve (list). 44 | # Returns 45 | The average precision as computed in py-faster-rcnn. 46 | """ 47 | # correct AP calculation 48 | # first append sentinel values at the end 49 | mrec = np.concatenate(([0.], recall, [1.])) 50 | mpre = np.concatenate(([0.], precision, [0.])) 51 | 52 | # compute the precision envelope 53 | for i in range(mpre.size - 1, 0, -1): 54 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 55 | 56 | # to calculate area under PR curve, look for points 57 | # where X axis (recall) changes value 58 | i = np.where(mrec[1:] != mrec[:-1])[0] 59 | 60 | # and sum (\Delta recall) * prec 61 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 62 | return ap 63 | 64 | 65 | def _get_detections(dataset, retinanet, score_threshold=0.05, max_detections=100, save_path=None): 66 | """ Get the detections from the retinanet using the generator. 67 | The result is a list of lists such that the size is: 68 | all_detections[num_images][num_classes] = detections[num_detections, 4 + num_classes] 69 | # Arguments 70 | dataset : The generator used to run images through the retinanet. 71 | retinanet : The retinanet to run on the images. 72 | score_threshold : The score confidence threshold to use. 73 | max_detections : The maximum number of detections to use per image. 74 | save_path : The path to save the images with visualized detections to. 75 | # Returns 76 | A list of lists containing the detections for each image in the generator. 77 | """ 78 | all_detections = [[None for i in range(dataset.num_classes())] for j in range(len(dataset))] 79 | 80 | retinanet.eval() 81 | 82 | with torch.no_grad(): 83 | 84 | for index in range(len(dataset)): 85 | data = dataset[index] 86 | scale = data['scale'] 87 | 88 | # run network 89 | scores, labels, boxes = retinanet(data['img'].permute(2, 0, 1).cuda().float().unsqueeze(dim=0)) 90 | scores = scores.cpu().numpy() 91 | labels = labels.cpu().numpy() 92 | boxes = boxes.cpu().numpy() 93 | 94 | # correct boxes for image scale 95 | boxes /= scale 96 | 97 | # select indices which have a score above the threshold 98 | indices = np.where(scores > score_threshold)[0] 99 | if indices.shape[0] > 0: 100 | # select those scores 101 | scores = scores[indices] 102 | 103 | # find the order with which to sort the scores 104 | scores_sort = np.argsort(-scores)[:max_detections] 105 | 106 | # select detections 107 | image_boxes = boxes[indices[scores_sort], :] 108 | image_scores = scores[scores_sort] 109 | image_labels = labels[indices[scores_sort]] 110 | image_detections = np.concatenate([image_boxes, np.expand_dims(image_scores, axis=1), np.expand_dims(image_labels, axis=1)], axis=1) 111 | 112 | # copy detections to all_detections 113 | for label in range(dataset.num_classes()): 114 | all_detections[index][label] = image_detections[image_detections[:, -1] == label, :-1] 115 | else: 116 | # copy detections to all_detections 117 | for label in range(dataset.num_classes()): 118 | all_detections[index][label] = np.zeros((0, 5)) 119 | 120 | print('{}/{}'.format(index + 1, len(dataset)), end='\r') 121 | 122 | return all_detections 123 | 124 | 125 | def _get_annotations(generator): 126 | """ Get the ground truth annotations from the generator. 127 | The result is a list of lists such that the size is: 128 | all_detections[num_images][num_classes] = annotations[num_detections, 5] 129 | # Arguments 130 | generator : The generator used to retrieve ground truth annotations. 131 | # Returns 132 | A list of lists containing the annotations for each image in the generator. 133 | """ 134 | all_annotations = [[None for i in range(generator.num_classes())] for j in range(len(generator))] 135 | 136 | for i in range(len(generator)): 137 | # load the annotations 138 | annotations = generator.load_annotations(i) 139 | 140 | # copy detections to all_annotations 141 | for label in range(generator.num_classes()): 142 | all_annotations[i][label] = annotations[annotations[:, 4] == label, :4].copy() 143 | 144 | print('{}/{}'.format(i + 1, len(generator)), end='\r') 145 | 146 | return all_annotations 147 | 148 | 149 | def evaluate( 150 | generator, 151 | retinanet, 152 | iou_threshold=0.5, 153 | score_threshold=0.05, 154 | max_detections=100, 155 | save_path=None 156 | ): 157 | """ Evaluate a given dataset using a given retinanet. 158 | # Arguments 159 | generator : The generator that represents the dataset to evaluate. 160 | retinanet : The retinanet to evaluate. 161 | iou_threshold : The threshold used to consider when a detection is positive or negative. 162 | score_threshold : The score confidence threshold to use for detections. 163 | max_detections : The maximum number of detections to use per image. 164 | save_path : The path to save images with visualized detections to. 165 | # Returns 166 | A dict mapping class names to mAP scores. 167 | """ 168 | 169 | 170 | 171 | # gather all detections and annotations 172 | 173 | all_detections = _get_detections(generator, retinanet, score_threshold=score_threshold, max_detections=max_detections, save_path=save_path) 174 | all_annotations = _get_annotations(generator) 175 | 176 | average_precisions = {} 177 | 178 | for label in range(generator.num_classes()): 179 | false_positives = np.zeros((0,)) 180 | true_positives = np.zeros((0,)) 181 | scores = np.zeros((0,)) 182 | num_annotations = 0.0 183 | 184 | for i in range(len(generator)): 185 | detections = all_detections[i][label] 186 | annotations = all_annotations[i][label] 187 | num_annotations += annotations.shape[0] 188 | detected_annotations = [] 189 | 190 | for d in detections: 191 | scores = np.append(scores, d[4]) 192 | 193 | if annotations.shape[0] == 0: 194 | false_positives = np.append(false_positives, 1) 195 | true_positives = np.append(true_positives, 0) 196 | continue 197 | 198 | overlaps = compute_overlap(np.expand_dims(d, axis=0), annotations) 199 | assigned_annotation = np.argmax(overlaps, axis=1) 200 | max_overlap = overlaps[0, assigned_annotation] 201 | 202 | if max_overlap >= iou_threshold and assigned_annotation not in detected_annotations: 203 | false_positives = np.append(false_positives, 0) 204 | true_positives = np.append(true_positives, 1) 205 | detected_annotations.append(assigned_annotation) 206 | else: 207 | false_positives = np.append(false_positives, 1) 208 | true_positives = np.append(true_positives, 0) 209 | 210 | # no annotations -> AP for this class is 0 (is this correct?) 211 | if num_annotations == 0: 212 | average_precisions[label] = 0, 0 213 | continue 214 | 215 | # sort by score 216 | indices = np.argsort(-scores) 217 | false_positives = false_positives[indices] 218 | true_positives = true_positives[indices] 219 | 220 | # compute false positives and true positives 221 | false_positives = np.cumsum(false_positives) 222 | true_positives = np.cumsum(true_positives) 223 | 224 | # compute recall and precision 225 | recall = true_positives / num_annotations 226 | precision = true_positives / np.maximum(true_positives + false_positives, np.finfo(np.float64).eps) 227 | 228 | # compute average precision 229 | average_precision = _compute_ap(recall, precision) 230 | average_precisions[label] = average_precision, num_annotations 231 | 232 | print('\nmAP:') 233 | for label in range(generator.num_classes()): 234 | label_name = generator.label_to_name(label) 235 | print('{}: {}'.format(label_name, average_precisions[label][0])) 236 | 237 | return average_precisions 238 | 239 | -------------------------------------------------------------------------------- /pytorch-retinanet/dataloader.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | import sys 3 | import os 4 | import torch 5 | import numpy as np 6 | import random 7 | import csv 8 | 9 | from torch.utils.data import Dataset, DataLoader 10 | from torchvision import transforms, utils 11 | from torch.utils.data.sampler import Sampler 12 | 13 | from pycocotools.coco import COCO 14 | 15 | import skimage.io 16 | import skimage.transform 17 | import skimage.color 18 | import skimage 19 | 20 | from PIL import Image 21 | 22 | 23 | class CocoDataset(Dataset): 24 | """Coco dataset.""" 25 | 26 | def __init__(self, root_dir, set_name='train2017', transform=None): 27 | """ 28 | Args: 29 | root_dir (string): COCO directory. 30 | transform (callable, optional): Optional transform to be applied 31 | on a sample. 32 | """ 33 | self.root_dir = root_dir 34 | self.set_name = set_name 35 | self.transform = transform 36 | 37 | self.coco = COCO(os.path.join(self.root_dir, 'annotations', 'instances_' + self.set_name + '.json')) 38 | self.image_ids = self.coco.getImgIds() 39 | 40 | self.load_classes() 41 | 42 | def load_classes(self): 43 | # load class names (name -> label) 44 | categories = self.coco.loadCats(self.coco.getCatIds()) 45 | categories.sort(key=lambda x: x['id']) 46 | 47 | self.classes = {} 48 | self.coco_labels = {} 49 | self.coco_labels_inverse = {} 50 | for c in categories: 51 | self.coco_labels[len(self.classes)] = c['id'] 52 | self.coco_labels_inverse[c['id']] = len(self.classes) 53 | self.classes[c['name']] = len(self.classes) 54 | 55 | # also load the reverse (label -> name) 56 | self.labels = {} 57 | for key, value in self.classes.items(): 58 | self.labels[value] = key 59 | 60 | def __len__(self): 61 | return len(self.image_ids) 62 | 63 | def __getitem__(self, idx): 64 | 65 | img = self.load_image(idx) 66 | annot = self.load_annotations(idx) 67 | sample = {'img': img, 'annot': annot} 68 | if self.transform: 69 | sample = self.transform(sample) 70 | 71 | return sample 72 | 73 | def load_image(self, image_index): 74 | image_info = self.coco.loadImgs(self.image_ids[image_index])[0] 75 | path = os.path.join(self.root_dir, 'images', self.set_name, image_info['file_name']) 76 | img = skimage.io.imread(path) 77 | 78 | if len(img.shape) == 2: 79 | img = skimage.color.gray2rgb(img) 80 | 81 | return img.astype(np.float32)/255.0 82 | 83 | def load_annotations(self, image_index): 84 | # get ground truth annotations 85 | annotations_ids = self.coco.getAnnIds(imgIds=self.image_ids[image_index], iscrowd=False) 86 | annotations = np.zeros((0, 5)) 87 | 88 | # some images appear to miss annotations (like image with id 257034) 89 | if len(annotations_ids) == 0: 90 | return annotations 91 | 92 | # parse annotations 93 | coco_annotations = self.coco.loadAnns(annotations_ids) 94 | for idx, a in enumerate(coco_annotations): 95 | 96 | # some annotations have basically no width / height, skip them 97 | if a['bbox'][2] < 1 or a['bbox'][3] < 1: 98 | continue 99 | 100 | annotation = np.zeros((1, 5)) 101 | annotation[0, :4] = a['bbox'] 102 | annotation[0, 4] = self.coco_label_to_label(a['category_id']) 103 | annotations = np.append(annotations, annotation, axis=0) 104 | 105 | # transform from [x, y, w, h] to [x1, y1, x2, y2] 106 | annotations[:, 2] = annotations[:, 0] + annotations[:, 2] 107 | annotations[:, 3] = annotations[:, 1] + annotations[:, 3] 108 | 109 | return annotations 110 | 111 | def coco_label_to_label(self, coco_label): 112 | return self.coco_labels_inverse[coco_label] 113 | 114 | 115 | def label_to_coco_label(self, label): 116 | return self.coco_labels[label] 117 | 118 | def image_aspect_ratio(self, image_index): 119 | image = self.coco.loadImgs(self.image_ids[image_index])[0] 120 | return float(image['width']) / float(image['height']) 121 | 122 | def num_classes(self): 123 | return 80 124 | 125 | 126 | class CSVDataset(Dataset): 127 | """CSV dataset.""" 128 | 129 | def __init__(self, train_file, class_list, transform=None): 130 | """ 131 | Args: 132 | train_file (string): CSV file with training annotations 133 | annotations (string): CSV file with class list 134 | test_file (string, optional): CSV file with testing annotations 135 | """ 136 | self.train_file = train_file 137 | self.class_list = class_list 138 | self.transform = transform 139 | 140 | # parse the provided class file 141 | try: 142 | with self._open_for_csv(self.class_list) as file: 143 | self.classes = self.load_classes(csv.reader(file, delimiter=',')) 144 | except ValueError as e: 145 | raise_from(ValueError('invalid CSV class file: {}: {}'.format(self.class_list, e)), None) 146 | 147 | self.labels = {} 148 | for key, value in self.classes.items(): 149 | self.labels[value] = key 150 | 151 | # csv with img_path, x1, y1, x2, y2, class_name 152 | try: 153 | with self._open_for_csv(self.train_file) as file: 154 | self.image_data = self._read_annotations(csv.reader(file, delimiter=','), self.classes) 155 | except ValueError as e: 156 | raise_from(ValueError('invalid CSV annotations file: {}: {}'.format(self.train_file, e)), None) 157 | self.image_names = list(self.image_data.keys()) 158 | 159 | def _parse(self, value, function, fmt): 160 | """ 161 | Parse a string into a value, and format a nice ValueError if it fails. 162 | Returns `function(value)`. 163 | Any `ValueError` raised is catched and a new `ValueError` is raised 164 | with message `fmt.format(e)`, where `e` is the caught `ValueError`. 165 | """ 166 | try: 167 | return function(value) 168 | except ValueError as e: 169 | raise_from(ValueError(fmt.format(e)), None) 170 | 171 | def _open_for_csv(self, path): 172 | """ 173 | Open a file with flags suitable for csv.reader. 174 | This is different for python2 it means with mode 'rb', 175 | for python3 this means 'r' with "universal newlines". 176 | """ 177 | if sys.version_info[0] < 3: 178 | return open(path, 'rb') 179 | else: 180 | return open(path, 'r', newline='') 181 | 182 | 183 | def load_classes(self, csv_reader): 184 | result = {} 185 | 186 | for line, row in enumerate(csv_reader): 187 | line += 1 188 | 189 | try: 190 | class_name, class_id = row 191 | except ValueError: 192 | raise_from(ValueError('line {}: format should be \'class_name,class_id\''.format(line)), None) 193 | class_id = self._parse(class_id, int, 'line {}: malformed class ID: {{}}'.format(line)) 194 | 195 | if class_name in result: 196 | raise ValueError('line {}: duplicate class name: \'{}\''.format(line, class_name)) 197 | result[class_name] = class_id 198 | return result 199 | 200 | 201 | def __len__(self): 202 | return len(self.image_names) 203 | 204 | def __getitem__(self, idx): 205 | 206 | img = self.load_image(idx) 207 | annot = self.load_annotations(idx) 208 | sample = {'img': img, 'annot': annot} 209 | if self.transform: 210 | sample = self.transform(sample) 211 | 212 | return sample 213 | 214 | def load_image(self, image_index): 215 | img = skimage.io.imread(self.image_names[image_index]) 216 | 217 | if len(img.shape) == 2: 218 | img = skimage.color.gray2rgb(img) 219 | 220 | return img.astype(np.float32)/255.0 221 | 222 | def load_annotations(self, image_index): 223 | # get ground truth annotations 224 | annotation_list = self.image_data[self.image_names[image_index]] 225 | annotations = np.zeros((0, 5)) 226 | 227 | # some images appear to miss annotations (like image with id 257034) 228 | if len(annotation_list) == 0: 229 | return annotations 230 | 231 | # parse annotations 232 | for idx, a in enumerate(annotation_list): 233 | # some annotations have basically no width / height, skip them 234 | x1 = a['x1'] 235 | x2 = a['x2'] 236 | y1 = a['y1'] 237 | y2 = a['y2'] 238 | 239 | if (x2-x1) < 1 or (y2-y1) < 1: 240 | continue 241 | 242 | annotation = np.zeros((1, 5)) 243 | 244 | annotation[0, 0] = x1 245 | annotation[0, 1] = y1 246 | annotation[0, 2] = x2 247 | annotation[0, 3] = y2 248 | 249 | annotation[0, 4] = self.name_to_label(a['class']) 250 | annotations = np.append(annotations, annotation, axis=0) 251 | 252 | return annotations 253 | 254 | def _read_annotations(self, csv_reader, classes): 255 | result = {} 256 | for line, row in enumerate(csv_reader): 257 | line += 1 258 | 259 | try: 260 | img_file, x1, y1, x2, y2, class_name = row[:6] 261 | except ValueError: 262 | raise_from(ValueError('line {}: format should be \'img_file,x1,y1,x2,y2,class_name\' or \'img_file,,,,,\''.format(line)), None) 263 | 264 | if img_file not in result: 265 | result[img_file] = [] 266 | 267 | # If a row contains only an image path, it's an image without annotations. 268 | if (x1, y1, x2, y2, class_name) == ('', '', '', '', ''): 269 | continue 270 | 271 | x1 = self._parse(x1, int, 'line {}: malformed x1: {{}}'.format(line)) 272 | y1 = self._parse(y1, int, 'line {}: malformed y1: {{}}'.format(line)) 273 | x2 = self._parse(x2, int, 'line {}: malformed x2: {{}}'.format(line)) 274 | y2 = self._parse(y2, int, 'line {}: malformed y2: {{}}'.format(line)) 275 | 276 | # Check that the bounding box is valid. 277 | if x2 <= x1: 278 | raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) 279 | if y2 <= y1: 280 | raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) 281 | 282 | # check if the current class name is correctly present 283 | if class_name not in classes: 284 | raise ValueError('line {}: unknown class name: \'{}\' (classes: {})'.format(line, class_name, classes)) 285 | 286 | result[img_file].append({'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2, 'class': class_name}) 287 | return result 288 | 289 | def name_to_label(self, name): 290 | return self.classes[name] 291 | 292 | def label_to_name(self, label): 293 | return self.labels[label] 294 | 295 | def num_classes(self): 296 | return max(self.classes.values()) + 1 297 | 298 | def image_aspect_ratio(self, image_index): 299 | image = Image.open(self.image_names[image_index]) 300 | return float(image.width) / float(image.height) 301 | 302 | 303 | def collater(data): 304 | 305 | imgs = [s['img'] for s in data] 306 | annots = [s['annot'] for s in data] 307 | scales = [s['scale'] for s in data] 308 | 309 | widths = [int(s.shape[0]) for s in imgs] 310 | heights = [int(s.shape[1]) for s in imgs] 311 | batch_size = len(imgs) 312 | 313 | max_width = np.array(widths).max() 314 | max_height = np.array(heights).max() 315 | 316 | padded_imgs = torch.zeros(batch_size, max_width, max_height, 3) 317 | 318 | for i in range(batch_size): 319 | img = imgs[i] 320 | padded_imgs[i, :int(img.shape[0]), :int(img.shape[1]), :] = img 321 | 322 | max_num_annots = max(annot.shape[0] for annot in annots) 323 | 324 | if max_num_annots > 0: 325 | 326 | annot_padded = torch.ones((len(annots), max_num_annots, 5)) * -1 327 | 328 | if max_num_annots > 0: 329 | for idx, annot in enumerate(annots): 330 | #print(annot.shape) 331 | if annot.shape[0] > 0: 332 | annot_padded[idx, :annot.shape[0], :] = annot 333 | else: 334 | annot_padded = torch.ones((len(annots), 1, 5)) * -1 335 | 336 | 337 | padded_imgs = padded_imgs.permute(0, 3, 1, 2) 338 | 339 | return {'img': padded_imgs, 'annot': annot_padded, 'scale': scales} 340 | 341 | class Resizer(object): 342 | """Convert ndarrays in sample to Tensors.""" 343 | 344 | def __call__(self, sample, min_side=608, max_side=1024): 345 | image, annots = sample['img'], sample['annot'] 346 | 347 | rows, cols, cns = image.shape 348 | 349 | smallest_side = min(rows, cols) 350 | 351 | # rescale the image so the smallest side is min_side 352 | scale = min_side / smallest_side 353 | 354 | # check if the largest side is now greater than max_side, which can happen 355 | # when images have a large aspect ratio 356 | largest_side = max(rows, cols) 357 | 358 | if largest_side * scale > max_side: 359 | scale = max_side / largest_side 360 | 361 | # resize the image with the computed scale 362 | image = skimage.transform.resize(image, (int(round(rows*scale)), int(round((cols*scale))))) 363 | rows, cols, cns = image.shape 364 | 365 | pad_w = 32 - rows%32 366 | pad_h = 32 - cols%32 367 | 368 | new_image = np.zeros((rows + pad_w, cols + pad_h, cns)).astype(np.float32) 369 | new_image[:rows, :cols, :] = image.astype(np.float32) 370 | 371 | annots[:, :4] *= scale 372 | 373 | return {'img': torch.from_numpy(new_image), 'annot': torch.from_numpy(annots), 'scale': scale} 374 | 375 | 376 | class Augmenter(object): 377 | """Convert ndarrays in sample to Tensors.""" 378 | 379 | def __call__(self, sample, flip_x=0.5): 380 | 381 | if np.random.rand() < flip_x: 382 | image, annots = sample['img'], sample['annot'] 383 | image = image[:, ::-1, :] 384 | 385 | rows, cols, channels = image.shape 386 | 387 | x1 = annots[:, 0].copy() 388 | x2 = annots[:, 2].copy() 389 | 390 | x_tmp = x1.copy() 391 | 392 | annots[:, 0] = cols - x2 393 | annots[:, 2] = cols - x_tmp 394 | 395 | sample = {'img': image, 'annot': annots} 396 | 397 | return sample 398 | 399 | 400 | class Normalizer(object): 401 | 402 | def __init__(self): 403 | self.mean = np.array([[[0.485, 0.456, 0.406]]]) 404 | self.std = np.array([[[0.229, 0.224, 0.225]]]) 405 | 406 | def __call__(self, sample): 407 | 408 | image, annots = sample['img'], sample['annot'] 409 | 410 | return {'img':((image.astype(np.float32)-self.mean)/self.std), 'annot': annots} 411 | 412 | class UnNormalizer(object): 413 | def __init__(self, mean=None, std=None): 414 | if mean == None: 415 | self.mean = [0.485, 0.456, 0.406] 416 | else: 417 | self.mean = mean 418 | if std == None: 419 | self.std = [0.229, 0.224, 0.225] 420 | else: 421 | self.std = std 422 | 423 | def __call__(self, tensor): 424 | """ 425 | Args: 426 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 427 | Returns: 428 | Tensor: Normalized image. 429 | """ 430 | for t, m, s in zip(tensor, self.mean, self.std): 431 | t.mul_(s).add_(m) 432 | return tensor 433 | 434 | 435 | class AspectRatioBasedSampler(Sampler): 436 | 437 | def __init__(self, data_source, batch_size, drop_last): 438 | self.data_source = data_source 439 | self.batch_size = batch_size 440 | self.drop_last = drop_last 441 | self.groups = self.group_images() 442 | 443 | def __iter__(self): 444 | random.shuffle(self.groups) 445 | for group in self.groups: 446 | yield group 447 | 448 | def __len__(self): 449 | if self.drop_last: 450 | return len(self.data_source) // self.batch_size 451 | else: 452 | return (len(self.data_source) + self.batch_size - 1) // self.batch_size 453 | 454 | def group_images(self): 455 | # determine the order of the images 456 | order = list(range(len(self.data_source))) 457 | order.sort(key=lambda x: self.data_source.image_aspect_ratio(x)) 458 | 459 | # divide into groups, one group = one batch 460 | return [[order[x % len(order)] for x in range(i, i + self.batch_size)] for i in range(0, len(order), self.batch_size)] 461 | -------------------------------------------------------------------------------- /pytorch-retinanet/images/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/pytorch-retinanet/images/1.jpg -------------------------------------------------------------------------------- /pytorch-retinanet/images/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/pytorch-retinanet/images/3.jpg -------------------------------------------------------------------------------- /pytorch-retinanet/images/4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/pytorch-retinanet/images/4.jpg -------------------------------------------------------------------------------- /pytorch-retinanet/images/5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/pytorch-retinanet/images/5.jpg -------------------------------------------------------------------------------- /pytorch-retinanet/images/6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/pytorch-retinanet/images/6.jpg -------------------------------------------------------------------------------- /pytorch-retinanet/images/7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/pytorch-retinanet/images/7.jpg -------------------------------------------------------------------------------- /pytorch-retinanet/images/8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/pytorch-retinanet/images/8.jpg -------------------------------------------------------------------------------- /pytorch-retinanet/lib/README.md: -------------------------------------------------------------------------------- 1 | # NMS 2 | the comparison of nms in speed 3 | 4 | method 1: 5 | thresh=0.7, time wastes:0.0287 6 | thresh=0.8, time wastes:0.1057 7 | thresh=0.9, time wastes:0.4204 8 | 9 | method 2: 10 | thresh=0.7, time wastes:0.0272 11 | thresh=0.8, time wastes:0.1038 12 | thresh=0.9, time wastes:0.4184 13 | 14 | method 3: 15 | thresh=0.7, time wastes:0.0019 16 | thresh=0.8, time wastes:0.0028 17 | thresh=0.9, time wastes:0.0036 18 | 19 | method 4: 20 | thresh=0.7, time wastes:0.0120 21 | thresh=0.8, time wastes:0.0063 22 | thresh=0.9, time wastes:0.0071 23 | 24 | Reference: 25 | py-faster-rcnn: https://github.com/rbgirshick/py-faster-rcnn/tree/master/lib/nms 26 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sebastiani/pytorch-attention-augmented-convolution/4f0eb899714f22a88a1b6a602ee2dfb20f59a4b6/pytorch-retinanet/lib/nms/__init__.py -------------------------------------------------------------------------------- /pytorch-retinanet/lib/nms/gpu_nms.hpp: -------------------------------------------------------------------------------- 1 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 2 | int boxes_dim, float nms_overlap_thresh, int device_id); 3 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/nms/gpu_nms.pyx: -------------------------------------------------------------------------------- 1 | 2 | # -------------------------------------------------------- 3 | # Faster R-CNN 4 | # Copyright (c) 2015 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Ross Girshick 7 | # -------------------------------------------------------- 8 | 9 | 10 | import numpy as np 11 | cimport numpy as np 12 | 13 | assert sizeof(int) == sizeof(np.int32_t) 14 | 15 | cdef extern from "gpu_nms.hpp": 16 | void _nms(np.int32_t*, int*, np.float32_t*, int, int, float, int) 17 | 18 | def gpu_nms(np.ndarray[np.float32_t, ndim=2] dets, np.float thresh, 19 | np.int32_t device_id=0): 20 | dets = dets.numpy() 21 | cdef int boxes_num = dets.shape[0] 22 | cdef int boxes_dim = dets.shape[1] 23 | cdef int num_out 24 | cdef np.ndarray[np.int32_t, ndim=1] \ 25 | keep = np.zeros(boxes_num, dtype=np.int32) 26 | cdef np.ndarray[np.float32_t, ndim=1] \ 27 | scores = dets[:, 4] 28 | cdef np.ndarray[np.int_t, ndim=1] \ 29 | order = scores.argsort()[::-1] 30 | cdef np.ndarray[np.float32_t, ndim=2] \ 31 | sorted_dets = dets[order, :] 32 | _nms(&keep[0], &num_out, &sorted_dets[0, 0], boxes_num, boxes_dim, thresh, device_id) 33 | keep = keep[:num_out] 34 | return list(order[keep]) 35 | 36 | 37 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/nms/nms_kernel.cu: -------------------------------------------------------------------------------- 1 | // ------------------------------------------------------------------ 2 | // Faster R-CNN 3 | // Copyright (c) 2015 Microsoft 4 | // Licensed under The MIT License [see fast-rcnn/LICENSE for details] 5 | // Written by Shaoqing Ren 6 | // ------------------------------------------------------------------ 7 | 8 | 9 | #include "gpu_nms.hpp" 10 | #include 11 | #include 12 | 13 | #define CUDA_CHECK(condition) \ 14 | /* Code block avoids redefinition of cudaError_t error */ \ 15 | do { \ 16 | cudaError_t error = condition; \ 17 | if (error != cudaSuccess) { \ 18 | std::cout << cudaGetErrorString(error) << std::endl; \ 19 | } \ 20 | } while (0) 21 | 22 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0)) 23 | int const threadsPerBlock = sizeof(unsigned long long) * 8; 24 | 25 | __device__ inline float devIoU(float const * const a, float const * const b) { 26 | float left = max(a[0], b[0]), right = min(a[2], b[2]); 27 | float top = max(a[1], b[1]), bottom = min(a[3], b[3]); 28 | float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f); 29 | float interS = width * height; 30 | float Sa = (a[2] - a[0] + 1) * (a[3] - a[1] + 1); 31 | float Sb = (b[2] - b[0] + 1) * (b[3] - b[1] + 1); 32 | return interS / (Sa + Sb - interS); 33 | } 34 | 35 | __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, 36 | const float *dev_boxes, unsigned long long *dev_mask) { 37 | const int row_start = blockIdx.y; 38 | const int col_start = blockIdx.x; 39 | 40 | // if (row_start > col_start) return; 41 | 42 | const int row_size = 43 | min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); 44 | const int col_size = 45 | min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); 46 | 47 | __shared__ float block_boxes[threadsPerBlock * 5]; 48 | if (threadIdx.x < col_size) { 49 | block_boxes[threadIdx.x * 5 + 0] = 50 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; 51 | block_boxes[threadIdx.x * 5 + 1] = 52 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; 53 | block_boxes[threadIdx.x * 5 + 2] = 54 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; 55 | block_boxes[threadIdx.x * 5 + 3] = 56 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; 57 | block_boxes[threadIdx.x * 5 + 4] = 58 | dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; 59 | } 60 | __syncthreads(); 61 | 62 | if (threadIdx.x < row_size) { 63 | const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; 64 | const float *cur_box = dev_boxes + cur_box_idx * 5; 65 | int i = 0; 66 | unsigned long long t = 0; 67 | int start = 0; 68 | if (row_start == col_start) { 69 | start = threadIdx.x + 1; 70 | } 71 | for (i = start; i < col_size; i++) { 72 | if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { 73 | t |= 1ULL << i; 74 | } 75 | } 76 | const int col_blocks = DIVUP(n_boxes, threadsPerBlock); 77 | dev_mask[cur_box_idx * col_blocks + col_start] = t; 78 | } 79 | } 80 | 81 | void _set_device(int device_id) { 82 | int current_device; 83 | CUDA_CHECK(cudaGetDevice(¤t_device)); 84 | if (current_device == device_id) { 85 | return; 86 | } 87 | // The call to cudaSetDevice must come before any calls to Get, which 88 | // may perform initialization using the GPU. 89 | CUDA_CHECK(cudaSetDevice(device_id)); 90 | } 91 | 92 | void _nms(int* keep_out, int* num_out, const float* boxes_host, int boxes_num, 93 | int boxes_dim, float nms_overlap_thresh, int device_id) { 94 | _set_device(device_id); 95 | 96 | float* boxes_dev = NULL; 97 | unsigned long long* mask_dev = NULL; 98 | 99 | const int col_blocks = DIVUP(boxes_num, threadsPerBlock); 100 | 101 | CUDA_CHECK(cudaMalloc(&boxes_dev, 102 | boxes_num * boxes_dim * sizeof(float))); 103 | CUDA_CHECK(cudaMemcpy(boxes_dev, 104 | boxes_host, 105 | boxes_num * boxes_dim * sizeof(float), 106 | cudaMemcpyHostToDevice)); 107 | 108 | CUDA_CHECK(cudaMalloc(&mask_dev, 109 | boxes_num * col_blocks * sizeof(unsigned long long))); 110 | 111 | dim3 blocks(DIVUP(boxes_num, threadsPerBlock), 112 | DIVUP(boxes_num, threadsPerBlock)); 113 | dim3 threads(threadsPerBlock); 114 | nms_kernel<<>>(boxes_num, 115 | nms_overlap_thresh, 116 | boxes_dev, 117 | mask_dev); 118 | 119 | std::vector mask_host(boxes_num * col_blocks); 120 | CUDA_CHECK(cudaMemcpy(&mask_host[0], 121 | mask_dev, 122 | sizeof(unsigned long long) * boxes_num * col_blocks, 123 | cudaMemcpyDeviceToHost)); 124 | 125 | std::vector remv(col_blocks); 126 | memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); 127 | 128 | int num_to_keep = 0; 129 | for (int i = 0; i < boxes_num; i++) { 130 | int nblock = i / threadsPerBlock; 131 | int inblock = i % threadsPerBlock; 132 | 133 | if (!(remv[nblock] & (1ULL << inblock))) { 134 | keep_out[num_to_keep++] = i; 135 | unsigned long long *p = &mask_host[0] + i * col_blocks; 136 | for (int j = nblock; j < col_blocks; j++) { 137 | remv[j] |= p[j]; 138 | } 139 | } 140 | } 141 | *num_out = num_to_keep; 142 | 143 | CUDA_CHECK(cudaFree(boxes_dev)); 144 | CUDA_CHECK(cudaFree(mask_dev)); 145 | } 146 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/nms/nums_py.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Mon May 7 21:45:37 2018 5 | 6 | @author: lps 7 | """ 8 | import numpy as np 9 | 10 | 11 | boxes=np.array([[100,100,210,210,0.72], 12 | [250,250,420,420,0.8], 13 | [220,220,320,330,0.92], 14 | [100,100,210,210,0.72], 15 | [230,240,325,330,0.81], 16 | [220,230,315,340,0.9]]) 17 | 18 | 19 | def py_cpu_nms(dets, thresh): 20 | # dets:(m,5) thresh:scaler 21 | 22 | x1 = dets[:,0] 23 | y1 = dets[:,1] 24 | x2 = dets[:,2] 25 | y2 = dets[:,3] 26 | 27 | areas = (y2-y1+1) * (x2-x1+1) 28 | scores = dets[:,4] 29 | keep = [] 30 | 31 | index = scores.argsort()[::-1] 32 | 33 | while index.size >0: 34 | 35 | i = index[0] # every time the first is the biggst, and add it directly 36 | keep.append(i) 37 | 38 | x11 = np.maximum(x1[i], x1[index[1:]]) # calculate the points of overlap 39 | y11 = np.maximum(y1[i], y1[index[1:]]) 40 | x22 = np.minimum(x2[i], x2[index[1:]]) 41 | y22 = np.minimum(y2[i], y2[index[1:]]) 42 | 43 | w = np.maximum(0, x22-x11+1) # the weights of overlap 44 | h = np.maximum(0, y22-y11+1) # the height of overlap 45 | 46 | overlaps = w*h 47 | 48 | ious = overlaps / (areas[i]+areas[index[1:]] - overlaps) 49 | 50 | idx = np.where(ious<=thresh)[0] 51 | 52 | index = index[idx+1] # because index start from 1 53 | 54 | return keep 55 | 56 | 57 | import matplotlib.pyplot as plt 58 | def plot_bbox(dets, c='k'): 59 | 60 | x1 = dets[:,0] 61 | y1 = dets[:,1] 62 | x2 = dets[:,2] 63 | y2 = dets[:,3] 64 | 65 | 66 | plt.plot([x1,x2], [y1,y1], c) 67 | plt.plot([x1,x1], [y1,y2], c) 68 | plt.plot([x1,x2], [y2,y2], c) 69 | plt.plot([x2,x2], [y1,y2], c) 70 | plt.title("after nms") 71 | 72 | #plot_bbox(boxes,'k') # before nms 73 | # 74 | #keep = py_cpu_nms(boxes, thresh=0.7) 75 | #plot_bbox(boxes[keep], 'r')# after nms 76 | # 77 | 78 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/nms/nums_py1.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | boxes=np.array([[100,100,210,210,0.72], 5 | [250,250,420,420,0.8], 6 | [220,220,320,330,0.92], 7 | [100,100,210,210,0.72], 8 | [230,240,325,330,0.81], 9 | [220,230,315,340,0.9]]) 10 | 11 | 12 | def py_cpu_nms(dets, thresh): 13 | # dets:(m,5) thresh:scaler 14 | 15 | x1 = dets[:,0] 16 | y1 = dets[:,1] 17 | x2 = dets[:,2] 18 | y2 = dets[:,3] 19 | 20 | areas = (y2-y1+1) * (x2-x1+1) 21 | scores = dets[:,4] 22 | keep = [] 23 | 24 | index = scores.argsort()[::-1] 25 | 26 | j=0 27 | while index.size >0: 28 | 29 | j = j+1 30 | i = index[0] # every time the first is the biggst, and add it directly 31 | keep.append(i) 32 | 33 | x11 = np.maximum(x1[i], x1[index[1:]]) # calculate the points of overlap 34 | y11 = np.maximum(y1[i], y1[index[1:]]) 35 | x22 = np.minimum(x2[i], x2[index[1:]]) 36 | y22 = np.minimum(y2[i], y2[index[1:]]) 37 | 38 | w = np.maximum(0, x22-x11+1) # the weights of overlap 39 | h = np.maximum(0, y22-y11+1) # the height of overlap 40 | 41 | overlaps = w*h 42 | 43 | ious = overlaps / (areas[i]+areas[index[1:]] - overlaps) 44 | 45 | idx = np.where(ious<=thresh)[0] 46 | 47 | index = index[idx+1] # because index starts with 1 48 | 49 | return keep,j 50 | 51 | import matplotlib.pyplot as plt 52 | def plot_bbox(dets, c='k'): 53 | 54 | x1 = dets[:,0] 55 | y1 = dets[:,1] 56 | x2 = dets[:,2] 57 | y2 = dets[:,3] 58 | 59 | plt.plot([x1,x2], [y1,y1], c) 60 | plt.plot([x1,x1], [y1,y2], c) 61 | plt.plot([x1,x2], [y2,y2], c) 62 | plt.plot([x2,x2], [y1,y2], c) 63 | 64 | #plot_bbox(boxes,'k') # before nms 65 | 66 | #keep = py_cpu_nms(boxes, thresh=0.7) 67 | #plot_bbox(boxes[keep], 'r')# after nms 68 | 69 | 70 | 71 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/nms/nums_py2.pyx: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | cimport numpy as np 3 | # 4 | #boxes=np.array([[100,100,210,210,0.72], 5 | # [250,250,420,420,0.8], 6 | # [220,220,320,330,0.92], 7 | # [100,100,210,210,0.72], 8 | # [230,240,325,330,0.81], 9 | # [220,230,315,340,0.9]]) 10 | # 11 | 12 | 13 | cdef inline np.float32_t max(np.float32_t a, np.float32_t b): 14 | return a if a >= b else b 15 | 16 | cdef inline np.float32_t min(np.float32_t a, np.float32_t b): 17 | return a if a <= b else b 18 | 19 | def py_cpu_nms(np.ndarray[np.float32_t,ndim=2] dets, np.float thresh): 20 | # dets:(m,5) thresh:scaler 21 | 22 | cdef np.ndarray[np.float32_t, ndim=1] x1 = dets[:,0] 23 | cdef np.ndarray[np.float32_t, ndim=1] y1 = dets[:,1] 24 | cdef np.ndarray[np.float32_t, ndim=1] x2 = dets[:,2] 25 | cdef np.ndarray[np.float32_t, ndim=1] y2 = dets[:,3] 26 | cdef np.ndarray[np.float32_t, ndim=1] scores = dets[:, 4] 27 | 28 | cdef np.ndarray[np.float32_t, ndim=1] areas = (y2-y1+1) * (x2-x1+1) 29 | cdef np.ndarray[np.int_t, ndim=1] index = scores.argsort()[::-1] # can be rewriten 30 | keep = [] 31 | 32 | cdef int ndets = dets.shape[0] 33 | cdef np.ndarray[np.int_t, ndim=1] suppressed = np.zeros(ndets, dtype=np.int) 34 | 35 | cdef int _i, _j 36 | 37 | cdef int i, j 38 | 39 | cdef np.float32_t ix1, iy1, ix2, iy2, iarea 40 | cdef np.float32_t w, h 41 | cdef np.float32_t overlap, ious 42 | 43 | j=0 44 | 45 | for _i in range(ndets): 46 | i = index[_i] 47 | 48 | if suppressed[i] == 1: 49 | continue 50 | keep.append(i) 51 | 52 | ix1 = x1[i] 53 | iy1 = y1[i] 54 | ix2 = x2[i] 55 | iy2 = y2[i] 56 | 57 | iarea = areas[i] 58 | 59 | for _j in range(_i+1, ndets): 60 | j = index[_j] 61 | if suppressed[j] == 1: 62 | continue 63 | xx1 = max(ix1, x1[j]) 64 | yy1 = max(iy1, y1[j]) 65 | xx2 = max(ix2, x2[j]) 66 | yy2 = max(iy2, y2[j]) 67 | 68 | w = max(0.0, xx2-xx1+1) 69 | h = max(0.0, yy2-yy1+1) 70 | 71 | overlap = w*h 72 | ious = overlap / (iarea + areas[j] - overlap) 73 | if ious>thresh: 74 | suppressed[j] = 1 75 | 76 | return keep 77 | 78 | import matplotlib.pyplot as plt 79 | def plot_bbox(dets, c='k'): 80 | 81 | x1 = dets[:,0] 82 | y1 = dets[:,1] 83 | x2 = dets[:,2] 84 | y2 = dets[:,3] 85 | 86 | plt.plot([x1,x2], [y1,y1], c) 87 | plt.plot([x1,x1], [y1,y2], c) 88 | plt.plot([x1,x2], [y2,y2], c) 89 | plt.plot([x2,x2], [y1,y2], c) 90 | 91 | 92 | #plot_bbox(boxes,'k') # before nms 93 | # 94 | #keep = py_cpu_nms(boxes, thresh=0.7) 95 | #plot_bbox(boxes[keep], 'r')# after nms 96 | 97 | 98 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/nms/setup1.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | 4 | setup( 5 | name = 'nms_module', 6 | ext_modules = cythonize('nums_py1.pyx'), 7 | ) 8 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/nms/setup2.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | 4 | setup( 5 | name = 'nms_module', 6 | ext_modules = cythonize('nums_py2.pyx'), 7 | ) 8 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/setup3.py: -------------------------------------------------------------------------------- 1 | 2 | # -------------------------------------------------------- 3 | # Faster R-CNN 4 | # Copyright (c) 2015 Microsoft 5 | # Licensed under The MIT License [see LICENSE for details] 6 | # Written by Ross Girshick 7 | # -------------------------------------------------------- 8 | 9 | 10 | from distutils.core import setup 11 | from Cython.Build import cythonize 12 | from distutils.extension import Extension 13 | from Cython.Distutils import build_ext 14 | import subprocess 15 | import numpy as np 16 | import os 17 | from os.path import join as pjoin 18 | 19 | 20 | def find_in_path(name, path): 21 | "Find a file in a search path" 22 | # Adapted fom 23 | # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/ 24 | for dir in path.split(os.pathsep): 25 | binpath = pjoin(dir, name) 26 | if os.path.exists(binpath): 27 | return os.path.abspath(binpath) 28 | return None 29 | 30 | def locate_cuda(): 31 | """Locate the CUDA environment on the system 32 | Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64' 33 | and values giving the absolute path to each directory. 34 | Starts by looking for the CUDAHOME env variable. If not found, everything 35 | is based on finding 'nvcc' in the PATH. 36 | """ 37 | 38 | # first check if the CUDAHOME env variable is in use 39 | if 'CUDAHOME' in os.environ: 40 | home = os.environ['CUDAHOME'] 41 | nvcc = pjoin(home, 'bin', 'nvcc') 42 | else: 43 | # otherwise, search the PATH for NVCC 44 | default_path = pjoin(os.sep, 'usr', 'local', 'cuda', 'bin') 45 | nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path) 46 | if nvcc is None: 47 | raise EnvironmentError('The nvcc binary could not be ' 48 | 'located in your $PATH. Either add it to your path, or set $CUDAHOME') 49 | home = os.path.dirname(os.path.dirname(nvcc)) 50 | 51 | cudaconfig = {'home':home, 'nvcc':nvcc, 52 | 'include': pjoin(home, 'include'), 53 | 'lib64': pjoin(home, 'lib64')} 54 | for k, v in cudaconfig.items(): 55 | if not os.path.exists(v): 56 | raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) 57 | 58 | return cudaconfig 59 | CUDA = locate_cuda() 60 | 61 | try: 62 | numpy_include = np.get_include() 63 | except AttributeError: 64 | numpy_include = np.get_numpy_include() 65 | 66 | 67 | def customize_compiler_for_nvcc(self): 68 | """inject deep into distutils to customize how the dispatch 69 | to gcc/nvcc works. 70 | If you subclass UnixCCompiler, it's not trivial to get your subclass 71 | injected in, and still have the right customizations (i.e. 72 | distutils.sysconfig.customize_compiler) run on it. So instead of going 73 | the OO route, I have this. Note, it's kindof like a wierd functional 74 | subclassing going on.""" 75 | 76 | # tell the compiler it can processes .cu 77 | self.src_extensions.append('.cu') 78 | 79 | # save references to the default compiler_so and _comple methods 80 | default_compiler_so = self.compiler_so 81 | super = self._compile 82 | 83 | # now redefine the _compile method. This gets executed for each 84 | # object but distutils doesn't have the ability to change compilers 85 | # based on source extension: we add it. 86 | def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts): 87 | if os.path.splitext(src)[1] == '.cu': 88 | # use the cuda for .cu files 89 | self.set_executable('compiler_so', CUDA['nvcc']) 90 | # use only a subset of the extra_postargs, which are 1-1 translated 91 | # from the extra_compile_args in the Extension class 92 | postargs = extra_postargs['nvcc'] 93 | else: 94 | postargs = extra_postargs['gcc'] 95 | 96 | super(obj, src, ext, cc_args, postargs, pp_opts) 97 | # reset the default compiler_so, which we might have changed for cuda 98 | self.compiler_so = default_compiler_so 99 | 100 | # inject our redefined _compile method into the class 101 | self._compile = _compile 102 | 103 | 104 | # run the customize_compiler 105 | class custom_build_ext(build_ext): 106 | def build_extensions(self): 107 | customize_compiler_for_nvcc(self.compiler) 108 | build_ext.build_extensions(self) 109 | 110 | ext_modules = [Extension('nms.gpu_nms', 111 | ['nms/nms_kernel.cu', 'nms/gpu_nms.pyx'], 112 | library_dirs=[CUDA['lib64']], 113 | libraries=['cudart'], 114 | language='c++', 115 | runtime_library_dirs=[CUDA['lib64']], 116 | # this syntax is specific to this build system 117 | # we're only going to use certain compiler args with nvcc and not with 118 | # gcc the implementation of this trick is in customize_compiler() below 119 | extra_compile_args={'gcc': ["-Wno-unused-function"], 120 | 'nvcc': ['-arch=sm_35', 121 | '--ptxas-options=-v', 122 | '-c', 123 | '--compiler-options', 124 | "'-fPIC'"]}, 125 | include_dirs = [numpy_include, CUDA['include']] 126 | )] 127 | 128 | setup( 129 | name='fast_rcnn', 130 | ext_modules=ext_modules, 131 | # inject our custom trigger 132 | cmdclass={'build_ext': custom_build_ext}, 133 | ) 134 | 135 | -------------------------------------------------------------------------------- /pytorch-retinanet/lib/test_num.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | #from nms.nums_py2 import py_cpu_nms # for cpu 4 | from nms.gpu_nms import gpu_nms # for gpu 5 | 6 | 7 | np.random.seed( 1 ) # keep fixed 8 | num_rois = 6000 9 | minxy = np.random.randint(50,145,size=(num_rois ,2)) 10 | maxxy = np.random.randint(150,200,size=(num_rois ,2)) 11 | score = 0.8*np.random.random_sample((num_rois ,1))+0.2 12 | 13 | boxes_new = np.concatenate((minxy,maxxy,score), axis=1).astype(np.float32) 14 | 15 | def nms_test_time(boxes_new): 16 | 17 | thresh = [0.7,0.8,0.9] 18 | T = 50 19 | for i in range(len(thresh)): 20 | since = time.time() 21 | for t in range(T): 22 | 23 | # keep = py_cpu_nms(boxes_new, thresh=thresh[i]) # for cpu 24 | keep = gpu_nms(boxes_new, thresh=thresh[i]) # for gpu 25 | print("thresh={:.1f}, time wastes:{:.4f}".format(thresh[i], (time.time()-since)/T)) 26 | 27 | return keep 28 | 29 | 30 | if __name__ =="__main__": 31 | nms_test_time(boxes_new) 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /pytorch-retinanet/losses.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn as nn 4 | 5 | def calc_iou(a, b): 6 | area = (b[:, 2] - b[:, 0]) * (b[:, 3] - b[:, 1]) 7 | 8 | iw = torch.min(torch.unsqueeze(a[:, 2], dim=1), b[:, 2]) - torch.max(torch.unsqueeze(a[:, 0], 1), b[:, 0]) 9 | ih = torch.min(torch.unsqueeze(a[:, 3], dim=1), b[:, 3]) - torch.max(torch.unsqueeze(a[:, 1], 1), b[:, 1]) 10 | 11 | iw = torch.clamp(iw, min=0) 12 | ih = torch.clamp(ih, min=0) 13 | 14 | ua = torch.unsqueeze((a[:, 2] - a[:, 0]) * (a[:, 3] - a[:, 1]), dim=1) + area - iw * ih 15 | 16 | ua = torch.clamp(ua, min=1e-8) 17 | 18 | intersection = iw * ih 19 | 20 | IoU = intersection / ua 21 | 22 | return IoU 23 | 24 | class FocalLoss(nn.Module): 25 | #def __init__(self): 26 | 27 | def forward(self, classifications, regressions, anchors, annotations): 28 | alpha = 0.25 29 | gamma = 2.0 30 | batch_size = classifications.shape[0] 31 | classification_losses = [] 32 | regression_losses = [] 33 | 34 | anchor = anchors[0, :, :] 35 | 36 | anchor_widths = anchor[:, 2] - anchor[:, 0] 37 | anchor_heights = anchor[:, 3] - anchor[:, 1] 38 | anchor_ctr_x = anchor[:, 0] + 0.5 * anchor_widths 39 | anchor_ctr_y = anchor[:, 1] + 0.5 * anchor_heights 40 | 41 | for j in range(batch_size): 42 | 43 | classification = classifications[j, :, :] 44 | regression = regressions[j, :, :] 45 | 46 | bbox_annotation = annotations[j, :, :] 47 | bbox_annotation = bbox_annotation[bbox_annotation[:, 4] != -1] 48 | 49 | if bbox_annotation.shape[0] == 0: 50 | regression_losses.append(torch.tensor(0).float().cuda()) 51 | classification_losses.append(torch.tensor(0).float().cuda()) 52 | 53 | continue 54 | 55 | classification = torch.clamp(classification, 1e-4, 1.0 - 1e-4) 56 | 57 | IoU = calc_iou(anchors[0, :, :], bbox_annotation[:, :4]) # num_anchors x num_annotations 58 | 59 | IoU_max, IoU_argmax = torch.max(IoU, dim=1) # num_anchors x 1 60 | 61 | #import pdb 62 | #pdb.set_trace() 63 | 64 | # compute the loss for classification 65 | targets = torch.ones(classification.shape) * -1 66 | targets = targets.cuda() 67 | 68 | targets[torch.lt(IoU_max, 0.4), :] = 0 69 | 70 | positive_indices = torch.ge(IoU_max, 0.5) 71 | 72 | num_positive_anchors = positive_indices.sum() 73 | 74 | assigned_annotations = bbox_annotation[IoU_argmax, :] 75 | 76 | targets[positive_indices, :] = 0 77 | targets[positive_indices, assigned_annotations[positive_indices, 4].long()] = 1 78 | 79 | alpha_factor = torch.ones(targets.shape).cuda() * alpha 80 | 81 | alpha_factor = torch.where(torch.eq(targets, 1.), alpha_factor, 1. - alpha_factor) 82 | focal_weight = torch.where(torch.eq(targets, 1.), 1. - classification, classification) 83 | focal_weight = alpha_factor * torch.pow(focal_weight, gamma) 84 | 85 | bce = -(targets * torch.log(classification) + (1.0 - targets) * torch.log(1.0 - classification)) 86 | 87 | # cls_loss = focal_weight * torch.pow(bce, gamma) 88 | cls_loss = focal_weight * bce 89 | 90 | cls_loss = torch.where(torch.ne(targets, -1.0), cls_loss, torch.zeros(cls_loss.shape).cuda()) 91 | 92 | classification_losses.append(cls_loss.sum()/torch.clamp(num_positive_anchors.float(), min=1.0)) 93 | 94 | # compute the loss for regression 95 | 96 | if positive_indices.sum() > 0: 97 | assigned_annotations = assigned_annotations[positive_indices, :] 98 | 99 | anchor_widths_pi = anchor_widths[positive_indices] 100 | anchor_heights_pi = anchor_heights[positive_indices] 101 | anchor_ctr_x_pi = anchor_ctr_x[positive_indices] 102 | anchor_ctr_y_pi = anchor_ctr_y[positive_indices] 103 | 104 | gt_widths = assigned_annotations[:, 2] - assigned_annotations[:, 0] 105 | gt_heights = assigned_annotations[:, 3] - assigned_annotations[:, 1] 106 | gt_ctr_x = assigned_annotations[:, 0] + 0.5 * gt_widths 107 | gt_ctr_y = assigned_annotations[:, 1] + 0.5 * gt_heights 108 | 109 | # clip widths to 1 110 | gt_widths = torch.clamp(gt_widths, min=1) 111 | gt_heights = torch.clamp(gt_heights, min=1) 112 | 113 | targets_dx = (gt_ctr_x - anchor_ctr_x_pi) / anchor_widths_pi 114 | targets_dy = (gt_ctr_y - anchor_ctr_y_pi) / anchor_heights_pi 115 | targets_dw = torch.log(gt_widths / anchor_widths_pi) 116 | targets_dh = torch.log(gt_heights / anchor_heights_pi) 117 | 118 | targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh)) 119 | targets = targets.t() 120 | 121 | targets = targets/torch.Tensor([[0.1, 0.1, 0.2, 0.2]]).cuda() 122 | 123 | 124 | negative_indices = 1 - positive_indices 125 | 126 | regression_diff = torch.abs(targets - regression[positive_indices, :]) 127 | 128 | regression_loss = torch.where( 129 | torch.le(regression_diff, 1.0 / 9.0), 130 | 0.5 * 9.0 * torch.pow(regression_diff, 2), 131 | regression_diff - 0.5 / 9.0 132 | ) 133 | regression_losses.append(regression_loss.mean()) 134 | else: 135 | regression_losses.append(torch.tensor(0).float().cuda()) 136 | 137 | return torch.stack(classification_losses).mean(dim=0, keepdim=True), torch.stack(regression_losses).mean(dim=0, keepdim=True) 138 | 139 | 140 | -------------------------------------------------------------------------------- /pytorch-retinanet/model.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch 3 | import math 4 | import time 5 | import torch.utils.model_zoo as model_zoo 6 | from utils import BasicBlock, Bottleneck, BBoxTransform, ClipBoxes, AttentionBottleneck 7 | from anchors import Anchors 8 | import losses 9 | from lib.nms.gpu_nms import gpu_nms 10 | 11 | 12 | model_urls = { 13 | 'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth', 14 | 'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth', 15 | 'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth', 16 | 'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth', 17 | 'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth', 18 | } 19 | 20 | class PyramidFeatures(nn.Module): 21 | def __init__(self, C3_size, C4_size, C5_size, feature_size=256): 22 | super(PyramidFeatures, self).__init__() 23 | 24 | # upsample C5 to get P5 from the FPN paper 25 | self.P5_1 = nn.Conv2d(C5_size, feature_size, kernel_size=1, stride=1, padding=0) 26 | self.P5_upsampled = nn.Upsample(scale_factor=2, mode='nearest') 27 | self.P5_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 28 | 29 | # add P5 elementwise to C4 30 | self.P4_1 = nn.Conv2d(C4_size, feature_size, kernel_size=1, stride=1, padding=0) 31 | self.P4_upsampled = nn.Upsample(scale_factor=2, mode='nearest') 32 | self.P4_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 33 | 34 | # add P4 elementwise to C3 35 | self.P3_1 = nn.Conv2d(C3_size, feature_size, kernel_size=1, stride=1, padding=0) 36 | self.P3_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=1, padding=1) 37 | 38 | # "P6 is obtained via a 3x3 stride-2 conv on C5" 39 | self.P6 = nn.Conv2d(C5_size, feature_size, kernel_size=3, stride=2, padding=1) 40 | 41 | # "P7 is computed by applying ReLU followed by a 3x3 stride-2 conv on P6" 42 | self.P7_1 = nn.ReLU() 43 | self.P7_2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, stride=2, padding=1) 44 | 45 | def forward(self, inputs): 46 | 47 | C3, C4, C5 = inputs 48 | 49 | P5_x = self.P5_1(C5) 50 | P5_upsampled_x = self.P5_upsampled(P5_x) 51 | P5_x = self.P5_2(P5_x) 52 | 53 | P4_x = self.P4_1(C4) 54 | P4_x = P5_upsampled_x + P4_x 55 | P4_upsampled_x = self.P4_upsampled(P4_x) 56 | P4_x = self.P4_2(P4_x) 57 | 58 | P3_x = self.P3_1(C3) 59 | P3_x = P3_x + P4_upsampled_x 60 | P3_x = self.P3_2(P3_x) 61 | 62 | P6_x = self.P6(C5) 63 | 64 | P7_x = self.P7_1(P6_x) 65 | P7_x = self.P7_2(P7_x) 66 | 67 | return [P3_x, P4_x, P5_x, P6_x, P7_x] 68 | 69 | 70 | class RegressionModel(nn.Module): 71 | def __init__(self, num_features_in, num_anchors=9, feature_size=256): 72 | super(RegressionModel, self).__init__() 73 | 74 | self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) 75 | self.act1 = nn.ReLU() 76 | 77 | self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 78 | self.act2 = nn.ReLU() 79 | 80 | self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 81 | self.act3 = nn.ReLU() 82 | 83 | self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 84 | self.act4 = nn.ReLU() 85 | 86 | self.output = nn.Conv2d(feature_size, num_anchors*4, kernel_size=3, padding=1) 87 | 88 | def forward(self, x): 89 | 90 | out = self.conv1(x) 91 | out = self.act1(out) 92 | 93 | out = self.conv2(out) 94 | out = self.act2(out) 95 | 96 | out = self.conv3(out) 97 | out = self.act3(out) 98 | 99 | out = self.conv4(out) 100 | out = self.act4(out) 101 | 102 | out = self.output(out) 103 | 104 | # out is B x C x W x H, with C = 4*num_anchors 105 | out = out.permute(0, 2, 3, 1) 106 | 107 | return out.contiguous().view(out.shape[0], -1, 4) 108 | 109 | class ClassificationModel(nn.Module): 110 | def __init__(self, num_features_in, num_anchors=9, num_classes=80, prior=0.01, feature_size=256): 111 | super(ClassificationModel, self).__init__() 112 | 113 | self.num_classes = num_classes 114 | self.num_anchors = num_anchors 115 | 116 | self.conv1 = nn.Conv2d(num_features_in, feature_size, kernel_size=3, padding=1) 117 | self.act1 = nn.ReLU() 118 | 119 | self.conv2 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 120 | self.act2 = nn.ReLU() 121 | 122 | self.conv3 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 123 | self.act3 = nn.ReLU() 124 | 125 | self.conv4 = nn.Conv2d(feature_size, feature_size, kernel_size=3, padding=1) 126 | self.act4 = nn.ReLU() 127 | 128 | self.output = nn.Conv2d(feature_size, num_anchors*num_classes, kernel_size=3, padding=1) 129 | self.output_act = nn.Sigmoid() 130 | 131 | def forward(self, x): 132 | 133 | out = self.conv1(x) 134 | out = self.act1(out) 135 | 136 | out = self.conv2(out) 137 | out = self.act2(out) 138 | 139 | out = self.conv3(out) 140 | out = self.act3(out) 141 | 142 | out = self.conv4(out) 143 | out = self.act4(out) 144 | 145 | out = self.output(out) 146 | out = self.output_act(out) 147 | 148 | # out is B x C x W x H, with C = n_classes + n_anchors 149 | out1 = out.permute(0, 2, 3, 1) 150 | 151 | batch_size, width, height, channels = out1.shape 152 | 153 | out2 = out1.view(batch_size, width, height, self.num_anchors, self.num_classes) 154 | 155 | return out2.contiguous().view(x.shape[0], -1, self.num_classes) 156 | 157 | class ResNet(nn.Module): 158 | 159 | def __init__(self, num_classes, block, layers): 160 | self.inplanes = 64 161 | super(ResNet, self).__init__() 162 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) 163 | self.bn1 = nn.BatchNorm2d(64) 164 | self.relu = nn.ReLU(inplace=True) 165 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 166 | self.layer1 = self._make_layer(block, 64, layers[0]) 167 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 168 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 169 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 170 | 171 | 172 | if block == BasicBlock: 173 | fpn_sizes = [self.layer2[layers[1]-1].conv2.out_channels, self.layer3[layers[2]-1].conv2.out_channels, self.layer4[layers[3]-1].conv2.out_channels] 174 | elif block == Bottleneck or block == AttentionBottleneck: 175 | fpn_sizes = [self.layer2[layers[1]-1].conv3.out_channels, self.layer3[layers[2]-1].conv3.out_channels, self.layer4[layers[3]-1].conv3.out_channels] 176 | 177 | self.fpn = PyramidFeatures(fpn_sizes[0], fpn_sizes[1], fpn_sizes[2]) 178 | 179 | self.regressionModel = RegressionModel(256) 180 | self.classificationModel = ClassificationModel(256, num_classes=num_classes) 181 | 182 | self.anchors = Anchors() 183 | 184 | self.regressBoxes = BBoxTransform() 185 | 186 | self.clipBoxes = ClipBoxes() 187 | 188 | self.focalLoss = losses.FocalLoss() 189 | 190 | for m in self.modules(): 191 | if isinstance(m, nn.Conv2d): 192 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 193 | m.weight.data.normal_(0, math.sqrt(2. / n)) 194 | elif isinstance(m, nn.BatchNorm2d): 195 | m.weight.data.fill_(1) 196 | m.bias.data.zero_() 197 | 198 | prior = 0.01 199 | 200 | self.classificationModel.output.weight.data.fill_(0) 201 | self.classificationModel.output.bias.data.fill_(-math.log((1.0-prior)/prior)) 202 | 203 | self.regressionModel.output.weight.data.fill_(0) 204 | self.regressionModel.output.bias.data.fill_(0) 205 | 206 | self.freeze_bn() 207 | 208 | def _make_layer(self, block, planes, blocks, stride=1): 209 | downsample = None 210 | if stride != 1 or self.inplanes != planes * block.expansion: 211 | downsample = nn.Sequential( 212 | nn.Conv2d(self.inplanes, planes * block.expansion, 213 | kernel_size=1, stride=stride, bias=False), 214 | nn.BatchNorm2d(planes * block.expansion), 215 | ) 216 | 217 | layers = [] 218 | layers.append(block(self.inplanes, planes, stride, downsample)) 219 | self.inplanes = planes * block.expansion 220 | for i in range(1, blocks): 221 | layers.append(block(self.inplanes, planes)) 222 | 223 | return nn.Sequential(*layers) 224 | 225 | def freeze_bn(self): 226 | '''Freeze BatchNorm layers.''' 227 | for layer in self.modules(): 228 | if isinstance(layer, nn.BatchNorm2d): 229 | layer.eval() 230 | 231 | def forward(self, inputs): 232 | 233 | if self.training: 234 | img_batch, annotations = inputs 235 | else: 236 | img_batch = inputs 237 | 238 | x = self.conv1(img_batch) 239 | x = self.bn1(x) 240 | x = self.relu(x) 241 | x = self.maxpool(x) 242 | 243 | x1 = self.layer1(x) 244 | x2 = self.layer2(x1) 245 | x3 = self.layer3(x2) 246 | x4 = self.layer4(x3) 247 | 248 | features = self.fpn([x2, x3, x4]) 249 | 250 | regression = torch.cat([self.regressionModel(feature) for feature in features], dim=1) 251 | 252 | classification = torch.cat([self.classificationModel(feature) for feature in features], dim=1) 253 | 254 | anchors = self.anchors(img_batch) 255 | 256 | if self.training: 257 | #return self.focalLoss(classification, regression, anchors, annotations) 258 | return classification, regression, anchors, annotations 259 | else: 260 | transformed_anchors = self.regressBoxes(anchors, regression) 261 | transformed_anchors = self.clipBoxes(transformed_anchors, img_batch) 262 | 263 | scores = torch.max(classification, dim=2, keepdim=True)[0] 264 | 265 | scores_over_thresh = (scores>0.05)[0, :, 0] 266 | 267 | if scores_over_thresh.sum() == 0: 268 | # no boxes to NMS, just return 269 | return [torch.zeros(0), torch.zeros(0), torch.zeros(0, 4)] 270 | 271 | classification = classification[:, scores_over_thresh, :] 272 | transformed_anchors = transformed_anchors[:, scores_over_thresh, :] 273 | scores = scores[:, scores_over_thresh, :] 274 | dets = torch.cat([transformed_anchors, scores], dim=2)[0, :, :] 275 | anchors_nms_idx = self.nms(dets.cpu().numpy(), 0.5) 276 | 277 | nms_scores, nms_class = classification[0, anchors_nms_idx, :].max(dim=1) 278 | 279 | return [nms_scores, nms_class, transformed_anchors[0, anchors_nms_idx, :]] 280 | 281 | def nms(self, dets, thresh): 282 | """Dispatch to either CPU or GPU NMS implementations. 283 | Accept dets as tensor""" 284 | return gpu_nms(dets, thresh) 285 | 286 | 287 | 288 | def resnet18(num_classes, pretrained=False, **kwargs): 289 | """Constructs a ResNet-18 model. 290 | Args: 291 | pretrained (bool): If True, returns a model pre-trained on ImageNet 292 | """ 293 | model = ResNet(num_classes, BasicBlock, [2, 2, 2, 2], **kwargs) 294 | if pretrained: 295 | model.load_state_dict(model_zoo.load_url(model_urls['resnet18'], model_dir='.'), strict=False) 296 | return model 297 | 298 | 299 | def resnet34(num_classes, pretrained=False, **kwargs): 300 | """Constructs a ResNet-34 model. 301 | Args: 302 | pretrained (bool): If True, returns a model pre-trained on ImageNet 303 | """ 304 | model = ResNet(num_classes, BasicBlock, [3, 4, 6, 3], **kwargs) 305 | if pretrained: 306 | model.load_state_dict(model_zoo.load_url(model_urls['resnet34'], model_dir='.'), strict=False) 307 | return model 308 | 309 | 310 | def resnet50(num_classes, pretrained=False, **kwargs): 311 | """Constructs a ResNet-50 model. 312 | Args: 313 | pretrained (bool): If True, returns a model pre-trained on ImageNet 314 | """ 315 | model = ResNet(num_classes, Bottleneck, [3, 4, 6, 3], **kwargs) 316 | if pretrained: 317 | model.load_state_dict(model_zoo.load_url(model_urls['resnet50'], model_dir='.'), strict=False) 318 | return model 319 | 320 | 321 | def attention_resnet50(num_classes, pretrained=False, **kwargs): 322 | """ 323 | :param num_classes: 324 | :param pretrained: 325 | :param kwargs: 326 | :return: 327 | """ 328 | 329 | model = ResNet(num_classes, AttentionBottleneck, [3, 4, 6, 3], **kwargs) 330 | return model 331 | 332 | def resnet101(num_classes, pretrained=False, **kwargs): 333 | """Constructs a ResNet-101 model. 334 | Args: 335 | pretrained (bool): If True, returns a model pre-trained on ImageNet 336 | """ 337 | model = ResNet(num_classes, Bottleneck, [3, 4, 23, 3], **kwargs) 338 | if pretrained: 339 | model.load_state_dict(model_zoo.load_url(model_urls['resnet101'], model_dir='.'), strict=False) 340 | return model 341 | 342 | 343 | def resnet152(num_classes, pretrained=False, **kwargs): 344 | """Constructs a ResNet-152 model. 345 | Args: 346 | pretrained (bool): If True, returns a model pre-trained on ImageNet 347 | """ 348 | model = ResNet(num_classes, Bottleneck, [3, 8, 36, 3], **kwargs) 349 | if pretrained: 350 | model.load_state_dict(model_zoo.load_url(model_urls['resnet152'], model_dir='.'), strict=False) 351 | return model 352 | -------------------------------------------------------------------------------- /pytorch-retinanet/oid_dataset.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, division 2 | 3 | import csv 4 | import json 5 | import os 6 | import warnings 7 | 8 | import numpy as np 9 | import skimage 10 | import skimage.color 11 | import skimage.io 12 | import skimage.transform 13 | from PIL import Image 14 | from torch.utils.data import Dataset 15 | 16 | 17 | def get_labels(metadata_dir, version='v4'): 18 | if version == 'v4' or version == 'challenge2018': 19 | csv_file = 'class-descriptions-boxable.csv' if version == 'v4' else 'challenge-2018-class-descriptions-500.csv' 20 | 21 | boxable_classes_descriptions = os.path.join(metadata_dir, csv_file) 22 | id_to_labels = {} 23 | cls_index = {} 24 | 25 | i = 0 26 | with open(boxable_classes_descriptions) as f: 27 | for row in csv.reader(f): 28 | # make sure the csv row is not empty (usually the last one) 29 | if len(row): 30 | label = row[0] 31 | description = row[1].replace("\"", "").replace("'", "").replace('`', '') 32 | 33 | id_to_labels[i] = description 34 | cls_index[label] = i 35 | 36 | i += 1 37 | else: 38 | trainable_classes_path = os.path.join(metadata_dir, 'classes-bbox-trainable.txt') 39 | description_path = os.path.join(metadata_dir, 'class-descriptions.csv') 40 | 41 | description_table = {} 42 | with open(description_path) as f: 43 | for row in csv.reader(f): 44 | # make sure the csv row is not empty (usually the last one) 45 | if len(row): 46 | description_table[row[0]] = row[1].replace("\"", "").replace("'", "").replace('`', '') 47 | 48 | with open(trainable_classes_path, 'rb') as f: 49 | trainable_classes = f.read().split('\n') 50 | 51 | id_to_labels = dict([(i, description_table[c]) for i, c in enumerate(trainable_classes)]) 52 | cls_index = dict([(c, i) for i, c in enumerate(trainable_classes)]) 53 | 54 | return id_to_labels, cls_index 55 | 56 | 57 | def generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, version='v4'): 58 | validation_image_ids = {} 59 | 60 | if version == 'v4': 61 | annotations_path = os.path.join(metadata_dir, subset, '{}-annotations-bbox.csv'.format(subset)) 62 | elif version == 'challenge2018': 63 | validation_image_ids_path = os.path.join(metadata_dir, 'challenge-2018-image-ids-valset-od.csv') 64 | 65 | with open(validation_image_ids_path, 'r') as csv_file: 66 | reader = csv.DictReader(csv_file, fieldnames=['ImageID']) 67 | reader.next() 68 | for line, row in enumerate(reader): 69 | image_id = row['ImageID'] 70 | validation_image_ids[image_id] = True 71 | 72 | annotations_path = os.path.join(metadata_dir, 'challenge-2018-train-annotations-bbox.csv') 73 | else: 74 | annotations_path = os.path.join(metadata_dir, subset, 'annotations-human-bbox.csv') 75 | 76 | fieldnames = ['ImageID', 'Source', 'LabelName', 'Confidence', 77 | 'XMin', 'XMax', 'YMin', 'YMax', 78 | 'IsOccluded', 'IsTruncated', 'IsGroupOf', 'IsDepiction', 'IsInside'] 79 | 80 | id_annotations = dict() 81 | with open(annotations_path, 'r') as csv_file: 82 | reader = csv.DictReader(csv_file, fieldnames=fieldnames) 83 | next(reader) 84 | 85 | images_sizes = {} 86 | for line, row in enumerate(reader): 87 | frame = row['ImageID'] 88 | 89 | if version == 'challenge2018': 90 | if subset == 'train': 91 | if frame in validation_image_ids: 92 | continue 93 | elif subset == 'validation': 94 | if frame not in validation_image_ids: 95 | continue 96 | else: 97 | raise NotImplementedError('This generator handles only the train and validation subsets') 98 | 99 | class_name = row['LabelName'] 100 | 101 | if class_name not in cls_index: 102 | continue 103 | 104 | cls_id = cls_index[class_name] 105 | 106 | if version == 'challenge2018': 107 | # We recommend participants to use the provided subset of the training set as a validation set. 108 | # This is preferable over using the V4 val/test sets, as the training set is more densely annotated. 109 | img_path = os.path.join(main_dir, 'images', 'train', frame + '.jpg') 110 | else: 111 | img_path = os.path.join(main_dir, 'images', subset, frame + '.jpg') 112 | 113 | if frame in images_sizes: 114 | width, height = images_sizes[frame] 115 | else: 116 | try: 117 | with Image.open(img_path) as img: 118 | width, height = img.width, img.height 119 | images_sizes[frame] = (width, height) 120 | except Exception as ex: 121 | if version == 'challenge2018': 122 | raise ex 123 | continue 124 | 125 | x1 = float(row['XMin']) 126 | x2 = float(row['XMax']) 127 | y1 = float(row['YMin']) 128 | y2 = float(row['YMax']) 129 | 130 | x1_int = int(round(x1 * width)) 131 | x2_int = int(round(x2 * width)) 132 | y1_int = int(round(y1 * height)) 133 | y2_int = int(round(y2 * height)) 134 | 135 | # Check that the bounding box is valid. 136 | if x2 <= x1: 137 | raise ValueError('line {}: x2 ({}) must be higher than x1 ({})'.format(line, x2, x1)) 138 | if y2 <= y1: 139 | raise ValueError('line {}: y2 ({}) must be higher than y1 ({})'.format(line, y2, y1)) 140 | 141 | if y2_int == y1_int: 142 | warnings.warn('filtering line {}: rounding y2 ({}) and y1 ({}) makes them equal'.format(line, y2, y1)) 143 | continue 144 | 145 | if x2_int == x1_int: 146 | warnings.warn('filtering line {}: rounding x2 ({}) and x1 ({}) makes them equal'.format(line, x2, x1)) 147 | continue 148 | 149 | img_id = row['ImageID'] 150 | annotation = {'cls_id': cls_id, 'x1': x1, 'x2': x2, 'y1': y1, 'y2': y2} 151 | 152 | if img_id in id_annotations: 153 | annotations = id_annotations[img_id] 154 | annotations['boxes'].append(annotation) 155 | else: 156 | id_annotations[img_id] = {'w': width, 'h': height, 'boxes': [annotation]} 157 | return id_annotations 158 | 159 | 160 | class OidDataset(Dataset): 161 | """Oid dataset.""" 162 | 163 | def __init__(self, main_dir, subset, version='v4', annotation_cache_dir='.', transform=None): 164 | if version == 'v4': 165 | metadata = '2018_04' 166 | elif version == 'challenge2018': 167 | metadata = 'challenge2018' 168 | elif version == 'v3': 169 | metadata = '2017_11' 170 | else: 171 | raise NotImplementedError('There is currently no implementation for versions older than v3') 172 | 173 | self.transform = transform 174 | 175 | if version == 'challenge2018': 176 | self.base_dir = os.path.join(main_dir, 'images', 'train') 177 | else: 178 | self.base_dir = os.path.join(main_dir, 'images', subset) 179 | 180 | metadata_dir = os.path.join(main_dir, metadata) 181 | annotation_cache_json = os.path.join(annotation_cache_dir, subset + '.json') 182 | 183 | self.id_to_labels, cls_index = get_labels(metadata_dir, version=version) 184 | 185 | if os.path.exists(annotation_cache_json): 186 | with open(annotation_cache_json, 'r') as f: 187 | self.annotations = json.loads(f.read()) 188 | else: 189 | self.annotations = generate_images_annotations_json(main_dir, metadata_dir, subset, cls_index, 190 | version=version) 191 | json.dump(self.annotations, open(annotation_cache_json, "w")) 192 | 193 | self.id_to_image_id = dict([(i, k) for i, k in enumerate(self.annotations)]) 194 | 195 | # (label -> name) 196 | self.labels = self.id_to_labels 197 | 198 | def __len__(self): 199 | return len(self.annotations) 200 | 201 | def __getitem__(self, idx): 202 | 203 | img = self.load_image(idx) 204 | annot = self.load_annotations(idx) 205 | sample = {'img': img, 'annot': annot} 206 | if self.transform: 207 | sample = self.transform(sample) 208 | 209 | return sample 210 | 211 | def image_path(self, image_index): 212 | path = os.path.join(self.base_dir, self.id_to_image_id[image_index] + '.jpg') 213 | return path 214 | 215 | def load_image(self, image_index): 216 | path = self.image_path(image_index) 217 | img = skimage.io.imread(path) 218 | 219 | if len(img.shape) == 1: 220 | img = img[0] 221 | 222 | if len(img.shape) == 2: 223 | img = skimage.color.gray2rgb(img) 224 | 225 | try: 226 | return img.astype(np.float32) / 255.0 227 | except Exception: 228 | print (path) 229 | exit(0) 230 | 231 | def load_annotations(self, image_index): 232 | # get ground truth annotations 233 | image_annotations = self.annotations[self.id_to_image_id[image_index]] 234 | 235 | labels = image_annotations['boxes'] 236 | height, width = image_annotations['h'], image_annotations['w'] 237 | 238 | boxes = np.zeros((len(labels), 5)) 239 | for idx, ann in enumerate(labels): 240 | cls_id = ann['cls_id'] 241 | x1 = ann['x1'] * width 242 | x2 = ann['x2'] * width 243 | y1 = ann['y1'] * height 244 | y2 = ann['y2'] * height 245 | 246 | boxes[idx, 0] = x1 247 | boxes[idx, 1] = y1 248 | boxes[idx, 2] = x2 249 | boxes[idx, 3] = y2 250 | boxes[idx, 4] = cls_id 251 | 252 | return boxes 253 | 254 | def image_aspect_ratio(self, image_index): 255 | img_annotations = self.annotations[self.id_to_image_id[image_index]] 256 | height, width = img_annotations['h'], img_annotations['w'] 257 | return float(width) / float(height) 258 | 259 | def num_classes(self): 260 | return len(self.id_to_labels) 261 | -------------------------------------------------------------------------------- /pytorch-retinanet/train.py: -------------------------------------------------------------------------------- 1 | import time 2 | import os 3 | import copy 4 | import argparse 5 | import pdb 6 | import collections 7 | import sys 8 | 9 | import numpy as np 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.optim as optim 14 | from torch.optim import lr_scheduler 15 | from torch.autograd import Variable 16 | from torchvision import datasets, models, transforms 17 | import torchvision 18 | 19 | import model 20 | from anchors import Anchors 21 | import losses 22 | from dataloader import CocoDataset, CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, UnNormalizer, Normalizer 23 | from torch.utils.data import Dataset, DataLoader 24 | 25 | import coco_eval 26 | import csv_eval 27 | 28 | #assert torch.__version__.split('.')[1] == '4' 29 | 30 | print('CUDA available: {}'.format(torch.cuda.is_available())) 31 | 32 | 33 | def main(args=None): 34 | 35 | parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') 36 | 37 | parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') 38 | parser.add_argument('--coco_path', help='Path to COCO directory') 39 | parser.add_argument('--csv_train', help='Path to file containing training annotations (see readme)') 40 | parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') 41 | parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)') 42 | 43 | parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) 44 | parser.add_argument('--epochs', help='Number of epochs', type=int, default=100) 45 | parser.add_argument('--attention', help='use attention version', action='store_true') 46 | 47 | parser = parser.parse_args(args) 48 | 49 | # Create the data loaders 50 | if parser.dataset == 'coco': 51 | 52 | if parser.coco_path is None: 53 | raise ValueError('Must provide --coco_path when training on COCO,') 54 | 55 | dataset_train = CocoDataset(parser.coco_path, set_name='train2017', transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()])) 56 | dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer()])) 57 | 58 | elif parser.dataset == 'csv': 59 | 60 | if parser.csv_train is None: 61 | raise ValueError('Must provide --csv_train when training on COCO,') 62 | 63 | if parser.csv_classes is None: 64 | raise ValueError('Must provide --csv_classes when training on COCO,') 65 | 66 | 67 | dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()])) 68 | 69 | if parser.csv_val is None: 70 | dataset_val = None 71 | print('No validation annotations provided.') 72 | else: 73 | dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()])) 74 | 75 | else: 76 | raise ValueError('Dataset type not understood (must be csv or coco), exiting.') 77 | 78 | sampler = AspectRatioBasedSampler(dataset_train, batch_size=1, drop_last=False) 79 | dataloader_train = DataLoader(dataset_train, num_workers=3, collate_fn=collater, batch_sampler=sampler) 80 | 81 | if dataset_val is not None: 82 | sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) 83 | dataloader_val = DataLoader(dataset_val, num_workers=3, collate_fn=collater, batch_sampler=sampler_val) 84 | 85 | # Create the model 86 | if parser.depth == 18: 87 | retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True) 88 | elif parser.depth == 34: 89 | retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) 90 | elif parser.depth == 50: 91 | if parser.attention: 92 | retinanet = model.attention_resnet50(num_classes=dataset_train.num_classes(), pretrained=True) 93 | else: 94 | retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) 95 | elif parser.depth == 101: 96 | retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True) 97 | elif parser.depth == 152: 98 | retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True) 99 | else: 100 | raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152') 101 | 102 | use_gpu = True 103 | 104 | if use_gpu: 105 | retinanet = retinanet.cuda() 106 | 107 | retinanet = torch.nn.DataParallel(retinanet).cuda() 108 | 109 | retinanet.training = True 110 | 111 | optimizer = optim.Adam(retinanet.parameters(), lr=1e-5) 112 | 113 | scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) 114 | 115 | loss_hist = collections.deque(maxlen=500) 116 | 117 | retinanet.train() 118 | retinanet.module.freeze_bn() 119 | 120 | print('Num training images: {}'.format(len(dataset_train))) 121 | 122 | focalLoss = losses.FocalLoss() 123 | 124 | for epoch_num in range(parser.epochs): 125 | 126 | retinanet.train() 127 | retinanet.module.freeze_bn() 128 | 129 | epoch_loss = [] 130 | 131 | for iter_num, data in enumerate(dataloader_train): 132 | try: 133 | optimizer.zero_grad() 134 | 135 | #classification_loss, regression_loss = retinanet([data['img'].cuda().float(), data['annot']]) 136 | classification, regression, anchors, annotations = retinanet([data['img'].cuda().float(), data['annot']]) 137 | classification_loss, regression_loss = focalLoss(classification, regression, anchors, annotations) 138 | 139 | classification_loss = classification_loss.mean() 140 | regression_loss = regression_loss.mean() 141 | 142 | loss = classification_loss + regression_loss 143 | 144 | if bool(loss == 0): 145 | continue 146 | 147 | loss.backward() 148 | 149 | torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1) 150 | 151 | optimizer.step() 152 | 153 | loss_hist.append(float(loss)) 154 | 155 | epoch_loss.append(float(loss)) 156 | 157 | print('Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}'.format(epoch_num, iter_num, float(classification_loss), float(regression_loss), np.mean(loss_hist))) 158 | 159 | del classification_loss 160 | del regression_loss 161 | except Exception as e: 162 | print(e) 163 | continue 164 | 165 | if parser.dataset == 'coco': 166 | 167 | print('Evaluating dataset') 168 | 169 | coco_eval.evaluate_coco(dataset_val, retinanet) 170 | 171 | elif parser.dataset == 'csv' and parser.csv_val is not None: 172 | 173 | print('Evaluating dataset') 174 | 175 | mAP = csv_eval.evaluate(dataset_val, retinanet) 176 | 177 | 178 | scheduler.step(np.mean(epoch_loss)) 179 | 180 | torch.save(retinanet.module, '{}_retinanet_{}.pt'.format(parser.dataset, epoch_num)) 181 | 182 | retinanet.eval() 183 | 184 | torch.save(retinanet, 'model_final.pt'.format(epoch_num)) 185 | 186 | if __name__ == '__main__': 187 | main() 188 | -------------------------------------------------------------------------------- /pytorch-retinanet/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import numpy as np 4 | from attentionConv2d import AttentionConv2d 5 | 6 | 7 | def conv3x3(in_planes, out_planes, stride=1): 8 | """3x3 convolution with padding""" 9 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 10 | padding=1, bias=False) 11 | 12 | class BasicBlock(nn.Module): 13 | expansion = 1 14 | 15 | def __init__(self, inplanes, planes, stride=1, downsample=None): 16 | super(BasicBlock, self).__init__() 17 | self.conv1 = conv3x3(inplanes, planes, stride) 18 | self.bn1 = nn.BatchNorm2d(planes) 19 | self.relu = nn.ReLU(inplace=True) 20 | self.conv2 = conv3x3(planes, planes) 21 | self.bn2 = nn.BatchNorm2d(planes) 22 | self.downsample = downsample 23 | self.stride = stride 24 | 25 | def forward(self, x): 26 | residual = x 27 | 28 | out = self.conv1(x) 29 | out = self.bn1(out) 30 | out = self.relu(out) 31 | 32 | out = self.conv2(out) 33 | out = self.bn2(out) 34 | 35 | if self.downsample is not None: 36 | residual = self.downsample(x) 37 | 38 | out += residual 39 | out = self.relu(out) 40 | 41 | return out 42 | 43 | 44 | class Bottleneck(nn.Module): 45 | expansion = 4 46 | 47 | def __init__(self, inplanes, planes, stride=1, downsample=None): 48 | super(Bottleneck, self).__init__() 49 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 50 | self.bn1 = nn.BatchNorm2d(planes) 51 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 52 | padding=1, bias=False) 53 | self.bn2 = nn.BatchNorm2d(planes) 54 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 55 | self.bn3 = nn.BatchNorm2d(planes * 4) 56 | self.relu = nn.ReLU(inplace=True) 57 | self.downsample = downsample 58 | self.stride = stride 59 | 60 | def forward(self, x): 61 | residual = x 62 | 63 | out = self.conv1(x) 64 | out = self.bn1(out) 65 | out = self.relu(out) 66 | 67 | out = self.conv2(out) 68 | out = self.bn2(out) 69 | out = self.relu(out) 70 | 71 | out = self.conv3(out) 72 | out = self.bn3(out) 73 | 74 | if self.downsample is not None: 75 | residual = self.downsample(x) 76 | 77 | out += residual 78 | out = self.relu(out) 79 | 80 | return out 81 | 82 | class AttentionBottleneck(nn.Module): 83 | expansion = 4 84 | def __init__(self, inplanes, planes, stride=1, downsample=None): 85 | super(AttentionBottleneck, self).__init__() 86 | n = np.log(planes) // np.log(2) 87 | 88 | dk = int(planes // 2 ** (n-1)) 89 | dv = int(planes // 2 ** (n-2)) 90 | self.conv1 = AttentionConv2d(inplanes, planes, dk, dv, num_heads=4, kernel_size=1, padding=0, 91 | rel_encoding=False, height=None, width=None) 92 | self.bn1 = nn.BatchNorm2d(planes) 93 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) 94 | self.bn2 = nn.BatchNorm2d(planes) 95 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 96 | self.bn3 = nn.BatchNorm2d(planes * 4) 97 | self.relu = nn.ReLU(inplace=True) 98 | self.downsample = downsample 99 | self.stride = stride 100 | 101 | def forward(self, x): 102 | residual = x 103 | 104 | out = self.conv1(x) 105 | out = self.bn1(out) 106 | out = self.relu(out) 107 | 108 | out = self.conv2(out) 109 | out = self.bn2(out) 110 | out = self.relu(out) 111 | 112 | out = self.conv3(out) 113 | out = self.bn3(out) 114 | 115 | if self.downsample is not None: 116 | residual = self.downsample(x) 117 | 118 | out += residual 119 | out = self.relu(out) 120 | return out 121 | 122 | class BBoxTransform(nn.Module): 123 | 124 | def __init__(self, mean=None, std=None): 125 | super(BBoxTransform, self).__init__() 126 | if mean is None: 127 | self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)).cuda() 128 | else: 129 | self.mean = mean 130 | if std is None: 131 | self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)).cuda() 132 | else: 133 | self.std = std 134 | 135 | def forward(self, boxes, deltas): 136 | 137 | widths = boxes[:, :, 2] - boxes[:, :, 0] 138 | heights = boxes[:, :, 3] - boxes[:, :, 1] 139 | ctr_x = boxes[:, :, 0] + 0.5 * widths 140 | ctr_y = boxes[:, :, 1] + 0.5 * heights 141 | 142 | dx = deltas[:, :, 0] * self.std[0] + self.mean[0] 143 | dy = deltas[:, :, 1] * self.std[1] + self.mean[1] 144 | dw = deltas[:, :, 2] * self.std[2] + self.mean[2] 145 | dh = deltas[:, :, 3] * self.std[3] + self.mean[3] 146 | 147 | pred_ctr_x = ctr_x + dx * widths 148 | pred_ctr_y = ctr_y + dy * heights 149 | pred_w = torch.exp(dw) * widths 150 | pred_h = torch.exp(dh) * heights 151 | 152 | pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w 153 | pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h 154 | pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w 155 | pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h 156 | 157 | pred_boxes = torch.stack([pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2) 158 | 159 | return pred_boxes 160 | 161 | 162 | class ClipBoxes(nn.Module): 163 | 164 | def __init__(self, width=None, height=None): 165 | super(ClipBoxes, self).__init__() 166 | 167 | def forward(self, boxes, img): 168 | 169 | batch_size, num_channels, height, width = img.shape 170 | 171 | boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0) 172 | boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0) 173 | 174 | boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width) 175 | boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height) 176 | 177 | return boxes 178 | -------------------------------------------------------------------------------- /pytorch-retinanet/visualize.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torchvision 3 | import time 4 | import os 5 | import copy 6 | import pdb 7 | import time 8 | import argparse 9 | 10 | import sys 11 | import cv2 12 | 13 | import torch 14 | from torch.utils.data import Dataset, DataLoader 15 | from torchvision import datasets, models, transforms 16 | 17 | from dataloader import CocoDataset, CSVDataset, collater, Resizer, AspectRatioBasedSampler, Augmenter, UnNormalizer, Normalizer 18 | 19 | 20 | assert torch.__version__.split('.')[1] == '4' 21 | 22 | print('CUDA available: {}'.format(torch.cuda.is_available())) 23 | 24 | 25 | def main(args=None): 26 | parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') 27 | 28 | parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') 29 | parser.add_argument('--coco_path', help='Path to COCO directory') 30 | parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') 31 | parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)') 32 | 33 | parser.add_argument('--model', help='Path to model (.pt) file.') 34 | 35 | parser = parser.parse_args(args) 36 | 37 | if parser.dataset == 'coco': 38 | dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer()])) 39 | elif parser.dataset == 'csv': 40 | dataset_val = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()])) 41 | else: 42 | raise ValueError('Dataset type not understood (must be csv or coco), exiting.') 43 | 44 | sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) 45 | dataloader_val = DataLoader(dataset_val, num_workers=1, collate_fn=collater, batch_sampler=sampler_val) 46 | 47 | retinanet = torch.load(parser.model) 48 | 49 | use_gpu = True 50 | 51 | if use_gpu: 52 | retinanet = retinanet.cuda() 53 | 54 | retinanet.eval() 55 | 56 | unnormalize = UnNormalizer() 57 | 58 | def draw_caption(image, box, caption): 59 | 60 | b = np.array(box).astype(int) 61 | cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (0, 0, 0), 2) 62 | cv2.putText(image, caption, (b[0], b[1] - 10), cv2.FONT_HERSHEY_PLAIN, 1, (255, 255, 255), 1) 63 | 64 | for idx, data in enumerate(dataloader_val): 65 | 66 | with torch.no_grad(): 67 | st = time.time() 68 | scores, classification, transformed_anchors = retinanet(data['img'].cuda().float()) 69 | print('Elapsed time: {}'.format(time.time()-st)) 70 | idxs = np.where(scores>0.5) 71 | img = np.array(255 * unnormalize(data['img'][0, :, :, :])).copy() 72 | 73 | img[img<0] = 0 74 | img[img>255] = 255 75 | 76 | img = np.transpose(img, (1, 2, 0)) 77 | 78 | img = cv2.cvtColor(img.astype(np.uint8), cv2.COLOR_BGR2RGB) 79 | 80 | for j in range(idxs[0].shape[0]): 81 | bbox = transformed_anchors[idxs[0][j], :] 82 | x1 = int(bbox[0]) 83 | y1 = int(bbox[1]) 84 | x2 = int(bbox[2]) 85 | y2 = int(bbox[3]) 86 | label_name = dataset_val.labels[int(classification[idxs[0][j]])] 87 | draw_caption(img, (x1, y1, x2, y2), label_name) 88 | 89 | cv2.rectangle(img, (x1, y1), (x2, y2), color=(0, 0, 255), thickness=2) 90 | print(label_name) 91 | 92 | cv2.imshow('img', img) 93 | cv2.waitKey(0) 94 | 95 | 96 | 97 | if __name__ == '__main__': 98 | main() -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | import torch.utils.data as data 3 | import torch.optim as optim 4 | 5 | from ignite.engine import Events, create_supervised_evaluator, create_supervised_trainer 6 | from ignite.metrics import Loss, Accuracy 7 | from ignite.contrib.handlers.param_scheduler import CosineAnnealingScheduler 8 | from ignite.handlers.checkpoint import ModelCheckpoint 9 | from torchvision.datasets import CIFAR100 10 | from dataloaders import CocoDetection 11 | from torchvision.transforms import Compose, RandomCrop, RandomHorizontalFlip, Normalize, ToTensor 12 | from .model.wideresnet import AttentionWideResNet 13 | from .model.retinanet import AttentionRetinaNet 14 | from tqdm import tqdm 15 | 16 | from tensorboardX import SummaryWriter 17 | from .utils.utils import Resizer, Augmenter 18 | import argparse 19 | import json 20 | 21 | HOME_PREFIX = '/home/se26956/projects/IRP/pytorch-attention-augmented-convolution/' 22 | 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--config", type=str, help="config path") 25 | 26 | args = parser.parse_args() 27 | 28 | 29 | def create_summary_writer(model, data_loader, log_dir): 30 | writer = SummaryWriter(log_dir=log_dir) 31 | data_loader_iter = iter(data_loader) 32 | x, y = next(data_loader_iter) 33 | 34 | try: 35 | writer.add_graph(model, x) 36 | except Exception as e: 37 | print("Failed to save model graph: {}".format(e)) 38 | return writer 39 | 40 | def get_data_loaders(batch_size): 41 | normalize = Normalize(mean=[0.49137254, 0.48235294, 0.4466667], 42 | std=[0.247058823, 0.24352941, 0.2615686]) 43 | train_transforms = Compose([ 44 | RandomCrop(32), 45 | RandomHorizontalFlip(), 46 | ToTensor(), 47 | normalize 48 | ]) 49 | 50 | test_transform = Compose([ 51 | ToTensor(), 52 | normalize 53 | ]) 54 | 55 | train_dataset = CIFAR100('./data', train=True, download=True, transform=train_transforms) 56 | val_dataset = CIFAR100('./data', train=False, download=True, transform=test_transform) 57 | 58 | train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 59 | val_loader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True) 60 | 61 | return train_loader, val_loader 62 | 63 | def get_COCO_loaders(batch_size): 64 | normalize = Normalize( 65 | mean=[0.485, 0.456, 0.406], 66 | std=[0.229, 0.224, 0.225] 67 | ) 68 | 69 | 70 | train_transforms = Compose([ 71 | RandomHorizontalFlip(p=0.5), 72 | ToTensor(), 73 | normalize 74 | ]) 75 | 76 | test_transform = Compose([ 77 | ToTensor(), 78 | normalize 79 | ]) 80 | 81 | train_dataset = CocoDetection(HOME_PREFIX+'data/coco_detection/train2017', 82 | HOME_PREFIX+'data/coco_detection/annotations/instances_train2017.json', 83 | img_and_target_transform=Resizer(), 84 | transform=train_transforms) 85 | 86 | test_dataset = CocoDetection(HOME_PREFIX+'data/coco_detection/val', 87 | HOME_PREFIX+'data/coco_detection/annotations/instances_val2017.json', 88 | transform=test_transform) 89 | 90 | train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) 91 | test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True) 92 | 93 | for input, label in train_loader: 94 | print(input.size()) 95 | 96 | return train_loader, test_loader 97 | 98 | 99 | def run(config): 100 | 101 | if config['model'] == 'AttentionWideResNet': 102 | train_loader, val_loader = get_data_loaders(config['batch_size']) 103 | model = AttentionWideResNet(28, 100, 10, (32, 32), 0.0) 104 | elif config['model'] == 'AttentionRetinaNet': 105 | train_loader, val_loader = get_COCO_loaders(config['batch_size']) 106 | model = AttentionRetinaNet(num_classes=80, input_size=(5,3)) 107 | writer = create_summary_writer(model, train_loader, config["tb_logdir"]) 108 | model.cuda() 109 | 110 | log_interval = config['log_interval'] 111 | epochs = config['epochs'] 112 | model = nn.DataParallel(model) 113 | 114 | optimizer = optim.SGD(model.parameters(), lr=config['lr'], momentum=config['momentum']) 115 | scheduler = CosineAnnealingScheduler(optimizer, 'lr', 0.1, 0.001, len(train_loader)) 116 | 117 | loss_fn = nn.CrossEntropyLoss().cuda() 118 | 119 | trainer = create_supervised_trainer(model, optimizer, loss_fn, device='cuda') 120 | trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) 121 | trainer_saver = ModelCheckpoint( 122 | config['checkpoint_dir'], 123 | filename_prefix="model_ckpt", 124 | save_interval=1000, 125 | n_saved=10, 126 | atomic=True, 127 | save_as_state_dict=True, 128 | create_dir=True 129 | ) 130 | trainer.add_event_handler(Events.ITERATION_COMPLETED, 131 | trainer_saver, 132 | { 133 | "model": model, 134 | }) 135 | evaluator = create_supervised_evaluator(model, 136 | metrics={"accuracy": Accuracy(), 137 | 'CE': Loss(loss_fn)}, 138 | device="cuda") 139 | 140 | desc = "ITERATION - loss: {:.2f}" 141 | pbar = tqdm( 142 | initial=0, leave=False, total=len(train_loader), 143 | desc=desc.format(0) 144 | ) 145 | 146 | @trainer.on(Events.ITERATION_COMPLETED) 147 | def log_training_loss(engine): 148 | iter = (engine.state.iteration - 1) % len(train_loader) + 1 149 | 150 | if iter % log_interval == 0: 151 | pbar.desc = desc.format(engine.state.output) 152 | pbar.update(log_interval) 153 | 154 | writer.add_scalar("training/loss", engine.state.output, engine.state.iteration) 155 | 156 | @trainer.on(Events.EPOCH_COMPLETED) 157 | def log_training_results(engine): 158 | pbar.refresh() 159 | evaluator.run(train_loader) 160 | metrics = evaluator.state.metrics 161 | avg_accuracy = metrics['accuracy'] 162 | avg_CE = metrics['CE'] 163 | tqdm.write( 164 | "Training Results - Epoch: {} Avg accuracy: {:.2f} Avg loss: {:.2f}".format(engine.state.epoch, 165 | avg_accuracy, 166 | avg_CE) 167 | ) 168 | writer.add_scalar("training/avg_loss", avg_CE, engine.state.epoch) 169 | writer.add_scalar("training/avg_accuracy", avg_accuracy, engine.state.epoch) 170 | 171 | @trainer.on(Events.EPOCH_COMPLETED) 172 | def log_validation_results(engine): 173 | evaluator.run(val_loader) 174 | metrics = evaluator.state.metrics 175 | avg_accuracy = metrics['accuracy'] 176 | avg_CE = metrics['CE'] 177 | tqdm.write( 178 | "Validation Results - Epoch: {} Avg accuracy {:.2f} Avg loss: {:.2f}".format(engine.state.epoch, 179 | avg_accuracy, 180 | avg_CE) 181 | ) 182 | pbar.n = pbar.last_print_n = 0 183 | 184 | writer.add_scalar("valdation/avg_loss", avg_CE, engine.state.epoch) 185 | writer.add_scalar("valdation/avg_accuracy", avg_accuracy, engine.state.epoch) 186 | 187 | trainer.run(train_loader, max_epochs=epochs) 188 | pbar.close() 189 | writer.close() 190 | 191 | 192 | config_file = args.config 193 | with open(config_file, 'rb') as infile: 194 | config = json.load(infile) 195 | 196 | run(config) 197 | 198 | 199 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | """Original from https://github.com/yhenon/pytorch-retinanet""" 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | 7 | import skimage.io 8 | import skimage.transform 9 | import skimage.color 10 | import skimage 11 | 12 | from PIL import Image 13 | 14 | 15 | def comptue_dim(dim, padding, kernel_size, stride): 16 | return np.floor((dim + 2*padding - kernel_size) / stride) + 1 17 | 18 | def conv3x3(in_planes, out_planes, stride=1): 19 | """3x3 convolution with padding""" 20 | return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, 21 | padding=1, bias=False) 22 | 23 | 24 | class BasicBlock(nn.Module): 25 | expansion = 1 26 | 27 | def __init__(self, inplanes, planes, stride=1, downsample=None): 28 | super(BasicBlock, self).__init__() 29 | self.conv1 = conv3x3(inplanes, planes, stride) 30 | self.bn1 = nn.BatchNorm2d(planes) 31 | self.relu = nn.ReLU(inplace=True) 32 | self.conv2 = conv3x3(planes, planes) 33 | self.bn2 = nn.BatchNorm2d(planes) 34 | self.downsample = downsample 35 | self.stride = stride 36 | 37 | def forward(self, x): 38 | residual = x 39 | 40 | out = self.conv1(x) 41 | out = self.bn1(out) 42 | out = self.relu(out) 43 | 44 | out = self.conv2(out) 45 | out = self.bn2(out) 46 | 47 | if self.downsample is not None: 48 | residual = self.downsample(x) 49 | 50 | out += residual 51 | out = self.relu(out) 52 | 53 | return out 54 | 55 | 56 | class Bottleneck(nn.Module): 57 | expansion = 4 58 | 59 | def __init__(self, inplanes, planes, stride=1, downsample=None): 60 | super(Bottleneck, self).__init__() 61 | self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) 62 | self.bn1 = nn.BatchNorm2d(planes) 63 | self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, 64 | padding=1, bias=False) 65 | self.bn2 = nn.BatchNorm2d(planes) 66 | self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) 67 | self.bn3 = nn.BatchNorm2d(planes * 4) 68 | self.relu = nn.ReLU(inplace=True) 69 | self.downsample = downsample 70 | self.stride = stride 71 | 72 | def forward(self, x): 73 | residual = x 74 | 75 | out = self.conv1(x) 76 | out = self.bn1(out) 77 | out = self.relu(out) 78 | 79 | out = self.conv2(out) 80 | out = self.bn2(out) 81 | out = self.relu(out) 82 | 83 | out = self.conv3(out) 84 | out = self.bn3(out) 85 | 86 | if self.downsample is not None: 87 | residual = self.downsample(x) 88 | 89 | out += residual 90 | out = self.relu(out) 91 | 92 | return out 93 | 94 | 95 | class BBoxTransform(nn.Module): 96 | 97 | def __init__(self, mean=None, std=None): 98 | super(BBoxTransform, self).__init__() 99 | if mean is None: 100 | self.mean = torch.from_numpy(np.array([0, 0, 0, 0]).astype(np.float32)).cuda() 101 | else: 102 | self.mean = mean 103 | if std is None: 104 | self.std = torch.from_numpy(np.array([0.1, 0.1, 0.2, 0.2]).astype(np.float32)).cuda() 105 | else: 106 | self.std = std 107 | 108 | def forward(self, boxes, deltas): 109 | 110 | widths = boxes[:, :, 2] - boxes[:, :, 0] 111 | heights = boxes[:, :, 3] - boxes[:, :, 1] 112 | ctr_x = boxes[:, :, 0] + 0.5 * widths 113 | ctr_y = boxes[:, :, 1] + 0.5 * heights 114 | 115 | dx = deltas[:, :, 0] * self.std[0] + self.mean[0] 116 | dy = deltas[:, :, 1] * self.std[1] + self.mean[1] 117 | dw = deltas[:, :, 2] * self.std[2] + self.mean[2] 118 | dh = deltas[:, :, 3] * self.std[3] + self.mean[3] 119 | 120 | pred_ctr_x = ctr_x + dx * widths 121 | pred_ctr_y = ctr_y + dy * heights 122 | pred_w = torch.exp(dw) * widths 123 | pred_h = torch.exp(dh) * heights 124 | 125 | pred_boxes_x1 = pred_ctr_x - 0.5 * pred_w 126 | pred_boxes_y1 = pred_ctr_y - 0.5 * pred_h 127 | pred_boxes_x2 = pred_ctr_x + 0.5 * pred_w 128 | pred_boxes_y2 = pred_ctr_y + 0.5 * pred_h 129 | 130 | pred_boxes = torch.stack([pred_boxes_x1, pred_boxes_y1, pred_boxes_x2, pred_boxes_y2], dim=2) 131 | 132 | return pred_boxes 133 | 134 | 135 | class ClipBoxes(nn.Module): 136 | 137 | def __init__(self, width=None, height=None): 138 | super(ClipBoxes, self).__init__() 139 | 140 | def forward(self, boxes, img): 141 | batch_size, num_channels, height, width = img.shape 142 | 143 | boxes[:, :, 0] = torch.clamp(boxes[:, :, 0], min=0) 144 | boxes[:, :, 1] = torch.clamp(boxes[:, :, 1], min=0) 145 | 146 | boxes[:, :, 2] = torch.clamp(boxes[:, :, 2], max=width) 147 | boxes[:, :, 3] = torch.clamp(boxes[:, :, 3], max=height) 148 | 149 | return boxes 150 | 151 | class Resizer(object): 152 | """Convert ndarrays in sample to Tensors.""" 153 | 154 | def __call__(self, image, annots, min_side=608, max_side=1024): 155 | 156 | image = np.array(image) 157 | annots = np.array([[*annot['bbox'], annot['category_id']] for annot in annots]) 158 | 159 | rows, cols, cns = image.shape 160 | 161 | smallest_side = min(rows, cols) 162 | 163 | # rescale the image so the smallest side is min_side 164 | scale = min_side / smallest_side 165 | 166 | # check if the largest side is now greater than max_side, which can happen 167 | # when images have a large aspect ratio 168 | largest_side = max(rows, cols) 169 | 170 | if largest_side * scale > max_side: 171 | scale = max_side / largest_side 172 | 173 | # resize the image with the computed scale 174 | image = skimage.transform.resize(image, (int(round(rows * scale)), int(round((cols * scale))))) 175 | rows, cols, cns = image.shape 176 | 177 | pad_w = 32 - rows % 32 178 | pad_h = 32 - cols % 32 179 | 180 | new_image = np.zeros((rows + pad_w, cols + pad_h, cns)).astype(np.float32) 181 | new_image[:rows, :cols, :] = image.astype(np.float32) 182 | 183 | annots[:, 4] = annots[:, 4] * scale 184 | 185 | 186 | return Image.fromarray(np.uint8(new_image)), torch.from_numpy(annots), scale 187 | 188 | 189 | class Augmenter(object): 190 | """Convert ndarrays in sample to Tensors.""" 191 | 192 | def __call__(self, sample, flip_x=0.5): 193 | if np.random.rand() < flip_x: 194 | image, annots = sample['img'], sample['annot'] 195 | image = image[:, ::-1, :] 196 | 197 | rows, cols, channels = image.shape 198 | 199 | x1 = annots[:, 0].copy() 200 | x2 = annots[:, 2].copy() 201 | 202 | x_tmp = x1.copy() 203 | 204 | annots[:, 0] = cols - x2 205 | annots[:, 2] = cols - x_tmp 206 | 207 | sample = {'img': image, 'annot': annots} 208 | 209 | return sample 210 | 211 | 212 | class Normalizer(object): 213 | 214 | def __init__(self): 215 | self.mean = np.array([[[0.485, 0.456, 0.406]]]) 216 | self.std = np.array([[[0.229, 0.224, 0.225]]]) 217 | 218 | def __call__(self, sample): 219 | image, annots = sample['img'], sample['annot'] 220 | 221 | return {'img': ((image.astype(np.float32) - self.mean) / self.std), 'annot': annots} 222 | 223 | 224 | class UnNormalizer(object): 225 | def __init__(self, mean=None, std=None): 226 | if mean == None: 227 | self.mean = [0.485, 0.456, 0.406] 228 | else: 229 | self.mean = mean 230 | if std == None: 231 | self.std = [0.229, 0.224, 0.225] 232 | else: 233 | self.std = std 234 | 235 | def __call__(self, tensor): 236 | """ 237 | Args: 238 | tensor (Tensor): Tensor image of size (C, H, W) to be normalized. 239 | Returns: 240 | Tensor: Normalized image. 241 | """ 242 | for t, m, s in zip(tensor, self.mean, self.std): 243 | t.mul_(s).add_(m) 244 | return tensor 245 | --------------------------------------------------------------------------------