├── .DS_Store
├── LICENSE
├── README.md
├── SRGAN.py
├── data
    ├── dataset.py
    ├── gan_data.py
    ├── util.py
    └── voc_dataset.py
├── demo.ipynb
├── epochs
    └── netG_epoch_4_100.pth
├── inference.py
├── misc
    ├── convert_caffe_pretrain.py
    ├── demo.jpg
    └── train_fast.py
├── model
    ├── __init__.py
    ├── faster_rcnn.py
    ├── faster_rcnn_vgg16.py
    ├── region_proposal_network.py
    ├── roi_module.py
    ├── sr_gan_network.py
    └── utils
    │   ├── __init__.py
    │   ├── bbox_tools.py
    │   ├── creator_tool.py
    │   ├── nms
    │       ├── __init__.py
    │       ├── _nms_gpu_post.pyx
    │       ├── _nms_gpu_post_py.py
    │       ├── build.py
    │       └── non_maximum_suppression.py
    │   └── roi_cupy.py
├── requirements.txt
├── train.py
├── trainer.py
└── utils
    ├── __init__.py
    ├── array_tool.py
    ├── config.py
    ├── eval_tool.py
    └── vis_tool.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samirsen/small-object-detection/30d402a09a5a01d0f365b8d5b4593544a11e90ec/.DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 samirsen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # small-object-detection
2 | Faster RCNN for Pascal VOC2007
3 | 
4 | 


--------------------------------------------------------------------------------
/SRGAN.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | 
  7 | class Generator(nn.Module):
  8 |     def __init__(self, scale_factor):
  9 |         upsample_block_num = int(math.log(scale_factor, 2))
 10 | 
 11 |         super(Generator, self).__init__()
 12 |         self.block1 = nn.Sequential(
 13 |             nn.Conv2d(3, 64, kernel_size=9, padding=4),
 14 |             nn.PReLU()
 15 |         )
 16 |         self.block2 = ResidualBlock(64)
 17 |         self.block3 = ResidualBlock(64)
 18 |         self.block4 = ResidualBlock(64)
 19 |         self.block5 = ResidualBlock(64)
 20 |         self.block6 = ResidualBlock(64)
 21 |         self.block7 = nn.Sequential(
 22 |             nn.Conv2d(64, 64, kernel_size=3, padding=1),
 23 |             nn.BatchNorm2d(64)
 24 |         )
 25 |         block8 = [UpsampleBLock(64, 2) for _ in range(upsample_block_num)]
 26 |         block8.append(nn.Conv2d(64, 3, kernel_size=9, padding=4))
 27 |         self.block8 = nn.Sequential(*block8)
 28 | 
 29 |     def forward(self, x):
 30 |         block1 = self.block1(x)
 31 |         block2 = self.block2(block1)
 32 |         block3 = self.block3(block2)
 33 |         block4 = self.block4(block3)
 34 |         block5 = self.block5(block4)
 35 |         block6 = self.block6(block5)
 36 |         block7 = self.block7(block6)
 37 |         block8 = self.block8(block1 + block7)
 38 | 
 39 |         return (F.tanh(block8) + 1) / 2
 40 | 
 41 | 
 42 | class Discriminator(nn.Module):
 43 |     def __init__(self):
 44 |         super(Discriminator, self).__init__()
 45 |         self.net = nn.Sequential(
 46 |             nn.Conv2d(3, 64, kernel_size=3, padding=1),
 47 |             nn.LeakyReLU(0.2),
 48 | 
 49 |             nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
 50 |             nn.BatchNorm2d(64),
 51 |             nn.LeakyReLU(0.2),
 52 | 
 53 |             nn.Conv2d(64, 128, kernel_size=3, padding=1),
 54 |             nn.BatchNorm2d(128),
 55 |             nn.LeakyReLU(0.2),
 56 | 
 57 |             nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1),
 58 |             nn.BatchNorm2d(128),
 59 |             nn.LeakyReLU(0.2),
 60 | 
 61 |             nn.Conv2d(128, 256, kernel_size=3, padding=1),
 62 |             nn.BatchNorm2d(256),
 63 |             nn.LeakyReLU(0.2),
 64 | 
 65 |             nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1),
 66 |             nn.BatchNorm2d(256),
 67 |             nn.LeakyReLU(0.2),
 68 | 
 69 |             nn.Conv2d(256, 512, kernel_size=3, padding=1),
 70 |             nn.BatchNorm2d(512),
 71 |             nn.LeakyReLU(0.2),
 72 | 
 73 |             nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1),
 74 |             nn.BatchNorm2d(512),
 75 |             nn.LeakyReLU(0.2),
 76 | 
 77 |             nn.AdaptiveAvgPool2d(1),
 78 |             nn.Conv2d(512, 1024, kernel_size=1),
 79 |             nn.LeakyReLU(0.2),
 80 |             nn.Conv2d(1024, 1, kernel_size=1)
 81 |         )
 82 | 
 83 |     def forward(self, x):
 84 |         batch_size = x.size(0)
 85 |         return F.sigmoid(self.net(x).view(batch_size))
 86 | 
 87 | 
 88 | class ResidualBlock(nn.Module):
 89 |     def __init__(self, channels):
 90 |         super(ResidualBlock, self).__init__()
 91 |         self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
 92 |         self.bn1 = nn.BatchNorm2d(channels)
 93 |         self.prelu = nn.PReLU()
 94 |         self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
 95 |         self.bn2 = nn.BatchNorm2d(channels)
 96 | 
 97 |     def forward(self, x):
 98 |         residual = self.conv1(x)
 99 |         residual = self.bn1(residual)
100 |         residual = self.prelu(residual)
101 |         residual = self.conv2(residual)
102 |         residual = self.bn2(residual)
103 | 
104 |         return x + residual
105 | 
106 | 
107 | class UpsampleBLock(nn.Module):
108 |     def __init__(self, in_channels, up_scale):
109 |         super(UpsampleBLock, self).__init__()
110 |         self.conv = nn.Conv2d(in_channels, in_channels * up_scale ** 2, kernel_size=3, padding=1)
111 |         self.pixel_shuffle = nn.PixelShuffle(up_scale)
112 |         self.prelu = nn.PReLU()
113 | 
114 |     def forward(self, x):
115 |         x = self.conv(x)
116 |         x = self.pixel_shuffle(x)
117 |         x = self.prelu(x)
118 |         return x
119 | 


--------------------------------------------------------------------------------
/data/dataset.py:
--------------------------------------------------------------------------------
  1 | from __future__ import  absolute_import
  2 | from __future__ import  division
  3 | import torch as t
  4 | from data.voc_dataset import VOCBboxDataset
  5 | from skimage import transform as sktsf
  6 | from torchvision import transforms as tvtsf
  7 | from data import util
  8 | import numpy as np
  9 | from utils.config import opt
 10 | 
 11 | 
 12 | def inverse_normalize(img):
 13 |     if opt.caffe_pretrain:
 14 |         img = img + (np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1))
 15 |         return img[::-1, :, :]
 16 |     # approximate un-normalize for visualize
 17 |     return (img * 0.225 + 0.45).clip(min=0, max=1) * 255
 18 | 
 19 | 
 20 | def pytorch_normalze(img):
 21 |     """
 22 |     https://github.com/pytorch/vision/issues/223
 23 |     return appr -1~1 RGB
 24 |     """
 25 |     normalize = tvtsf.Normalize(mean=[0.485, 0.456, 0.406],
 26 |                                 std=[0.229, 0.224, 0.225])
 27 |     img = normalize(t.from_numpy(img))
 28 |     return img.numpy()
 29 | 
 30 | 
 31 | def caffe_normalize(img):
 32 |     """
 33 |     return appr -125-125 BGR
 34 |     """
 35 |     img = img[[2, 1, 0], :, :]  # RGB-BGR
 36 |     img = img * 255
 37 |     mean = np.array([122.7717, 115.9465, 102.9801]).reshape(3, 1, 1)
 38 |     img = (img - mean).astype(np.float32, copy=True)
 39 |     return img
 40 | 
 41 | 
 42 | def preprocess(img, min_size=600, max_size=1000):
 43 |     """Preprocess an image for feature extraction.
 44 | 
 45 |     The length of the shorter edge is scaled to :obj:`self.min_size`.
 46 |     After the scaling, if the length of the longer edge is longer than
 47 |     :param min_size:
 48 |     :obj:`self.max_size`, the image is scaled to fit the longer edge
 49 |     to :obj:`self.max_size`.
 50 | 
 51 |     After resizing the image, the image is subtracted by a mean image value
 52 |     :obj:`self.mean`.
 53 | 
 54 |     Args:
 55 |         img (~numpy.ndarray): An image. This is in CHW and RGB format.
 56 |             The range of its value is :math:`[0, 255]`.
 57 | 
 58 |     Returns:
 59 |         ~numpy.ndarray: A preprocessed image.
 60 | 
 61 |     """
 62 |     C, H, W = img.shape
 63 |     scale1 = min_size / min(H, W)
 64 |     scale2 = max_size / max(H, W)
 65 |     scale = min(scale1, scale2)
 66 |     img = img / 255.
 67 |     img = sktsf.resize(img, (C, H * scale, W * scale), mode='reflect',anti_aliasing=False)
 68 |     # both the longer and shorter should be less than
 69 |     # max_size and min_size
 70 |     if opt.caffe_pretrain:
 71 |         normalize = caffe_normalize
 72 |     else:
 73 |         normalize = pytorch_normalze
 74 |     return normalize(img)
 75 | 
 76 | 
 77 | class Transform(object):
 78 | 
 79 |     def __init__(self, min_size=600, max_size=1000):
 80 |         self.min_size = min_size
 81 |         self.max_size = max_size
 82 | 
 83 |     def __call__(self, in_data):
 84 |         img, bbox, label = in_data
 85 |         _, H, W = img.shape
 86 |         img = preprocess(img, self.min_size, self.max_size)
 87 |         _, o_H, o_W = img.shape
 88 |         scale = o_H / H
 89 |         bbox = util.resize_bbox(bbox, (H, W), (o_H, o_W))
 90 | 
 91 |         # horizontally flip
 92 |         img, params = util.random_flip(
 93 |             img, x_random=True, return_param=True)
 94 |         bbox = util.flip_bbox(
 95 |             bbox, (o_H, o_W), x_flip=params['x_flip'])
 96 | 
 97 |         return img, bbox, label, scale
 98 | 
 99 | 
100 | class Dataset:
101 |     def __init__(self, opt):
102 |         self.opt = opt
103 |         self.db = VOCBboxDataset(opt.voc_data_dir)
104 |         self.tsf = Transform(opt.min_size, opt.max_size)
105 | 
106 |     def __getitem__(self, idx):
107 |         ori_img, bbox, label, difficult = self.db.get_example(idx)
108 | 
109 |         img, bbox, label, scale = self.tsf((ori_img, bbox, label))
110 |         # TODO: check whose stride is negative to fix this instead copy all
111 |         # some of the strides of a given numpy array are negative.
112 |         return img.copy(), bbox.copy(), label.copy(), scale
113 | 
114 |     def __len__(self):
115 |         return len(self.db)
116 | 
117 | 
118 | class TestDataset:
119 |     def __init__(self, opt, split='test', use_difficult=True):
120 |         self.opt = opt
121 |         self.db = VOCBboxDataset(opt.voc_data_dir, split=split, use_difficult=use_difficult)
122 | 
123 |     def __getitem__(self, idx):
124 |         ori_img, bbox, label, difficult = self.db.get_example(idx)
125 |         img = preprocess(ori_img)
126 |         return img, ori_img.shape[1:], bbox, label, difficult
127 | 
128 |     def __len__(self):
129 |         return len(self.db)
130 | 


--------------------------------------------------------------------------------
/data/gan_data.py:
--------------------------------------------------------------------------------
 1 | from os import listdir
 2 | from os.path import join
 3 | 
 4 | from PIL import Image
 5 | from torch.utils.data.dataset import Dataset
 6 | from torchvision.transforms import Compose, RandomCrop, ToTensor, ToPILImage, CenterCrop, Resize
 7 | 
 8 | 
 9 | def is_image_file(filename):
10 |     return any(filename.endswith(extension) for extension in ['.png', '.jpg', '.jpeg', '.PNG', '.JPG', '.JPEG'])
11 | 
12 | 
13 | def calculate_valid_crop_size(crop_size, upscale_factor):
14 |     return crop_size - (crop_size % upscale_factor)
15 | 
16 | 
17 | def train_hr_transform(crop_size):
18 |     return Compose([
19 |         RandomCrop(crop_size),
20 |         ToTensor(),
21 |     ])
22 | 
23 | 
24 | def train_lr_transform(crop_size, upscale_factor):
25 |     return Compose([
26 |         ToPILImage(),
27 |         Resize(crop_size // upscale_factor, interpolation=Image.BICUBIC),
28 |         ToTensor()
29 |     ])
30 | 
31 | 
32 | def display_transform():
33 |     return Compose([
34 |         ToPILImage(),
35 |         Resize(400),
36 |         CenterCrop(400),
37 |         ToTensor()
38 |     ])
39 | 
40 | 
41 | class TrainDatasetFromFolder(Dataset):
42 |     def __init__(self, dataset_dir, crop_size, upscale_factor):
43 |         super(TrainDatasetFromFolder, self).__init__()
44 |         self.image_filenames = [join(dataset_dir, x) for x in listdir(dataset_dir) if is_image_file(x)]
45 |         crop_size = calculate_valid_crop_size(crop_size, upscale_factor)
46 |         self.hr_transform = train_hr_transform(crop_size)
47 |         self.lr_transform = train_lr_transform(crop_size, upscale_factor)
48 | 
49 |     def __getitem__(self, index):
50 |         hr_image = self.hr_transform(Image.open(self.image_filenames[index]))
51 |         lr_image = self.lr_transform(hr_image)
52 |         return lr_image, hr_image
53 | 
54 |     def __len__(self):
55 |         return len(self.image_filenames)
56 | 
57 | 
58 | class ValDatasetFromFolder(Dataset):
59 |     def __init__(self, dataset_dir, upscale_factor):
60 |         super(ValDatasetFromFolder, self).__init__()
61 |         self.upscale_factor = upscale_factor
62 |         self.image_filenames = [join(dataset_dir, x) for x in listdir(dataset_dir) if is_image_file(x)]
63 | 
64 |     def __getitem__(self, index):
65 |         hr_image = Image.open(self.image_filenames[index])
66 |         w, h = hr_image.size
67 |         crop_size = calculate_valid_crop_size(min(w, h), self.upscale_factor)
68 |         lr_scale = Resize(crop_size // self.upscale_factor, interpolation=Image.BICUBIC)
69 |         hr_scale = Resize(crop_size, interpolation=Image.BICUBIC)
70 |         hr_image = CenterCrop(crop_size)(hr_image)
71 |         lr_image = lr_scale(hr_image)
72 |         hr_restore_img = hr_scale(lr_image)
73 |         return ToTensor()(lr_image), ToTensor()(hr_restore_img), ToTensor()(hr_image)
74 | 
75 |     def __len__(self):
76 |         return len(self.image_filenames)
77 | 
78 | 
79 | class TestDatasetFromFolder(Dataset):
80 |     def __init__(self, dataset_dir, upscale_factor):
81 |         super(TestDatasetFromFolder, self).__init__()
82 |         self.lr_path = dataset_dir + '/SRF_' + str(upscale_factor) + '/data/'
83 |         self.hr_path = dataset_dir + '/SRF_' + str(upscale_factor) + '/target/'
84 |         self.upscale_factor = upscale_factor
85 |         self.lr_filenames = [join(self.lr_path, x) for x in listdir(self.lr_path) if is_image_file(x)]
86 |         self.hr_filenames = [join(self.hr_path, x) for x in listdir(self.hr_path) if is_image_file(x)]
87 | 
88 |     def __getitem__(self, index):
89 |         image_name = self.lr_filenames[index].split('/')[-1]
90 |         lr_image = Image.open(self.lr_filenames[index])
91 |         w, h = lr_image.size
92 |         hr_image = Image.open(self.hr_filenames[index])
93 |         hr_scale = Resize((self.upscale_factor * h, self.upscale_factor * w), interpolation=Image.BICUBIC)
94 |         hr_restore_img = hr_scale(lr_image)
95 |         return image_name, ToTensor()(lr_image), ToTensor()(hr_restore_img), ToTensor()(hr_image)
96 | 
97 |     def __len__(self):
98 |         return len(self.lr_filenames)
99 | 


--------------------------------------------------------------------------------
/data/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from PIL import Image
  3 | import random
  4 | 
  5 | 
  6 | def read_image(path, dtype=np.float32, color=True):
  7 |     """Read an image from a file.
  8 | 
  9 |     This function reads an image from given file. The image is CHW format and
 10 |     the range of its value is :math:`[0, 255]`. If :obj:`color = True`, the
 11 |     order of the channels is RGB.
 12 | 
 13 |     Args:
 14 |         path (str): A path of image file.
 15 |         dtype: The type of array. The default value is :obj:`~numpy.float32`.
 16 |         color (bool): This option determines the number of channels.
 17 |             If :obj:`True`, the number of channels is three. In this case,
 18 |             the order of the channels is RGB. This is the default behaviour.
 19 |             If :obj:`False`, this function returns a grayscale image.
 20 | 
 21 |     Returns:
 22 |         ~numpy.ndarray: An image.
 23 |     """
 24 | 
 25 |     f = Image.open(path)
 26 |     try:
 27 |         if color:
 28 |             img = f.convert('RGB')
 29 |         else:
 30 |             img = f.convert('P')
 31 |         img = np.asarray(img, dtype=dtype)
 32 |     finally:
 33 |         if hasattr(f, 'close'):
 34 |             f.close()
 35 | 
 36 |     if img.ndim == 2:
 37 |         # reshape (H, W) -> (1, H, W)
 38 |         return img[np.newaxis]
 39 |     else:
 40 |         # transpose (H, W, C) -> (C, H, W)
 41 |         return img.transpose((2, 0, 1))
 42 | 
 43 | 
 44 | def resize_bbox(bbox, in_size, out_size):
 45 |     """Resize bounding boxes according to image resize.
 46 | 
 47 |     The bounding boxes are expected to be packed into a two dimensional
 48 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
 49 |     bounding boxes in the image. The second axis represents attributes of
 50 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
 51 |     where the four attributes are coordinates of the top left and the
 52 |     bottom right vertices.
 53 | 
 54 |     Args:
 55 |         bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
 56 |             :math:`R` is the number of bounding boxes.
 57 |         in_size (tuple): A tuple of length 2. The height and the width
 58 |             of the image before resized.
 59 |         out_size (tuple): A tuple of length 2. The height and the width
 60 |             of the image after resized.
 61 | 
 62 |     Returns:
 63 |         ~numpy.ndarray:
 64 |         Bounding boxes rescaled according to the given image shapes.
 65 | 
 66 |     """
 67 |     bbox = bbox.copy()
 68 |     y_scale = float(out_size[0]) / in_size[0]
 69 |     x_scale = float(out_size[1]) / in_size[1]
 70 |     bbox[:, 0] = y_scale * bbox[:, 0]
 71 |     bbox[:, 2] = y_scale * bbox[:, 2]
 72 |     bbox[:, 1] = x_scale * bbox[:, 1]
 73 |     bbox[:, 3] = x_scale * bbox[:, 3]
 74 |     return bbox
 75 | 
 76 | 
 77 | def flip_bbox(bbox, size, y_flip=False, x_flip=False):
 78 |     """Flip bounding boxes accordingly.
 79 | 
 80 |     The bounding boxes are expected to be packed into a two dimensional
 81 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
 82 |     bounding boxes in the image. The second axis represents attributes of
 83 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
 84 |     where the four attributes are coordinates of the top left and the
 85 |     bottom right vertices.
 86 | 
 87 |     Args:
 88 |         bbox (~numpy.ndarray): An array whose shape is :math:`(R, 4)`.
 89 |             :math:`R` is the number of bounding boxes.
 90 |         size (tuple): A tuple of length 2. The height and the width
 91 |             of the image before resized.
 92 |         y_flip (bool): Flip bounding box according to a vertical flip of
 93 |             an image.
 94 |         x_flip (bool): Flip bounding box according to a horizontal flip of
 95 |             an image.
 96 | 
 97 |     Returns:
 98 |         ~numpy.ndarray:
 99 |         Bounding boxes flipped according to the given flips.
100 | 
101 |     """
102 |     H, W = size
103 |     bbox = bbox.copy()
104 |     if y_flip:
105 |         y_max = H - bbox[:, 0]
106 |         y_min = H - bbox[:, 2]
107 |         bbox[:, 0] = y_min
108 |         bbox[:, 2] = y_max
109 |     if x_flip:
110 |         x_max = W - bbox[:, 1]
111 |         x_min = W - bbox[:, 3]
112 |         bbox[:, 1] = x_min
113 |         bbox[:, 3] = x_max
114 |     return bbox
115 | 
116 | 
117 | def crop_bbox(
118 |         bbox, y_slice=None, x_slice=None,
119 |         allow_outside_center=True, return_param=False):
120 |     """Translate bounding boxes to fit within the cropped area of an image.
121 | 
122 |     This method is mainly used together with image cropping.
123 |     This method translates the coordinates of bounding boxes like
124 |     :func:`data.util.translate_bbox`. In addition,
125 |     this function truncates the bounding boxes to fit within the cropped area.
126 |     If a bounding box does not overlap with the cropped area,
127 |     this bounding box will be removed.
128 | 
129 |     The bounding boxes are expected to be packed into a two dimensional
130 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
131 |     bounding boxes in the image. The second axis represents attributes of
132 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
133 |     where the four attributes are coordinates of the top left and the
134 |     bottom right vertices.
135 | 
136 |     Args:
137 |         bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
138 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
139 |         y_slice (slice): The slice of y axis.
140 |         x_slice (slice): The slice of x axis.
141 |         allow_outside_center (bool): If this argument is :obj:`False`,
142 |             bounding boxes whose centers are outside of the cropped area
143 |             are removed. The default value is :obj:`True`.
144 |         return_param (bool): If :obj:`True`, this function returns
145 |             indices of kept bounding boxes.
146 | 
147 |     Returns:
148 |         ~numpy.ndarray or (~numpy.ndarray, dict):
149 | 
150 |         If :obj:`return_param = False`, returns an array :obj:`bbox`.
151 | 
152 |         If :obj:`return_param = True`,
153 |         returns a tuple whose elements are :obj:`bbox, param`.
154 |         :obj:`param` is a dictionary of intermediate parameters whose
155 |         contents are listed below with key, value-type and the description
156 |         of the value.
157 | 
158 |         * **index** (*numpy.ndarray*): An array holding indices of used \
159 |             bounding boxes.
160 | 
161 |     """
162 | 
163 |     t, b = _slice_to_bounds(y_slice)
164 |     l, r = _slice_to_bounds(x_slice)
165 |     crop_bb = np.array((t, l, b, r))
166 | 
167 |     if allow_outside_center:
168 |         mask = np.ones(bbox.shape[0], dtype=bool)
169 |     else:
170 |         center = (bbox[:, :2] + bbox[:, 2:]) / 2.0
171 |         mask = np.logical_and(crop_bb[:2] <= center, center < crop_bb[2:]) \
172 |             .all(axis=1)
173 | 
174 |     bbox = bbox.copy()
175 |     bbox[:, :2] = np.maximum(bbox[:, :2], crop_bb[:2])
176 |     bbox[:, 2:] = np.minimum(bbox[:, 2:], crop_bb[2:])
177 |     bbox[:, :2] -= crop_bb[:2]
178 |     bbox[:, 2:] -= crop_bb[:2]
179 | 
180 |     mask = np.logical_and(mask, (bbox[:, :2] < bbox[:, 2:]).all(axis=1))
181 |     bbox = bbox[mask]
182 | 
183 |     if return_param:
184 |         return bbox, {'index': np.flatnonzero(mask)}
185 |     else:
186 |         return bbox
187 | 
188 | 
189 | def _slice_to_bounds(slice_):
190 |     if slice_ is None:
191 |         return 0, np.inf
192 | 
193 |     if slice_.start is None:
194 |         l = 0
195 |     else:
196 |         l = slice_.start
197 | 
198 |     if slice_.stop is None:
199 |         u = np.inf
200 |     else:
201 |         u = slice_.stop
202 | 
203 |     return l, u
204 | 
205 | 
206 | def translate_bbox(bbox, y_offset=0, x_offset=0):
207 |     """Translate bounding boxes.
208 | 
209 |     This method is mainly used together with image transforms, such as padding
210 |     and cropping, which translates the left top point of the image from
211 |     coordinate :math:`(0, 0)` to coordinate
212 |     :math:`(y, x) = (y_{offset}, x_{offset})`.
213 | 
214 |     The bounding boxes are expected to be packed into a two dimensional
215 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
216 |     bounding boxes in the image. The second axis represents attributes of
217 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
218 |     where the four attributes are coordinates of the top left and the
219 |     bottom right vertices.
220 | 
221 |     Args:
222 |         bbox (~numpy.ndarray): Bounding boxes to be transformed. The shape is
223 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
224 |         y_offset (int or float): The offset along y axis.
225 |         x_offset (int or float): The offset along x axis.
226 | 
227 |     Returns:
228 |         ~numpy.ndarray:
229 |         Bounding boxes translated according to the given offsets.
230 | 
231 |     """
232 | 
233 |     out_bbox = bbox.copy()
234 |     out_bbox[:, :2] += (y_offset, x_offset)
235 |     out_bbox[:, 2:] += (y_offset, x_offset)
236 | 
237 |     return out_bbox
238 | 
239 | 
240 | def random_flip(img, y_random=False, x_random=False,
241 |                 return_param=False, copy=False):
242 |     """Randomly flip an image in vertical or horizontal direction.
243 | 
244 |     Args:
245 |         img (~numpy.ndarray): An array that gets flipped. This is in
246 |             CHW format.
247 |         y_random (bool): Randomly flip in vertical direction.
248 |         x_random (bool): Randomly flip in horizontal direction.
249 |         return_param (bool): Returns information of flip.
250 |         copy (bool): If False, a view of :obj:`img` will be returned.
251 | 
252 |     Returns:
253 |         ~numpy.ndarray or (~numpy.ndarray, dict):
254 | 
255 |         If :obj:`return_param = False`,
256 |         returns an array :obj:`out_img` that is the result of flipping.
257 | 
258 |         If :obj:`return_param = True`,
259 |         returns a tuple whose elements are :obj:`out_img, param`.
260 |         :obj:`param` is a dictionary of intermediate parameters whose
261 |         contents are listed below with key, value-type and the description
262 |         of the value.
263 | 
264 |         * **y_flip** (*bool*): Whether the image was flipped in the\
265 |             vertical direction or not.
266 |         * **x_flip** (*bool*): Whether the image was flipped in the\
267 |             horizontal direction or not.
268 | 
269 |     """
270 |     y_flip, x_flip = False, False
271 |     if y_random:
272 |         y_flip = random.choice([True, False])
273 |     if x_random:
274 |         x_flip = random.choice([True, False])
275 | 
276 |     if y_flip:
277 |         img = img[:, ::-1, :]
278 |     if x_flip:
279 |         img = img[:, :, ::-1]
280 | 
281 |     if copy:
282 |         img = img.copy()
283 | 
284 |     if return_param:
285 |         return img, {'y_flip': y_flip, 'x_flip': x_flip}
286 |     else:
287 |         return img
288 | 


--------------------------------------------------------------------------------
/data/voc_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import xml.etree.ElementTree as ET
  3 | 
  4 | import numpy as np
  5 | 
  6 | from .util import read_image
  7 | 
  8 | 
  9 | class VOCBboxDataset:
 10 |     """Bounding box dataset for PASCAL `VOC`_.
 11 | 
 12 |     .. _`VOC`: http://host.robots.ox.ac.uk/pascal/VOC/voc2012/
 13 | 
 14 |     The index corresponds to each image.
 15 | 
 16 |     When queried by an index, if :obj:`return_difficult == False`,
 17 |     this dataset returns a corresponding
 18 |     :obj:`img, bbox, label`, a tuple of an image, bounding boxes and labels.
 19 |     This is the default behaviour.
 20 |     If :obj:`return_difficult == True`, this dataset returns corresponding
 21 |     :obj:`img, bbox, label, difficult`. :obj:`difficult` is a boolean array
 22 |     that indicates whether bounding boxes are labeled as difficult or not.
 23 | 
 24 |     The bounding boxes are packed into a two dimensional tensor of shape
 25 |     :math:`(R, 4)`, where :math:`R` is the number of bounding boxes in
 26 |     the image. The second axis represents attributes of the bounding box.
 27 |     They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`, where the
 28 |     four attributes are coordinates of the top left and the bottom right
 29 |     vertices.
 30 | 
 31 |     The labels are packed into a one dimensional tensor of shape :math:`(R,)`.
 32 |     :math:`R` is the number of bounding boxes in the image.
 33 |     The class name of the label :math:`l` is :math:`l` th element of
 34 |     :obj:`VOC_BBOX_LABEL_NAMES`.
 35 | 
 36 |     The array :obj:`difficult` is a one dimensional boolean array of shape
 37 |     :math:`(R,)`. :math:`R` is the number of bounding boxes in the image.
 38 |     If :obj:`use_difficult` is :obj:`False`, this array is
 39 |     a boolean array with all :obj:`False`.
 40 | 
 41 |     The type of the image, the bounding boxes and the labels are as follows.
 42 | 
 43 |     * :obj:`img.dtype == numpy.float32`
 44 |     * :obj:`bbox.dtype == numpy.float32`
 45 |     * :obj:`label.dtype == numpy.int32`
 46 |     * :obj:`difficult.dtype == numpy.bool`
 47 | 
 48 |     Args:
 49 |         data_dir (string): Path to the root of the training data. 
 50 |             i.e. "/data/image/voc/VOCdevkit/VOC2007/"
 51 |         split ({'train', 'val', 'trainval', 'test'}): Select a split of the
 52 |             dataset. :obj:`test` split is only available for
 53 |             2007 dataset.
 54 |         year ({'2007', '2012'}): Use a dataset prepared for a challenge
 55 |             held in :obj:`year`.
 56 |         use_difficult (bool): If :obj:`True`, use images that are labeled as
 57 |             difficult in the original annotation.
 58 |         return_difficult (bool): If :obj:`True`, this dataset returns
 59 |             a boolean array
 60 |             that indicates whether bounding boxes are labeled as difficult
 61 |             or not. The default value is :obj:`False`.
 62 | 
 63 |     """
 64 | 
 65 |     def __init__(self, data_dir, split='trainval',
 66 |                  use_difficult=False, return_difficult=False,
 67 |                  ):
 68 | 
 69 |         # if split not in ['train', 'trainval', 'val']:
 70 |         #     if not (split == 'test' and year == '2007'):
 71 |         #         warnings.warn(
 72 |         #             'please pick split from \'train\', \'trainval\', \'val\''
 73 |         #             'for 2012 dataset. For 2007 dataset, you can pick \'test\''
 74 |         #             ' in addition to the above mentioned splits.'
 75 |         #         )
 76 |         id_list_file = os.path.join(
 77 |             data_dir, 'ImageSets/Main/{0}.txt'.format(split))
 78 | 
 79 |         self.ids = [id_.strip() for id_ in open(id_list_file)]
 80 |         self.data_dir = data_dir
 81 |         self.use_difficult = use_difficult
 82 |         self.return_difficult = return_difficult
 83 |         self.label_names = VOC_BBOX_LABEL_NAMES
 84 | 
 85 |     def __len__(self):
 86 |         return len(self.ids)
 87 | 
 88 |     def get_example(self, i):
 89 |         """Returns the i-th example.
 90 | 
 91 |         Returns a color image and bounding boxes. The image is in CHW format.
 92 |         The returned image is RGB.
 93 | 
 94 |         Args:
 95 |             i (int): The index of the example.
 96 | 
 97 |         Returns:
 98 |             tuple of an image and bounding boxes
 99 | 
100 |         """
101 |         id_ = self.ids[i]
102 |         anno = ET.parse(
103 |             os.path.join(self.data_dir, 'Annotations', id_ + '.xml'))
104 |         bbox = list()
105 |         label = list()
106 |         difficult = list()
107 |         for obj in anno.findall('object'):
108 |             # when in not using difficult split, and the object is
109 |             # difficult, skipt it.
110 |             if not self.use_difficult and int(obj.find('difficult').text) == 1:
111 |                 continue
112 | 
113 |             difficult.append(int(obj.find('difficult').text))
114 |             bndbox_anno = obj.find('bndbox')
115 |             # subtract 1 to make pixel indexes 0-based
116 |             bbox.append([
117 |                 int(bndbox_anno.find(tag).text) - 1
118 |                 for tag in ('ymin', 'xmin', 'ymax', 'xmax')])
119 |             name = obj.find('name').text.lower().strip()
120 |             label.append(VOC_BBOX_LABEL_NAMES.index(name))
121 |         bbox = np.stack(bbox).astype(np.float32)
122 |         label = np.stack(label).astype(np.int32)
123 |         # When `use_difficult==False`, all elements in `difficult` are False.
124 |         difficult = np.array(difficult, dtype=np.bool).astype(np.uint8)  # PyTorch don't support np.bool
125 | 
126 |         # Load a image
127 |         img_file = os.path.join(self.data_dir, 'JPEGImages', id_ + '.jpg')
128 |         img = read_image(img_file, color=True)
129 | 
130 |         # if self.return_difficult:
131 |         #     return img, bbox, label, difficult
132 |         return img, bbox, label, difficult
133 | 
134 |     __getitem__ = get_example
135 | 
136 | 
137 | VOC_BBOX_LABEL_NAMES = (
138 |     'aeroplane',
139 |     'bicycle',
140 |     'bird',
141 |     'boat',
142 |     'bottle',
143 |     'bus',
144 |     'car',
145 |     'cat',
146 |     'chair',
147 |     'cow',
148 |     'diningtable',
149 |     'dog',
150 |     'horse',
151 |     'motorbike',
152 |     'person',
153 |     'pottedplant',
154 |     'sheep',
155 |     'sofa',
156 |     'train',
157 |     'tvmonitor')
158 | 


--------------------------------------------------------------------------------
/epochs/netG_epoch_4_100.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samirsen/small-object-detection/30d402a09a5a01d0f365b8d5b4593544a11e90ec/epochs/netG_epoch_4_100.pth


--------------------------------------------------------------------------------
/inference.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import time
 4 | 
 5 | import torch
 6 | from PIL import Image
 7 | from torch.autograd import Variable
 8 | from torchvision.transforms import ToTensor, ToPILImage
 9 | 
10 | 
11 | from utils.config import Config
12 | from model import FasterRCNNVGG16
13 | from trainer import FasterRCNNTrainer
14 | from data.util import  read_image
15 | from utils.vis_tool import vis_bbox
16 | from utils import array_tool as at
17 | 
18 | from SRGAN import Generator
19 | 
20 | parser = argparse.ArgumentParser(description='Test Single Image')
21 | parser.add_argument('--upscale_factor', default=4, type=int, help='super resolution upscale factor')
22 | parser.add_argument('--test_mode', default='GPU', type=str, choices=['GPU', 'CPU'], help='using GPU or CPU')
23 | parser.add_argument('--model_name', default='netG_epoch_4_100.pth', type=str, help='generator model epoch name')
24 | opt = parser.parse_args()
25 | 
26 | FasterRCNNOpt = Config()
27 | 
28 | UPSCALE_FACTOR = opt.upscale_factor
29 | TEST_MODE = True if opt.test_mode == 'GPU' else False
30 | MODEL_NAME = opt.model_name
31 | 
32 | gan_model = Generator(UPSCALE_FACTOR).eval()
33 | faster_rcnn = FasterRCNNVGG16()
34 | trainer = FasterRCNNTrainer(faster_rcnn)
35 | 
36 | 
37 | if TEST_MODE:
38 |     gan_model.cuda()
39 |     trainer.cuda()
40 |     gan_model.load_state_dict(torch.load('epochs/' + MODEL_NAME))
41 |     # trainer.load('epochs/samir_fast_rcnn_epoch60.pth')
42 | 
43 | else:
44 |     gan_model.load_state_dict(torch.load('epochs/' + MODEL_NAME, map_location=lambda storage, loc: storage))
45 |     # trainer.load('epochs/samir_fast_rcnn_epoch60.pth')
46 | 
47 | image = read_image('misc/demo.jpg')
48 | image = Variable(ToTensor()(image), volatile=True).unsqueeze(0)
49 | if TEST_MODE:
50 |     image = image.cuda()
51 | 
52 | start = time.clock()
53 | out = gan_model(image)
54 | out_img = ToPILImage()(out[0].data.cpu())
55 | out_img.save('out_srf_' + str(UPSCALE_FACTOR) + '_' + IMAGE_NAME)
56 | 
57 | # _bboxes, _labels, _scores = trainer.faster_rcnn.predict(out_img, visualize=True)
58 | # ax = vis_bbox(at.tonumpy(img[0]),
59 | #          at.tonumpy(_bboxes[0]),
60 | #          at.tonumpy(_labels[0]).reshape(-1),
61 | #          at.tonumpy(_scores[0]).reshape(-1))
62 | #
63 | # plt.show()
64 | 
65 | elapsed = (time.clock() - start)
66 | print('cost' + str(elapsed) + 's')
67 | # out_img = ToPILImage()(out[0].data.cpu())
68 | # out_img.save('out_srf_' + str(UPSCALE_FACTOR) + '_' + IMAGE_NAME)
69 | 


--------------------------------------------------------------------------------
/misc/convert_caffe_pretrain.py:
--------------------------------------------------------------------------------
 1 | # code from ruotian luo
 2 | # https://github.com/ruotianluo/pytorch-faster-rcnn
 3 | import torch
 4 | from torch.utils.model_zoo import load_url
 5 | from torchvision import models
 6 | 
 7 | sd = load_url("https://s3-us-west-2.amazonaws.com/jcjohns-models/vgg16-00b39a1b.pth")
 8 | sd['classifier.0.weight'] = sd['classifier.1.weight']
 9 | sd['classifier.0.bias'] = sd['classifier.1.bias']
10 | del sd['classifier.1.weight']
11 | del sd['classifier.1.bias']
12 | 
13 | sd['classifier.3.weight'] = sd['classifier.4.weight']
14 | sd['classifier.3.bias'] = sd['classifier.4.bias']
15 | del sd['classifier.4.weight']
16 | del sd['classifier.4.bias']
17 | 
18 | import  os
19 | # speicify the path to save
20 | if not os.path.exists('checkpoints'):
21 |     os.makedirs('checkpoints')
22 | torch.save(sd, "checkpoints/vgg16_caffe.pth")


--------------------------------------------------------------------------------
/misc/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samirsen/small-object-detection/30d402a09a5a01d0f365b8d5b4593544a11e90ec/misc/demo.jpg


--------------------------------------------------------------------------------
/misc/train_fast.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | import ipdb
  4 | import matplotlib
  5 | from tqdm import tqdm
  6 | 
  7 | from utils.config import opt
  8 | from data.dataset import Dataset, TestDataset
  9 | from model import FasterRCNNVGG16
 10 | from torch.utils import data as data_
 11 | from trainer import FasterRCNNTrainer
 12 | from utils import array_tool as at
 13 | from utils.vis_tool import visdom_bbox
 14 | from utils.eval_tool import eval_detection_voc
 15 | 
 16 | matplotlib.use('agg')
 17 | 
 18 | def eval(dataloader, faster_rcnn, test_num=10000):
 19 |     pred_bboxes, pred_labels, pred_scores = list(), list(), list()
 20 |     gt_bboxes, gt_labels, gt_difficults = list(), list(), list()
 21 |     for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)):
 22 |         sizes = [sizes[0][0], sizes[1][0]]
 23 |         pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes])
 24 |         gt_bboxes += list(gt_bboxes_.numpy())
 25 |         gt_labels += list(gt_labels_.numpy())
 26 |         gt_difficults += list(gt_difficults_.numpy())
 27 |         pred_bboxes += pred_bboxes_
 28 |         pred_labels += pred_labels_
 29 |         pred_scores += pred_scores_
 30 |         if ii == test_num: break
 31 | 
 32 |     result = eval_detection_voc(
 33 |         pred_bboxes, pred_labels, pred_scores,
 34 |         gt_bboxes, gt_labels, gt_difficults,
 35 |         use_07_metric=True)
 36 |     return result
 37 | 
 38 | 
 39 | def train(**kwargs):
 40 |     opt._parse(kwargs)
 41 | 
 42 |     dataset = Dataset(opt)
 43 |     print('load data')
 44 |     dataloader = data_.DataLoader(dataset, \
 45 |                                   batch_size=1, \
 46 |                                   shuffle=True, \
 47 |                                   # pin_memory=True,
 48 |                                   num_workers=opt.num_workers)
 49 |     testset = TestDataset(opt)
 50 |     test_dataloader = data_.DataLoader(testset,
 51 |                                        batch_size=1,
 52 |                                        num_workers=2,
 53 |                                        shuffle=False, \
 54 |                                        # pin_memory=True
 55 |                                        )
 56 |     faster_rcnn = FasterRCNNVGG16()
 57 |     print('model construct completed')
 58 |     trainer = FasterRCNNTrainer(faster_rcnn).cuda()
 59 |     if opt.load_path:
 60 |         trainer.load(opt.load_path)
 61 |         print('load pretrained model from %s' % opt.load_path)
 62 | 
 63 |     trainer.vis.text(dataset.db.label_names, win='labels')
 64 |     best_map = 0
 65 |     for epoch in range(7):
 66 |         trainer.reset_meters()
 67 |         for ii, (img, bbox_, label_, scale, ori_img) in tqdm(enumerate(dataloader)):
 68 |             scale = at.scalar(scale)
 69 |             img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
 70 |             losses = trainer.train_step(img, bbox, label, scale)
 71 | 
 72 |             if (ii + 1) % opt.plot_every == 0:
 73 |                 if os.path.exists(opt.debug_file):
 74 |                     ipdb.set_trace()
 75 | 
 76 |                 # plot loss
 77 |                 trainer.vis.plot_many(trainer.get_meter_data())
 78 | 
 79 |                 # plot groud truth bboxes
 80 |                 ori_img_ = (img * 0.225 + 0.45).clamp(min=0, max=1) * 255
 81 |                 gt_img = visdom_bbox(at.tonumpy(ori_img_)[0], 
 82 |                                     at.tonumpy(bbox_)[0], 
 83 |                                     label_[0].numpy())
 84 |                 trainer.vis.img('gt_img', gt_img)
 85 | 
 86 |                 # plot predicti bboxes
 87 |                 _bboxes, _labels, _scores = trainer.faster_rcnn.predict(ori_img,visualize=True)
 88 |                 pred_img = visdom_bbox( at.tonumpy(ori_img[0]), 
 89 |                                         at.tonumpy(_bboxes[0]),
 90 |                                         at.tonumpy(_labels[0]).reshape(-1), 
 91 |                                         at.tonumpy(_scores[0]))
 92 |                 trainer.vis.img('pred_img', pred_img)
 93 | 
 94 |                 # rpn confusion matrix(meter)
 95 |                 trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
 96 |                 # roi confusion matrix
 97 |                 trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
 98 |         if epoch==4:
 99 |             trainer.faster_rcnn.scale_lr(opt.lr_decay)
100 | 
101 |     eval_result = eval(test_dataloader, faster_rcnn, test_num=1e100)
102 |     print('eval_result')
103 |     trainer.save(mAP=eval_result['map'])
104 | 
105 | if __name__ == '__main__':
106 |     import fire
107 | 
108 |     fire.Fire()
109 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | from model.faster_rcnn_vgg16 import FasterRCNNVGG16
2 | 


--------------------------------------------------------------------------------
/model/faster_rcnn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import  absolute_import
  2 | from __future__ import division
  3 | import torch as t
  4 | import numpy as np
  5 | import cupy as cp
  6 | from utils import array_tool as at
  7 | from model.utils.bbox_tools import loc2bbox
  8 | from model.utils.nms import non_maximum_suppression
  9 | 
 10 | from torch import nn
 11 | from data.dataset import preprocess
 12 | from torch.nn import functional as F
 13 | from utils.config import opt
 14 | 
 15 | 
 16 | def nograd(f):
 17 |     def new_f(*args,**kwargs):
 18 |         with t.no_grad():
 19 |            return f(*args,**kwargs)
 20 |     return new_f
 21 | 
 22 | class FasterRCNN(nn.Module):
 23 |     """Base class for Faster R-CNN.
 24 | 
 25 |     This is a base class for Faster R-CNN links supporting object detection
 26 |     API [#]_. The following three stages constitute Faster R-CNN.
 27 | 
 28 |     1. **Feature extraction**: Images are taken and their \
 29 |         feature maps are calculated.
 30 |     2. **Region Proposal Networks**: Given the feature maps calculated in \
 31 |         the previous stage, produce set of RoIs around objects.
 32 |     3. **Localization and Classification Heads**: Using feature maps that \
 33 |         belong to the proposed RoIs, classify the categories of the objects \
 34 |         in the RoIs and improve localizations.
 35 | 
 36 |     Each stage is carried out by one of the callable
 37 |     :class:`torch.nn.Module` objects :obj:`feature`, :obj:`rpn` and :obj:`head`.
 38 | 
 39 |     There are two functions :meth:`predict` and :meth:`__call__` to conduct
 40 |     object detection.
 41 |     :meth:`predict` takes images and returns bounding boxes that are converted
 42 |     to image coordinates. This will be useful for a scenario when
 43 |     Faster R-CNN is treated as a black box function, for instance.
 44 |     :meth:`__call__` is provided for a scnerario when intermediate outputs
 45 |     are needed, for instance, for training and debugging.
 46 | 
 47 |     Links that support obejct detection API have method :meth:`predict` with
 48 |     the same interface. Please refer to :meth:`predict` for
 49 |     further details.
 50 | 
 51 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
 52 |     Faster R-CNN: Towards Real-Time Object Detection with \
 53 |     Region Proposal Networks. NIPS 2015.
 54 | 
 55 |     Args:
 56 |         extractor (nn.Module): A module that takes a BCHW image
 57 |             array and returns feature maps.
 58 |         rpn (nn.Module): A module that has the same interface as
 59 |             :class:`model.region_proposal_network.RegionProposalNetwork`.
 60 |             Please refer to the documentation found there.
 61 |         head (nn.Module): A module that takes
 62 |             a BCHW variable, RoIs and batch indices for RoIs. This returns class
 63 |             dependent localization paramters and class scores.
 64 |         loc_normalize_mean (tuple of four floats): Mean values of
 65 |             localization estimates.
 66 |         loc_normalize_std (tupler of four floats): Standard deviation
 67 |             of localization estimates.
 68 | 
 69 |     """
 70 | 
 71 |     def __init__(self, extractor, rpn, head,
 72 |                 loc_normalize_mean = (0., 0., 0., 0.),
 73 |                 loc_normalize_std = (0.1, 0.1, 0.2, 0.2)
 74 |     ):
 75 |         super(FasterRCNN, self).__init__()
 76 |         self.extractor = extractor
 77 |         self.rpn = rpn
 78 |         self.head = head
 79 | 
 80 |         # mean and std
 81 |         self.loc_normalize_mean = loc_normalize_mean
 82 |         self.loc_normalize_std = loc_normalize_std
 83 |         self.use_preset('evaluate')
 84 | 
 85 |     @property
 86 |     def n_class(self):
 87 |         # Total number of classes including the background.
 88 |         return self.head.n_class
 89 | 
 90 |     def forward(self, x, scale=1.):
 91 |         """Forward Faster R-CNN.
 92 | 
 93 |         Scaling paramter :obj:`scale` is used by RPN to determine the
 94 |         threshold to select small objects, which are going to be
 95 |         rejected irrespective of their confidence scores.
 96 | 
 97 |         Here are notations used.
 98 | 
 99 |         * :math:`N` is the number of batch size
100 |         * :math:`R'` is the total number of RoIs produced across batches. \
101 |             Given :math:`R_i` proposed RoIs from the :math:`i` th image, \
102 |             :math:`R' = \\sum _{i=1} ^ N R_i`.
103 |         * :math:`L` is the number of classes excluding the background.
104 | 
105 |         Classes are ordered by the background, the first class, ..., and
106 |         the :math:`L` th class.
107 | 
108 |         Args:
109 |             x (autograd.Variable): 4D image variable.
110 |             scale (float): Amount of scaling applied to the raw image
111 |                 during preprocessing.
112 | 
113 |         Returns:
114 |             Variable, Variable, array, array:
115 |             Returns tuple of four values listed below.
116 | 
117 |             * **roi_cls_locs**: Offsets and scalings for the proposed RoIs. \
118 |                 Its shape is :math:`(R', (L + 1) \\times 4)`.
119 |             * **roi_scores**: Class predictions for the proposed RoIs. \
120 |                 Its shape is :math:`(R', L + 1)`.
121 |             * **rois**: RoIs proposed by RPN. Its shape is \
122 |                 :math:`(R', 4)`.
123 |             * **roi_indices**: Batch indices of RoIs. Its shape is \
124 |                 :math:`(R',)`.
125 | 
126 |         """
127 |         img_size = x.shape[2:]
128 | 
129 |         h = self.extractor(x)
130 |         rpn_locs, rpn_scores, rois, roi_indices, anchor = \
131 |             self.rpn(h, img_size, scale)
132 |         roi_cls_locs, roi_scores = self.head(
133 |             h, rois, roi_indices)
134 |         return roi_cls_locs, roi_scores, rois, roi_indices
135 | 
136 |     def use_preset(self, preset):
137 |         """Use the given preset during prediction.
138 | 
139 |         This method changes values of :obj:`self.nms_thresh` and
140 |         :obj:`self.score_thresh`. These values are a threshold value
141 |         used for non maximum suppression and a threshold value
142 |         to discard low confidence proposals in :meth:`predict`,
143 |         respectively.
144 | 
145 |         If the attributes need to be changed to something
146 |         other than the values provided in the presets, please modify
147 |         them by directly accessing the public attributes.
148 | 
149 |         Args:
150 |             preset ({'visualize', 'evaluate'): A string to determine the
151 |                 preset to use.
152 | 
153 |         """
154 |         if preset == 'visualize':
155 |             self.nms_thresh = 0.3
156 |             self.score_thresh = 0.7
157 |         elif preset == 'evaluate':
158 |             self.nms_thresh = 0.3
159 |             self.score_thresh = 0.05
160 |         else:
161 |             raise ValueError('preset must be visualize or evaluate')
162 | 
163 |     def _suppress(self, raw_cls_bbox, raw_prob):
164 |         bbox = list()
165 |         label = list()
166 |         score = list()
167 |         # skip cls_id = 0 because it is the background class
168 |         for l in range(1, self.n_class):
169 |             cls_bbox_l = raw_cls_bbox.reshape((-1, self.n_class, 4))[:, l, :]
170 |             prob_l = raw_prob[:, l]
171 |             mask = prob_l > self.score_thresh
172 |             cls_bbox_l = cls_bbox_l[mask]
173 |             prob_l = prob_l[mask]
174 |             keep = non_maximum_suppression(
175 |                 cp.array(cls_bbox_l), self.nms_thresh, prob_l)
176 |             keep = cp.asnumpy(keep)
177 |             bbox.append(cls_bbox_l[keep])
178 |             # The labels are in [0, self.n_class - 2].
179 |             label.append((l - 1) * np.ones((len(keep),)))
180 |             score.append(prob_l[keep])
181 |         bbox = np.concatenate(bbox, axis=0).astype(np.float32)
182 |         label = np.concatenate(label, axis=0).astype(np.int32)
183 |         score = np.concatenate(score, axis=0).astype(np.float32)
184 |         return bbox, label, score
185 | 
186 |     @nograd
187 |     def predict(self, imgs,sizes=None,visualize=False):
188 |         """Detect objects from images.
189 | 
190 |         This method predicts objects for each image.
191 | 
192 |         Args:
193 |             imgs (iterable of numpy.ndarray): Arrays holding images.
194 |                 All images are in CHW and RGB format
195 |                 and the range of their value is :math:`[0, 255]`.
196 | 
197 |         Returns:
198 |            tuple of lists:
199 |            This method returns a tuple of three lists,
200 |            :obj:`(bboxes, labels, scores)`.
201 | 
202 |            * **bboxes**: A list of float arrays of shape :math:`(R, 4)`, \
203 |                where :math:`R` is the number of bounding boxes in a image. \
204 |                Each bouding box is organized by \
205 |                :math:`(y_{min}, x_{min}, y_{max}, x_{max})` \
206 |                in the second axis.
207 |            * **labels** : A list of integer arrays of shape :math:`(R,)`. \
208 |                Each value indicates the class of the bounding box. \
209 |                Values are in range :math:`[0, L - 1]`, where :math:`L` is the \
210 |                number of the foreground classes.
211 |            * **scores** : A list of float arrays of shape :math:`(R,)`. \
212 |                Each value indicates how confident the prediction is.
213 | 
214 |         """
215 |         self.eval()
216 |         if visualize:
217 |             self.use_preset('visualize')
218 |             prepared_imgs = list()
219 |             sizes = list()
220 |             for img in imgs:
221 |                 size = img.shape[1:]
222 |                 img = preprocess(at.tonumpy(img))
223 |                 prepared_imgs.append(img)
224 |                 sizes.append(size)
225 |         else:
226 |              prepared_imgs = imgs 
227 |         bboxes = list()
228 |         labels = list()
229 |         scores = list()
230 |         for img, size in zip(prepared_imgs, sizes):
231 |             img = at.totensor(img[None]).float()
232 |             scale = img.shape[3] / size[1]
233 |             roi_cls_loc, roi_scores, rois, _ = self(img, scale=scale)
234 |             # We are assuming that batch size is 1.
235 |             roi_score = roi_scores.data
236 |             roi_cls_loc = roi_cls_loc.data
237 |             roi = at.totensor(rois) / scale
238 | 
239 |             # Convert predictions to bounding boxes in image coordinates.
240 |             # Bounding boxes are scaled to the scale of the input images.
241 |             mean = t.Tensor(self.loc_normalize_mean).cuda(). \
242 |                 repeat(self.n_class)[None]
243 |             std = t.Tensor(self.loc_normalize_std).cuda(). \
244 |                 repeat(self.n_class)[None]
245 | 
246 |             roi_cls_loc = (roi_cls_loc * std + mean)
247 |             roi_cls_loc = roi_cls_loc.view(-1, self.n_class, 4)
248 |             roi = roi.view(-1, 1, 4).expand_as(roi_cls_loc)
249 |             cls_bbox = loc2bbox(at.tonumpy(roi).reshape((-1, 4)),
250 |                                 at.tonumpy(roi_cls_loc).reshape((-1, 4)))
251 |             cls_bbox = at.totensor(cls_bbox)
252 |             cls_bbox = cls_bbox.view(-1, self.n_class * 4)
253 |             # clip bounding box
254 |             cls_bbox[:, 0::2] = (cls_bbox[:, 0::2]).clamp(min=0, max=size[0])
255 |             cls_bbox[:, 1::2] = (cls_bbox[:, 1::2]).clamp(min=0, max=size[1])
256 | 
257 |             prob = at.tonumpy(F.softmax(at.totensor(roi_score), dim=1))
258 | 
259 |             raw_cls_bbox = at.tonumpy(cls_bbox)
260 |             raw_prob = at.tonumpy(prob)
261 | 
262 |             bbox, label, score = self._suppress(raw_cls_bbox, raw_prob)
263 |             bboxes.append(bbox)
264 |             labels.append(label)
265 |             scores.append(score)
266 | 
267 |         self.use_preset('evaluate')
268 |         self.train()
269 |         return bboxes, labels, scores
270 | 
271 |     def get_optimizer(self):
272 |         """
273 |         return optimizer, It could be overwriten if you want to specify 
274 |         special optimizer
275 |         """
276 |         lr = opt.lr
277 |         params = []
278 |         for key, value in dict(self.named_parameters()).items():
279 |             if value.requires_grad:
280 |                 if 'bias' in key:
281 |                     params += [{'params': [value], 'lr': lr * 2, 'weight_decay': 0}]
282 |                 else:
283 |                     params += [{'params': [value], 'lr': lr, 'weight_decay': opt.weight_decay}]
284 |         if opt.use_adam:
285 |             self.optimizer = t.optim.Adam(params)
286 |         else:
287 |             self.optimizer = t.optim.SGD(params, momentum=0.9)
288 |         return self.optimizer
289 | 
290 |     def scale_lr(self, decay=0.1):
291 |         for param_group in self.optimizer.param_groups:
292 |             param_group['lr'] *= decay
293 |         return self.optimizer
294 | 
295 | 
296 | 
297 | 
298 | 


--------------------------------------------------------------------------------
/model/faster_rcnn_vgg16.py:
--------------------------------------------------------------------------------
  1 | from __future__ import  absolute_import
  2 | import torch as t
  3 | from torch import nn
  4 | from torchvision.models import vgg16
  5 | from model.region_proposal_network import RegionProposalNetwork
  6 | from model.faster_rcnn import FasterRCNN
  7 | from model.roi_module import RoIPooling2D
  8 | from utils import array_tool as at
  9 | from utils.config import opt
 10 | 
 11 | 
 12 | def decom_vgg16():
 13 |     # the 30th layer of features is relu of conv5_3
 14 |     if opt.caffe_pretrain:
 15 |         model = vgg16(pretrained=False)
 16 |         if not opt.load_path:
 17 |             model.load_state_dict(t.load(opt.caffe_pretrain_path))
 18 |     else:
 19 |         model = vgg16(not opt.load_path)
 20 | 
 21 |     features = list(model.features)[:30]
 22 |     classifier = model.classifier
 23 | 
 24 |     classifier = list(classifier)
 25 |     del classifier[6]
 26 |     if not opt.use_drop:
 27 |         del classifier[5]
 28 |         del classifier[2]
 29 |     classifier = nn.Sequential(*classifier)
 30 | 
 31 |     # freeze top4 conv
 32 |     for layer in features[:10]:
 33 |         for p in layer.parameters():
 34 |             p.requires_grad = False
 35 | 
 36 |     return nn.Sequential(*features), classifier
 37 | 
 38 | 
 39 | class FasterRCNNVGG16(FasterRCNN):
 40 |     """Faster R-CNN based on VGG-16.
 41 |     For descriptions on the interface of this model, please refer to
 42 |     :class:`model.faster_rcnn.FasterRCNN`.
 43 | 
 44 |     Args:
 45 |         n_fg_class (int): The number of classes excluding the background.
 46 |         ratios (list of floats): This is ratios of width to height of
 47 |             the anchors.
 48 |         anchor_scales (list of numbers): This is areas of anchors.
 49 |             Those areas will be the product of the square of an element in
 50 |             :obj:`anchor_scales` and the original area of the reference
 51 |             window.
 52 | 
 53 |     """
 54 | 
 55 |     feat_stride = 16  # downsample 16x for output of conv5 in vgg16
 56 | 
 57 |     def __init__(self,
 58 |                  n_fg_class=20,
 59 |                  ratios=[0.5, 1, 2],
 60 |                  anchor_scales=[8, 16, 32]
 61 |                  ):
 62 |                  
 63 |         extractor, classifier = decom_vgg16()
 64 | 
 65 |         rpn = RegionProposalNetwork(
 66 |             512, 512,
 67 |             ratios=ratios,
 68 |             anchor_scales=anchor_scales,
 69 |             feat_stride=self.feat_stride,
 70 |         )
 71 | 
 72 |         head = VGG16RoIHead(
 73 |             n_class=n_fg_class + 1,
 74 |             roi_size=7,
 75 |             spatial_scale=(1. / self.feat_stride),
 76 |             classifier=classifier
 77 |         )
 78 | 
 79 |         super(FasterRCNNVGG16, self).__init__(
 80 |             extractor,
 81 |             rpn,
 82 |             head,
 83 |         )
 84 | 
 85 | 
 86 | class VGG16RoIHead(nn.Module):
 87 |     """Faster R-CNN Head for VGG-16 based implementation.
 88 |     This class is used as a head for Faster R-CNN.
 89 |     This outputs class-wise localizations and classification based on feature
 90 |     maps in the given RoIs.
 91 |     
 92 |     Args:
 93 |         n_class (int): The number of classes possibly including the background.
 94 |         roi_size (int): Height and width of the feature maps after RoI-pooling.
 95 |         spatial_scale (float): Scale of the roi is resized.
 96 |         classifier (nn.Module): Two layer Linear ported from vgg16
 97 | 
 98 |     """
 99 | 
100 |     def __init__(self, n_class, roi_size, spatial_scale,
101 |                  classifier):
102 |         # n_class includes the background
103 |         super(VGG16RoIHead, self).__init__()
104 | 
105 |         self.classifier = classifier
106 |         self.cls_loc = nn.Linear(4096, n_class * 4)
107 |         self.score = nn.Linear(4096, n_class)
108 | 
109 |         normal_init(self.cls_loc, 0, 0.001)
110 |         normal_init(self.score, 0, 0.01)
111 | 
112 |         self.n_class = n_class
113 |         self.roi_size = roi_size
114 |         self.spatial_scale = spatial_scale
115 |         self.roi = RoIPooling2D(self.roi_size, self.roi_size, self.spatial_scale)
116 | 
117 |     def forward(self, x, rois, roi_indices):
118 |         """Forward the chain.
119 | 
120 |         We assume that there are :math:`N` batches.
121 | 
122 |         Args:
123 |             x (Variable): 4D image variable.
124 |             rois (Tensor): A bounding box array containing coordinates of
125 |                 proposal boxes.  This is a concatenation of bounding box
126 |                 arrays from multiple images in the batch.
127 |                 Its shape is :math:`(R', 4)`. Given :math:`R_i` proposed
128 |                 RoIs from the :math:`i` th image,
129 |                 :math:`R' = \\sum _{i=1} ^ N R_i`.
130 |             roi_indices (Tensor): An array containing indices of images to
131 |                 which bounding boxes correspond to. Its shape is :math:`(R',)`.
132 | 
133 |         """
134 |         # in case roi_indices is  ndarray
135 |         roi_indices = at.totensor(roi_indices).float()
136 |         rois = at.totensor(rois).float()
137 |         indices_and_rois = t.cat([roi_indices[:, None], rois], dim=1)
138 |         # NOTE: important: yx->xy
139 |         xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
140 |         indices_and_rois =  xy_indices_and_rois.contiguous()
141 | 
142 |         pool = self.roi(x, indices_and_rois)
143 |         pool = pool.view(pool.size(0), -1)
144 |         fc7 = self.classifier(pool)
145 |         roi_cls_locs = self.cls_loc(fc7)
146 |         roi_scores = self.score(fc7)
147 |         return roi_cls_locs, roi_scores
148 | 
149 | 
150 | def normal_init(m, mean, stddev, truncated=False):
151 |     """
152 |     weight initalizer: truncated normal and random normal.
153 |     """
154 |     # x is a parameter
155 |     if truncated:
156 |         m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # not a perfect approximation
157 |     else:
158 |         m.weight.data.normal_(mean, stddev)
159 |         m.bias.data.zero_()
160 | 


--------------------------------------------------------------------------------
/model/region_proposal_network.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from torch.nn import functional as F
  3 | import torch as t
  4 | from torch import nn
  5 | 
  6 | from model.utils.bbox_tools import generate_anchor_base
  7 | from model.utils.creator_tool import ProposalCreator
  8 | 
  9 | 
 10 | class RegionProposalNetwork(nn.Module):
 11 |     """Region Proposal Network introduced in Faster R-CNN.
 12 | 
 13 |     This is Region Proposal Network introduced in Faster R-CNN [#]_.
 14 |     This takes features extracted from images and propose
 15 |     class agnostic bounding boxes around "objects".
 16 | 
 17 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
 18 |     Faster R-CNN: Towards Real-Time Object Detection with \
 19 |     Region Proposal Networks. NIPS 2015.
 20 | 
 21 |     Args:
 22 |         in_channels (int): The channel size of input.
 23 |         mid_channels (int): The channel size of the intermediate tensor.
 24 |         ratios (list of floats): This is ratios of width to height of
 25 |             the anchors.
 26 |         anchor_scales (list of numbers): This is areas of anchors.
 27 |             Those areas will be the product of the square of an element in
 28 |             :obj:`anchor_scales` and the original area of the reference
 29 |             window.
 30 |         feat_stride (int): Stride size after extracting features from an
 31 |             image.
 32 |         initialW (callable): Initial weight value. If :obj:`None` then this
 33 |             function uses Gaussian distribution scaled by 0.1 to
 34 |             initialize weight.
 35 |             May also be a callable that takes an array and edits its values.
 36 |         proposal_creator_params (dict): Key valued paramters for
 37 |             :class:`model.utils.creator_tools.ProposalCreator`.
 38 | 
 39 |     .. seealso::
 40 |         :class:`~model.utils.creator_tools.ProposalCreator`
 41 | 
 42 |     """
 43 | 
 44 |     def __init__(
 45 |             self, in_channels=512, mid_channels=512, ratios=[0.5, 1, 2],
 46 |             anchor_scales=[8, 16, 32], feat_stride=16,
 47 |             proposal_creator_params=dict(),
 48 |     ):
 49 |         super(RegionProposalNetwork, self).__init__()
 50 |         self.anchor_base = generate_anchor_base(
 51 |             anchor_scales=anchor_scales, ratios=ratios)
 52 |         self.feat_stride = feat_stride
 53 |         self.proposal_layer = ProposalCreator(self, **proposal_creator_params)
 54 |         n_anchor = self.anchor_base.shape[0]
 55 |         self.conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
 56 |         self.score = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
 57 |         self.loc = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
 58 |         normal_init(self.conv1, 0, 0.01)
 59 |         normal_init(self.score, 0, 0.01)
 60 |         normal_init(self.loc, 0, 0.01)
 61 | 
 62 |     def forward(self, x, img_size, scale=1.):
 63 |         """Forward Region Proposal Network.
 64 | 
 65 |         Here are notations.
 66 | 
 67 |         * :math:`N` is batch size.
 68 |         * :math:`C` channel size of the input.
 69 |         * :math:`H` and :math:`W` are height and witdh of the input feature.
 70 |         * :math:`A` is number of anchors assigned to each pixel.
 71 | 
 72 |         Args:
 73 |             x (~torch.autograd.Variable): The Features extracted from images.
 74 |                 Its shape is :math:`(N, C, H, W)`.
 75 |             img_size (tuple of ints): A tuple :obj:`height, width`,
 76 |                 which contains image size after scaling.
 77 |             scale (float): The amount of scaling done to the input images after
 78 |                 reading them from files.
 79 | 
 80 |         Returns:
 81 |             (~torch.autograd.Variable, ~torch.autograd.Variable, array, array, array):
 82 | 
 83 |             This is a tuple of five following values.
 84 | 
 85 |             * **rpn_locs**: Predicted bounding box offsets and scales for \
 86 |                 anchors. Its shape is :math:`(N, H W A, 4)`.
 87 |             * **rpn_scores**:  Predicted foreground scores for \
 88 |                 anchors. Its shape is :math:`(N, H W A, 2)`.
 89 |             * **rois**: A bounding box array containing coordinates of \
 90 |                 proposal boxes.  This is a concatenation of bounding box \
 91 |                 arrays from multiple images in the batch. \
 92 |                 Its shape is :math:`(R', 4)`. Given :math:`R_i` predicted \
 93 |                 bounding boxes from the :math:`i` th image, \
 94 |                 :math:`R' = \\sum _{i=1} ^ N R_i`.
 95 |             * **roi_indices**: An array containing indices of images to \
 96 |                 which RoIs correspond to. Its shape is :math:`(R',)`.
 97 |             * **anchor**: Coordinates of enumerated shifted anchors. \
 98 |                 Its shape is :math:`(H W A, 4)`.
 99 | 
100 |         """
101 |         n, _, hh, ww = x.shape
102 |         anchor = _enumerate_shifted_anchor(
103 |             np.array(self.anchor_base),
104 |             self.feat_stride, hh, ww)
105 | 
106 |         n_anchor = anchor.shape[0] // (hh * ww)
107 |         h = F.relu(self.conv1(x))
108 | 
109 |         rpn_locs = self.loc(h)
110 |         # UNNOTE: check whether need contiguous
111 |         # A: Yes
112 |         rpn_locs = rpn_locs.permute(0, 2, 3, 1).contiguous().view(n, -1, 4)
113 |         rpn_scores = self.score(h)
114 |         rpn_scores = rpn_scores.permute(0, 2, 3, 1).contiguous()
115 |         rpn_softmax_scores = F.softmax(rpn_scores.view(n, hh, ww, n_anchor, 2), dim=4)
116 |         rpn_fg_scores = rpn_softmax_scores[:, :, :, :, 1].contiguous()
117 |         rpn_fg_scores = rpn_fg_scores.view(n, -1)
118 |         rpn_scores = rpn_scores.view(n, -1, 2)
119 | 
120 |         rois = list()
121 |         roi_indices = list()
122 |         for i in range(n):
123 |             roi = self.proposal_layer(
124 |                 rpn_locs[i].cpu().data.numpy(),
125 |                 rpn_fg_scores[i].cpu().data.numpy(),
126 |                 anchor, img_size,
127 |                 scale=scale)
128 |             batch_index = i * np.ones((len(roi),), dtype=np.int32)
129 |             rois.append(roi)
130 |             roi_indices.append(batch_index)
131 | 
132 |         rois = np.concatenate(rois, axis=0)
133 |         roi_indices = np.concatenate(roi_indices, axis=0)
134 |         return rpn_locs, rpn_scores, rois, roi_indices, anchor
135 | 
136 | 
137 | def _enumerate_shifted_anchor(anchor_base, feat_stride, height, width):
138 |     # Enumerate all shifted anchors:
139 |     #
140 |     # add A anchors (1, A, 4) to
141 |     # cell K shifts (K, 1, 4) to get
142 |     # shift anchors (K, A, 4)
143 |     # reshape to (K*A, 4) shifted anchors
144 |     # return (K*A, 4)
145 | 
146 |     # !TODO: add support for torch.CudaTensor
147 |     # xp = cuda.get_array_module(anchor_base)
148 |     # it seems that it can't be boosed using GPU
149 |     import numpy as xp
150 |     shift_y = xp.arange(0, height * feat_stride, feat_stride)
151 |     shift_x = xp.arange(0, width * feat_stride, feat_stride)
152 |     shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
153 |     shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
154 |                       shift_y.ravel(), shift_x.ravel()), axis=1)
155 | 
156 |     A = anchor_base.shape[0]
157 |     K = shift.shape[0]
158 |     anchor = anchor_base.reshape((1, A, 4)) + \
159 |              shift.reshape((1, K, 4)).transpose((1, 0, 2))
160 |     anchor = anchor.reshape((K * A, 4)).astype(np.float32)
161 |     return anchor
162 | 
163 | 
164 | def _enumerate_shifted_anchor_torch(anchor_base, feat_stride, height, width):
165 |     # Enumerate all shifted anchors:
166 |     #
167 |     # add A anchors (1, A, 4) to
168 |     # cell K shifts (K, 1, 4) to get
169 |     # shift anchors (K, A, 4)
170 |     # reshape to (K*A, 4) shifted anchors
171 |     # return (K*A, 4)
172 | 
173 |     # !TODO: add support for torch.CudaTensor
174 |     # xp = cuda.get_array_module(anchor_base)
175 |     import torch as t
176 |     shift_y = t.arange(0, height * feat_stride, feat_stride)
177 |     shift_x = t.arange(0, width * feat_stride, feat_stride)
178 |     shift_x, shift_y = xp.meshgrid(shift_x, shift_y)
179 |     shift = xp.stack((shift_y.ravel(), shift_x.ravel(),
180 |                       shift_y.ravel(), shift_x.ravel()), axis=1)
181 | 
182 |     A = anchor_base.shape[0]
183 |     K = shift.shape[0]
184 |     anchor = anchor_base.reshape((1, A, 4)) + \
185 |              shift.reshape((1, K, 4)).transpose((1, 0, 2))
186 |     anchor = anchor.reshape((K * A, 4)).astype(np.float32)
187 |     return anchor
188 | 
189 | 
190 | def normal_init(m, mean, stddev, truncated=False):
191 |     """
192 |     weight initalizer: truncated normal and random normal.
193 |     """
194 |     # x is a parameter
195 |     if truncated:
196 |         m.weight.data.normal_().fmod_(2).mul_(stddev).add_(mean)  # not a perfect approximation
197 |     else:
198 |         m.weight.data.normal_(mean, stddev)
199 |         m.bias.data.zero_()
200 | 


--------------------------------------------------------------------------------
/model/roi_module.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | from string import Template
  3 | 
  4 | import cupy, torch
  5 | import cupy as cp
  6 | import torch as t
  7 | from torch.autograd import Function
  8 | 
  9 | from model.utils.roi_cupy import kernel_backward, kernel_forward
 10 | 
 11 | Stream = namedtuple('Stream', ['ptr'])
 12 | 
 13 | 
 14 | @cupy.util.memoize(for_each_device=True)
 15 | def load_kernel(kernel_name, code, **kwargs):
 16 |     cp.cuda.runtime.free(0)
 17 |     code = Template(code).substitute(**kwargs)
 18 |     kernel_code = cupy.cuda.compile_with_cache(code)
 19 |     return kernel_code.get_function(kernel_name)
 20 | 
 21 | 
 22 | CUDA_NUM_THREADS = 1024
 23 | 
 24 | 
 25 | def GET_BLOCKS(N, K=CUDA_NUM_THREADS):
 26 |     return (N + K - 1) // K
 27 | 
 28 | 
 29 | class RoI(Function):
 30 |     def __init__(self, outh, outw, spatial_scale):
 31 |         self.forward_fn = load_kernel('roi_forward', kernel_forward)
 32 |         self.backward_fn = load_kernel('roi_backward', kernel_backward)
 33 |         self.outh, self.outw, self.spatial_scale = outh, outw, spatial_scale
 34 | 
 35 |     def forward(self, x, rois):
 36 |         # NOTE: MAKE SURE input is contiguous too
 37 |         x = x.contiguous()
 38 |         rois = rois.contiguous()
 39 |         self.in_size = B, C, H, W = x.size()
 40 |         self.N = N = rois.size(0)
 41 |         output = t.zeros(N, C, self.outh, self.outw).cuda()
 42 |         self.argmax_data = t.zeros(N, C, self.outh, self.outw).int().cuda()
 43 |         self.rois = rois
 44 |         args = [x.data_ptr(), rois.data_ptr(),
 45 |                 output.data_ptr(),
 46 |                 self.argmax_data.data_ptr(),
 47 |                 self.spatial_scale, C, H, W,
 48 |                 self.outh, self.outw,
 49 |                 output.numel()]
 50 |         stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
 51 |         self.forward_fn(args=args,
 52 |                         block=(CUDA_NUM_THREADS, 1, 1),
 53 |                         grid=(GET_BLOCKS(output.numel()), 1, 1),
 54 |                         stream=stream)
 55 |         return output
 56 | 
 57 |     def backward(self, grad_output):
 58 |         ##NOTE: IMPORTANT CONTIGUOUS
 59 |         # TODO: input
 60 |         grad_output = grad_output.contiguous()
 61 |         B, C, H, W = self.in_size
 62 |         grad_input = t.zeros(self.in_size).cuda()
 63 |         stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
 64 |         args = [grad_output.data_ptr(),
 65 |                 self.argmax_data.data_ptr(),
 66 |                 self.rois.data_ptr(),
 67 |                 grad_input.data_ptr(),
 68 |                 self.N, self.spatial_scale, C, H, W, self.outh, self.outw,
 69 |                 grad_input.numel()]
 70 |         self.backward_fn(args=args,
 71 |                          block=(CUDA_NUM_THREADS, 1, 1),
 72 |                          grid=(GET_BLOCKS(grad_input.numel()), 1, 1),
 73 |                          stream=stream
 74 |                          )
 75 |         return grad_input, None
 76 | 
 77 | 
 78 | class RoIPooling2D(t.nn.Module):
 79 | 
 80 |     def __init__(self, outh, outw, spatial_scale):
 81 |         super(RoIPooling2D, self).__init__()
 82 |         self.RoI = RoI(outh, outw, spatial_scale)
 83 | 
 84 |     def forward(self, x, rois):
 85 |         return self.RoI(x, rois)
 86 | 
 87 | 
 88 | def test_roi_module():
 89 |     ## fake data###
 90 |     B, N, C, H, W, PH, PW = 2, 8, 4, 32, 32, 7, 7
 91 | 
 92 |     bottom_data = t.randn(B, C, H, W).cuda()
 93 |     bottom_rois = t.randn(N, 5)
 94 |     bottom_rois[:int(N / 2), 0] = 0
 95 |     bottom_rois[int(N / 2):, 0] = 1
 96 |     bottom_rois[:, 1:] = (t.rand(N, 4) * 100).float()
 97 |     bottom_rois = bottom_rois.cuda()
 98 |     spatial_scale = 1. / 16
 99 |     outh, outw = PH, PW
100 | 
101 |     # pytorch version
102 |     module = RoIPooling2D(outh, outw, spatial_scale)
103 |     x = bottom_data.requires_grad_()
104 |     rois = bottom_rois.detach()
105 | 
106 |     output = module(x, rois)
107 |     output.sum().backward()
108 | 
109 |     def t2c(variable):
110 |         npa = variable.data.cpu().numpy()
111 |         return cp.array(npa)
112 | 
113 |     def test_eq(variable, array, info):
114 |         cc = cp.asnumpy(array)
115 |         neq = (cc != variable.data.cpu().numpy())
116 |         assert neq.sum() == 0, 'test failed: %s' % info
117 | 
118 |     # chainer version,if you're going to run this
119 |     # pip install chainer 
120 |     import chainer.functions as F
121 |     from chainer import Variable
122 |     x_cn = Variable(t2c(x))
123 | 
124 |     o_cn = F.roi_pooling_2d(x_cn, t2c(rois), outh, outw, spatial_scale)
125 |     test_eq(output, o_cn.array, 'forward')
126 |     F.sum(o_cn).backward()
127 |     test_eq(x.grad, x_cn.grad, 'backward')
128 |     print('test pass')
129 | 


--------------------------------------------------------------------------------
/model/sr_gan_network.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch.nn.functional as F
  4 | from torch import nn
  5 | 
  6 | 
  7 | class Generator(nn.Module):
  8 |     def __init__(self, scale_factor):
  9 |         upsample_block_num = int(math.log(scale_factor, 2))
 10 | 
 11 |         super(Generator, self).__init__()
 12 |         self.block1 = nn.Sequential(
 13 |             nn.Conv2d(3, 64, kernel_size=9, padding=4),
 14 |             nn.PReLU()
 15 |         )
 16 |         self.block2 = ResidualBlock(64)
 17 |         self.block3 = ResidualBlock(64)
 18 |         self.block4 = ResidualBlock(64)
 19 |         self.block5 = ResidualBlock(64)
 20 |         self.block6 = ResidualBlock(64)
 21 |         self.block7 = nn.Sequential(
 22 |             nn.Conv2d(64, 64, kernel_size=3, padding=1),
 23 |             nn.BatchNorm2d(64)
 24 |         )
 25 |         block8 = [UpsampleBLock(64, 2) for _ in range(upsample_block_num)]
 26 |         block8.append(nn.Conv2d(64, 3, kernel_size=9, padding=4))
 27 |         self.block8 = nn.Sequential(*block8)
 28 | 
 29 |     def forward(self, x):
 30 |         block1 = self.block1(x)
 31 |         block2 = self.block2(block1)
 32 |         block3 = self.block3(block2)
 33 |         block4 = self.block4(block3)
 34 |         block5 = self.block5(block4)
 35 |         block6 = self.block6(block5)
 36 |         block7 = self.block7(block6)
 37 |         block8 = self.block8(block1 + block7)
 38 | 
 39 |         return (F.tanh(block8) + 1) / 2
 40 | 
 41 | 
 42 | class Discriminator(nn.Module):
 43 |     def __init__(self):
 44 |         super(Discriminator, self).__init__()
 45 |         self.net = nn.Sequential(
 46 |             nn.Conv2d(3, 64, kernel_size=3, padding=1),
 47 |             nn.LeakyReLU(0.2),
 48 | 
 49 |             nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1),
 50 |             nn.BatchNorm2d(64),
 51 |             nn.LeakyReLU(0.2),
 52 | 
 53 |             nn.Conv2d(64, 128, kernel_size=3, padding=1),
 54 |             nn.BatchNorm2d(128),
 55 |             nn.LeakyReLU(0.2),
 56 | 
 57 |             nn.Conv2d(128, 128, kernel_size=3, stride=2, padding=1),
 58 |             nn.BatchNorm2d(128),
 59 |             nn.LeakyReLU(0.2),
 60 | 
 61 |             nn.Conv2d(128, 256, kernel_size=3, padding=1),
 62 |             nn.BatchNorm2d(256),
 63 |             nn.LeakyReLU(0.2),
 64 | 
 65 |             nn.Conv2d(256, 256, kernel_size=3, stride=2, padding=1),
 66 |             nn.BatchNorm2d(256),
 67 |             nn.LeakyReLU(0.2),
 68 | 
 69 |             nn.Conv2d(256, 512, kernel_size=3, padding=1),
 70 |             nn.BatchNorm2d(512),
 71 |             nn.LeakyReLU(0.2),
 72 | 
 73 |             nn.Conv2d(512, 512, kernel_size=3, stride=2, padding=1),
 74 |             nn.BatchNorm2d(512),
 75 |             nn.LeakyReLU(0.2),
 76 | 
 77 |             nn.AdaptiveAvgPool2d(1),
 78 |             nn.Conv2d(512, 1024, kernel_size=1),
 79 |             nn.LeakyReLU(0.2),
 80 |             nn.Conv2d(1024, 1, kernel_size=1)
 81 |         )
 82 | 
 83 |     def forward(self, x):
 84 |         batch_size = x.size(0)
 85 |         return F.sigmoid(self.net(x).view(batch_size))
 86 | 
 87 | 
 88 | class ResidualBlock(nn.Module):
 89 |     def __init__(self, channels):
 90 |         super(ResidualBlock, self).__init__()
 91 |         self.conv1 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
 92 |         self.bn1 = nn.BatchNorm2d(channels)
 93 |         self.prelu = nn.PReLU()
 94 |         self.conv2 = nn.Conv2d(channels, channels, kernel_size=3, padding=1)
 95 |         self.bn2 = nn.BatchNorm2d(channels)
 96 | 
 97 |     def forward(self, x):
 98 |         residual = self.conv1(x)
 99 |         residual = self.bn1(residual)
100 |         residual = self.prelu(residual)
101 |         residual = self.conv2(residual)
102 |         residual = self.bn2(residual)
103 | 
104 |         return x + residual
105 | 
106 | 
107 | class UpsampleBLock(nn.Module):
108 |     def __init__(self, in_channels, up_scale):
109 |         super(UpsampleBLock, self).__init__()
110 |         self.conv = nn.Conv2d(in_channels, in_channels * up_scale ** 2, kernel_size=3, padding=1)
111 |         self.pixel_shuffle = nn.PixelShuffle(up_scale)
112 |         self.prelu = nn.PReLU()
113 | 
114 |     def forward(self, x):
115 |         x = self.conv(x)
116 |         x = self.pixel_shuffle(x)
117 |         x = self.prelu(x)
118 |         return x
119 | 


--------------------------------------------------------------------------------
/model/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/samirsen/small-object-detection/30d402a09a5a01d0f365b8d5b4593544a11e90ec/model/utils/__init__.py


--------------------------------------------------------------------------------
/model/utils/bbox_tools.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numpy as xp
  3 | 
  4 | import six
  5 | from six import __init__
  6 | 
  7 | 
  8 | def loc2bbox(src_bbox, loc):
  9 |     """Decode bounding boxes from bounding box offsets and scales.
 10 | 
 11 |     Given bounding box offsets and scales computed by
 12 |     :meth:`bbox2loc`, this function decodes the representation to
 13 |     coordinates in 2D image coordinates.
 14 | 
 15 |     Given scales and offsets :math:`t_y, t_x, t_h, t_w` and a bounding
 16 |     box whose center is :math:`(y, x) = p_y, p_x` and size :math:`p_h, p_w`,
 17 |     the decoded bounding box's center :math:`\\hat{g}_y`, :math:`\\hat{g}_x`
 18 |     and size :math:`\\hat{g}_h`, :math:`\\hat{g}_w` are calculated
 19 |     by the following formulas.
 20 | 
 21 |     * :math:`\\hat{g}_y = p_h t_y + p_y`
 22 |     * :math:`\\hat{g}_x = p_w t_x + p_x`
 23 |     * :math:`\\hat{g}_h = p_h \\exp(t_h)`
 24 |     * :math:`\\hat{g}_w = p_w \\exp(t_w)`
 25 | 
 26 |     The decoding formulas are used in works such as R-CNN [#]_.
 27 | 
 28 |     The output is same type as the type of the inputs.
 29 | 
 30 |     .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \
 31 |     Rich feature hierarchies for accurate object detection and semantic \
 32 |     segmentation. CVPR 2014.
 33 | 
 34 |     Args:
 35 |         src_bbox (array): A coordinates of bounding boxes.
 36 |             Its shape is :math:`(R, 4)`. These coordinates are
 37 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
 38 |         loc (array): An array with offsets and scales.
 39 |             The shapes of :obj:`src_bbox` and :obj:`loc` should be same.
 40 |             This contains values :math:`t_y, t_x, t_h, t_w`.
 41 | 
 42 |     Returns:
 43 |         array:
 44 |         Decoded bounding box coordinates. Its shape is :math:`(R, 4)`. \
 45 |         The second axis contains four values \
 46 |         :math:`\\hat{g}_{ymin}, \\hat{g}_{xmin},
 47 |         \\hat{g}_{ymax}, \\hat{g}_{xmax}`.
 48 | 
 49 |     """
 50 | 
 51 |     if src_bbox.shape[0] == 0:
 52 |         return xp.zeros((0, 4), dtype=loc.dtype)
 53 | 
 54 |     src_bbox = src_bbox.astype(src_bbox.dtype, copy=False)
 55 | 
 56 |     src_height = src_bbox[:, 2] - src_bbox[:, 0]
 57 |     src_width = src_bbox[:, 3] - src_bbox[:, 1]
 58 |     src_ctr_y = src_bbox[:, 0] + 0.5 * src_height
 59 |     src_ctr_x = src_bbox[:, 1] + 0.5 * src_width
 60 | 
 61 |     dy = loc[:, 0::4]
 62 |     dx = loc[:, 1::4]
 63 |     dh = loc[:, 2::4]
 64 |     dw = loc[:, 3::4]
 65 | 
 66 |     ctr_y = dy * src_height[:, xp.newaxis] + src_ctr_y[:, xp.newaxis]
 67 |     ctr_x = dx * src_width[:, xp.newaxis] + src_ctr_x[:, xp.newaxis]
 68 |     h = xp.exp(dh) * src_height[:, xp.newaxis]
 69 |     w = xp.exp(dw) * src_width[:, xp.newaxis]
 70 | 
 71 |     dst_bbox = xp.zeros(loc.shape, dtype=loc.dtype)
 72 |     dst_bbox[:, 0::4] = ctr_y - 0.5 * h
 73 |     dst_bbox[:, 1::4] = ctr_x - 0.5 * w
 74 |     dst_bbox[:, 2::4] = ctr_y + 0.5 * h
 75 |     dst_bbox[:, 3::4] = ctr_x + 0.5 * w
 76 | 
 77 |     return dst_bbox
 78 | 
 79 | 
 80 | def bbox2loc(src_bbox, dst_bbox):
 81 |     """Encodes the source and the destination bounding boxes to "loc".
 82 | 
 83 |     Given bounding boxes, this function computes offsets and scales
 84 |     to match the source bounding boxes to the target bounding boxes.
 85 |     Mathematcially, given a bounding box whose center is
 86 |     :math:`(y, x) = p_y, p_x` and
 87 |     size :math:`p_h, p_w` and the target bounding box whose center is
 88 |     :math:`g_y, g_x` and size :math:`g_h, g_w`, the offsets and scales
 89 |     :math:`t_y, t_x, t_h, t_w` can be computed by the following formulas.
 90 | 
 91 |     * :math:`t_y = \\frac{(g_y - p_y)} {p_h}`
 92 |     * :math:`t_x = \\frac{(g_x - p_x)} {p_w}`
 93 |     * :math:`t_h = \\log(\\frac{g_h} {p_h})`
 94 |     * :math:`t_w = \\log(\\frac{g_w} {p_w})`
 95 | 
 96 |     The output is same type as the type of the inputs.
 97 |     The encoding formulas are used in works such as R-CNN [#]_.
 98 | 
 99 |     .. [#] Ross Girshick, Jeff Donahue, Trevor Darrell, Jitendra Malik. \
100 |     Rich feature hierarchies for accurate object detection and semantic \
101 |     segmentation. CVPR 2014.
102 | 
103 |     Args:
104 |         src_bbox (array): An image coordinate array whose shape is
105 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
106 |             These coordinates are
107 |             :math:`p_{ymin}, p_{xmin}, p_{ymax}, p_{xmax}`.
108 |         dst_bbox (array): An image coordinate array whose shape is
109 |             :math:`(R, 4)`.
110 |             These coordinates are
111 |             :math:`g_{ymin}, g_{xmin}, g_{ymax}, g_{xmax}`.
112 | 
113 |     Returns:
114 |         array:
115 |         Bounding box offsets and scales from :obj:`src_bbox` \
116 |         to :obj:`dst_bbox`. \
117 |         This has shape :math:`(R, 4)`.
118 |         The second axis contains four values :math:`t_y, t_x, t_h, t_w`.
119 | 
120 |     """
121 | 
122 |     height = src_bbox[:, 2] - src_bbox[:, 0]
123 |     width = src_bbox[:, 3] - src_bbox[:, 1]
124 |     ctr_y = src_bbox[:, 0] + 0.5 * height
125 |     ctr_x = src_bbox[:, 1] + 0.5 * width
126 | 
127 |     base_height = dst_bbox[:, 2] - dst_bbox[:, 0]
128 |     base_width = dst_bbox[:, 3] - dst_bbox[:, 1]
129 |     base_ctr_y = dst_bbox[:, 0] + 0.5 * base_height
130 |     base_ctr_x = dst_bbox[:, 1] + 0.5 * base_width
131 | 
132 |     eps = xp.finfo(height.dtype).eps
133 |     height = xp.maximum(height, eps)
134 |     width = xp.maximum(width, eps)
135 | 
136 |     dy = (base_ctr_y - ctr_y) / height
137 |     dx = (base_ctr_x - ctr_x) / width
138 |     dh = xp.log(base_height / height)
139 |     dw = xp.log(base_width / width)
140 | 
141 |     loc = xp.vstack((dy, dx, dh, dw)).transpose()
142 |     return loc
143 | 
144 | 
145 | def bbox_iou(bbox_a, bbox_b):
146 |     """Calculate the Intersection of Unions (IoUs) between bounding boxes.
147 | 
148 |     IoU is calculated as a ratio of area of the intersection
149 |     and area of the union.
150 | 
151 |     This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as
152 |     inputs. Please note that both :obj:`bbox_a` and :obj:`bbox_b` need to be
153 |     same type.
154 |     The output is same type as the type of the inputs.
155 | 
156 |     Args:
157 |         bbox_a (array): An array whose shape is :math:`(N, 4)`.
158 |             :math:`N` is the number of bounding boxes.
159 |             The dtype should be :obj:`numpy.float32`.
160 |         bbox_b (array): An array similar to :obj:`bbox_a`,
161 |             whose shape is :math:`(K, 4)`.
162 |             The dtype should be :obj:`numpy.float32`.
163 | 
164 |     Returns:
165 |         array:
166 |         An array whose shape is :math:`(N, K)`. \
167 |         An element at index :math:`(n, k)` contains IoUs between \
168 |         :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \
169 |         box in :obj:`bbox_b`.
170 | 
171 |     """
172 |     if bbox_a.shape[1] != 4 or bbox_b.shape[1] != 4:
173 |         raise IndexError
174 | 
175 |     # top left
176 |     tl = xp.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
177 |     # bottom right
178 |     br = xp.minimum(bbox_a[:, None, 2:], bbox_b[:, 2:])
179 | 
180 |     area_i = xp.prod(br - tl, axis=2) * (tl < br).all(axis=2)
181 |     area_a = xp.prod(bbox_a[:, 2:] - bbox_a[:, :2], axis=1)
182 |     area_b = xp.prod(bbox_b[:, 2:] - bbox_b[:, :2], axis=1)
183 |     return area_i / (area_a[:, None] + area_b - area_i)
184 | 
185 | 
186 | def __test():
187 |     pass
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     __test()
192 | 
193 | 
194 | def generate_anchor_base(base_size=16, ratios=[0.5, 1, 2],
195 |                          anchor_scales=[8, 16, 32]):
196 |     """Generate anchor base windows by enumerating aspect ratio and scales.
197 | 
198 |     Generate anchors that are scaled and modified to the given aspect ratios.
199 |     Area of a scaled anchor is preserved when modifying to the given aspect
200 |     ratio.
201 | 
202 |     :obj:`R = len(ratios) * len(anchor_scales)` anchors are generated by this
203 |     function.
204 |     The :obj:`i * len(anchor_scales) + j` th anchor corresponds to an anchor
205 |     generated by :obj:`ratios[i]` and :obj:`anchor_scales[j]`.
206 | 
207 |     For example, if the scale is :math:`8` and the ratio is :math:`0.25`,
208 |     the width and the height of the base window will be stretched by :math:`8`.
209 |     For modifying the anchor to the given aspect ratio,
210 |     the height is halved and the width is doubled.
211 | 
212 |     Args:
213 |         base_size (number): The width and the height of the reference window.
214 |         ratios (list of floats): This is ratios of width to height of
215 |             the anchors.
216 |         anchor_scales (list of numbers): This is areas of anchors.
217 |             Those areas will be the product of the square of an element in
218 |             :obj:`anchor_scales` and the original area of the reference
219 |             window.
220 | 
221 |     Returns:
222 |         ~numpy.ndarray:
223 |         An array of shape :math:`(R, 4)`.
224 |         Each element is a set of coordinates of a bounding box.
225 |         The second axis corresponds to
226 |         :math:`(y_{min}, x_{min}, y_{max}, x_{max})` of a bounding box.
227 | 
228 |     """
229 |     py = base_size / 2.
230 |     px = base_size / 2.
231 | 
232 |     anchor_base = np.zeros((len(ratios) * len(anchor_scales), 4),
233 |                            dtype=np.float32)
234 |     for i in six.moves.range(len(ratios)):
235 |         for j in six.moves.range(len(anchor_scales)):
236 |             h = base_size * anchor_scales[j] * np.sqrt(ratios[i])
237 |             w = base_size * anchor_scales[j] * np.sqrt(1. / ratios[i])
238 | 
239 |             index = i * len(anchor_scales) + j
240 |             anchor_base[index, 0] = py - h / 2.
241 |             anchor_base[index, 1] = px - w / 2.
242 |             anchor_base[index, 2] = py + h / 2.
243 |             anchor_base[index, 3] = px + w / 2.
244 |     return anchor_base
245 | 


--------------------------------------------------------------------------------
/model/utils/creator_tool.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cupy as cp
  3 | 
  4 | from model.utils.bbox_tools import bbox2loc, bbox_iou, loc2bbox
  5 | from model.utils.nms import non_maximum_suppression
  6 | 
  7 | 
  8 | class ProposalTargetCreator(object):
  9 |     """Assign ground truth bounding boxes to given RoIs.
 10 | 
 11 |     The :meth:`__call__` of this class generates training targets
 12 |     for each object proposal.
 13 |     This is used to train Faster RCNN [#]_.
 14 | 
 15 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
 16 |     Faster R-CNN: Towards Real-Time Object Detection with \
 17 |     Region Proposal Networks. NIPS 2015.
 18 | 
 19 |     Args:
 20 |         n_sample (int): The number of sampled regions.
 21 |         pos_ratio (float): Fraction of regions that is labeled as a
 22 |             foreground.
 23 |         pos_iou_thresh (float): IoU threshold for a RoI to be considered as a
 24 |             foreground.
 25 |         neg_iou_thresh_hi (float): RoI is considered to be the background
 26 |             if IoU is in
 27 |             [:obj:`neg_iou_thresh_hi`, :obj:`neg_iou_thresh_hi`).
 28 |         neg_iou_thresh_lo (float): See above.
 29 | 
 30 |     """
 31 | 
 32 |     def __init__(self,
 33 |                  n_sample=128,
 34 |                  pos_ratio=0.25, pos_iou_thresh=0.5,
 35 |                  neg_iou_thresh_hi=0.5, neg_iou_thresh_lo=0.0
 36 |                  ):
 37 |         self.n_sample = n_sample
 38 |         self.pos_ratio = pos_ratio
 39 |         self.pos_iou_thresh = pos_iou_thresh
 40 |         self.neg_iou_thresh_hi = neg_iou_thresh_hi
 41 |         self.neg_iou_thresh_lo = neg_iou_thresh_lo  # NOTE:default 0.1 in py-faster-rcnn
 42 | 
 43 |     def __call__(self, roi, bbox, label,
 44 |                  loc_normalize_mean=(0., 0., 0., 0.),
 45 |                  loc_normalize_std=(0.1, 0.1, 0.2, 0.2)):
 46 |         """Assigns ground truth to sampled proposals.
 47 | 
 48 |         This function samples total of :obj:`self.n_sample` RoIs
 49 |         from the combination of :obj:`roi` and :obj:`bbox`.
 50 |         The RoIs are assigned with the ground truth class labels as well as
 51 |         bounding box offsets and scales to match the ground truth bounding
 52 |         boxes. As many as :obj:`pos_ratio * self.n_sample` RoIs are
 53 |         sampled as foregrounds.
 54 | 
 55 |         Offsets and scales of bounding boxes are calculated using
 56 |         :func:`model.utils.bbox_tools.bbox2loc`.
 57 |         Also, types of input arrays and output arrays are same.
 58 | 
 59 |         Here are notations.
 60 | 
 61 |         * :math:`S` is the total number of sampled RoIs, which equals \
 62 |             :obj:`self.n_sample`.
 63 |         * :math:`L` is number of object classes possibly including the \
 64 |             background.
 65 | 
 66 |         Args:
 67 |             roi (array): Region of Interests (RoIs) from which we sample.
 68 |                 Its shape is :math:`(R, 4)`
 69 |             bbox (array): The coordinates of ground truth bounding boxes.
 70 |                 Its shape is :math:`(R', 4)`.
 71 |             label (array): Ground truth bounding box labels. Its shape
 72 |                 is :math:`(R',)`. Its range is :math:`[0, L - 1]`, where
 73 |                 :math:`L` is the number of foreground classes.
 74 |             loc_normalize_mean (tuple of four floats): Mean values to normalize
 75 |                 coordinates of bouding boxes.
 76 |             loc_normalize_std (tupler of four floats): Standard deviation of
 77 |                 the coordinates of bounding boxes.
 78 | 
 79 |         Returns:
 80 |             (array, array, array):
 81 | 
 82 |             * **sample_roi**: Regions of interests that are sampled. \
 83 |                 Its shape is :math:`(S, 4)`.
 84 |             * **gt_roi_loc**: Offsets and scales to match \
 85 |                 the sampled RoIs to the ground truth bounding boxes. \
 86 |                 Its shape is :math:`(S, 4)`.
 87 |             * **gt_roi_label**: Labels assigned to sampled RoIs. Its shape is \
 88 |                 :math:`(S,)`. Its range is :math:`[0, L]`. The label with \
 89 |                 value 0 is the background.
 90 | 
 91 |         """
 92 |         n_bbox, _ = bbox.shape
 93 | 
 94 |         roi = np.concatenate((roi, bbox), axis=0)
 95 | 
 96 |         pos_roi_per_image = np.round(self.n_sample * self.pos_ratio)
 97 |         iou = bbox_iou(roi, bbox)
 98 |         gt_assignment = iou.argmax(axis=1)
 99 |         max_iou = iou.max(axis=1)
100 |         # Offset range of classes from [0, n_fg_class - 1] to [1, n_fg_class].
101 |         # The label with value 0 is the background.
102 |         gt_roi_label = label[gt_assignment] + 1
103 | 
104 |         # Select foreground RoIs as those with >= pos_iou_thresh IoU.
105 |         pos_index = np.where(max_iou >= self.pos_iou_thresh)[0]
106 |         pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
107 |         if pos_index.size > 0:
108 |             pos_index = np.random.choice(
109 |                 pos_index, size=pos_roi_per_this_image, replace=False)
110 | 
111 |         # Select background RoIs as those within
112 |         # [neg_iou_thresh_lo, neg_iou_thresh_hi).
113 |         neg_index = np.where((max_iou < self.neg_iou_thresh_hi) &
114 |                              (max_iou >= self.neg_iou_thresh_lo))[0]
115 |         neg_roi_per_this_image = self.n_sample - pos_roi_per_this_image
116 |         neg_roi_per_this_image = int(min(neg_roi_per_this_image,
117 |                                          neg_index.size))
118 |         if neg_index.size > 0:
119 |             neg_index = np.random.choice(
120 |                 neg_index, size=neg_roi_per_this_image, replace=False)
121 | 
122 |         # The indices that we're selecting (both positive and negative).
123 |         keep_index = np.append(pos_index, neg_index)
124 |         gt_roi_label = gt_roi_label[keep_index]
125 |         gt_roi_label[pos_roi_per_this_image:] = 0  # negative labels --> 0
126 |         sample_roi = roi[keep_index]
127 | 
128 |         # Compute offsets and scales to match sampled RoIs to the GTs.
129 |         gt_roi_loc = bbox2loc(sample_roi, bbox[gt_assignment[keep_index]])
130 |         gt_roi_loc = ((gt_roi_loc - np.array(loc_normalize_mean, np.float32)
131 |                        ) / np.array(loc_normalize_std, np.float32))
132 | 
133 |         return sample_roi, gt_roi_loc, gt_roi_label
134 | 
135 | 
136 | class AnchorTargetCreator(object):
137 |     """Assign the ground truth bounding boxes to anchors.
138 | 
139 |     Assigns the ground truth bounding boxes to anchors for training Region
140 |     Proposal Networks introduced in Faster R-CNN [#]_.
141 | 
142 |     Offsets and scales to match anchors to the ground truth are
143 |     calculated using the encoding scheme of
144 |     :func:`model.utils.bbox_tools.bbox2loc`.
145 | 
146 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
147 |     Faster R-CNN: Towards Real-Time Object Detection with \
148 |     Region Proposal Networks. NIPS 2015.
149 | 
150 |     Args:
151 |         n_sample (int): The number of regions to produce.
152 |         pos_iou_thresh (float): Anchors with IoU above this
153 |             threshold will be assigned as positive.
154 |         neg_iou_thresh (float): Anchors with IoU below this
155 |             threshold will be assigned as negative.
156 |         pos_ratio (float): Ratio of positive regions in the
157 |             sampled regions.
158 | 
159 |     """
160 | 
161 |     def __init__(self,
162 |                  n_sample=256,
163 |                  pos_iou_thresh=0.7, neg_iou_thresh=0.3,
164 |                  pos_ratio=0.5):
165 |         self.n_sample = n_sample
166 |         self.pos_iou_thresh = pos_iou_thresh
167 |         self.neg_iou_thresh = neg_iou_thresh
168 |         self.pos_ratio = pos_ratio
169 | 
170 |     def __call__(self, bbox, anchor, img_size):
171 |         """Assign ground truth supervision to sampled subset of anchors.
172 | 
173 |         Types of input arrays and output arrays are same.
174 | 
175 |         Here are notations.
176 | 
177 |         * :math:`S` is the number of anchors.
178 |         * :math:`R` is the number of bounding boxes.
179 | 
180 |         Args:
181 |             bbox (array): Coordinates of bounding boxes. Its shape is
182 |                 :math:`(R, 4)`.
183 |             anchor (array): Coordinates of anchors. Its shape is
184 |                 :math:`(S, 4)`.
185 |             img_size (tuple of ints): A tuple :obj:`H, W`, which
186 |                 is a tuple of height and width of an image.
187 | 
188 |         Returns:
189 |             (array, array):
190 | 
191 |             #NOTE: it's scale not only  offset
192 |             * **loc**: Offsets and scales to match the anchors to \
193 |                 the ground truth bounding boxes. Its shape is :math:`(S, 4)`.
194 |             * **label**: Labels of anchors with values \
195 |                 :obj:`(1=positive, 0=negative, -1=ignore)`. Its shape \
196 |                 is :math:`(S,)`.
197 | 
198 |         """
199 | 
200 |         img_H, img_W = img_size
201 | 
202 |         n_anchor = len(anchor)
203 |         inside_index = _get_inside_index(anchor, img_H, img_W)
204 |         anchor = anchor[inside_index]
205 |         argmax_ious, label = self._create_label(
206 |             inside_index, anchor, bbox)
207 | 
208 |         # compute bounding box regression targets
209 |         loc = bbox2loc(anchor, bbox[argmax_ious])
210 | 
211 |         # map up to original set of anchors
212 |         label = _unmap(label, n_anchor, inside_index, fill=-1)
213 |         loc = _unmap(loc, n_anchor, inside_index, fill=0)
214 | 
215 |         return loc, label
216 | 
217 |     def _create_label(self, inside_index, anchor, bbox):
218 |         # label: 1 is positive, 0 is negative, -1 is dont care
219 |         label = np.empty((len(inside_index),), dtype=np.int32)
220 |         label.fill(-1)
221 | 
222 |         argmax_ious, max_ious, gt_argmax_ious = \
223 |             self._calc_ious(anchor, bbox, inside_index)
224 | 
225 |         # assign negative labels first so that positive labels can clobber them
226 |         label[max_ious < self.neg_iou_thresh] = 0
227 | 
228 |         # positive label: for each gt, anchor with highest iou
229 |         label[gt_argmax_ious] = 1
230 | 
231 |         # positive label: above threshold IOU
232 |         label[max_ious >= self.pos_iou_thresh] = 1
233 | 
234 |         # subsample positive labels if we have too many
235 |         n_pos = int(self.pos_ratio * self.n_sample)
236 |         pos_index = np.where(label == 1)[0]
237 |         if len(pos_index) > n_pos:
238 |             disable_index = np.random.choice(
239 |                 pos_index, size=(len(pos_index) - n_pos), replace=False)
240 |             label[disable_index] = -1
241 | 
242 |         # subsample negative labels if we have too many
243 |         n_neg = self.n_sample - np.sum(label == 1)
244 |         neg_index = np.where(label == 0)[0]
245 |         if len(neg_index) > n_neg:
246 |             disable_index = np.random.choice(
247 |                 neg_index, size=(len(neg_index) - n_neg), replace=False)
248 |             label[disable_index] = -1
249 | 
250 |         return argmax_ious, label
251 | 
252 |     def _calc_ious(self, anchor, bbox, inside_index):
253 |         # ious between the anchors and the gt boxes
254 |         ious = bbox_iou(anchor, bbox)
255 |         argmax_ious = ious.argmax(axis=1)
256 |         max_ious = ious[np.arange(len(inside_index)), argmax_ious]
257 |         gt_argmax_ious = ious.argmax(axis=0)
258 |         gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]
259 |         gt_argmax_ious = np.where(ious == gt_max_ious)[0]
260 | 
261 |         return argmax_ious, max_ious, gt_argmax_ious
262 | 
263 | 
264 | def _unmap(data, count, index, fill=0):
265 |     # Unmap a subset of item (data) back to the original set of items (of
266 |     # size count)
267 | 
268 |     if len(data.shape) == 1:
269 |         ret = np.empty((count,), dtype=data.dtype)
270 |         ret.fill(fill)
271 |         ret[index] = data
272 |     else:
273 |         ret = np.empty((count,) + data.shape[1:], dtype=data.dtype)
274 |         ret.fill(fill)
275 |         ret[index, :] = data
276 |     return ret
277 | 
278 | 
279 | def _get_inside_index(anchor, H, W):
280 |     # Calc indicies of anchors which are located completely inside of the image
281 |     # whose size is speficied.
282 |     index_inside = np.where(
283 |         (anchor[:, 0] >= 0) &
284 |         (anchor[:, 1] >= 0) &
285 |         (anchor[:, 2] <= H) &
286 |         (anchor[:, 3] <= W)
287 |     )[0]
288 |     return index_inside
289 | 
290 | 
291 | class ProposalCreator:
292 |     # unNOTE: I'll make it undifferential
293 |     # unTODO: make sure it's ok
294 |     # It's ok
295 |     """Proposal regions are generated by calling this object.
296 | 
297 |     The :meth:`__call__` of this object outputs object detection proposals by
298 |     applying estimated bounding box offsets
299 |     to a set of anchors.
300 | 
301 |     This class takes parameters to control number of bounding boxes to
302 |     pass to NMS and keep after NMS.
303 |     If the paramters are negative, it uses all the bounding boxes supplied
304 |     or keep all the bounding boxes returned by NMS.
305 | 
306 |     This class is used for Region Proposal Networks introduced in
307 |     Faster R-CNN [#]_.
308 | 
309 |     .. [#] Shaoqing Ren, Kaiming He, Ross Girshick, Jian Sun. \
310 |     Faster R-CNN: Towards Real-Time Object Detection with \
311 |     Region Proposal Networks. NIPS 2015.
312 | 
313 |     Args:
314 |         nms_thresh (float): Threshold value used when calling NMS.
315 |         n_train_pre_nms (int): Number of top scored bounding boxes
316 |             to keep before passing to NMS in train mode.
317 |         n_train_post_nms (int): Number of top scored bounding boxes
318 |             to keep after passing to NMS in train mode.
319 |         n_test_pre_nms (int): Number of top scored bounding boxes
320 |             to keep before passing to NMS in test mode.
321 |         n_test_post_nms (int): Number of top scored bounding boxes
322 |             to keep after passing to NMS in test mode.
323 |         force_cpu_nms (bool): If this is :obj:`True`,
324 |             always use NMS in CPU mode. If :obj:`False`,
325 |             the NMS mode is selected based on the type of inputs.
326 |         min_size (int): A paramter to determine the threshold on
327 |             discarding bounding boxes based on their sizes.
328 | 
329 |     """
330 | 
331 |     def __init__(self,
332 |                  parent_model,
333 |                  nms_thresh=0.7,
334 |                  n_train_pre_nms=12000,
335 |                  n_train_post_nms=2000,
336 |                  n_test_pre_nms=6000,
337 |                  n_test_post_nms=300,
338 |                  min_size=16
339 |                  ):
340 |         self.parent_model = parent_model
341 |         self.nms_thresh = nms_thresh
342 |         self.n_train_pre_nms = n_train_pre_nms
343 |         self.n_train_post_nms = n_train_post_nms
344 |         self.n_test_pre_nms = n_test_pre_nms
345 |         self.n_test_post_nms = n_test_post_nms
346 |         self.min_size = min_size
347 | 
348 |     def __call__(self, loc, score,
349 |                  anchor, img_size, scale=1.):
350 |         """input should  be ndarray
351 |         Propose RoIs.
352 | 
353 |         Inputs :obj:`loc, score, anchor` refer to the same anchor when indexed
354 |         by the same index.
355 | 
356 |         On notations, :math:`R` is the total number of anchors. This is equal
357 |         to product of the height and the width of an image and the number of
358 |         anchor bases per pixel.
359 | 
360 |         Type of the output is same as the inputs.
361 | 
362 |         Args:
363 |             loc (array): Predicted offsets and scaling to anchors.
364 |                 Its shape is :math:`(R, 4)`.
365 |             score (array): Predicted foreground probability for anchors.
366 |                 Its shape is :math:`(R,)`.
367 |             anchor (array): Coordinates of anchors. Its shape is
368 |                 :math:`(R, 4)`.
369 |             img_size (tuple of ints): A tuple :obj:`height, width`,
370 |                 which contains image size after scaling.
371 |             scale (float): The scaling factor used to scale an image after
372 |                 reading it from a file.
373 | 
374 |         Returns:
375 |             array:
376 |             An array of coordinates of proposal boxes.
377 |             Its shape is :math:`(S, 4)`. :math:`S` is less than
378 |             :obj:`self.n_test_post_nms` in test time and less than
379 |             :obj:`self.n_train_post_nms` in train time. :math:`S` depends on
380 |             the size of the predicted bounding boxes and the number of
381 |             bounding boxes discarded by NMS.
382 | 
383 |         """
384 |         # NOTE: when test, remember
385 |         # faster_rcnn.eval()
386 |         # to set self.traing = False
387 |         if self.parent_model.training:
388 |             n_pre_nms = self.n_train_pre_nms
389 |             n_post_nms = self.n_train_post_nms
390 |         else:
391 |             n_pre_nms = self.n_test_pre_nms
392 |             n_post_nms = self.n_test_post_nms
393 | 
394 |         # Convert anchors into proposal via bbox transformations.
395 |         # roi = loc2bbox(anchor, loc)
396 |         roi = loc2bbox(anchor, loc)
397 | 
398 |         # Clip predicted boxes to image.
399 |         roi[:, slice(0, 4, 2)] = np.clip(
400 |             roi[:, slice(0, 4, 2)], 0, img_size[0])
401 |         roi[:, slice(1, 4, 2)] = np.clip(
402 |             roi[:, slice(1, 4, 2)], 0, img_size[1])
403 | 
404 |         # Remove predicted boxes with either height or width < threshold.
405 |         min_size = self.min_size * scale
406 |         hs = roi[:, 2] - roi[:, 0]
407 |         ws = roi[:, 3] - roi[:, 1]
408 |         keep = np.where((hs >= min_size) & (ws >= min_size))[0]
409 |         roi = roi[keep, :]
410 |         score = score[keep]
411 | 
412 |         # Sort all (proposal, score) pairs by score from highest to lowest.
413 |         # Take top pre_nms_topN (e.g. 6000).
414 |         order = score.ravel().argsort()[::-1]
415 |         if n_pre_nms > 0:
416 |             order = order[:n_pre_nms]
417 |         roi = roi[order, :]
418 | 
419 |         # Apply nms (e.g. threshold = 0.7).
420 |         # Take after_nms_topN (e.g. 300).
421 | 
422 |         # unNOTE: somthing is wrong here!
423 |         # TODO: remove cuda.to_gpu
424 |         keep = non_maximum_suppression(
425 |             cp.ascontiguousarray(cp.asarray(roi)),
426 |             thresh=self.nms_thresh)
427 |         if n_post_nms > 0:
428 |             keep = keep[:n_post_nms]
429 |         roi = roi[keep]
430 |         return roi
431 | 


--------------------------------------------------------------------------------
/model/utils/nms/__init__.py:
--------------------------------------------------------------------------------
1 | from model.utils.nms.non_maximum_suppression import non_maximum_suppression


--------------------------------------------------------------------------------
/model/utils/nms/_nms_gpu_post.pyx:
--------------------------------------------------------------------------------
 1 | cimport numpy as np
 2 | from libc.stdint cimport uint64_t
 3 | 
 4 | import numpy as np
 5 | 
 6 | def _nms_gpu_post(np.ndarray[np.uint64_t, ndim=1] mask,
 7 |                   int n_bbox,
 8 |                   int threads_per_block,
 9 |                   int col_blocks
10 |                   ):
11 |     cdef:
12 |         int i, j, nblock, index
13 |         uint64_t inblock
14 |         int n_selection = 0
15 |         uint64_t one_ull = 1
16 |         np.ndarray[np.int32_t, ndim=1] selection
17 |         np.ndarray[np.uint64_t, ndim=1] remv
18 | 
19 |     selection = np.zeros((n_bbox,), dtype=np.int32)
20 |     remv = np.zeros((col_blocks,), dtype=np.uint64)
21 | 
22 |     for i in range(n_bbox):
23 |         nblock = i // threads_per_block
24 |         inblock = i % threads_per_block
25 | 
26 |         if not (remv[nblock] & one_ull << inblock):
27 |             selection[n_selection] = i
28 |             n_selection += 1
29 | 
30 |             index = i * col_blocks
31 |             for j in range(nblock, col_blocks):
32 |                 remv[j] |= mask[index + j]
33 |     return selection, n_selection
34 | 


--------------------------------------------------------------------------------
/model/utils/nms/_nms_gpu_post_py.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | def _nms_gpu_post( mask,
 5 |                   n_bbox,
 6 |                    threads_per_block,
 7 |                    col_blocks
 8 |                   ):
 9 |     n_selection = 0
10 |     one_ull = np.array([1],dtype=np.uint64)
11 |     selection = np.zeros((n_bbox,), dtype=np.int32)
12 |     remv = np.zeros((col_blocks,), dtype=np.uint64)
13 | 
14 |     for i in range(n_bbox):
15 |         nblock = i // threads_per_block
16 |         inblock = i % threads_per_block
17 | 
18 |         if not (remv[nblock] & one_ull << inblock):
19 |             selection[n_selection] = i
20 |             n_selection += 1
21 | 
22 |             index = i * col_blocks
23 |             for j in range(nblock, col_blocks):
24 |                 remv[j] |= mask[index + j]
25 |     return selection, n_selection
26 | 


--------------------------------------------------------------------------------
/model/utils/nms/build.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from distutils.extension import Extension
 3 | from Cython.Distutils import build_ext
 4 | 
 5 | ext_modules = [Extension("_nms_gpu_post", ["_nms_gpu_post.pyx"])]
 6 | setup(
 7 |     name="Hello pyx",
 8 |     cmdclass={'build_ext': build_ext},
 9 |     ext_modules=ext_modules
10 | )
11 | 


--------------------------------------------------------------------------------
/model/utils/nms/non_maximum_suppression.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | import numpy as np
  3 | import cupy as cp
  4 | import torch as t
  5 | try:
  6 |     from ._nms_gpu_post import _nms_gpu_post
  7 | except:
  8 |     import warnings
  9 |     warnings.warn('''
 10 |     the python code for non_maximum_suppression is about 2x slow
 11 |     It is strongly recommended to build cython code: 
 12 |     `cd model/utils/nms/; python3 build.py build_ext --inplace''')
 13 |     from ._nms_gpu_post_py import _nms_gpu_post
 14 | 
 15 | 
 16 | @cp.util.memoize(for_each_device=True)
 17 | def _load_kernel(kernel_name, code, options=()):
 18 |     cp.cuda.runtime.free(0)
 19 |     assert isinstance(options, tuple)
 20 |     kernel_code = cp.cuda.compile_with_cache(code, options=options)
 21 |     return kernel_code.get_function(kernel_name)
 22 | 
 23 | 
 24 | def non_maximum_suppression(bbox, thresh, score=None,
 25 |                             limit=None):
 26 |     """Suppress bounding boxes according to their IoUs.
 27 | 
 28 |     This method checks each bounding box sequentially and selects the bounding
 29 |     box if the Intersection over Unions (IoUs) between the bounding box and the
 30 |     previously selected bounding boxes is less than :obj:`thresh`. This method
 31 |     is mainly used as postprocessing of object detection.
 32 |     The bounding boxes are selected from ones with higher scores.
 33 |     If :obj:`score` is not provided as an argument, the bounding box
 34 |     is ordered by its index in ascending order.
 35 | 
 36 |     The bounding boxes are expected to be packed into a two dimensional
 37 |     tensor of shape :math:`(R, 4)`, where :math:`R` is the number of
 38 |     bounding boxes in the image. The second axis represents attributes of
 39 |     the bounding box. They are :math:`(y_{min}, x_{min}, y_{max}, x_{max})`,
 40 |     where the four attributes are coordinates of the top left and the
 41 |     bottom right vertices.
 42 | 
 43 |     :obj:`score` is a float array of shape :math:`(R,)`. Each score indicates
 44 |     confidence of prediction.
 45 | 
 46 |     This function accepts both :obj:`numpy.ndarray` and :obj:`cupy.ndarray` as
 47 |     an input. Please note that both :obj:`bbox` and :obj:`score` need to be
 48 |     the same type.
 49 |     The type of the output is the same as the input.
 50 | 
 51 |     Args:
 52 |         bbox (array): Bounding boxes to be transformed. The shape is
 53 |             :math:`(R, 4)`. :math:`R` is the number of bounding boxes.
 54 |         thresh (float): Threshold of IoUs.
 55 |         score (array): An array of confidences whose shape is :math:`(R,)`.
 56 |         limit (int): The upper bound of the number of the output bounding
 57 |             boxes. If it is not specified, this method selects as many
 58 |             bounding boxes as possible.
 59 | 
 60 |     Returns:
 61 |         array:
 62 |         An array with indices of bounding boxes that are selected. \
 63 |         They are sorted by the scores of bounding boxes in descending \
 64 |         order. \
 65 |         The shape of this array is :math:`(K,)` and its dtype is\
 66 |         :obj:`numpy.int32`. Note that :math:`K \\leq R`.
 67 | 
 68 |     """
 69 | 
 70 |     return _non_maximum_suppression_gpu(bbox, thresh, score, limit)
 71 | 
 72 | 
 73 | def _non_maximum_suppression_gpu(bbox, thresh, score=None, limit=None):
 74 |     if len(bbox) == 0:
 75 |         return cp.zeros((0,), dtype=np.int32)
 76 | 
 77 |     n_bbox = bbox.shape[0]
 78 | 
 79 |     if score is not None:
 80 |         order = score.argsort()[::-1].astype(np.int32)
 81 |     else:
 82 |         order = cp.arange(n_bbox, dtype=np.int32)
 83 | 
 84 |     sorted_bbox = bbox[order, :]
 85 |     selec, n_selec = _call_nms_kernel(
 86 |         sorted_bbox, thresh)
 87 |     selec = selec[:n_selec]
 88 |     selec = order[selec]
 89 |     if limit is not None:
 90 |         selec = selec[:limit]
 91 |     return cp.asnumpy(selec)
 92 | 
 93 | 
 94 | _nms_gpu_code = '''
 95 | #define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 96 | int const threadsPerBlock = sizeof(unsigned long long) * 8;
 97 | 
 98 | __device__
 99 | inline float devIoU(float const *const bbox_a, float const *const bbox_b) {
100 |   float top = max(bbox_a[0], bbox_b[0]);
101 |   float bottom = min(bbox_a[2], bbox_b[2]);
102 |   float left = max(bbox_a[1], bbox_b[1]);
103 |   float right = min(bbox_a[3], bbox_b[3]);
104 |   float height = max(bottom - top, 0.f);
105 |   float width = max(right - left, 0.f);
106 |   float area_i = height * width;
107 |   float area_a = (bbox_a[2] - bbox_a[0]) * (bbox_a[3] - bbox_a[1]);
108 |   float area_b = (bbox_b[2] - bbox_b[0]) * (bbox_b[3] - bbox_b[1]);
109 |   return area_i / (area_a + area_b - area_i);
110 | }
111 | 
112 | extern "C"
113 | __global__
114 | void nms_kernel(const int n_bbox, const float thresh,
115 |                 const float *dev_bbox,
116 |                 unsigned long long *dev_mask) {
117 |   const int row_start = blockIdx.y;
118 |   const int col_start = blockIdx.x;
119 | 
120 |   const int row_size =
121 |         min(n_bbox - row_start * threadsPerBlock, threadsPerBlock);
122 |   const int col_size =
123 |         min(n_bbox - col_start * threadsPerBlock, threadsPerBlock);
124 | 
125 |   __shared__ float block_bbox[threadsPerBlock * 4];
126 |   if (threadIdx.x < col_size) {
127 |     block_bbox[threadIdx.x * 4 + 0] =
128 |         dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 0];
129 |     block_bbox[threadIdx.x * 4 + 1] =
130 |         dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 1];
131 |     block_bbox[threadIdx.x * 4 + 2] =
132 |         dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 2];
133 |     block_bbox[threadIdx.x * 4 + 3] =
134 |         dev_bbox[(threadsPerBlock * col_start + threadIdx.x) * 4 + 3];
135 |   }
136 |   __syncthreads();
137 | 
138 |   if (threadIdx.x < row_size) {
139 |     const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
140 |     const float *cur_box = dev_bbox + cur_box_idx * 4;
141 |     int i = 0;
142 |     unsigned long long t = 0;
143 |     int start = 0;
144 |     if (row_start == col_start) {
145 |       start = threadIdx.x + 1;
146 |     }
147 |     for (i = start; i < col_size; i++) {
148 |       if (devIoU(cur_box, block_bbox + i * 4) >= thresh) {
149 |         t |= 1ULL << i;
150 |       }
151 |     }
152 |     const int col_blocks = DIVUP(n_bbox, threadsPerBlock);
153 |     dev_mask[cur_box_idx * col_blocks + col_start] = t;
154 |   }
155 | }
156 | '''
157 | 
158 | 
159 | def _call_nms_kernel(bbox, thresh):
160 |     # PyTorch does not support unsigned long Tensor.
161 |     # Doesn't matter,since it returns ndarray finally.
162 |     # So I'll keep it unmodified.
163 |     n_bbox = bbox.shape[0]
164 |     threads_per_block = 64
165 |     col_blocks = np.ceil(n_bbox / threads_per_block).astype(np.int32)
166 |     blocks = (col_blocks, col_blocks, 1)
167 |     threads = (threads_per_block, 1, 1)
168 | 
169 |     mask_dev = cp.zeros((n_bbox * col_blocks,), dtype=np.uint64)
170 |     bbox = cp.ascontiguousarray(bbox, dtype=np.float32)
171 |     kern = _load_kernel('nms_kernel', _nms_gpu_code)
172 |     kern(blocks, threads, args=(cp.int32(n_bbox), cp.float32(thresh),
173 |                                 bbox, mask_dev))
174 | 
175 |     mask_host = mask_dev.get()
176 |     selection, n_selec = _nms_gpu_post(
177 |         mask_host, n_bbox, threads_per_block, col_blocks)
178 |     return selection, n_selec
179 | 


--------------------------------------------------------------------------------
/model/utils/roi_cupy.py:
--------------------------------------------------------------------------------
  1 | kernel_forward = '''
  2 |     extern "C"
  3 |     __global__ void roi_forward(const float* const bottom_data,const float* const bottom_rois,
  4 |                 float* top_data, int* argmax_data,
  5 |                 const double spatial_scale,const int channels,const int height, 
  6 |                 const int width, const int pooled_height, 
  7 |                 const int pooled_width,const int NN
  8 |     ){
  9 |         
 10 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 11 |     if(idx>=NN)
 12 |         return;
 13 |     const int pw = idx % pooled_width;
 14 |     const int ph = (idx / pooled_width) % pooled_height;
 15 |     const int c = (idx / pooled_width / pooled_height) % channels;
 16 |     int num = idx / pooled_width / pooled_height / channels;
 17 |     const int roi_batch_ind = bottom_rois[num * 5 + 0];
 18 |     const int roi_start_w = round(bottom_rois[num * 5 + 1] * spatial_scale);
 19 |     const int roi_start_h = round(bottom_rois[num * 5 + 2] * spatial_scale);
 20 |     const int roi_end_w = round(bottom_rois[num * 5 + 3] * spatial_scale);
 21 |     const int roi_end_h = round(bottom_rois[num * 5 + 4] * spatial_scale);
 22 |     // Force malformed ROIs to be 1x1
 23 |     const int roi_width = max(roi_end_w - roi_start_w + 1, 1);
 24 |     const int roi_height = max(roi_end_h - roi_start_h + 1, 1);
 25 |     const float bin_size_h = static_cast<float>(roi_height)
 26 |                     / static_cast<float>(pooled_height);
 27 |     const float bin_size_w = static_cast<float>(roi_width)
 28 |                     / static_cast<float>(pooled_width);
 29 | 
 30 |     int hstart = static_cast<int>(floor(static_cast<float>(ph)
 31 |                                     * bin_size_h));
 32 |         int wstart = static_cast<int>(floor(static_cast<float>(pw)
 33 |                                     * bin_size_w));
 34 |     int hend = static_cast<int>(ceil(static_cast<float>(ph + 1)
 35 |                                 * bin_size_h));
 36 |         int wend = static_cast<int>(ceil(static_cast<float>(pw + 1)
 37 |                                 * bin_size_w));
 38 | 
 39 |     // Add roi offsets and clip to input boundaries
 40 |     hstart = min(max(hstart + roi_start_h, 0), height);
 41 |     hend = min(max(hend + roi_start_h, 0), height);
 42 |     wstart = min(max(wstart + roi_start_w, 0), width);
 43 |     wend = min(max(wend + roi_start_w, 0), width);
 44 |     bool is_empty = (hend <= hstart) || (wend <= wstart);
 45 | 
 46 |     // Define an empty pooling region to be zero
 47 |     float maxval = is_empty ? 0 : -1E+37;
 48 |     // If nothing is pooled, argmax=-1 causes nothing to be backprop'd
 49 |     int maxidx = -1;
 50 |     const int data_offset = (roi_batch_ind * channels + c) * height * width;
 51 |     for (int h = hstart; h < hend; ++h) {
 52 |         for (int w = wstart; w < wend; ++w) {
 53 |             int bottom_index = h * width + w;
 54 |             if (bottom_data[data_offset + bottom_index] > maxval) {
 55 |                 maxval = bottom_data[data_offset + bottom_index];
 56 |                 maxidx = bottom_index;
 57 |             }
 58 |         }
 59 |     }
 60 |     top_data[idx]=maxval;
 61 |     argmax_data[idx]=maxidx;
 62 |     }
 63 | '''
 64 | kernel_backward = '''
 65 |     extern "C"
 66 |     __global__ void roi_backward(const float* const top_diff,
 67 |          const int* const argmax_data,const float* const bottom_rois,
 68 |          float* bottom_diff, const int num_rois,
 69 |          const double spatial_scale, int channels,
 70 |          int height, int width, int pooled_height,
 71 |           int pooled_width,const int NN)
 72 |     {
 73 | 
 74 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 75 |     ////Importtan >= instead of >
 76 |     if(idx>=NN)
 77 |         return;
 78 |     int w = idx % width;
 79 |     int h = (idx / width) % height;
 80 |     int c = (idx/ (width * height)) % channels;
 81 |     int num = idx / (width * height * channels);
 82 | 
 83 |     float gradient = 0;
 84 |     // Accumulate gradient over all ROIs that pooled this element
 85 |     for (int roi_n = 0; roi_n < num_rois; ++roi_n) {
 86 |         // Skip if ROI's batch index doesn't match num
 87 |         if (num != static_cast<int>(bottom_rois[roi_n * 5])) {
 88 |             continue;
 89 |         }
 90 | 
 91 |         int roi_start_w = round(bottom_rois[roi_n * 5 + 1]
 92 |                                 * spatial_scale);
 93 |         int roi_start_h = round(bottom_rois[roi_n * 5 + 2]
 94 |                                 * spatial_scale);
 95 |         int roi_end_w = round(bottom_rois[roi_n * 5 + 3]
 96 |                                 * spatial_scale);
 97 |         int roi_end_h = round(bottom_rois[roi_n * 5 + 4]
 98 |                                 * spatial_scale);
 99 | 
100 |         // Skip if ROI doesn't include (h, w)
101 |         const bool in_roi = (w >= roi_start_w && w <= roi_end_w &&
102 |                                 h >= roi_start_h && h <= roi_end_h);
103 |         if (!in_roi) {
104 |             continue;
105 |         }
106 | 
107 |         int offset = (roi_n * channels + c) * pooled_height
108 |                         * pooled_width;
109 | 
110 |         // Compute feasible set of pooled units that could have pooled
111 |         // this bottom unit
112 | 
113 |         // Force malformed ROIs to be 1x1
114 |         int roi_width = max(roi_end_w - roi_start_w + 1, 1);
115 |         int roi_height = max(roi_end_h - roi_start_h + 1, 1);
116 | 
117 |         float bin_size_h = static_cast<float>(roi_height)
118 |                         / static_cast<float>(pooled_height);
119 |         float bin_size_w = static_cast<float>(roi_width)
120 |                         / static_cast<float>(pooled_width);
121 | 
122 |         int phstart = floor(static_cast<float>(h - roi_start_h)
123 |                             / bin_size_h);
124 |         int phend = ceil(static_cast<float>(h - roi_start_h + 1)
125 |                             / bin_size_h);
126 |         int pwstart = floor(static_cast<float>(w - roi_start_w)
127 |                             / bin_size_w);
128 |         int pwend = ceil(static_cast<float>(w - roi_start_w + 1)
129 |                             / bin_size_w);
130 | 
131 |         phstart = min(max(phstart, 0), pooled_height);
132 |         phend = min(max(phend, 0), pooled_height);
133 |         pwstart = min(max(pwstart, 0), pooled_width);
134 |         pwend = min(max(pwend, 0), pooled_width);
135 |         for (int ph = phstart; ph < phend; ++ph) {
136 |             for (int pw = pwstart; pw < pwend; ++pw) {
137 |                 int index_ = ph * pooled_width + pw + offset;
138 |                 if (argmax_data[index_] == (h * width + w)) {
139 |                     gradient += top_diff[index_];
140 |                 }
141 |             }
142 |         }
143 |     }
144 |     bottom_diff[idx] = gradient;
145 |     }
146 | '''
147 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | visdom
 2 | torchvision
 3 | scikit-image
 4 | tqdm
 5 | fire
 6 | pprint
 7 | matplotlib
 8 | ipdb
 9 | cython
10 | git+https://github.com/pytorch/tnt.git@master


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import  absolute_import
  2 | # though cupy is not used but without this line, it raise errors...
  3 | import cupy as cp
  4 | import os
  5 | 
  6 | import ipdb
  7 | import matplotlib
  8 | from tqdm import tqdm
  9 | 
 10 | from utils.config import opt
 11 | from data.dataset import Dataset, TestDataset, inverse_normalize
 12 | from model import FasterRCNNVGG16
 13 | from torch.utils import data as data_
 14 | from trainer import FasterRCNNTrainer
 15 | from utils import array_tool as at
 16 | from utils.vis_tool import visdom_bbox
 17 | from utils.eval_tool import eval_detection_voc
 18 | 
 19 | # fix for ulimit
 20 | # https://github.com/pytorch/pytorch/issues/973#issuecomment-346405667
 21 | import resource
 22 | 
 23 | rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
 24 | resource.setrlimit(resource.RLIMIT_NOFILE, (20480, rlimit[1]))
 25 | 
 26 | matplotlib.use('agg')
 27 | 
 28 | 
 29 | def eval(dataloader, faster_rcnn, test_num=10000):
 30 |     pred_bboxes, pred_labels, pred_scores = list(), list(), list()
 31 |     gt_bboxes, gt_labels, gt_difficults = list(), list(), list()
 32 |     for ii, (imgs, sizes, gt_bboxes_, gt_labels_, gt_difficults_) in tqdm(enumerate(dataloader)):
 33 |         sizes = [sizes[0][0].item(), sizes[1][0].item()]
 34 |         pred_bboxes_, pred_labels_, pred_scores_ = faster_rcnn.predict(imgs, [sizes])
 35 |         gt_bboxes += list(gt_bboxes_.numpy())
 36 |         gt_labels += list(gt_labels_.numpy())
 37 |         gt_difficults += list(gt_difficults_.numpy())
 38 |         pred_bboxes += pred_bboxes_
 39 |         pred_labels += pred_labels_
 40 |         pred_scores += pred_scores_
 41 |         if ii == test_num: break
 42 | 
 43 |     result = eval_detection_voc(
 44 |         pred_bboxes, pred_labels, pred_scores,
 45 |         gt_bboxes, gt_labels, gt_difficults,
 46 |         use_07_metric=True)
 47 |     return result
 48 | 
 49 | 
 50 | def train(**kwargs):
 51 |     opt._parse(kwargs)
 52 | 
 53 |     dataset = Dataset(opt)
 54 |     print('load data')
 55 |     dataloader = data_.DataLoader(dataset, \
 56 |                                   batch_size=1, \
 57 |                                   shuffle=True, \
 58 |                                   # pin_memory=True,
 59 |                                   num_workers=opt.num_workers)
 60 |     testset = TestDataset(opt)
 61 |     test_dataloader = data_.DataLoader(testset,
 62 |                                        batch_size=1,
 63 |                                        num_workers=opt.test_num_workers,
 64 |                                        shuffle=False, \
 65 |                                        pin_memory=True
 66 |                                        )
 67 |     faster_rcnn = FasterRCNNVGG16()
 68 |     print('model construct completed')
 69 |     trainer = FasterRCNNTrainer(faster_rcnn).cuda()
 70 |     if opt.load_path:
 71 |         trainer.load(opt.load_path)
 72 |         print('load pretrained model from %s' % opt.load_path)
 73 |     trainer.vis.text(dataset.db.label_names, win='labels')
 74 |     best_map = 0
 75 |     lr_ = opt.lr
 76 |     for epoch in range(opt.epoch):
 77 |         trainer.reset_meters()
 78 |         for ii, (img, bbox_, label_, scale) in tqdm(enumerate(dataloader)):
 79 |             scale = at.scalar(scale)
 80 |             img, bbox, label = img.cuda().float(), bbox_.cuda(), label_.cuda()
 81 |             trainer.train_step(img, bbox, label, scale)
 82 | 
 83 |             if (ii + 1) % opt.plot_every == 0:
 84 |                 if os.path.exists(opt.debug_file):
 85 |                     ipdb.set_trace()
 86 | 
 87 |                 # plot loss
 88 |                 trainer.vis.plot_many(trainer.get_meter_data())
 89 | 
 90 |                 # plot groud truth bboxes
 91 |                 ori_img_ = inverse_normalize(at.tonumpy(img[0]))
 92 |                 gt_img = visdom_bbox(ori_img_,
 93 |                                      at.tonumpy(bbox_[0]),
 94 |                                      at.tonumpy(label_[0]))
 95 |                 trainer.vis.img('gt_img', gt_img)
 96 | 
 97 |                 # plot predicti bboxes
 98 |                 _bboxes, _labels, _scores = trainer.faster_rcnn.predict([ori_img_], visualize=True)
 99 |                 pred_img = visdom_bbox(ori_img_,
100 |                                        at.tonumpy(_bboxes[0]),
101 |                                        at.tonumpy(_labels[0]).reshape(-1),
102 |                                        at.tonumpy(_scores[0]))
103 |                 trainer.vis.img('pred_img', pred_img)
104 | 
105 |                 # rpn confusion matrix(meter)
106 |                 trainer.vis.text(str(trainer.rpn_cm.value().tolist()), win='rpn_cm')
107 |                 # roi confusion matrix
108 |                 trainer.vis.img('roi_cm', at.totensor(trainer.roi_cm.conf, False).float())
109 |         eval_result = eval(test_dataloader, faster_rcnn, test_num=opt.test_num)
110 |         trainer.vis.plot('test_map', eval_result['map'])
111 |         lr_ = trainer.faster_rcnn.optimizer.param_groups[0]['lr']
112 |         log_info = 'lr:{}, map:{},loss:{}'.format(str(lr_),
113 |                                                   str(eval_result['map']),
114 |                                                   str(trainer.get_meter_data()))
115 |         trainer.vis.log(log_info)
116 | 
117 |         if eval_result['map'] > best_map:
118 |             best_map = eval_result['map']
119 |             best_path = trainer.save(best_map=best_map)
120 |         if epoch == 9:
121 |             trainer.load(best_path)
122 |             trainer.faster_rcnn.scale_lr(opt.lr_decay)
123 |             lr_ = lr_ * opt.lr_decay
124 | 
125 |         if epoch == 13:
126 |             break
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     import fire
131 | 
132 |     fire.Fire()
133 | 


--------------------------------------------------------------------------------
/trainer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import  absolute_import
  2 | import os
  3 | from collections import namedtuple
  4 | import time
  5 | from torch.nn import functional as F
  6 | from model.utils.creator_tool import AnchorTargetCreator, ProposalTargetCreator
  7 | 
  8 | from torch import nn
  9 | import torch as t
 10 | from utils import array_tool as at
 11 | from utils.vis_tool import Visualizer
 12 | 
 13 | from utils.config import opt
 14 | from torchnet.meter import ConfusionMeter, AverageValueMeter
 15 | 
 16 | LossTuple = namedtuple('LossTuple',
 17 |                        ['rpn_loc_loss',
 18 |                         'rpn_cls_loss',
 19 |                         'roi_loc_loss',
 20 |                         'roi_cls_loss',
 21 |                         'total_loss'
 22 |                         ])
 23 | 
 24 | 
 25 | class FasterRCNNTrainer(nn.Module):
 26 |     """wrapper for conveniently training. return losses
 27 | 
 28 |     The losses include:
 29 | 
 30 |     * :obj:`rpn_loc_loss`: The localization loss for \
 31 |         Region Proposal Network (RPN).
 32 |     * :obj:`rpn_cls_loss`: The classification loss for RPN.
 33 |     * :obj:`roi_loc_loss`: The localization loss for the head module.
 34 |     * :obj:`roi_cls_loss`: The classification loss for the head module.
 35 |     * :obj:`total_loss`: The sum of 4 loss above.
 36 | 
 37 |     Args:
 38 |         faster_rcnn (model.FasterRCNN):
 39 |             A Faster R-CNN model that is going to be trained.
 40 |     """
 41 | 
 42 |     def __init__(self, faster_rcnn):
 43 |         super(FasterRCNNTrainer, self).__init__()
 44 | 
 45 |         self.faster_rcnn = faster_rcnn
 46 |         self.rpn_sigma = opt.rpn_sigma
 47 |         self.roi_sigma = opt.roi_sigma
 48 | 
 49 |         # target creator create gt_bbox gt_label etc as training targets.
 50 |         self.anchor_target_creator = AnchorTargetCreator()
 51 |         self.proposal_target_creator = ProposalTargetCreator()
 52 | 
 53 |         self.loc_normalize_mean = faster_rcnn.loc_normalize_mean
 54 |         self.loc_normalize_std = faster_rcnn.loc_normalize_std
 55 | 
 56 |         self.optimizer = self.faster_rcnn.get_optimizer()
 57 |         # visdom wrapper
 58 |         self.vis = Visualizer(env=opt.env)
 59 | 
 60 |         # indicators for training status
 61 |         self.rpn_cm = ConfusionMeter(2)
 62 |         self.roi_cm = ConfusionMeter(21)
 63 |         self.meters = {k: AverageValueMeter() for k in LossTuple._fields}  # average loss
 64 | 
 65 |     def forward(self, imgs, bboxes, labels, scale):
 66 |         """Forward Faster R-CNN and calculate losses.
 67 | 
 68 |         Here are notations used.
 69 | 
 70 |         * :math:`N` is the batch size.
 71 |         * :math:`R` is the number of bounding boxes per image.
 72 | 
 73 |         Currently, only :math:`N=1` is supported.
 74 | 
 75 |         Args:
 76 |             imgs (~torch.autograd.Variable): A variable with a batch of images.
 77 |             bboxes (~torch.autograd.Variable): A batch of bounding boxes.
 78 |                 Its shape is :math:`(N, R, 4)`.
 79 |             labels (~torch.autograd..Variable): A batch of labels.
 80 |                 Its shape is :math:`(N, R)`. The background is excluded from
 81 |                 the definition, which means that the range of the value
 82 |                 is :math:`[0, L - 1]`. :math:`L` is the number of foreground
 83 |                 classes.
 84 |             scale (float): Amount of scaling applied to
 85 |                 the raw image during preprocessing.
 86 | 
 87 |         Returns:
 88 |             namedtuple of 5 losses
 89 |         """
 90 |         n = bboxes.shape[0]
 91 |         if n != 1:
 92 |             raise ValueError('Currently only batch size 1 is supported.')
 93 | 
 94 |         _, _, H, W = imgs.shape
 95 |         img_size = (H, W)
 96 | 
 97 |         features = self.faster_rcnn.extractor(imgs)
 98 | 
 99 |         rpn_locs, rpn_scores, rois, roi_indices, anchor = \
100 |             self.faster_rcnn.rpn(features, img_size, scale)
101 | 
102 |         # Since batch size is one, convert variables to singular form
103 |         bbox = bboxes[0]
104 |         label = labels[0]
105 |         rpn_score = rpn_scores[0]
106 |         rpn_loc = rpn_locs[0]
107 |         roi = rois
108 | 
109 |         # SRGAN on roi
110 |         # TODO: Upsample roi through pretrained SRGAN network
111 | 
112 |         # Sample RoIs and forward
113 |         # it's fine to break the computation graph of rois,
114 |         # consider them as constant input
115 |         sample_roi, gt_roi_loc, gt_roi_label = self.proposal_target_creator(
116 |             roi,
117 |             at.tonumpy(bbox),
118 |             at.tonumpy(label),
119 |             self.loc_normalize_mean,
120 |             self.loc_normalize_std)
121 |         # NOTE it's all zero because now it only support for batch=1 now
122 |         sample_roi_index = t.zeros(len(sample_roi))
123 |         roi_cls_loc, roi_score = self.faster_rcnn.head(
124 |             features,
125 |             sample_roi,
126 |             sample_roi_index)
127 | 
128 |         # ------------------ RPN losses -------------------#
129 |         gt_rpn_loc, gt_rpn_label = self.anchor_target_creator(
130 |             at.tonumpy(bbox),
131 |             anchor,
132 |             img_size)
133 |         gt_rpn_label = at.totensor(gt_rpn_label).long()
134 |         gt_rpn_loc = at.totensor(gt_rpn_loc)
135 |         rpn_loc_loss = _fast_rcnn_loc_loss(
136 |             rpn_loc,
137 |             gt_rpn_loc,
138 |             gt_rpn_label.data,
139 |             self.rpn_sigma)
140 | 
141 |         # NOTE: default value of ignore_index is -100 ...
142 |         rpn_cls_loss = F.cross_entropy(rpn_score, gt_rpn_label.cuda(), ignore_index=-1)
143 |         _gt_rpn_label = gt_rpn_label[gt_rpn_label > -1]
144 |         _rpn_score = at.tonumpy(rpn_score)[at.tonumpy(gt_rpn_label) > -1]
145 |         self.rpn_cm.add(at.totensor(_rpn_score, False), _gt_rpn_label.data.long())
146 | 
147 |         # ------------------ ROI losses (fast rcnn loss) -------------------#
148 |         n_sample = roi_cls_loc.shape[0]
149 |         roi_cls_loc = roi_cls_loc.view(n_sample, -1, 4)
150 |         roi_loc = roi_cls_loc[t.arange(0, n_sample).long().cuda(), \
151 |                               at.totensor(gt_roi_label).long()]
152 |         gt_roi_label = at.totensor(gt_roi_label).long()
153 |         gt_roi_loc = at.totensor(gt_roi_loc)
154 | 
155 |         roi_loc_loss = _fast_rcnn_loc_loss(
156 |             roi_loc.contiguous(),
157 |             gt_roi_loc,
158 |             gt_roi_label.data,
159 |             self.roi_sigma)
160 | 
161 |         roi_cls_loss = nn.CrossEntropyLoss()(roi_score, gt_roi_label.cuda())
162 | 
163 |         self.roi_cm.add(at.totensor(roi_score, False), gt_roi_label.data.long())
164 | 
165 |         losses = [rpn_loc_loss, rpn_cls_loss, roi_loc_loss, roi_cls_loss]
166 |         losses = losses + [sum(losses)]
167 | 
168 |         return LossTuple(*losses)
169 | 
170 |     def train_step(self, imgs, bboxes, labels, scale):
171 |         self.optimizer.zero_grad()
172 |         losses = self.forward(imgs, bboxes, labels, scale)
173 |         losses.total_loss.backward()
174 |         self.optimizer.step()
175 |         self.update_meters(losses)
176 |         return losses
177 | 
178 |     def save(self, save_optimizer=False, save_path=None, **kwargs):
179 |         """serialize models include optimizer and other info
180 |         return path where the model-file is stored.
181 | 
182 |         Args:
183 |             save_optimizer (bool): whether save optimizer.state_dict().
184 |             save_path (string): where to save model, if it's None, save_path
185 |                 is generate using time str and info from kwargs.
186 | 
187 |         Returns:
188 |             save_path(str): the path to save models.
189 |         """
190 |         save_dict = dict()
191 | 
192 |         save_dict['model'] = self.faster_rcnn.state_dict()
193 |         save_dict['config'] = opt._state_dict()
194 |         save_dict['other_info'] = kwargs
195 |         save_dict['vis_info'] = self.vis.state_dict()
196 | 
197 |         if save_optimizer:
198 |             save_dict['optimizer'] = self.optimizer.state_dict()
199 | 
200 |         if save_path is None:
201 |             timestr = time.strftime('%m%d%H%M')
202 |             save_path = 'checkpoints/fasterrcnn_%s' % timestr
203 |             for k_, v_ in kwargs.items():
204 |                 save_path += '_%s' % v_
205 | 
206 |         save_dir = os.path.dirname(save_path)
207 |         if not os.path.exists(save_dir):
208 |             os.makedirs(save_dir)
209 | 
210 |         t.save(save_dict, save_path)
211 |         self.vis.save([self.vis.env])
212 |         return save_path
213 | 
214 |     def load(self, path, load_optimizer=True, parse_opt=False):
215 |         state_dict = t.load(path)
216 |         if 'model' in state_dict:
217 |             self.faster_rcnn.load_state_dict(state_dict['model'])
218 |         else:  # legacy way, for backward compatibility
219 |             self.faster_rcnn.load_state_dict(state_dict)
220 |             return self
221 |         if parse_opt:
222 |             opt._parse(state_dict['config'])
223 |         if 'optimizer' in state_dict and load_optimizer:
224 |             self.optimizer.load_state_dict(state_dict['optimizer'])
225 |         return self
226 | 
227 |     def update_meters(self, losses):
228 |         loss_d = {k: at.scalar(v) for k, v in losses._asdict().items()}
229 |         for key, meter in self.meters.items():
230 |             meter.add(loss_d[key])
231 | 
232 |     def reset_meters(self):
233 |         for key, meter in self.meters.items():
234 |             meter.reset()
235 |         self.roi_cm.reset()
236 |         self.rpn_cm.reset()
237 | 
238 |     def get_meter_data(self):
239 |         return {k: v.value()[0] for k, v in self.meters.items()}
240 | 
241 | 
242 | def _smooth_l1_loss(x, t, in_weight, sigma):
243 |     sigma2 = sigma ** 2
244 |     diff = in_weight * (x - t)
245 |     abs_diff = diff.abs()
246 |     flag = (abs_diff.data < (1. / sigma2)).float()
247 |     y = (flag * (sigma2 / 2.) * (diff ** 2) +
248 |          (1 - flag) * (abs_diff - 0.5 / sigma2))
249 |     return y.sum()
250 | 
251 | 
252 | def _fast_rcnn_loc_loss(pred_loc, gt_loc, gt_label, sigma):
253 |     in_weight = t.zeros(gt_loc.shape).cuda()
254 |     # Localization loss is calculated only for positive rois.
255 |     # NOTE:  unlike origin implementation,
256 |     # we don't need inside_weight and outside_weight, they can calculate by gt_label
257 |     in_weight[(gt_label > 0).view(-1, 1).expand_as(in_weight).cuda()] = 1
258 |     loc_loss = _smooth_l1_loss(pred_loc, gt_loc, in_weight.detach(), sigma)
259 |     # Normalize by total number of negtive and positive rois.
260 |     loc_loss /= ((gt_label >= 0).sum().float()) # ignore gt_label==-1 for rpn_loss
261 |     return loc_loss
262 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #    Copyright 2017 cy
 2 | # 
 3 | #    Licensed under the Apache License, Version 2.0 (the "License");
 4 | #    you may not use this file except in compliance with the License.
 5 | #    You may obtain a copy of the License at
 6 | # 
 7 | #        http://www.apache.org/licenses/LICENSE-2.0
 8 | # 
 9 | #    Unless required by applicable law or agreed to in writing, software
10 | #    distributed under the License is distributed on an "AS IS" BASIS,
11 | #    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #    See the License for the specific language governing permissions and
13 | #    limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/utils/array_tool.py:
--------------------------------------------------------------------------------
 1 | """
 2 | tools to convert specified type
 3 | """
 4 | import torch as t
 5 | import numpy as np
 6 | 
 7 | 
 8 | def tonumpy(data):
 9 |     if isinstance(data, np.ndarray):
10 |         return data
11 |     if isinstance(data, t.Tensor):
12 |         return data.detach().cpu().numpy()
13 | 
14 | 
15 | def totensor(data, cuda=True):
16 |     if isinstance(data, np.ndarray):
17 |         tensor = t.from_numpy(data)
18 |     if isinstance(data, t.Tensor):
19 |         tensor = data.detach()
20 |     if cuda:
21 |         tensor = tensor.cuda()
22 |     return tensor
23 | 
24 | 
25 | def scalar(data):
26 |     if isinstance(data, np.ndarray):
27 |         return data.reshape(1)[0]
28 |     if isinstance(data, t.Tensor):
29 |         return data.item()


--------------------------------------------------------------------------------
/utils/config.py:
--------------------------------------------------------------------------------
 1 | from pprint import pprint
 2 | 
 3 | 
 4 | # Default Configs for training
 5 | # NOTE that, config items could be overwriten by passing argument through command line.
 6 | # e.g. --voc-data-dir='./data/'
 7 | 
 8 | class Config:
 9 |     # data
10 |     voc_data_dir = '/home/cy/.chainer/dataset/pfnet/chainercv/voc/VOCdevkit/VOC2007/'
11 |     min_size = 600  # image resize
12 |     max_size = 1000 # image resize
13 |     num_workers = 8
14 |     test_num_workers = 8
15 | 
16 |     # sigma for l1_smooth_loss
17 |     rpn_sigma = 3.
18 |     roi_sigma = 1.
19 | 
20 |     # param for optimizer
21 |     # 0.0005 in origin paper but 0.0001 in tf-faster-rcnn
22 |     weight_decay = 0.0005
23 |     lr_decay = 0.1  # 1e-3 -> 1e-4
24 |     lr = 1e-3
25 | 
26 | 
27 |     # visualization
28 |     env = 'faster-rcnn'  # visdom env
29 |     port = 8097
30 |     plot_every = 40  # vis every N iter
31 | 
32 |     # preset
33 |     data = 'voc'
34 |     pretrained_model = 'vgg16'
35 | 
36 |     # training
37 |     epoch = 14
38 | 
39 | 
40 |     use_adam = False # Use Adam optimizer
41 |     use_chainer = False # try match everything as chainer
42 |     use_drop = False # use dropout in RoIHead
43 |     # debug
44 |     debug_file = '/tmp/debugf'
45 | 
46 |     test_num = 10000
47 |     # model
48 |     load_path = None
49 | 
50 |     caffe_pretrain = False # use caffe pretrained model instead of torchvision
51 |     caffe_pretrain_path = 'checkpoints/vgg16_caffe.pth'
52 | 
53 |     def _parse(self, kwargs):
54 |         state_dict = self._state_dict()
55 |         for k, v in kwargs.items():
56 |             if k not in state_dict:
57 |                 raise ValueError('UnKnown Option: "--%s"' % k)
58 |             setattr(self, k, v)
59 | 
60 |         print('======user config========')
61 |         pprint(self._state_dict())
62 |         print('==========end============')
63 | 
64 |     def _state_dict(self):
65 |         return {k: getattr(self, k) for k, _ in Config.__dict__.items() \
66 |                 if not k.startswith('_')}
67 | 
68 | 
69 | opt = Config()
70 | 


--------------------------------------------------------------------------------
/utils/eval_tool.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | from collections import defaultdict
  4 | import itertools
  5 | import numpy as np
  6 | import six
  7 | 
  8 | from model.utils.bbox_tools import bbox_iou
  9 | 
 10 | 
 11 | def eval_detection_voc(
 12 |         pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
 13 |         gt_difficults=None,
 14 |         iou_thresh=0.5, use_07_metric=False):
 15 |     """Calculate average precisions based on evaluation code of PASCAL VOC.
 16 | 
 17 |     This function evaluates predicted bounding boxes obtained from a dataset
 18 |     which has :math:`N` images by using average precision for each class.
 19 |     The code is based on the evaluation code used in PASCAL VOC Challenge.
 20 | 
 21 |     Args:
 22 |         pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
 23 |             sets of bounding boxes.
 24 |             Its index corresponds to an index for the base dataset.
 25 |             Each element of :obj:`pred_bboxes` is a set of coordinates
 26 |             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
 27 |             where :math:`R` corresponds
 28 |             to the number of bounding boxes, which may vary among boxes.
 29 |             The second axis corresponds to
 30 |             :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
 31 |         pred_labels (iterable of numpy.ndarray): An iterable of labels.
 32 |             Similar to :obj:`pred_bboxes`, its index corresponds to an
 33 |             index for the base dataset. Its length is :math:`N`.
 34 |         pred_scores (iterable of numpy.ndarray): An iterable of confidence
 35 |             scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
 36 |             its index corresponds to an index for the base dataset.
 37 |             Its length is :math:`N`.
 38 |         gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
 39 |             bounding boxes
 40 |             whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
 41 |             bounding box whose shape is :math:`(R, 4)`. Note that the number of
 42 |             bounding boxes in each image does not need to be same as the number
 43 |             of corresponding predicted boxes.
 44 |         gt_labels (iterable of numpy.ndarray): An iterable of ground truth
 45 |             labels which are organized similarly to :obj:`gt_bboxes`.
 46 |         gt_difficults (iterable of numpy.ndarray): An iterable of boolean
 47 |             arrays which is organized similarly to :obj:`gt_bboxes`.
 48 |             This tells whether the
 49 |             corresponding ground truth bounding box is difficult or not.
 50 |             By default, this is :obj:`None`. In that case, this function
 51 |             considers all bounding boxes to be not difficult.
 52 |         iou_thresh (float): A prediction is correct if its Intersection over
 53 |             Union with the ground truth is above this value.
 54 |         use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
 55 |             for calculating average precision. The default value is
 56 |             :obj:`False`.
 57 | 
 58 |     Returns:
 59 |         dict:
 60 | 
 61 |         The keys, value-types and the description of the values are listed
 62 |         below.
 63 | 
 64 |         * **ap** (*numpy.ndarray*): An array of average precisions. \
 65 |             The :math:`l`-th value corresponds to the average precision \
 66 |             for class :math:`l`. If class :math:`l` does not exist in \
 67 |             either :obj:`pred_labels` or :obj:`gt_labels`, the corresponding \
 68 |             value is set to :obj:`numpy.nan`.
 69 |         * **map** (*float*): The average of Average Precisions over classes.
 70 | 
 71 |     """
 72 | 
 73 |     prec, rec = calc_detection_voc_prec_rec(
 74 |         pred_bboxes, pred_labels, pred_scores,
 75 |         gt_bboxes, gt_labels, gt_difficults,
 76 |         iou_thresh=iou_thresh)
 77 | 
 78 |     ap = calc_detection_voc_ap(prec, rec, use_07_metric=use_07_metric)
 79 | 
 80 |     return {'ap': ap, 'map': np.nanmean(ap)}
 81 | 
 82 | 
 83 | def calc_detection_voc_prec_rec(
 84 |         pred_bboxes, pred_labels, pred_scores, gt_bboxes, gt_labels,
 85 |         gt_difficults=None,
 86 |         iou_thresh=0.5):
 87 |     """Calculate precision and recall based on evaluation code of PASCAL VOC.
 88 | 
 89 |     This function calculates precision and recall of
 90 |     predicted bounding boxes obtained from a dataset which has :math:`N`
 91 |     images.
 92 |     The code is based on the evaluation code used in PASCAL VOC Challenge.
 93 | 
 94 |     Args:
 95 |         pred_bboxes (iterable of numpy.ndarray): An iterable of :math:`N`
 96 |             sets of bounding boxes.
 97 |             Its index corresponds to an index for the base dataset.
 98 |             Each element of :obj:`pred_bboxes` is a set of coordinates
 99 |             of bounding boxes. This is an array whose shape is :math:`(R, 4)`,
100 |             where :math:`R` corresponds
101 |             to the number of bounding boxes, which may vary among boxes.
102 |             The second axis corresponds to
103 |             :math:`y_{min}, x_{min}, y_{max}, x_{max}` of a bounding box.
104 |         pred_labels (iterable of numpy.ndarray): An iterable of labels.
105 |             Similar to :obj:`pred_bboxes`, its index corresponds to an
106 |             index for the base dataset. Its length is :math:`N`.
107 |         pred_scores (iterable of numpy.ndarray): An iterable of confidence
108 |             scores for predicted bounding boxes. Similar to :obj:`pred_bboxes`,
109 |             its index corresponds to an index for the base dataset.
110 |             Its length is :math:`N`.
111 |         gt_bboxes (iterable of numpy.ndarray): An iterable of ground truth
112 |             bounding boxes
113 |             whose length is :math:`N`. An element of :obj:`gt_bboxes` is a
114 |             bounding box whose shape is :math:`(R, 4)`. Note that the number of
115 |             bounding boxes in each image does not need to be same as the number
116 |             of corresponding predicted boxes.
117 |         gt_labels (iterable of numpy.ndarray): An iterable of ground truth
118 |             labels which are organized similarly to :obj:`gt_bboxes`.
119 |         gt_difficults (iterable of numpy.ndarray): An iterable of boolean
120 |             arrays which is organized similarly to :obj:`gt_bboxes`.
121 |             This tells whether the
122 |             corresponding ground truth bounding box is difficult or not.
123 |             By default, this is :obj:`None`. In that case, this function
124 |             considers all bounding boxes to be not difficult.
125 |         iou_thresh (float): A prediction is correct if its Intersection over
126 |             Union with the ground truth is above this value..
127 | 
128 |     Returns:
129 |         tuple of two lists:
130 |         This function returns two lists: :obj:`prec` and :obj:`rec`.
131 | 
132 |         * :obj:`prec`: A list of arrays. :obj:`prec[l]` is precision \
133 |             for class :math:`l`. If class :math:`l` does not exist in \
134 |             either :obj:`pred_labels` or :obj:`gt_labels`, :obj:`prec[l]` is \
135 |             set to :obj:`None`.
136 |         * :obj:`rec`: A list of arrays. :obj:`rec[l]` is recall \
137 |             for class :math:`l`. If class :math:`l` that is not marked as \
138 |             difficult does not exist in \
139 |             :obj:`gt_labels`, :obj:`rec[l]` is \
140 |             set to :obj:`None`.
141 | 
142 |     """
143 | 
144 |     pred_bboxes = iter(pred_bboxes)
145 |     pred_labels = iter(pred_labels)
146 |     pred_scores = iter(pred_scores)
147 |     gt_bboxes = iter(gt_bboxes)
148 |     gt_labels = iter(gt_labels)
149 |     if gt_difficults is None:
150 |         gt_difficults = itertools.repeat(None)
151 |     else:
152 |         gt_difficults = iter(gt_difficults)
153 | 
154 |     n_pos = defaultdict(int)
155 |     score = defaultdict(list)
156 |     match = defaultdict(list)
157 | 
158 |     for pred_bbox, pred_label, pred_score, gt_bbox, gt_label, gt_difficult in \
159 |             six.moves.zip(
160 |                 pred_bboxes, pred_labels, pred_scores,
161 |                 gt_bboxes, gt_labels, gt_difficults):
162 | 
163 |         if gt_difficult is None:
164 |             gt_difficult = np.zeros(gt_bbox.shape[0], dtype=bool)
165 | 
166 |         for l in np.unique(np.concatenate((pred_label, gt_label)).astype(int)):
167 |             pred_mask_l = pred_label == l
168 |             pred_bbox_l = pred_bbox[pred_mask_l]
169 |             pred_score_l = pred_score[pred_mask_l]
170 |             # sort by score
171 |             order = pred_score_l.argsort()[::-1]
172 |             pred_bbox_l = pred_bbox_l[order]
173 |             pred_score_l = pred_score_l[order]
174 | 
175 |             gt_mask_l = gt_label == l
176 |             gt_bbox_l = gt_bbox[gt_mask_l]
177 |             gt_difficult_l = gt_difficult[gt_mask_l]
178 | 
179 |             n_pos[l] += np.logical_not(gt_difficult_l).sum()
180 |             score[l].extend(pred_score_l)
181 | 
182 |             if len(pred_bbox_l) == 0:
183 |                 continue
184 |             if len(gt_bbox_l) == 0:
185 |                 match[l].extend((0,) * pred_bbox_l.shape[0])
186 |                 continue
187 | 
188 |             # VOC evaluation follows integer typed bounding boxes.
189 |             pred_bbox_l = pred_bbox_l.copy()
190 |             pred_bbox_l[:, 2:] += 1
191 |             gt_bbox_l = gt_bbox_l.copy()
192 |             gt_bbox_l[:, 2:] += 1
193 | 
194 |             iou = bbox_iou(pred_bbox_l, gt_bbox_l)
195 |             gt_index = iou.argmax(axis=1)
196 |             # set -1 if there is no matching ground truth
197 |             gt_index[iou.max(axis=1) < iou_thresh] = -1
198 |             del iou
199 | 
200 |             selec = np.zeros(gt_bbox_l.shape[0], dtype=bool)
201 |             for gt_idx in gt_index:
202 |                 if gt_idx >= 0:
203 |                     if gt_difficult_l[gt_idx]:
204 |                         match[l].append(-1)
205 |                     else:
206 |                         if not selec[gt_idx]:
207 |                             match[l].append(1)
208 |                         else:
209 |                             match[l].append(0)
210 |                     selec[gt_idx] = True
211 |                 else:
212 |                     match[l].append(0)
213 | 
214 |     for iter_ in (
215 |             pred_bboxes, pred_labels, pred_scores,
216 |             gt_bboxes, gt_labels, gt_difficults):
217 |         if next(iter_, None) is not None:
218 |             raise ValueError('Length of input iterables need to be same.')
219 | 
220 |     n_fg_class = max(n_pos.keys()) + 1
221 |     prec = [None] * n_fg_class
222 |     rec = [None] * n_fg_class
223 | 
224 |     for l in n_pos.keys():
225 |         score_l = np.array(score[l])
226 |         match_l = np.array(match[l], dtype=np.int8)
227 | 
228 |         order = score_l.argsort()[::-1]
229 |         match_l = match_l[order]
230 | 
231 |         tp = np.cumsum(match_l == 1)
232 |         fp = np.cumsum(match_l == 0)
233 | 
234 |         # If an element of fp + tp is 0,
235 |         # the corresponding element of prec[l] is nan.
236 |         prec[l] = tp / (fp + tp)
237 |         # If n_pos[l] is 0, rec[l] is None.
238 |         if n_pos[l] > 0:
239 |             rec[l] = tp / n_pos[l]
240 | 
241 |     return prec, rec
242 | 
243 | 
244 | def calc_detection_voc_ap(prec, rec, use_07_metric=False):
245 |     """Calculate average precisions based on evaluation code of PASCAL VOC.
246 | 
247 |     This function calculates average precisions
248 |     from given precisions and recalls.
249 |     The code is based on the evaluation code used in PASCAL VOC Challenge.
250 | 
251 |     Args:
252 |         prec (list of numpy.array): A list of arrays.
253 |             :obj:`prec[l]` indicates precision for class :math:`l`.
254 |             If :obj:`prec[l]` is :obj:`None`, this function returns
255 |             :obj:`numpy.nan` for class :math:`l`.
256 |         rec (list of numpy.array): A list of arrays.
257 |             :obj:`rec[l]` indicates recall for class :math:`l`.
258 |             If :obj:`rec[l]` is :obj:`None`, this function returns
259 |             :obj:`numpy.nan` for class :math:`l`.
260 |         use_07_metric (bool): Whether to use PASCAL VOC 2007 evaluation metric
261 |             for calculating average precision. The default value is
262 |             :obj:`False`.
263 | 
264 |     Returns:
265 |         ~numpy.ndarray:
266 |         This function returns an array of average precisions.
267 |         The :math:`l`-th value corresponds to the average precision
268 |         for class :math:`l`. If :obj:`prec[l]` or :obj:`rec[l]` is
269 |         :obj:`None`, the corresponding value is set to :obj:`numpy.nan`.
270 | 
271 |     """
272 | 
273 |     n_fg_class = len(prec)
274 |     ap = np.empty(n_fg_class)
275 |     for l in six.moves.range(n_fg_class):
276 |         if prec[l] is None or rec[l] is None:
277 |             ap[l] = np.nan
278 |             continue
279 | 
280 |         if use_07_metric:
281 |             # 11 point metric
282 |             ap[l] = 0
283 |             for t in np.arange(0., 1.1, 0.1):
284 |                 if np.sum(rec[l] >= t) == 0:
285 |                     p = 0
286 |                 else:
287 |                     p = np.max(np.nan_to_num(prec[l])[rec[l] >= t])
288 |                 ap[l] += p / 11
289 |         else:
290 |             # correct AP calculation
291 |             # first append sentinel values at the end
292 |             mpre = np.concatenate(([0], np.nan_to_num(prec[l]), [0]))
293 |             mrec = np.concatenate(([0], rec[l], [1]))
294 | 
295 |             mpre = np.maximum.accumulate(mpre[::-1])[::-1]
296 | 
297 |             # to calculate area under PR curve, look for points
298 |             # where X axis (recall) changes value
299 |             i = np.where(mrec[1:] != mrec[:-1])[0]
300 | 
301 |             # and sum (\Delta recall) * prec
302 |             ap[l] = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
303 | 
304 |     return ap
305 | 


--------------------------------------------------------------------------------
/utils/vis_tool.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | import numpy as np
  4 | import matplotlib
  5 | import torch as t
  6 | import visdom
  7 | 
  8 | matplotlib.use('Agg')
  9 | from matplotlib import pyplot as plot
 10 | 
 11 | # from data.voc_dataset import VOC_BBOX_LABEL_NAMES
 12 | 
 13 | 
 14 | VOC_BBOX_LABEL_NAMES = (
 15 |     'fly',
 16 |     'bike',
 17 |     'bird',
 18 |     'boat',
 19 |     'pin',
 20 |     'bus',
 21 |     'c',
 22 |     'cat',
 23 |     'chair',
 24 |     'cow',
 25 |     'table',
 26 |     'dog',
 27 |     'horse',
 28 |     'moto',
 29 |     'p',
 30 |     'plant',
 31 |     'shep',
 32 |     'sofa',
 33 |     'train',
 34 |     'tv',
 35 | )
 36 | 
 37 | 
 38 | def vis_image(img, ax=None):
 39 |     """Visualize a color image.
 40 | 
 41 |     Args:
 42 |         img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
 43 |             This is in RGB format and the range of its value is
 44 |             :math:`[0, 255]`.
 45 |         ax (matplotlib.axes.Axis): The visualization is displayed on this
 46 |             axis. If this is :obj:`None` (default), a new axis is created.
 47 | 
 48 |     Returns:
 49 |         ~matploblib.axes.Axes:
 50 |         Returns the Axes object with the plot for further tweaking.
 51 | 
 52 |     """
 53 | 
 54 |     if ax is None:
 55 |         fig = plot.figure()
 56 |         ax = fig.add_subplot(1, 1, 1)
 57 |     # CHW -> HWC
 58 |     img = img.transpose((1, 2, 0))
 59 |     ax.imshow(img.astype(np.uint8))
 60 |     return ax
 61 | 
 62 | 
 63 | def vis_bbox(img, bbox, label=None, score=None, ax=None):
 64 |     """Visualize bounding boxes inside image.
 65 | 
 66 |     Args:
 67 |         img (~numpy.ndarray): An array of shape :math:`(3, height, width)`.
 68 |             This is in RGB format and the range of its value is
 69 |             :math:`[0, 255]`.
 70 |         bbox (~numpy.ndarray): An array of shape :math:`(R, 4)`, where
 71 |             :math:`R` is the number of bounding boxes in the image.
 72 |             Each element is organized
 73 |             by :math:`(y_{min}, x_{min}, y_{max}, x_{max})` in the second axis.
 74 |         label (~numpy.ndarray): An integer array of shape :math:`(R,)`.
 75 |             The values correspond to id for label names stored in
 76 |             :obj:`label_names`. This is optional.
 77 |         score (~numpy.ndarray): A float array of shape :math:`(R,)`.
 78 |              Each value indicates how confident the prediction is.
 79 |              This is optional.
 80 |         label_names (iterable of strings): Name of labels ordered according
 81 |             to label ids. If this is :obj:`None`, labels will be skipped.
 82 |         ax (matplotlib.axes.Axis): The visualization is displayed on this
 83 |             axis. If this is :obj:`None` (default), a new axis is created.
 84 | 
 85 |     Returns:
 86 |         ~matploblib.axes.Axes:
 87 |         Returns the Axes object with the plot for further tweaking.
 88 | 
 89 |     """
 90 | 
 91 |     label_names = list(VOC_BBOX_LABEL_NAMES) + ['bg']
 92 |     # add for index `-1`
 93 |     if label is not None and not len(bbox) == len(label):
 94 |         raise ValueError('The length of label must be same as that of bbox')
 95 |     if score is not None and not len(bbox) == len(score):
 96 |         raise ValueError('The length of score must be same as that of bbox')
 97 | 
 98 |     # Returns newly instantiated matplotlib.axes.Axes object if ax is None
 99 |     ax = vis_image(img, ax=ax)
100 | 
101 |     # If there is no bounding box to display, visualize the image and exit.
102 |     if len(bbox) == 0:
103 |         return ax
104 | 
105 |     for i, bb in enumerate(bbox):
106 |         xy = (bb[1], bb[0])
107 |         height = bb[2] - bb[0]
108 |         width = bb[3] - bb[1]
109 |         ax.add_patch(plot.Rectangle(
110 |             xy, width, height, fill=False, edgecolor='red', linewidth=2))
111 | 
112 |         caption = list()
113 | 
114 |         if label is not None and label_names is not None:
115 |             lb = label[i]
116 |             if not (-1 <= lb < len(label_names)):  # modfy here to add backgroud
117 |                 raise ValueError('No corresponding name is given')
118 |             caption.append(label_names[lb])
119 |         if score is not None:
120 |             sc = score[i]
121 |             caption.append('{:.2f}'.format(sc))
122 | 
123 |         if len(caption) > 0:
124 |             ax.text(bb[1], bb[0],
125 |                     ': '.join(caption),
126 |                     style='italic',
127 |                     bbox={'facecolor': 'white', 'alpha': 0.5, 'pad': 0})
128 |     return ax
129 | 
130 | 
131 | def fig2data(fig):
132 |     """
133 |     brief Convert a Matplotlib figure to a 4D numpy array with RGBA 
134 |     channels and return it
135 | 
136 |     @param fig: a matplotlib figure
137 |     @return a numpy 3D array of RGBA values
138 |     """
139 |     # draw the renderer
140 |     fig.canvas.draw()
141 | 
142 |     # Get the RGBA buffer from the figure
143 |     w, h = fig.canvas.get_width_height()
144 |     buf = np.fromstring(fig.canvas.tostring_argb(), dtype=np.uint8)
145 |     buf.shape = (w, h, 4)
146 | 
147 |     # canvas.tostring_argb give pixmap in ARGB mode. Roll the ALPHA channel to have it in RGBA mode
148 |     buf = np.roll(buf, 3, axis=2)
149 |     return buf.reshape(h, w, 4)
150 | 
151 | 
152 | def fig4vis(fig):
153 |     """
154 |     convert figure to ndarray
155 |     """
156 |     ax = fig.get_figure()
157 |     img_data = fig2data(ax).astype(np.int32)
158 |     plot.close()
159 |     # HWC->CHW
160 |     return img_data[:, :, :3].transpose((2, 0, 1)) / 255.
161 | 
162 | 
163 | def visdom_bbox(*args, **kwargs):
164 |     fig = vis_bbox(*args, **kwargs)
165 |     data = fig4vis(fig)
166 |     return data
167 | 
168 | 
169 | class Visualizer(object):
170 |     """
171 |     wrapper for visdom
172 |     you can still access naive visdom function by 
173 |     self.line, self.scater,self._send,etc.
174 |     due to the implementation of `__getattr__`
175 |     """
176 | 
177 |     def __init__(self, env='default', **kwargs):
178 |         self.vis = visdom.Visdom(env=env, use_incoming_socket=False, **kwargs)
179 |         self._vis_kw = kwargs
180 | 
181 |         # e.g.('loss',23) the 23th value of loss
182 |         self.index = {}
183 |         self.log_text = ''
184 | 
185 |     def reinit(self, env='default', **kwargs):
186 |         """
187 |         change the config of visdom
188 |         """
189 |         self.vis = visdom.Visdom(env=env, **kwargs)
190 |         return self
191 | 
192 |     def plot_many(self, d):
193 |         """
194 |         plot multi values
195 |         @params d: dict (name,value) i.e. ('loss',0.11)
196 |         """
197 |         for k, v in d.items():
198 |             if v is not None:
199 |                 self.plot(k, v)
200 | 
201 |     def img_many(self, d):
202 |         for k, v in d.items():
203 |             self.img(k, v)
204 | 
205 |     def plot(self, name, y, **kwargs):
206 |         """
207 |         self.plot('loss',1.00)
208 |         """
209 |         x = self.index.get(name, 0)
210 |         self.vis.line(Y=np.array([y]), X=np.array([x]),
211 |                       win=name,
212 |                       opts=dict(title=name),
213 |                       update=None if x == 0 else 'append',
214 |                       **kwargs
215 |                       )
216 |         self.index[name] = x + 1
217 | 
218 |     def img(self, name, img_, **kwargs):
219 |         """
220 |         self.img('input_img',t.Tensor(64,64))
221 |         self.img('input_imgs',t.Tensor(3,64,64))
222 |         self.img('input_imgs',t.Tensor(100,1,64,64))
223 |         self.img('input_imgs',t.Tensor(100,3,64,64),nrows=10)
224 |         !!don't ~~self.img('input_imgs',t.Tensor(100,64,64),nrows=10)~~!!
225 |         """
226 |         self.vis.images(t.Tensor(img_).cpu().numpy(),
227 |                         win=name,
228 |                         opts=dict(title=name),
229 |                         **kwargs
230 |                         )
231 | 
232 |     def log(self, info, win='log_text'):
233 |         """
234 |         self.log({'loss':1,'lr':0.0001})
235 |         """
236 |         self.log_text += ('[{time}] {info} <br>'.format(
237 |             time=time.strftime('%m%d_%H%M%S'), \
238 |             info=info))
239 |         self.vis.text(self.log_text, win)
240 | 
241 |     def __getattr__(self, name):
242 |         return getattr(self.vis, name)
243 | 
244 |     def state_dict(self):
245 |         return {
246 |             'index': self.index,
247 |             'vis_kw': self._vis_kw,
248 |             'log_text': self.log_text,
249 |             'env': self.vis.env
250 |         }
251 | 
252 |     def load_state_dict(self, d):
253 |         self.vis = visdom.Visdom(env=d.get('env', self.vis.env), **(self.d.get('vis_kw')))
254 |         self.log_text = d.get('log_text', '')
255 |         self.index = d.get('index', dict())
256 |         return self
257 | 


--------------------------------------------------------------------------------