├── README.md ├── faster_rcnn.py ├── model.py ├── train.py └── utils.py /README.md: -------------------------------------------------------------------------------- 1 | # simple_faster_rcnn 2 | run this repository only depend python2.7 and Pytorch (0.3.1 or 0.4.1) 3 | 4 | English Blog: https://medium.com/@fractaldle/guide-to-build-faster-rcnn-in-pytorch-95b10c273439 5 | 6 | Chinese Blog: https://zhuanlan.zhihu.com/p/69250914 7 | 8 | # Note: the current environment of Pytorch is 0.4.1 if you run this repository on Pytorch 0.3.1 ,please check the line 174 of train.py: 9 | ''' 10 | 11 | if your pytorch version is 0.3.1, you must run this: 12 | 13 | output = torch.stack(output) 14 | 15 | ''' 16 | -------------------------------------------------------------------------------- /faster_rcnn.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | 3 | import torch 4 | import torchvision 5 | from PIL import Image, ImageDraw 6 | import numpy as np 7 | 8 | img_tensor = torch.zeros((1, 3, 800, 800)).float() 9 | print(img_tensor.shape) 10 | #Out: torch.Size([1, 3, 800, 800]) 11 | 12 | img_var = torch.autograd.Variable(img_tensor) 13 | 14 | model = torchvision.models.vgg16(pretrained=False) 15 | fe = list(model.features) 16 | print(fe) # length is 15 17 | 18 | req_features = [] 19 | k = img_var.clone() 20 | for i in fe: 21 | print i 22 | k = i(k) 23 | print k.data.shape 24 | if k.size()[2] < 800//16: 25 | break 26 | req_features.append(i) 27 | out_channels = k.size()[1] 28 | print(len(req_features)) #30 29 | print(out_channels) # 512 30 | 31 | for f in req_features: 32 | print f 33 | 34 | 35 | faster_rcnn_fe_extractor = torch.nn.Sequential(*req_features) 36 | out_map = faster_rcnn_fe_extractor(img_var) 37 | print(out_map.size()) 38 | # #Out: torch.Size([1, 512, 50, 50]) 39 | 40 | ratios = [0.5, 1, 2] 41 | anchor_scales = [8, 16, 32] 42 | sub_sample = 16 43 | 44 | # 一个特征点对应原图片中的16*16个像素点区域 45 | fe_size = (800//16) 46 | # ctr_x, ctr_y: 每个特征点对应原图片区域的右下方坐标 47 | ctr_x = np.arange(16, (fe_size+1) * 16, 16) 48 | ctr_y = np.arange(16, (fe_size+1) * 16, 16) 49 | print len(ctr_x) # 共50*50个特征点,将原图片分割成50*50=2500个区域 50 | 51 | 52 | index = 0 53 | # ctr: 每个特征点对应原图片区域的中心点 54 | ctr = dict() 55 | for x in range(len(ctr_x)): 56 | for y in range(len(ctr_y)): 57 | ctr[index] = [-1, -1] 58 | ctr[index][1] = ctr_x[x] - 8 59 | ctr[index][0] = ctr_y[y] - 8 60 | index +=1 61 | # print ctr 62 | print len(ctr) # 将原图片分割成50*50=2500个区域的中心点 63 | 64 | 65 | # 初始化:每个区域有9个anchors候选框,每个候选框的坐标(y1, x1, y2, x2) 66 | anchors = np.zeros(((fe_size * fe_size * 9), 4)) 67 | # (22500, 4) 68 | print anchors.shape 69 | index = 0 70 | # 将候选框的坐标赋值到anchors 71 | for c in ctr: 72 | ctr_y, ctr_x = ctr[c] 73 | for i in range(len(ratios)): 74 | for j in range(len(anchor_scales)): 75 | # anchor_scales 是针对特征图的,所以需要乘以下采样"sub_sample" 76 | h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i]) 77 | w = sub_sample * anchor_scales[j] * np.sqrt(1./ ratios[i]) 78 | anchors[index, 0] = ctr_y - h / 2. 79 | anchors[index, 1] = ctr_x - w / 2. 80 | anchors[index, 2] = ctr_y + h / 2. 81 | anchors[index, 3] = ctr_x + w / 2. 82 | index += 1 83 | # (22500, 4) 84 | print(anchors.shape) 85 | 86 | img_npy = img_tensor.numpy() 87 | img_npy = np.transpose(img_npy[0], (1, 2, 0)).astype(np.float32) 88 | img = Image.fromarray(np.uint8(img_npy)) 89 | # # img.show() 90 | draw = ImageDraw.Draw(img) 91 | 92 | # for index in range(15000, 15009): 93 | # # for index in range(len(anchors)): 94 | # draw.rectangle([(anchors[index, 1], anchors[index, 0]), (anchors[index, 3], anchors[index, 2])], outline=(255, 0, 0)) 95 | # img.show() 96 | 97 | # 假设 图片中的两个目标框"ground-truth" 98 | bbox = np.asarray([[20, 30, 400, 500], [300, 400, 500, 600]], dtype=np.float32) # [y1, x1, y2, x2] format 99 | draw.rectangle([(30, 20), (500, 400)], outline=(100, 255, 0)) 100 | draw.rectangle([(400, 300), (600, 500)], outline=(100, 255, 0)) 101 | 102 | # 假设 图片中两个目标框分别对应的标签 103 | labels = np.asarray([6, 8], dtype=np.int8) # 0 represents background 104 | 105 | # 去除坐标出界的边框,保留图片内的框——图片内框 106 | valid_anchor_index = np.where( 107 | (anchors[:, 0] >= 0) & 108 | (anchors[:, 1] >= 0) & 109 | (anchors[:, 2] <= 800) & 110 | (anchors[:, 3] <= 800) 111 | )[0] # 该函数返回数组中满足条件的index 112 | print valid_anchor_index.shape # (8940,),表明有8940个框满足条件 113 | 114 | 115 | # 获取有效anchor(即边框都在图片内的anchor)的坐标 116 | valid_anchor_boxes = anchors[valid_anchor_index] 117 | print(valid_anchor_boxes.shape) # (8940, 4) 118 | 119 | 120 | # 计算有效anchor框"valid_anchor_boxes"与目标框"bbox"的IOU 121 | ious = np.empty((len(valid_anchor_boxes), 2), dtype=np.float32) 122 | ious.fill(0) 123 | print(bbox) 124 | for num1, i in enumerate(valid_anchor_boxes): 125 | ya1, xa1, ya2, xa2 = i 126 | anchor_area = (ya2 - ya1) * (xa2 - xa1) # anchor框面积 127 | for num2, j in enumerate(bbox): 128 | yb1, xb1, yb2, xb2 = j 129 | box_area = (yb2 - yb1) * (xb2 - xb1) # 目标框面积 130 | inter_x1 = max([xb1, xa1]) 131 | inter_y1 = max([yb1, ya1]) 132 | inter_x2 = min([xb2, xa2]) 133 | inter_y2 = min([yb2, ya2]) 134 | if (inter_x1 < inter_x2) and (inter_y1 < inter_y2): 135 | iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1) # anchor框和目标框的相交面积 136 | iou = iter_area / (anchor_area + box_area - iter_area) # IOU计算 137 | else: 138 | iou = 0. 139 | 140 | ious[num1, num2] = iou 141 | print(ious.shape) # (8940, 2) 表示每个anchor框与所有目标框的IOU,这里所有的目标框共2个。 142 | gt_argmax_ious = ious.argmax(axis=0) # 找出每个目标框最大IOU的anchor框index,共2个 143 | print(gt_argmax_ious) # 共2个,与图片内目标框数量一致 144 | gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])] # 获取每个目标框最大IOU的值,与gt_argmax_ious对应 145 | print(gt_max_ious) # 共2个,与图片内目标框数量一致 146 | argmax_ious = ious.argmax(axis=1) # 找出每个anchor框最大IOU的目标框index,共8940个 147 | print(argmax_ious.shape) # (8940,) 每个anchor框都会对应一个最大IOU的目标框 148 | max_ious = ious[np.arange(len(valid_anchor_index)), argmax_ious] # 获取每个anchor框的最大IOU值, 与argmax_ious对应 149 | print(max_ious.shape) # (8940,),每个anchor框内都会有一个最大值 150 | 151 | # 疑问: ious == gt_max_ious, 有区分目标 152 | gt_argmax_ious = np.where(ious == gt_max_ious)[0] # 根据上面获取的目标最大IOU值,获取等于该值的index 153 | print gt_argmax_ious.shape # (18,) 共计18个 154 | # for index in gt_argmax_ious: 155 | # draw.rectangle([(valid_anchor_boxes[index, 1], valid_anchor_boxes[index, 0]), 156 | # (valid_anchor_boxes[index, 3], valid_anchor_boxes[index, 2])], outline=(255, 0, 0)) 157 | # img.show() 158 | 159 | 160 | pos_iou_threshold = 0.7 161 | neg_iou_threshold = 0.3 162 | label = np.empty((len(valid_anchor_index), ), dtype=np.int32) 163 | label.fill(-1) 164 | print label.shape # (8940,) 165 | label[max_ious < neg_iou_threshold] = 0 # anchor框内最大IOU值小于neg_iou_threshold,设为0 166 | label[gt_argmax_ious] = 1 # anchor框有全局最大IOU值,设为1 167 | label[max_ious >= pos_iou_threshold] = 1 # anchor框内最大IOU值大于等于pos_iou_threshold,设为1 168 | 169 | 170 | 171 | pos_ratio = 0.5 172 | n_sample = 256 173 | n_pos = pos_ratio * n_sample # 正例样本数 174 | 175 | # 随机获取n_pos个正例, 176 | pos_index = np.where(label == 1)[0] 177 | if len(pos_index) > n_pos: 178 | disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False) 179 | label[disable_index] = -1 180 | 181 | n_neg = n_sample - np.sum(label == 1) 182 | neg_index = np.where(label == 0)[0] 183 | 184 | if len(neg_index) > n_neg: 185 | disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace = False) 186 | label[disable_index] = -1 187 | print np.sum(label == 1) # 18个正例 188 | print np.sum(label == 0) # 256-18=238个负例 189 | 190 | 191 | # 现在让我们用具有最大iou的ground truth对象为每个anchor box分配位置。 192 | # 注意,我们将为所有有效的anchor box分配anchor locs,而不考虑其标签,稍后在计算损失时,我们可以使用简单的过滤器删除它们。 193 | max_iou_bbox = bbox[argmax_ious] # 有效anchor框对应的目标框坐标 (8940, 4) 194 | print(max_iou_bbox) 195 | print max_iou_bbox.shape # (8940, 4),共有8940个有效anchor框,每个anchor有坐标值(y1, x1, y2, x2) 196 | 197 | # 有效anchor的中心点和宽高:ctr_x, ctr_y, width, height 198 | height = valid_anchor_boxes[:, 2] - valid_anchor_boxes[:, 0] 199 | width = valid_anchor_boxes[:, 3] - valid_anchor_boxes[:, 1] 200 | ctr_y = valid_anchor_boxes[:, 0] + 0.5 * height 201 | ctr_x = valid_anchor_boxes[:, 1] + 0.5 * width 202 | # 有效anchor对应目标框的中心点和宽高: base_ctr_x, base_ctr_y, base_width, base_height 203 | base_height = max_iou_bbox[:, 2] - max_iou_bbox[:, 0] 204 | base_width = max_iou_bbox[:, 3] - max_iou_bbox[:, 1] 205 | base_ctr_y = max_iou_bbox[:, 0] + 0.5 * base_height 206 | base_ctr_x = max_iou_bbox[:, 1] + 0.5 * base_width 207 | 208 | # 有效anchor转为目标框的系数(dy,dx是平移系数;dh,dw是缩放系数) 209 | eps = np.finfo(height.dtype).eps 210 | height = np.maximum(height, eps) 211 | width = np.maximum(width, eps) 212 | dy = (base_ctr_y - ctr_y) / height 213 | dx = (base_ctr_x - ctr_x) / width 214 | dh = np.log(base_height / height) 215 | dw = np.log(base_width / width) 216 | anchor_locs = np.vstack((dy, dx, dh, dw)).transpose() 217 | # print anchor_locs 218 | print(anchor_locs.shape) 219 | 220 | 221 | # anchor_labels : 每个anchor框对应的label(-1:无效anchor,0:负例有效anchor,1:正例有效anchor) 222 | anchor_labels = np.empty((len(anchors),), dtype=label.dtype) 223 | anchor_labels.fill(-1) 224 | anchor_labels[valid_anchor_index] = label 225 | 226 | # anchor_locations: 每个有效anchor框转为目标框的系数 227 | anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype=anchor_locs.dtype) 228 | anchor_locations.fill(0) 229 | anchor_locations[valid_anchor_index, :] = anchor_locs 230 | 231 | 232 | # Region Proposal Network (RPN) 233 | import torch.nn as nn 234 | mid_channels = 512 235 | in_channels = 512 # depends on the output feature map. in vgg 16 it is equal to 512 236 | n_anchor = 9 # Number of anchors at each location 237 | conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1) 238 | reg_layer = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0) 239 | cls_layer = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0) ## I will be going to use softmax here. you can equally use sigmoid if u replace 2 with 1. 240 | 241 | # conv sliding layer 242 | conv1.weight.data.normal_(0, 0.01) 243 | conv1.bias.data.zero_() 244 | 245 | # Regression layer 246 | reg_layer.weight.data.normal_(0, 0.01) 247 | reg_layer.bias.data.zero_() 248 | 249 | # classification layer 250 | cls_layer.weight.data.normal_(0, 0.01) 251 | cls_layer.bias.data.zero_() 252 | 253 | x = conv1(out_map) # out_map is obtained in section 1 254 | pred_anchor_locs = reg_layer(x) # 回归层,计算有效anchor转为目标框的四个系数 255 | pred_cls_scores = cls_layer(x) # 分类层,判断该anchor是否可以捕获目标 256 | 257 | print(pred_cls_scores.shape, pred_anchor_locs.shape) # ((1L, 18L, 50L, 50L), (1L, 36L, 50L, 50L)) 258 | 259 | pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4) 260 | print(pred_anchor_locs.shape) 261 | #Out: torch.Size([1, 22500, 4]) 262 | 263 | pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous() 264 | print(pred_cls_scores.shape) 265 | #Out torch.Size([1, 50, 50, 18]) 266 | 267 | objectness_score = pred_cls_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1) 268 | print(objectness_score.shape) 269 | #Out torch.Size([1, 22500]) 270 | 271 | pred_cls_scores = pred_cls_scores.view(1, -1, 2) 272 | print(pred_cls_scores.shape) 273 | # Out torch.size([1, 22500, 2]) 274 | 275 | 276 | # Generating proposals to feed Fast R-CNN network 277 | n_train_pre_nms = 12000 278 | n_train_post_nms = 2000 279 | n_test_pre_nms = 6000 280 | n_test_post_nms = 300 281 | min_size = 16 282 | 283 | # 转换anchor格式从 y1, x1, y2, x2 到 ctr_x, ctr_y, h, w : 284 | anc_height = anchors[:, 2] - anchors[:, 0] 285 | anc_width = anchors[:, 3] - anchors[:, 1] 286 | anc_ctr_y = anchors[:, 0] + 0.5 * anc_height 287 | anc_ctr_x = anchors[:, 1] + 0.5 * anc_width 288 | 289 | 290 | # 根据预测的四个系数,将anchor框通过平移和缩放转化为预测的目标框 291 | pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy() 292 | objectness_score_numpy = objectness_score[0].data.numpy() 293 | dy = pred_anchor_locs_numpy[:, 0::4] 294 | dx = pred_anchor_locs_numpy[:, 1::4] 295 | dh = pred_anchor_locs_numpy[:, 2::4] 296 | dw = pred_anchor_locs_numpy[:, 3::4] 297 | ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis] 298 | ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis] 299 | h = np.exp(dh) * anc_height[:, np.newaxis] 300 | w = np.exp(dw) * anc_width[:, np.newaxis] 301 | 302 | # 将预测的目标框转换为[y1, x1, y2, x2]格式 303 | roi = np.zeros(pred_anchor_locs_numpy.shape, dtype=pred_anchor_locs_numpy.dtype) 304 | roi[:, 0::4] = ctr_y - 0.5 * h 305 | roi[:, 1::4] = ctr_x - 0.5 * w 306 | roi[:, 2::4] = ctr_y + 0.5 * h 307 | roi[:, 3::4] = ctr_x + 0.5 * w 308 | 309 | 310 | # 剪辑预测框到图像上 311 | img_size = (800, 800) #Image size 312 | roi[:, slice(0, 4, 2)] = np.clip(roi[:, slice(0, 4, 2)], 0, img_size[0]) 313 | roi[:, slice(1, 4, 2)] = np.clip(roi[:, slice(1, 4, 2)], 0, img_size[1]) 314 | print(roi.shape) # (22500, 4) 315 | 316 | # 去除高度或宽度 < threshold的预测框 (疑问:这样会不会忽略小目标) 317 | hs = roi[:, 2] - roi[:, 0] 318 | ws = roi[:, 3] - roi[:, 1] 319 | keep = np.where((hs >= min_size) & (ws >= min_size))[0] 320 | roi = roi[keep, :] 321 | score = objectness_score_numpy[keep] 322 | 323 | # 按分数从高到低排序所有的(proposal, score)对 324 | order = score.ravel().argsort()[::-1] 325 | print(order.shape) # (22500,) 326 | 327 | # 取前几个预测框pre_nms_topN(如训练时12000,测试时300) 328 | order = order[:n_train_pre_nms] 329 | roi = roi[order, :] 330 | print(roi.shape) # (12000, 4) 331 | 332 | # nms(非极大抑制)计算: (去除和极大值anchor框IOU大于0.7的框——即去除相交的框,保留score大,且基本无相交的框) 333 | nms_thresh = 0.7 334 | y1 = roi[:, 0] 335 | x1 = roi[:, 1] 336 | y2 = roi[:, 2] 337 | x2 = roi[:, 3] 338 | 339 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 340 | 341 | score = score[order] 342 | order = score.argsort()[::-1] 343 | print order # [11999 3996 4005 ... 7995 7994 0] 344 | keep = [] 345 | while order.size > 0: 346 | i = order[0] 347 | keep.append(i) 348 | xx1 = np.maximum(x1[i], x1[order[1:]]) 349 | yy1 = np.maximum(y1[i], y1[order[1:]]) 350 | xx2 = np.minimum(x2[i], x2[order[1:]]) 351 | yy2 = np.minimum(y2[i], y2[order[1:]]) 352 | 353 | w = np.maximum(0.0, xx2 - xx1 + 1) 354 | h = np.maximum(0.0, yy2 - yy1 + 1) 355 | inter = w * h 356 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 357 | 358 | inds = np.where(ovr <= nms_thresh)[0] 359 | order = order[inds + 1] 360 | # print ovr 361 | # print order 362 | 363 | keep = keep[:n_train_post_nms] # while training/testing , use accordingly 364 | roi = roi[keep] # the final region proposals(region proposals表示预测目标框) 365 | print roi.shape # (1758, 4) 366 | 367 | 368 | # Proposal targets 369 | n_sample = 128 370 | pos_ratio = 0.25 371 | pos_iou_thresh = 0.5 372 | neg_iou_thresh_hi = 0.5 373 | neg_iou_thresh_lo = 0.0 374 | 375 | # 找到每个ground-truth目标(真实目标框)与region proposal(预测目标框)的iou 376 | ious = np.empty((len(roi), 2), dtype=np.float32) 377 | ious.fill(0) 378 | for num1, i in enumerate(roi): 379 | ya1, xa1, ya2, xa2 = i 380 | anchor_area = (ya2 - ya1) * (xa2 - xa1) 381 | for num2, j in enumerate(bbox): 382 | yb1, xb1, yb2, xb2 = j 383 | box_area = (yb2 - yb1) * (xb2 - xb1) 384 | 385 | inter_x1 = max([xb1, xa1]) 386 | inter_y1 = max([yb1, ya1]) 387 | inter_x2 = min([xb2, xa2]) 388 | inter_y2 = min([yb2, ya2]) 389 | 390 | if (inter_x1 < inter_x2) and (inter_y1 < inter_y2): 391 | iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1) 392 | iou = iter_area / (anchor_area + box_area - iter_area) 393 | else: 394 | iou = 0. 395 | 396 | ious[num1, num2] = iou 397 | print(ious.shape) # (1758, 2) 398 | 399 | # 找到与每个region proposal具有较高IoU的ground truth,并且找到最大的IoU 400 | gt_assignment = ious.argmax(axis=1) 401 | max_iou = ious.max(axis=1) 402 | print(gt_assignment) # [0 0 1 ... 0 0 0] 403 | print(max_iou) # [0.17802152 0.17926688 0.04676317 ... 0. 0. 0. ] 404 | 405 | 406 | # 为每个proposal分配标签: 407 | gt_roi_label = labels[gt_assignment] 408 | print(gt_roi_label) # [6 6 8 ... 6 6 6] 409 | 410 | # 希望只保留n_sample*pos_ratio(128*0.25=32)个前景样本,因此如果只得到少于32个正样本,保持原状。 411 | # 如果得到多余32个前景目标,从中采样32个样本 412 | pos_roi_per_image = 32 413 | pos_index = np.where(max_iou >= pos_iou_thresh)[0] 414 | pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) 415 | if pos_index.size > 0: 416 | pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False) 417 | # print(pos_roi_per_this_image) 418 | print(pos_index) # 19 419 | 420 | # 针对负[背景]region proposal进行相似处理 421 | neg_index = np.where((max_iou < neg_iou_thresh_hi) & (max_iou >= neg_iou_thresh_lo))[0] 422 | neg_roi_per_this_image = n_sample - pos_roi_per_this_image 423 | neg_roi_per_this_image = int(min(neg_roi_per_this_image, neg_index.size)) 424 | if neg_index.size > 0: 425 | neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False) 426 | # print(neg_roi_per_this_image) 427 | print(neg_index) # 109 428 | 429 | keep_index = np.append(pos_index, neg_index) 430 | gt_roi_labels = gt_roi_label[keep_index] 431 | gt_roi_labels[pos_roi_per_this_image:] = 0 # negative labels --> 0 432 | sample_roi = roi[keep_index] # 预测框 433 | print(sample_roi.shape) # (128, 4) 434 | 435 | 436 | bbox_for_sampled_roi = bbox[gt_assignment[keep_index]] # 目标框 437 | print(bbox_for_sampled_roi.shape) # (128, 4) 438 | 439 | 440 | # 根据上面得到的预测框和与之对应的目标框,计算4维参数(平移参数:dy, dx; 缩放参数:dh, dw) 441 | height = sample_roi[:, 2] - sample_roi[:, 0] 442 | width = sample_roi[:, 3] - sample_roi[:, 1] 443 | ctr_y = sample_roi[:, 0] + 0.5 * height 444 | ctr_x = sample_roi[:, 1] + 0.5 * width 445 | base_height = bbox_for_sampled_roi[:, 2] - bbox_for_sampled_roi[:, 0] 446 | base_width = bbox_for_sampled_roi[:, 3] - bbox_for_sampled_roi[:, 1] 447 | base_ctr_y = bbox_for_sampled_roi[:, 0] + 0.5 * base_height 448 | base_ctr_x = bbox_for_sampled_roi[:, 1] + 0.5 * base_width 449 | 450 | eps = np.finfo(height.dtype).eps 451 | height = np.maximum(height, eps) 452 | width = np.maximum(width, eps) 453 | 454 | dy = (base_ctr_y - ctr_y) / height 455 | dx = (base_ctr_x - ctr_x) / width 456 | dh = np.log(base_height / height) 457 | dw = np.log(base_width / width) 458 | 459 | gt_roi_locs = np.vstack((dy, dx, dh, dw)).transpose() 460 | print(gt_roi_locs.shape) 461 | 462 | 463 | rois = torch.from_numpy(sample_roi).float() 464 | roi_indices = 0 * np.ones((len(rois),), dtype=np.int32) 465 | roi_indices = torch.from_numpy(roi_indices).float() 466 | print(rois.shape, roi_indices.shape) # torch.Size([128, 4]) torch.Size([128]) 467 | 468 | indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1) 469 | xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]] 470 | indices_and_rois = xy_indices_and_rois.contiguous() 471 | print(xy_indices_and_rois.shape) # torch.Size([128, 5]) 472 | 473 | # 设计大小为7x7的roi_pooling层 474 | size = (7, 7) 475 | adaptive_max_pool = torch.nn.AdaptiveMaxPool2d(size[0], size[1]) 476 | output = [] 477 | rois = indices_and_rois.float() 478 | rois[:, 1:].mul_(1/16.0) # Subsampling ratio 479 | rois = rois.long() 480 | num_rois = rois.size(0) 481 | for i in range(num_rois): 482 | roi = rois[i] 483 | im_idx = roi[0] 484 | im = out_map.narrow(0, im_idx, 1)[..., roi[2]:(roi[4]+1), roi[1]:(roi[3]+1)] 485 | # print adaptive_max_pool(im)[0].data.shape 486 | output.append(adaptive_max_pool(im)[0].data) 487 | output = torch.stack(output) 488 | print output.shape 489 | output = torch.cat(output, 0) 490 | print(output.size()) # torch.Size([128, 512, 7, 7]) 491 | 492 | k = output.view(output.size(0), -1) 493 | print(k.shape) # [128, 25088] 494 | 495 | # 分类层 496 | roi_head_classifier = nn.Sequential(*[nn.Linear(25088, 4096), 497 | nn.Linear(4096, 4096)]) 498 | cls_loc = nn.Linear(4096, 21 * 4) # (VOC 20 classes + 1 background. Each will have 4 co-ordinates) 499 | cls_loc.weight.data.normal_(0, 0.01) 500 | cls_loc.bias.data.zero_() 501 | score = nn.Linear(4096, 21) # (VOC 20 classes + 1 background) 502 | 503 | k = torch.autograd.Variable(k) 504 | k = roi_head_classifier(k) 505 | roi_cls_loc = cls_loc(k) 506 | roi_cls_score = score(k) 507 | print(roi_cls_loc.data.shape, roi_cls_score.data.shape) # torch.Size([128, 84]), torch.Size([128, 21]) 508 | 509 | 510 | # Fast RCNN 损失函数 511 | print(pred_anchor_locs.shape) # torch.Size([1, 22500, 4]) # RPN网络预测的坐标系数 512 | print(pred_cls_scores.shape) # torch.Size([1, 22500, 2]) # RPN网络预测的类别 513 | print(anchor_locations.shape) # (22500, 4) # anchor对应的实际坐标系数 514 | print(anchor_labels.shape) # (22500,) # anchor的实际类别 515 | 516 | # 我们将会从新排列,将输入和输出排成一行 517 | rpn_loc = pred_anchor_locs[0] 518 | rpn_score = pred_cls_scores[0] 519 | gt_rpn_loc = torch.from_numpy(anchor_locations) 520 | gt_rpn_score = torch.from_numpy(anchor_labels) 521 | print(rpn_loc.shape, rpn_score.shape, gt_rpn_loc.shape, gt_rpn_score.shape) 522 | # torch.Size([22500, 4]) torch.Size([22500, 2]) torch.Size([22500, 4]) torch.Size([22500]) 523 | 524 | # 对与classification我们使用Cross Entropy损失 525 | gt_rpn_score = torch.autograd.Variable(gt_rpn_score.long()) 526 | rpn_cls_loss = torch.nn.functional.cross_entropy(rpn_score, gt_rpn_score, ignore_index=-1) 527 | print(rpn_cls_loss) # Variable containing: 0.6931 528 | 529 | # 对于 Regression 我们使用smooth L1 损失 530 | pos = gt_rpn_score.data > 0 # Regression 损失也被应用在有正标签的边界区域中 531 | mask = pos.unsqueeze(1).expand_as(rpn_loc) 532 | print(mask.shape) # (22500L, 4L) 533 | 534 | # 现在取有正数标签的边界区域 535 | mask_loc_preds = rpn_loc[mask].view(-1, 4) 536 | mask_loc_targets = gt_rpn_loc[mask].view(-1, 4) 537 | print(mask_loc_preds.shape, mask_loc_targets.shape) # ((18L, 4L), (18L, 4L)) 538 | 539 | # regression损失应用如下 540 | x = np.abs(mask_loc_targets.numpy() - mask_loc_preds.data.numpy()) 541 | print x.shape # (18, 4) 542 | # print (x < 1) 543 | rpn_loc_loss = ((x < 1) * 0.5 * x**2) + ((x >= 1) * (x-0.5)) 544 | # print rpn_loc_loss.shape # (18, 4) 545 | rpn_loc_loss = rpn_loc_loss.sum() # 1.1628926242031001 546 | print rpn_loc_loss 547 | # print rpn_loc_loss.shape 548 | # rpn_loc_loss = np.squeeze(rpn_loc_loss) 549 | # print rpn_loc_loss 550 | 551 | N_reg = (gt_rpn_score > 0).float().sum() 552 | N_reg = np.squeeze(N_reg.data.numpy()) 553 | 554 | print "N_reg: {}, {}".format(N_reg, N_reg.shape) 555 | rpn_loc_loss = rpn_loc_loss / N_reg 556 | rpn_loc_loss = np.float32(rpn_loc_loss) 557 | # rpn_loc_loss = torch.autograd.Variable(torch.from_numpy(rpn_loc_loss)) 558 | rpn_lambda = 10. 559 | rpn_cls_loss = np.squeeze(rpn_cls_loss.data.numpy()) 560 | print "rpn_cls_loss: {}".format(rpn_cls_loss) # 0.693146109581 561 | print 'rpn_loc_loss: {}'.format(rpn_loc_loss) # 0.0646051466465 562 | rpn_loss = rpn_cls_loss + (rpn_lambda * rpn_loc_loss) 563 | print("rpn_loss: {}".format(rpn_loss)) # 1.33919757605 564 | 565 | 566 | # Fast R-CNN 损失函数 567 | # 预测 568 | print(roi_cls_loc.shape) # # torch.Size([128, 84]) 569 | print(roi_cls_score.shape) # torch.Size([128, 21]) 570 | 571 | # 真实 572 | print(gt_roi_locs.shape) # (128, 4) 573 | print(gt_roi_labels.shape) # (128, ) 574 | 575 | gt_roi_loc = torch.from_numpy(gt_roi_locs) 576 | gt_roi_label = torch.from_numpy(np.float32(gt_roi_labels)).long() 577 | print(gt_roi_loc.shape, gt_roi_label.shape) # torch.Size([128, 4]) torch.Size([128]) 578 | 579 | # 分类损失 580 | gt_roi_label = torch.autograd.Variable(gt_roi_label) 581 | roi_cls_loss = torch.nn.functional.cross_entropy(roi_cls_score, gt_roi_label, ignore_index=-1) 582 | print(roi_cls_loss) # Variable containing: 3.0515 583 | 584 | 585 | # 回归损失 586 | n_sample = roi_cls_loc.shape[0] 587 | roi_loc = roi_cls_loc.view(n_sample, -1, 4) 588 | print(roi_loc.shape) # (128L, 21L, 4L) 589 | 590 | roi_loc = roi_loc[torch.arange(0, n_sample).long(), gt_roi_label] 591 | print(roi_loc.shape) # torch.Size([128, 4]) 592 | 593 | 594 | # 用计算RPN网络回归损失的方法计算回归损失 595 | # roi_loc_loss = REGLoss(roi_loc, gt_roi_loc) 596 | 597 | pos = gt_roi_label.data > 0 # Regression 损失也被应用在有正标签的边界区域中 598 | mask = pos.unsqueeze(1).expand_as(roi_loc) 599 | print(mask.shape) # (128, 4L) 600 | 601 | # 现在取有正数标签的边界区域 602 | mask_loc_preds = roi_loc[mask].view(-1, 4) 603 | mask_loc_targets = gt_roi_loc[mask].view(-1, 4) 604 | print(mask_loc_preds.shape, mask_loc_targets.shape) # ((19L, 4L), (19L, 4L)) 605 | 606 | 607 | x = np.abs(mask_loc_targets.numpy() - mask_loc_preds.data.numpy()) 608 | print x.shape # (19, 4) 609 | 610 | roi_loc_loss = ((x < 1) * 0.5 * x**2) + ((x >= 1) * (x-0.5)) 611 | print(roi_loc_loss.sum()) # 1.4645805211187053 612 | 613 | 614 | N_reg = (gt_roi_label > 0).float().sum() 615 | N_reg = np.squeeze(N_reg.data.numpy()) 616 | roi_loc_loss = roi_loc_loss.sum() / N_reg 617 | roi_loc_loss = np.float32(roi_loc_loss) 618 | print roi_loc_loss # 0.077294916 619 | # roi_loc_loss = torch.autograd.Variable(torch.from_numpy(roi_loc_loss)) 620 | 621 | 622 | # ROI损失总和 623 | roi_lambda = 10. 624 | roi_cls_loss = np.squeeze(roi_cls_loss.data.numpy()) 625 | roi_loss = roi_cls_loss + (roi_lambda * roi_loc_loss) 626 | print(roi_loss) # 3.810348778963089 627 | 628 | 629 | total_loss = rpn_loss + roi_loss 630 | 631 | print total_loss # 5.149546355009079 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torchvision 6 | from PIL import Image, ImageDraw 7 | import numpy as np 8 | 9 | 10 | # cfg = { 11 | # 'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 12 | # 'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], 13 | # 'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], 14 | # 'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], 15 | # } 16 | 17 | featur_cfg = '' 18 | 19 | 20 | class VGG(nn.Module): 21 | 22 | def __init__(self): 23 | super(VGG, self).__init__() 24 | 25 | cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512] 26 | self.features = self._make_layers(cfg) 27 | self._rpn_model() 28 | 29 | size = (7, 7) 30 | self.adaptive_max_pool = torch.nn.AdaptiveMaxPool2d(size[0], size[1]) 31 | self.roi_classifier() 32 | 33 | def _make_layers(self, cfg): 34 | layers = [] 35 | in_channels = 3 36 | for x in cfg: 37 | if x == 'M': 38 | layers += [nn.MaxPool2d(kernel_size=2, stride=2)] 39 | else: 40 | layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1), 41 | nn.BatchNorm2d(x), 42 | nn.ReLU(inplace=True)] 43 | in_channels = x 44 | 45 | # layers += [nn.Conv2d(in_channels, 512, kernel_size=3, padding=1)] 46 | return nn.Sequential(*layers) 47 | # return layers 48 | 49 | def _rpn_model(self, mid_channels=512, in_channels=512, n_anchor=9): 50 | self.rpn_conv = nn.Conv2d(in_channels, mid_channels, 3, 1, 1) 51 | self.reg_layer = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0) 52 | # I will be going to use softmax here. you can equally use sigmoid if u replace 2 with 1. 53 | self.cls_layer = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0) 54 | 55 | # conv sliding layer 56 | self.rpn_conv.weight.data.normal_(0, 0.01) 57 | self.rpn_conv.bias.data.zero_() 58 | 59 | # Regression layer 60 | self.reg_layer.weight.data.normal_(0, 0.01) 61 | self.reg_layer.bias.data.zero_() 62 | 63 | # classification layer 64 | self.cls_layer.weight.data.normal_(0, 0.01) 65 | self.cls_layer.bias.data.zero_() 66 | 67 | def forward(self, data): 68 | out_map = self.features(data) 69 | # for layer in self.features: 70 | # # print layer 71 | # data = layer(data) 72 | # # print data.data.shape 73 | # 74 | # # out = data.view(data.size(0), -1) 75 | x = self.rpn_conv(out_map) 76 | pred_anchor_locs = self.reg_layer(x) # 回归层,计算有效anchor转为目标框的四个系数 77 | pred_cls_scores = self.cls_layer(x) # 分类层,判断该anchor是否可以捕获目标 78 | 79 | return out_map, pred_anchor_locs, pred_cls_scores 80 | 81 | def roi_classifier(self, class_num=20): # 假设为VOC数据集,共20分类 82 | # 分类层 83 | self.roi_head_classifier = nn.Sequential(*[nn.Linear(25088, 4096), 84 | nn.ReLU(), 85 | nn.Linear(4096, 4096), 86 | nn.ReLU()]) 87 | self.cls_loc = nn.Linear(4096, (class_num+1) * 4) # (VOC 20 classes + 1 background. Each will have 4 co-ordinates) 88 | self.cls_loc.weight.data.normal_(0, 0.01) 89 | self.cls_loc.bias.data.zero_() 90 | 91 | 92 | self.score = nn.Linear(4096, class_num+1) # (VOC 20 classes + 1 background) 93 | 94 | def rpn_loss(self, rpn_loc, rpn_score, gt_rpn_loc, gt_rpn_label, weight=10.0): 95 | # 对与classification我们使用Cross Entropy损失 96 | gt_rpn_label = torch.autograd.Variable(gt_rpn_label.long()) 97 | rpn_cls_loss = torch.nn.functional.cross_entropy(rpn_score, gt_rpn_label, ignore_index=-1) 98 | # print(rpn_cls_loss) # Variable containing: 0.6931 99 | 100 | # 对于 Regression 我们使用smooth L1 损失 101 | pos = gt_rpn_label.data > 0 # Regression 损失也被应用在有正标签的边界区域中 102 | mask = pos.unsqueeze(1).expand_as(rpn_loc) 103 | # print(mask.shape) # (22500L, 4L) 104 | 105 | # 现在取有正数标签的边界区域 106 | mask_pred_loc = rpn_loc[mask].view(-1, 4) 107 | mask_target_loc = gt_rpn_loc[mask].view(-1, 4) 108 | # print(mask_pred_loc.shape, mask_target_loc.shape) # ((18L, 4L), (18L, 4L)) 109 | 110 | # regression损失应用如下 111 | x = np.abs(mask_target_loc.numpy() - mask_pred_loc.data.numpy()) 112 | # print x.shape # (18, 4) 113 | # print (x < 1) 114 | rpn_loc_loss = ((x < 1) * 0.5 * x ** 2) + ((x >= 1) * (x - 0.5)) 115 | # print rpn_loc_loss.shape # (18, 4) 116 | rpn_loc_loss = rpn_loc_loss.sum() # 1.1628926242031001 117 | # print rpn_loc_loss 118 | # print rpn_loc_loss.shape 119 | # rpn_loc_loss = np.squeeze(rpn_loc_loss) 120 | # print rpn_loc_loss 121 | 122 | N_reg = (gt_rpn_label > 0).float().sum() 123 | N_reg = np.squeeze(N_reg.data.numpy()) 124 | 125 | # print "N_reg: {}, {}".format(N_reg, N_reg.shape) 126 | rpn_loc_loss = rpn_loc_loss / N_reg 127 | rpn_loc_loss = np.float32(rpn_loc_loss) 128 | # rpn_loc_loss = torch.autograd.Variable(torch.from_numpy(rpn_loc_loss)) 129 | 130 | rpn_cls_loss = np.squeeze(rpn_cls_loss.data.numpy()) 131 | # print "rpn_cls_loss: {}".format(rpn_cls_loss) # 0.693146109581 132 | # print 'rpn_loc_loss: {}'.format(rpn_loc_loss) # 0.0646051466465 133 | rpn_loss = rpn_cls_loss + (weight * rpn_loc_loss) 134 | # print("rpn_loss: {}".format(rpn_loss)) # 1.33919757605 135 | return rpn_loss 136 | 137 | def roi_loss(self, pre_loc, pre_conf, target_loc, target_conf, weight=10.0): 138 | # 分类损失 139 | target_conf = torch.autograd.Variable(target_conf.long()) 140 | pred_conf_loss = torch.nn.functional.cross_entropy(pre_conf, target_conf, ignore_index=-1) 141 | # print(pred_conf_loss) # Variable containing: 3.0515 142 | 143 | # 对于 Regression 我们使用smooth L1 损失 144 | # 用计算RPN网络回归损失的方法计算回归损失 145 | # pre_loc_loss = REGLoss(pre_loc, target_loc) 146 | pos = target_conf.data > 0 # Regression 损失也被应用在有正标签的边界区域中 147 | mask = pos.unsqueeze(1).expand_as(pre_loc) # (128, 4L) 148 | 149 | # 现在取有正数标签的边界区域 150 | mask_pred_loc = pre_loc[mask].view(-1, 4) 151 | mask_target_loc = target_loc[mask].view(-1, 4) 152 | # print(mask_pred_loc.shape, mask_target_loc.shape) # ((19L, 4L), (19L, 4L)) 153 | 154 | x = np.abs(mask_target_loc.numpy() - mask_pred_loc.data.numpy()) 155 | # print x.shape # (19, 4) 156 | 157 | pre_loc_loss = ((x < 1) * 0.5 * x ** 2) + ((x >= 1) * (x - 0.5)) 158 | # print(pre_loc_loss.sum()) # 1.4645805211187053 159 | 160 | N_reg = (target_conf > 0).float().sum() 161 | N_reg = np.squeeze(N_reg.data.numpy()) 162 | pre_loc_loss = pre_loc_loss.sum() / N_reg 163 | pre_loc_loss = np.float32(pre_loc_loss) 164 | # print pre_loc_loss # 0.077294916 165 | # pre_loc_loss = torch.autograd.Variable(torch.from_numpy(pre_loc_loss)) 166 | # 损失总和 167 | pred_conf_loss = np.squeeze(pred_conf_loss.data.numpy()) 168 | total_loss = pred_conf_loss + (weight * pre_loc_loss) 169 | 170 | return total_loss 171 | 172 | 173 | if __name__ == '__main__': 174 | vgg = VGG() 175 | print vgg 176 | data = torch.randn((1, 3, 800, 800)) 177 | print data.shape 178 | data = torch.autograd.Variable(data) 179 | out = vgg.forward(data) 180 | print out.data.shape 181 | 182 | 183 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | #coding:utf8 2 | 3 | import torch 4 | import torchvision 5 | from PIL import Image, ImageDraw 6 | import numpy as np 7 | from model import VGG 8 | import utils 9 | 10 | 11 | # 大体流程: 12 | # 1. 图像通过vgg获得特征图, 13 | # 2. 特征图通过RPN获得有效anchor的置信度(foreground)和转为预测框的坐标系数 14 | # 3. 特征图和预测框通过ROI Pooling获取固定尺寸的预测目标特征图,即利用预测框,从特征图中把目标抠出来, 15 | # 因为目标尺寸不一,再通过ROI Pooling的方法把目标转为统一的固定尺寸(7*7),这样就可以方便做目标的分类和预测框的修正处理。 16 | # 4. 固定尺寸的预测目标特征图通过类别分类模型(self.score)获取预测框所属的类别, 17 | # 5. 固定尺寸的预测目标特征图通过坐标分类模型(self.cls_loc)获取置信度和预测框修正的坐标系数。 18 | 19 | # 假设 图片中的两个目标框"ground-truth" 20 | bbox = np.asarray([[20, 30, 400, 500], [300, 400, 500, 600]], dtype=np.float32) # [y1, x1, y2, x2] format 21 | # 假设 图片中两个目标框分别对应的标签 22 | labels = np.asarray([6, 8], dtype=np.int8) # 0 represents background 23 | 24 | img_tensor = torch.zeros((1, 3, 800, 800)).float() 25 | img_var = torch.autograd.Variable(img_tensor) 26 | 27 | 28 | # ---------------------step_1: 获取目标anchor的置信度(anchor_conf)和平移缩放系数(anchor_locations) 29 | # 初始化所有anchors, 并找出有效anchors和对应的index 30 | # anchors: (22500, 4) valid_anchor_boxes: (8940, 4) valid_anchor_index:8940 31 | anchors, valid_anchor_boxes, valid_anchor_index = utils.init_anchor() 32 | # 计算有效anchors与所有目标框的IOU 33 | # ious:(8940, 2) 每个有效anchor框与目标实体框的IOU 34 | ious = utils.compute_iou(valid_anchor_boxes, bbox) 35 | valid_anchor_len = len(valid_anchor_boxes) 36 | # 在有效框中找到一定比例的正例和负例 37 | label, argmax_ious = utils.get_pos_neg_sample(ious, valid_anchor_len, pos_iou_threshold=0.7, 38 | neg_iou_threshold=0.3, pos_ratio=0.5, n_sample=256) 39 | # print np.sum(label == 1) # 18个正例 40 | # print np.sum(label == 0) # 256-18=238个负例 41 | 42 | # 现在让我们用具有最大iou的ground truth对象为每个anchor box分配位置。 43 | # 注意,我们将为所有有效的anchor box分配anchor locs,而不考虑其标签,稍后在计算损失时,我们可以使用简单的过滤器删除它们。 44 | # 每个有效anchor对应的目标框bbox 45 | max_iou_bbox = bbox[argmax_ious] # 有效anchor框对应的目标框坐标 (8940, 4) 46 | # print max_iou_bbox.shape # (8940, 4),共有8940个有效anchor框,每个anchor有坐标值(y1, x1, y2, x2) 47 | # 为所有有效的anchor_box分配anchor_locs,anchor_locs是每个有效的anchors转为对应目标框(bbox)的平移缩放系数 48 | anchor_locs = utils.get_coefficient(valid_anchor_boxes, max_iou_bbox) 49 | # print(anchor_locs.shape) # (8940, 4) 4维参数(平移参数:dy, dx; 缩放参数:dh, dw) 50 | 51 | # anchor_conf : 所有anchor框对应的label(-1:无效anchor,0:负例有效anchor,1:正例有效anchor) 52 | anchor_conf = np.empty((len(anchors),), dtype=label.dtype) 53 | anchor_conf.fill(-1) 54 | anchor_conf[valid_anchor_index] = label 55 | print anchor_conf.shape # 所有anchor对应的label(feature_size*feature_size*9)=》 (22500,) 56 | 57 | # anchor_locations: 所有anchor框转为目标实体框的系数,无效anchor系数全部为0,有效anchor有有效系数 58 | anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype=anchor_locs.dtype) 59 | anchor_locations.fill(0) 60 | anchor_locations[valid_anchor_index, :] = anchor_locs 61 | print anchor_locations.shape # 所有anchor对应的平移缩放系数(feature_size*feature_size*9,4)=》(22500, 4) 62 | 63 | # 这里通过候选anchor与目标实体框计算得到anchor框的置信度(anchor_conf)和平移缩放系数(anchor_locations) 64 | # ---------------------- 65 | 66 | 67 | # --------------------step_2: VGG 和 RPN 模型: RPN 预测的是anchor转为目标框的平移缩放系数 68 | vgg = VGG() 69 | # out_map 特征图, # pred_anchor_locs 预测anchor框到目标框转化的系数, pred_anchor_conf 预测anchor框的分数 70 | out_map, pred_anchor_locs, pred_anchor_conf = vgg.forward(img_var) 71 | print out_map.data.shape # (batch_size, num, feature_size, feature_size) => (1, 512, 50, 50) 72 | 73 | # 1. pred_anchor_locs 预测每个anchor框到目标框转化的系数(平移缩放),与 anchor_locations对应 74 | pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4) 75 | print(pred_anchor_locs.shape) # Out: torch.Size([1, 22500, 4]) 76 | 77 | # 2. 预测anchor框的置信度,每个anchor框都会对应一个置信度,与 anchor_conf对应 78 | pred_anchor_conf = pred_anchor_conf.permute(0, 2, 3, 1).contiguous() 79 | print(pred_anchor_conf.shape) # Out torch.Size([1, 50, 50, 18]) 80 | objectness_score = pred_anchor_conf.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1) 81 | print(objectness_score.shape) # Out torch.Size([1, 22500]) 82 | 83 | pred_anchor_conf = pred_anchor_conf.view(1, -1, 2) 84 | print(pred_anchor_conf.shape) # Out torch.size([1, 22500, 2]) 85 | # --------------------- 86 | 87 | 88 | # ---------------------step_3: RPN 损失 (有效anchor与预测anchor之间的损失--坐标系数损失与置信度损失) 89 | # 从上面step_1中,我们得到了目标anchor信息: 90 | # 目标anchor坐标系数:anchor_locations (22500, 4) 91 | # 目标anchor置信度:anchor_conf (22500,) 92 | 93 | # 从上面step_2中,我们得到了预测anchor信息: 94 | # RPN网络预测anchor的坐标系数:pred_anchor_locs (1, 22500, 4) 95 | # RPN网络预测anchor的置信度: pred_anchor_conf (1, 22500, 2) 96 | 97 | # 我们将会从新排列,将输入和输出排成一行 98 | rpn_anchor_loc = pred_anchor_locs[0] 99 | rpn_anchor_conf = pred_anchor_conf[0] 100 | anchor_locations = torch.from_numpy(anchor_locations) 101 | anchor_conf = torch.from_numpy(anchor_conf) 102 | print(rpn_anchor_loc.shape, rpn_anchor_conf.shape, anchor_locations.shape, anchor_conf.shape) 103 | # torch.Size([22500, 4]) torch.Size([22500, 2]) torch.Size([22500, 4]) torch.Size([22500]) 104 | 105 | rpn_loss = vgg.roi_loss(rpn_anchor_loc, rpn_anchor_conf, anchor_locations, anchor_conf, weight=10.0) 106 | print("rpn_loss: {}".format(rpn_loss)) # 1.33919 107 | # --------------------- 108 | 109 | 110 | # ---------------------step_4: 根据anchor和预测anchor系数,计算预测框(roi)和预测框的坐标系数(roi_locs), 111 | # ---------------------并得到每个预测框的所属类别label(roi_labels) 112 | # 通过anchors框和模型预测的平移缩放系数,得到预测框ROI;再通过预测的分值和阈值进行过滤精简 113 | roi, score, order = utils.get_predict_bbox(anchors, pred_anchor_locs, objectness_score, 114 | n_train_pre_nms=12000, min_size=16) 115 | 116 | # 得到的预测框(ROI)还会有大量重叠,再通过NMS(非极大抑制)做进一步的过滤精简 117 | roi = utils.nms(roi, score, order, nms_thresh=0.7, n_train_post_nms=2000) 118 | 119 | 120 | # 根据预测框ROI与目标框BBox的IOU,得到每个预测框所要预测的目标框(预测框与哪个目标框的IOU大,就代表预测哪个目标); 121 | # 并根据IOU对ROI做进一步过滤,并划分正负样例。 122 | sample_roi, keep_index, gt_assignment, roi_labels = utils.get_propose_target(roi, bbox, labels, 123 | n_sample=128, 124 | pos_ratio=0.25, 125 | pos_iou_thresh=0.5, 126 | neg_iou_thresh_hi=0.5, 127 | neg_iou_thresh_lo=0.0) 128 | # print(sample_roi.shape) # (128, 4) 129 | # 预测框对应的目标框 bbox_for_sampled_roi 130 | bbox_for_sampled_roi = bbox[gt_assignment[keep_index]] # 目标框 131 | print(bbox_for_sampled_roi.shape) # (128, 4) 132 | # 预测框(ROI)转目标框的真实系数 133 | roi_locs = utils.get_coefficient(sample_roi, bbox_for_sampled_roi) 134 | # --------------------- 135 | 136 | 137 | # ---------------------step_5: ROI Pooling: 138 | # 这一步做了两件事: 139 | # 一是从特征图中根据ROI把相应的预测目标框抠出来(im) 140 | # 二是将抠出来的预测目标框通过adaptive_max_pool方法,输出为固定尺寸(512, 7, 7),方便后续的批处理 141 | # 这样的特点: 142 | # 一是并没有在输入图像上预测,而是在VGG模型的输出特征图上进行预测,这样减少了计算量; 143 | # 二是因为目标实体尺寸多种多样,通过ROI Pooling方法将输出统一为固定尺寸(512, 7, 7),方便进行批处理, 144 | # sample_roi:预测的有效框 (128, 4) 145 | rois = torch.from_numpy(sample_roi).float() 146 | # roi_indices:添加图像的索引[这里我们只有一个图像,其索引号为0] 147 | roi_indices = 0 * np.ones((len(rois),), dtype=np.int32) 148 | roi_indices = torch.from_numpy(roi_indices).float() 149 | print(rois.shape, roi_indices.shape) # torch.Size([128, 4]) torch.Size([128]) 150 | 151 | # 将图像的索引号和预测的有效框进行合并, 这样我们将会得到维度是[N, 5] 5=>(index, x1, y1, x2, y2)的张量 152 | indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1) # torch.Size([128, 5]) 153 | 154 | output = [] 155 | rois = indices_and_rois.float() 156 | rois[:, 1:].mul_(1/16.0) # 对预测框进行下采样,匹配特征图out_map 157 | rois = rois.long() 158 | num_rois = rois.size(0) 159 | # out_map: (batch_size, num, feature_size, feature_size) => (1, 512, 50, 50) 160 | for i in range(num_rois): 161 | roi = rois[i] 162 | im_idx = roi[0] # 图片的索引号 163 | # 取出索引号是im_idx的图片特征图=》(1, 512, 50, 50),因为本实例就一张图片,所以操作完后shape并不变 164 | out_map = out_map.narrow(0, im_idx, 1) 165 | # 这一步是根据预测框的的x1,y1, x2,y2坐标,从特征图out_map中把目标实体抠出来 166 | im = out_map[..., roi[2]:(roi[4]+1), roi[1]:(roi[3]+1)] 167 | # print im.shape 168 | # 将抠出来的目标实体im,做adaptive_max_pool计算,最后得到一个固定的尺寸(7,7)== > (512, 7, 7),方便后面进行批处理 169 | output.append(vgg.adaptive_max_pool(im)[0].data) 170 | # ---------------------ROI Pooling 171 | 172 | 173 | # ---------------------step_6: Classification 线性分类,预测预测框的类别,置信度和转为目标框的平移缩放系数(要与RPN区分) 174 | # note: if your pytorch version is 0.3.1, you must run this: 175 | # output = torch.stack(output) 176 | output = torch.cat(output, 0) # torch.Size([128, 512, 7, 7]) 177 | k = output.view(output.size(0), -1) # [128, 25088] 178 | 179 | k = torch.autograd.Variable(k) 180 | k = vgg.roi_head_classifier(k) # (128, 4096) 181 | # torch.Size([128, 84]) 84 ==> (20+1)*4,表示每个框有20个候选类别和一个置信度(假设为VOC数据集,共20分类),4表示坐标信息 182 | pred_roi_locs = vgg.cls_loc(k) 183 | # pred_roi_labels: [128, 21] 表示每个框的类别和置信度 184 | pred_roi_labels = vgg.score(k) 185 | print(pred_roi_locs.data.shape, pred_roi_labels.data.shape) # torch.Size([128, 84]), torch.Size([128, 21]) 186 | # ---------------------Classification 187 | 188 | 189 | # ---------------------step_7: 分类损失 (有效预测框真实系数与有效预测框的预测系数间损失,其中系数是转为目标框的坐标系数) 190 | # 从上面step_4中,我们得到了预测框转为目标框的目标信息: 191 | # 预测框的坐标系数(roi_locs): (128, 4) 192 | # 预测框的所属类别(roi_labels):(128, ) 193 | 194 | # 从上面step_6中,我们得到了预测框转为目标框的预测信息: 195 | # 预测框的坐标系数:pred_roi_locs (128, 84) 196 | # 预测框的所属类别和置信度: pred_roi_labels (128, 21) 197 | 198 | 199 | gt_roi_loc = torch.from_numpy(roi_locs) 200 | gt_roi_label = torch.from_numpy(np.float32(roi_labels)).long() 201 | print(gt_roi_loc.shape, gt_roi_label.shape) # torch.Size([128, 4]) torch.Size([128]) 202 | 203 | n_sample = pred_roi_locs.shape[0] 204 | roi_loc = pred_roi_locs.view(n_sample, -1, 4) # (128L, 21L, 4L) 205 | 206 | roi_loc = roi_loc[torch.arange(0, n_sample).long(), gt_roi_label] # 根据预测框的真实类别,找到真实类别所对应的坐标系数 207 | # print(roi_loc.shape) # torch.Size([128, 4]) 208 | 209 | roi_loss = vgg.roi_loss(roi_loc, pred_roi_labels, gt_roi_loc, gt_roi_label, weight=10.0) 210 | print(roi_loss) # 3.810348778963089 211 | 212 | 213 | # 整体损失函数 214 | total_loss = rpn_loss + roi_loss 215 | print total_loss # 5.149546355009079 216 | 217 | 218 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | import sys 3 | import numpy as np 4 | import torch 5 | 6 | def init_anchor(img_size=800, sub_sample=16): 7 | ratios = [0.5, 1, 2] 8 | anchor_scales = [8, 16, 32] # 该尺寸是针对特征图的 9 | 10 | # 一个特征点对应原图片中的16*16个像素点区域, 'img_size // sub_sample'得到特征图的尺寸 11 | feature_size = (img_size // sub_sample) 12 | # 这里相当于把图像分割成feature_size*feature_size的网格, 每个网格对应一个特征点。 13 | # ctr_x, ctr_y: 每个网格的右下方坐标 14 | ctr_x = np.arange(sub_sample, (feature_size + 1) * sub_sample, sub_sample) # 共feature_size个 15 | ctr_y = np.arange(sub_sample, (feature_size + 1) * sub_sample, sub_sample) # 共feature_size个 16 | # print len(ctr_x) # 50 17 | 18 | index = 0 19 | # ctr: 每个网格的中心点,一共feature_size*feature_size个网格 20 | ctr = dict() 21 | for x in range(len(ctr_x)): 22 | for y in range(len(ctr_y)): 23 | ctr[index] = [-1, -1] 24 | ctr[index][1] = ctr_x[x] - 8 # 右下角坐标 - 8 = 中心坐标 25 | ctr[index][0] = ctr_y[y] - 8 26 | index += 1 27 | # print len(ctr) # 将原图片分割成50*50=2500(feature_size*feature_size)个区域的中心点 28 | 29 | # 初始化:每个区域有9个anchors候选框,每个候选框的坐标(y1, x1, y2, x2) 30 | anchors = np.zeros(((feature_size * feature_size * 9), 4)) # (22500, 4) 31 | index = 0 32 | # 将候选框的坐标赋值到anchors 33 | for c in ctr: 34 | ctr_y, ctr_x = ctr[c] 35 | for i in range(len(ratios)): 36 | for j in range(len(anchor_scales)): 37 | # anchor_scales 是针对特征图的,所以需要乘以下采样"sub_sample" 38 | h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i]) 39 | w = sub_sample * anchor_scales[j] * np.sqrt(1. / ratios[i]) 40 | anchors[index, 0] = ctr_y - h / 2. 41 | anchors[index, 1] = ctr_x - w / 2. 42 | anchors[index, 2] = ctr_y + h / 2. 43 | anchors[index, 3] = ctr_x + w / 2. 44 | index += 1 45 | 46 | # 去除坐标出界的边框,保留图片内的框——图片内框 47 | valid_anchor_index = np.where( 48 | (anchors[:, 0] >= 0) & 49 | (anchors[:, 1] >= 0) & 50 | (anchors[:, 2] <= 800) & 51 | (anchors[:, 3] <= 800) 52 | )[0] # 该函数返回数组中满足条件的index 53 | # print valid_anchor_index.shape # (8940,),表明有8940个框满足条件 54 | 55 | # 获取有效anchor(即边框都在图片内的anchor)的坐标 56 | valid_anchor_boxes = anchors[valid_anchor_index] 57 | # print(valid_anchor_boxes.shape) # (8940, 4) 58 | 59 | return anchors, valid_anchor_boxes, valid_anchor_index 60 | 61 | 62 | # 计算有效anchor框"valid_anchor_boxes"与目标框"bbox"的IOU 63 | def compute_iou(valid_anchor_boxes, bbox): 64 | valid_anchor_num = len(valid_anchor_boxes) 65 | ious = np.empty((valid_anchor_num, 2), dtype=np.float32) 66 | ious.fill(0) 67 | for num1, i in enumerate(valid_anchor_boxes): 68 | ya1, xa1, ya2, xa2 = i 69 | anchor_area = (ya2 - ya1) * (xa2 - xa1) # anchor框面积 70 | for num2, j in enumerate(bbox): 71 | yb1, xb1, yb2, xb2 = j 72 | box_area = (yb2 - yb1) * (xb2 - xb1) # 目标框面积 73 | inter_x1 = max([xb1, xa1]) 74 | inter_y1 = max([yb1, ya1]) 75 | inter_x2 = min([xb2, xa2]) 76 | inter_y2 = min([yb2, ya2]) 77 | if (inter_x1 < inter_x2) and (inter_y1 < inter_y2): 78 | iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1) # anchor框和目标框的相交面积 79 | iou = iter_area / (anchor_area + box_area - iter_area) # IOU计算 80 | else: 81 | iou = 0. 82 | 83 | ious[num1, num2] = iou 84 | 85 | return ious 86 | 87 | 88 | def get_pos_neg_sample(ious, valid_anchor_len, pos_iou_threshold=0.7,neg_iou_threshold=0.3, pos_ratio=0.5, n_sample=256): 89 | gt_argmax_ious = ious.argmax(axis=0) # 找出每个目标实体框最大IOU的anchor框index,共2个, 与图片内目标框数量一致 90 | gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])] # 获取每个目标实体框最大IOU的值,与gt_argmax_ious对应, 共2个,与图片内目标框数量一致 91 | argmax_ious = ious.argmax(axis=1) # 找出每个anchor框最大IOU的目标框index,共8940个, 每个anchor框都会对应一个最大IOU的目标框 92 | max_ious = ious[np.arange(valid_anchor_len), argmax_ious] # 获取每个anchor框的最大IOU值, 与argmax_ious对应, 每个anchor框内都会有一个最大值 93 | 94 | gt_argmax_ious = np.where(ious == gt_max_ious)[0] # 根据上面获取的目标最大IOU值,获取等于该值的index 95 | # print gt_argmax_ious.shape # (18,) 共计18个 96 | 97 | label = np.empty((valid_anchor_len,), dtype=np.int32) 98 | label.fill(-1) 99 | # print label.shape # (8940,) 100 | label[max_ious < neg_iou_threshold] = 0 # anchor框内最大IOU值小于neg_iou_threshold,设为0 101 | label[gt_argmax_ious] = 1 # anchor框有全局最大IOU值,设为1 102 | label[max_ious >= pos_iou_threshold] = 1 # anchor框内最大IOU值大于等于pos_iou_threshold,设为1 103 | 104 | n_pos = pos_ratio * n_sample # 正例样本数 105 | 106 | # 随机获取n_pos个正例, 107 | pos_index = np.where(label == 1)[0] 108 | if len(pos_index) > n_pos: 109 | disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False) 110 | label[disable_index] = -1 111 | 112 | n_neg = n_sample - np.sum(label == 1) 113 | neg_index = np.where(label == 0)[0] 114 | 115 | if len(neg_index) > n_neg: 116 | disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace=False) 117 | label[disable_index] = -1 118 | 119 | return label, argmax_ious 120 | 121 | 122 | def get_predict_bbox(anchors, pred_anchor_locs, objectness_score, n_train_pre_nms=12000, min_size=16): 123 | # 转换anchor格式从 y1, x1, y2, x2 到 ctr_x, ctr_y, h, w : 124 | anc_height = anchors[:, 2] - anchors[:, 0] 125 | anc_width = anchors[:, 3] - anchors[:, 1] 126 | anc_ctr_y = anchors[:, 0] + 0.5 * anc_height 127 | anc_ctr_x = anchors[:, 1] + 0.5 * anc_width 128 | 129 | # 根据预测的四个系数,将anchor框通过平移和缩放转化为预测的目标框 130 | pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy() 131 | objectness_score_numpy = objectness_score[0].data.numpy() 132 | dy = pred_anchor_locs_numpy[:, 0::4] 133 | dx = pred_anchor_locs_numpy[:, 1::4] 134 | dh = pred_anchor_locs_numpy[:, 2::4] 135 | dw = pred_anchor_locs_numpy[:, 3::4] 136 | ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis] 137 | ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis] 138 | h = np.exp(dh) * anc_height[:, np.newaxis] 139 | w = np.exp(dw) * anc_width[:, np.newaxis] 140 | 141 | # 将预测的目标框转换为[y1, x1, y2, x2]格式 142 | roi = np.zeros(pred_anchor_locs_numpy.shape, dtype=pred_anchor_locs_numpy.dtype) 143 | roi[:, 0::4] = ctr_y - 0.5 * h 144 | roi[:, 1::4] = ctr_x - 0.5 * w 145 | roi[:, 2::4] = ctr_y + 0.5 * h 146 | roi[:, 3::4] = ctr_x + 0.5 * w 147 | 148 | # 保证预测框的坐标全部落在图片中,y1,y2在(0, img_size[0])之间, x1,x2在(0, img_size[1])之间 149 | img_size = (800, 800) # Image size 150 | roi[:, slice(0, 4, 2)] = np.clip(roi[:, slice(0, 4, 2)], 0, img_size[0]) 151 | roi[:, slice(1, 4, 2)] = np.clip(roi[:, slice(1, 4, 2)], 0, img_size[1]) 152 | # print(roi.shape) # (22500, 4) 153 | 154 | # 去除高度或宽度 < threshold的预测框 (疑问:这样会不会忽略小目标) 155 | hs = roi[:, 2] - roi[:, 0] 156 | ws = roi[:, 3] - roi[:, 1] 157 | keep = np.where((hs >= min_size) & (ws >= min_size))[0] 158 | roi = roi[keep, :] 159 | score = objectness_score_numpy[keep] 160 | 161 | # 按分数从高到低排序所有的(proposal, score)对 162 | order = score.ravel().argsort()[::-1] # (22500,) 163 | # 取前几个预测框pre_nms_topN(如训练时12000,测试时300) 164 | order = order[:n_train_pre_nms] 165 | 166 | return roi, score, order 167 | 168 | # torch.masked_select() 169 | 170 | def nms(roi, score, order, nms_thresh=0.7, n_train_post_nms=2000): 171 | # nms(非极大抑制)计算: (去除和极大值anchor框IOU大于0.7的框——即去除相交的框,保留score大,且基本无相交的框) 172 | roi = roi[order, :] # (12000, 4) 173 | score = score[order] 174 | y1 = roi[:, 0] 175 | x1 = roi[:, 1] 176 | y2 = roi[:, 2] 177 | x2 = roi[:, 3] 178 | 179 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 180 | 181 | order = score.argsort()[::-1] 182 | # print score 183 | # print order 184 | keep = [] 185 | while order.size > 0: 186 | # print order 187 | i = order[0] 188 | keep.append(i) 189 | xx1 = np.maximum(x1[i], x1[order[1:]]) 190 | yy1 = np.maximum(y1[i], y1[order[1:]]) 191 | xx2 = np.minimum(x2[i], x2[order[1:]]) 192 | yy2 = np.minimum(y2[i], y2[order[1:]]) 193 | 194 | w = np.maximum(0.0, xx2 - xx1 + 1) 195 | h = np.maximum(0.0, yy2 - yy1 + 1) 196 | inter = w * h 197 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 198 | 199 | # print ovr 200 | inds = np.where(ovr <= nms_thresh)[0] 201 | # print inds 202 | order = order[inds + 1] # 这里加1是因为在计算IOU时,把序列的第一个忽略了(如上面的order[1:]) 203 | 204 | keep = keep[:n_train_post_nms] # while training/testing , use accordingly 205 | roi = roi[keep] # the final region proposals(region proposals表示预测目标框) 206 | # print roi.shape # (1758, 4) 207 | return roi 208 | 209 | 210 | def get_propose_target(roi, bbox, labels, n_sample=128, pos_ratio=0.25, 211 | pos_iou_thresh=0.5, neg_iou_thresh_hi=0.5, neg_iou_thresh_lo = 0.0): 212 | # Proposal targets 213 | # 找到每个ground-truth目标(真实目标框bbox)与region proposal(预测目标框roi)的iou 214 | ious = compute_iou(roi, bbox) 215 | # print(ious.shape) # (1758, 2) 216 | 217 | # 找到与每个region proposal具有较高IoU的ground truth,并且找到最大的IoU 218 | gt_assignment = ious.argmax(axis=1) 219 | max_iou = ious.max(axis=1) 220 | # print(gt_assignment) # [0 0 1 ... 0 0 0] 221 | # print(max_iou) # [0.17802152 0.17926688 0.04676317 ... 0. 0. 0. ] 222 | 223 | # 为每个proposal分配标签: 224 | gt_roi_label = labels[gt_assignment] 225 | # print(gt_roi_label) # [6 6 8 ... 6 6 6] 226 | 227 | # 希望只保留n_sample*pos_ratio(128*0.25=32)个前景样本,因此如果只得到少于32个正样本,保持原状。 228 | # 如果得到多余32个前景目标,从中采样32个样本 229 | pos_roi_per_image = n_sample*pos_ratio 230 | pos_index = np.where(max_iou >= pos_iou_thresh)[0] 231 | pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size)) 232 | if pos_index.size > 0: 233 | pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False) 234 | # print(pos_roi_per_this_image) 235 | # print(pos_index) # 19 236 | 237 | # 针对负[背景]region proposal进行相似处理 238 | neg_index = np.where((max_iou < neg_iou_thresh_hi) & (max_iou >= neg_iou_thresh_lo))[0] 239 | neg_roi_per_this_image = n_sample - pos_roi_per_this_image 240 | neg_roi_per_this_image = int(min(neg_roi_per_this_image, neg_index.size)) 241 | if neg_index.size > 0: 242 | neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False) 243 | # print(neg_roi_per_this_image) 244 | # print(neg_index) # 109 245 | 246 | keep_index = np.append(pos_index, neg_index) 247 | gt_roi_labels = gt_roi_label[keep_index] 248 | gt_roi_labels[pos_roi_per_this_image:] = 0 # negative labels --> 0 249 | sample_roi = roi[keep_index] # 预测框 250 | # print(sample_roi.shape) # (128, 4) 251 | return sample_roi, keep_index, gt_assignment, gt_roi_labels 252 | 253 | 254 | def get_coefficient(anchor, bbox): 255 | # 根据上面得到的预测框和与之对应的目标框,计算4维参数(平移参数:dy, dx; 缩放参数:dh, dw) 256 | height = anchor[:, 2] - anchor[:, 0] 257 | width = anchor[:, 3] - anchor[:, 1] 258 | ctr_y = anchor[:, 0] + 0.5 * height 259 | ctr_x = anchor[:, 1] + 0.5 * width 260 | base_height = bbox[:, 2] - bbox[:, 0] 261 | base_width = bbox[:, 3] - bbox[:, 1] 262 | base_ctr_y = bbox[:, 0] + 0.5 * base_height 263 | base_ctr_x = bbox[:, 1] + 0.5 * base_width 264 | 265 | eps = np.finfo(height.dtype).eps 266 | height = np.maximum(height, eps) 267 | width = np.maximum(width, eps) 268 | 269 | dy = (base_ctr_y - ctr_y) / height 270 | dx = (base_ctr_x - ctr_x) / width 271 | dh = np.log(base_height / height) 272 | dw = np.log(base_width / width) 273 | 274 | gt_roi_locs = np.vstack((dy, dx, dh, dw)).transpose() 275 | # print(gt_roi_locs.shape) 276 | 277 | return gt_roi_locs --------------------------------------------------------------------------------