├── README.md
├── faster_rcnn.py
├── model.py
├── train.py
└── utils.py


/README.md:
--------------------------------------------------------------------------------
 1 | # simple_faster_rcnn
 2 | run this repository only depend python2.7 and Pytorch (0.3.1 or 0.4.1)
 3 | 
 4 | English Blog: https://medium.com/@fractaldle/guide-to-build-faster-rcnn-in-pytorch-95b10c273439
 5 | 
 6 | Chinese Blog: https://zhuanlan.zhihu.com/p/69250914
 7 | 
 8 | # Note: the current environment of Pytorch is 0.4.1 if you run this repository on Pytorch 0.3.1 ,please check the line 174 of train.py:
 9 | '''
10 | 
11 | if your pytorch version is 0.3.1, you must run this:
12 | 
13 | output = torch.stack(output)
14 | 
15 | '''
16 | 


--------------------------------------------------------------------------------
/faster_rcnn.py:
--------------------------------------------------------------------------------
  1 | #coding:utf8
  2 | 
  3 | import torch
  4 | import torchvision
  5 | from PIL import Image, ImageDraw
  6 | import numpy as np
  7 | 
  8 | img_tensor = torch.zeros((1, 3, 800, 800)).float()
  9 | print(img_tensor.shape)
 10 | #Out: torch.Size([1, 3, 800, 800])
 11 | 
 12 | img_var = torch.autograd.Variable(img_tensor)
 13 | 
 14 | model = torchvision.models.vgg16(pretrained=False)
 15 | fe = list(model.features)
 16 | print(fe)  # length is 15
 17 | 
 18 | req_features = []
 19 | k = img_var.clone()
 20 | for i in fe:
 21 |    print i
 22 |    k = i(k)
 23 |    print k.data.shape
 24 |    if k.size()[2] < 800//16:
 25 |        break
 26 |    req_features.append(i)
 27 |    out_channels = k.size()[1]
 28 | print(len(req_features)) #30
 29 | print(out_channels) # 512
 30 | 
 31 | for f in req_features:
 32 |     print f
 33 | 
 34 | 
 35 | faster_rcnn_fe_extractor = torch.nn.Sequential(*req_features)
 36 | out_map = faster_rcnn_fe_extractor(img_var)
 37 | print(out_map.size())
 38 | # #Out: torch.Size([1, 512, 50, 50])
 39 | 
 40 | ratios = [0.5, 1, 2]
 41 | anchor_scales = [8, 16, 32]
 42 | sub_sample = 16
 43 | 
 44 | # 一个特征点对应原图片中的16*16个像素点区域
 45 | fe_size = (800//16)
 46 | # ctr_x， ctr_y: 每个特征点对应原图片区域的右下方坐标
 47 | ctr_x = np.arange(16, (fe_size+1) * 16, 16)
 48 | ctr_y = np.arange(16, (fe_size+1) * 16, 16)
 49 | print len(ctr_x)  # 共50*50个特征点，将原图片分割成50*50=2500个区域
 50 | 
 51 | 
 52 | index = 0
 53 | # ctr: 每个特征点对应原图片区域的中心点
 54 | ctr = dict()
 55 | for x in range(len(ctr_x)):
 56 |    for y in range(len(ctr_y)):
 57 |        ctr[index] = [-1, -1]
 58 |        ctr[index][1] = ctr_x[x] - 8
 59 |        ctr[index][0] = ctr_y[y] - 8
 60 |        index +=1
 61 | # print ctr
 62 | print len(ctr)  # 将原图片分割成50*50=2500个区域的中心点
 63 | 
 64 | 
 65 | # 初始化：每个区域有9个anchors候选框，每个候选框的坐标(y1, x1, y2, x2)
 66 | anchors = np.zeros(((fe_size * fe_size * 9), 4))
 67 | # (22500, 4)
 68 | print anchors.shape
 69 | index = 0
 70 | # 将候选框的坐标赋值到anchors
 71 | for c in ctr:
 72 |  ctr_y, ctr_x = ctr[c]
 73 |  for i in range(len(ratios)):
 74 |    for j in range(len(anchor_scales)):
 75 |      # anchor_scales 是针对特征图的，所以需要乘以下采样"sub_sample"
 76 |      h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
 77 |      w = sub_sample * anchor_scales[j] * np.sqrt(1./ ratios[i])
 78 |      anchors[index, 0] = ctr_y - h / 2.
 79 |      anchors[index, 1] = ctr_x - w / 2.
 80 |      anchors[index, 2] = ctr_y + h / 2.
 81 |      anchors[index, 3] = ctr_x + w / 2.
 82 |      index += 1
 83 | # (22500, 4)
 84 | print(anchors.shape)
 85 | 
 86 | img_npy = img_tensor.numpy()
 87 | img_npy = np.transpose(img_npy[0], (1, 2, 0)).astype(np.float32)
 88 | img = Image.fromarray(np.uint8(img_npy))
 89 | # # img.show()
 90 | draw = ImageDraw.Draw(img)
 91 | 
 92 | # for index in range(15000, 15009):
 93 | # # for index in range(len(anchors)):
 94 | #     draw.rectangle([(anchors[index, 1], anchors[index, 0]), (anchors[index, 3], anchors[index, 2])], outline=(255, 0, 0))
 95 | # img.show()
 96 | 
 97 | # 假设 图片中的两个目标框"ground-truth"
 98 | bbox = np.asarray([[20, 30, 400, 500], [300, 400, 500, 600]], dtype=np.float32) # [y1, x1, y2, x2] format
 99 | draw.rectangle([(30, 20), (500, 400)], outline=(100, 255, 0))
100 | draw.rectangle([(400, 300), (600, 500)], outline=(100, 255, 0))
101 | 
102 | # 假设 图片中两个目标框分别对应的标签
103 | labels = np.asarray([6, 8], dtype=np.int8)  # 0 represents background
104 | 
105 | # 去除坐标出界的边框，保留图片内的框——图片内框
106 | valid_anchor_index = np.where(
107 |        (anchors[:, 0] >= 0) &
108 |        (anchors[:, 1] >= 0) &
109 |        (anchors[:, 2] <= 800) &
110 |        (anchors[:, 3] <= 800)
111 |    )[0]  # 该函数返回数组中满足条件的index
112 | print valid_anchor_index.shape  # (8940,)，表明有8940个框满足条件
113 | 
114 | 
115 | # 获取有效anchor（即边框都在图片内的anchor）的坐标
116 | valid_anchor_boxes = anchors[valid_anchor_index]
117 | print(valid_anchor_boxes.shape)  # (8940, 4)
118 | 
119 | 
120 | # 计算有效anchor框"valid_anchor_boxes"与目标框"bbox"的IOU
121 | ious = np.empty((len(valid_anchor_boxes), 2), dtype=np.float32)
122 | ious.fill(0)
123 | print(bbox)
124 | for num1, i in enumerate(valid_anchor_boxes):
125 |    ya1, xa1, ya2, xa2 = i
126 |    anchor_area = (ya2 - ya1) * (xa2 - xa1)  # anchor框面积
127 |    for num2, j in enumerate(bbox):
128 |        yb1, xb1, yb2, xb2 = j
129 |        box_area = (yb2 - yb1) * (xb2 - xb1)  # 目标框面积
130 |        inter_x1 = max([xb1, xa1])
131 |        inter_y1 = max([yb1, ya1])
132 |        inter_x2 = min([xb2, xa2])
133 |        inter_y2 = min([yb2, ya2])
134 |        if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
135 |            iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)  # anchor框和目标框的相交面积
136 |            iou = iter_area / (anchor_area + box_area - iter_area)  # IOU计算
137 |        else:
138 |            iou = 0.
139 | 
140 |        ious[num1, num2] = iou
141 | print(ious.shape)  # (8940, 2)  表示每个anchor框与所有目标框的IOU，这里所有的目标框共2个。
142 | gt_argmax_ious = ious.argmax(axis=0)  # 找出每个目标框最大IOU的anchor框index，共2个
143 | print(gt_argmax_ious)  # 共2个，与图片内目标框数量一致
144 | gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]  # 获取每个目标框最大IOU的值，与gt_argmax_ious对应
145 | print(gt_max_ious)  # 共2个，与图片内目标框数量一致
146 | argmax_ious = ious.argmax(axis=1)  # 找出每个anchor框最大IOU的目标框index，共8940个
147 | print(argmax_ious.shape)  # (8940,) 每个anchor框都会对应一个最大IOU的目标框
148 | max_ious = ious[np.arange(len(valid_anchor_index)), argmax_ious]  # 获取每个anchor框的最大IOU值， 与argmax_ious对应
149 | print(max_ious.shape)  # (8940,),每个anchor框内都会有一个最大值
150 | 
151 | # 疑问： ious == gt_max_ious， 有区分目标
152 | gt_argmax_ious = np.where(ious == gt_max_ious)[0]  # 根据上面获取的目标最大IOU值，获取等于该值的index
153 | print gt_argmax_ious.shape  # (18,) 共计18个
154 | # for index in gt_argmax_ious:
155 | #     draw.rectangle([(valid_anchor_boxes[index, 1], valid_anchor_boxes[index, 0]),
156 | #                     (valid_anchor_boxes[index, 3], valid_anchor_boxes[index, 2])], outline=(255, 0, 0))
157 | # img.show()
158 | 
159 | 
160 | pos_iou_threshold = 0.7
161 | neg_iou_threshold = 0.3
162 | label = np.empty((len(valid_anchor_index), ), dtype=np.int32)
163 | label.fill(-1)
164 | print label.shape  # (8940,)
165 | label[max_ious < neg_iou_threshold] = 0  # anchor框内最大IOU值小于neg_iou_threshold，设为0
166 | label[gt_argmax_ious] = 1  # anchor框有全局最大IOU值，设为1
167 | label[max_ious >= pos_iou_threshold] = 1  # anchor框内最大IOU值大于等于pos_iou_threshold，设为1
168 | 
169 | 
170 | 
171 | pos_ratio = 0.5
172 | n_sample = 256
173 | n_pos = pos_ratio * n_sample  # 正例样本数
174 | 
175 | # 随机获取n_pos个正例，
176 | pos_index = np.where(label == 1)[0]
177 | if len(pos_index) > n_pos:
178 |    disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False)
179 |    label[disable_index] = -1
180 | 
181 | n_neg = n_sample - np.sum(label == 1)
182 | neg_index = np.where(label == 0)[0]
183 | 
184 | if len(neg_index) > n_neg:
185 |    disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace = False)
186 |    label[disable_index] = -1
187 | print np.sum(label == 1)  # 18个正例
188 | print np.sum(label == 0)  # 256-18=238个负例
189 | 
190 | 
191 | # 现在让我们用具有最大iou的ground truth对象为每个anchor box分配位置。
192 | # 注意，我们将为所有有效的anchor box分配anchor locs，而不考虑其标签，稍后在计算损失时，我们可以使用简单的过滤器删除它们。
193 | max_iou_bbox = bbox[argmax_ious]  # 有效anchor框对应的目标框坐标  (8940, 4)
194 | print(max_iou_bbox)
195 | print max_iou_bbox.shape  # (8940, 4)，共有8940个有效anchor框，每个anchor有坐标值（y1, x1, y2, x2）
196 | 
197 | # 有效anchor的中心点和宽高：ctr_x, ctr_y, width, height
198 | height = valid_anchor_boxes[:, 2] - valid_anchor_boxes[:, 0]
199 | width = valid_anchor_boxes[:, 3] - valid_anchor_boxes[:, 1]
200 | ctr_y = valid_anchor_boxes[:, 0] + 0.5 * height
201 | ctr_x = valid_anchor_boxes[:, 1] + 0.5 * width
202 | # 有效anchor对应目标框的中心点和宽高: base_ctr_x, base_ctr_y, base_width, base_height
203 | base_height = max_iou_bbox[:, 2] - max_iou_bbox[:, 0]
204 | base_width = max_iou_bbox[:, 3] - max_iou_bbox[:, 1]
205 | base_ctr_y = max_iou_bbox[:, 0] + 0.5 * base_height
206 | base_ctr_x = max_iou_bbox[:, 1] + 0.5 * base_width
207 | 
208 | # 有效anchor转为目标框的系数（dy，dx是平移系数；dh，dw是缩放系数）
209 | eps = np.finfo(height.dtype).eps
210 | height = np.maximum(height, eps)
211 | width = np.maximum(width, eps)
212 | dy = (base_ctr_y - ctr_y) / height
213 | dx = (base_ctr_x - ctr_x) / width
214 | dh = np.log(base_height / height)
215 | dw = np.log(base_width / width)
216 | anchor_locs = np.vstack((dy, dx, dh, dw)).transpose()
217 | # print anchor_locs
218 | print(anchor_locs.shape)
219 | 
220 | 
221 | #  anchor_labels ： 每个anchor框对应的label（-1：无效anchor，0：负例有效anchor，1：正例有效anchor）
222 | anchor_labels = np.empty((len(anchors),), dtype=label.dtype)
223 | anchor_labels.fill(-1)
224 | anchor_labels[valid_anchor_index] = label
225 | 
226 | #  anchor_locations： 每个有效anchor框转为目标框的系数
227 | anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype=anchor_locs.dtype)
228 | anchor_locations.fill(0)
229 | anchor_locations[valid_anchor_index, :] = anchor_locs
230 | 
231 | 
232 | # Region Proposal Network (RPN)
233 | import torch.nn as nn
234 | mid_channels = 512
235 | in_channels = 512 # depends on the output feature map. in vgg 16 it is equal to 512
236 | n_anchor = 9 # Number of anchors at each location
237 | conv1 = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
238 | reg_layer = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
239 | cls_layer = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0) ## I will be going to use softmax here. you can equally use sigmoid if u replace 2 with 1.
240 | 
241 | # conv sliding layer
242 | conv1.weight.data.normal_(0, 0.01)
243 | conv1.bias.data.zero_()
244 | 
245 | # Regression layer
246 | reg_layer.weight.data.normal_(0, 0.01)
247 | reg_layer.bias.data.zero_()
248 | 
249 | # classification layer
250 | cls_layer.weight.data.normal_(0, 0.01)
251 | cls_layer.bias.data.zero_()
252 | 
253 | x = conv1(out_map)  # out_map is obtained in section 1
254 | pred_anchor_locs = reg_layer(x)  # 回归层，计算有效anchor转为目标框的四个系数
255 | pred_cls_scores = cls_layer(x)   # 分类层，判断该anchor是否可以捕获目标
256 | 
257 | print(pred_cls_scores.shape, pred_anchor_locs.shape)  # ((1L, 18L, 50L, 50L), (1L, 36L, 50L, 50L))
258 | 
259 | pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
260 | print(pred_anchor_locs.shape)
261 | #Out: torch.Size([1, 22500, 4])
262 | 
263 | pred_cls_scores = pred_cls_scores.permute(0, 2, 3, 1).contiguous()
264 | print(pred_cls_scores.shape)
265 | #Out torch.Size([1, 50, 50, 18])
266 | 
267 | objectness_score = pred_cls_scores.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
268 | print(objectness_score.shape)
269 | #Out torch.Size([1, 22500])
270 | 
271 | pred_cls_scores = pred_cls_scores.view(1, -1, 2)
272 | print(pred_cls_scores.shape)
273 | # Out torch.size([1, 22500, 2])
274 | 
275 | 
276 | # Generating proposals to feed Fast R-CNN network
277 | n_train_pre_nms = 12000
278 | n_train_post_nms = 2000
279 | n_test_pre_nms = 6000
280 | n_test_post_nms = 300
281 | min_size = 16
282 | 
283 | # 转换anchor格式从 y1, x1, y2, x2 到 ctr_x, ctr_y, h, w ：
284 | anc_height = anchors[:, 2] - anchors[:, 0]
285 | anc_width = anchors[:, 3] - anchors[:, 1]
286 | anc_ctr_y = anchors[:, 0] + 0.5 * anc_height
287 | anc_ctr_x = anchors[:, 1] + 0.5 * anc_width
288 | 
289 | 
290 | # 根据预测的四个系数，将anchor框通过平移和缩放转化为预测的目标框
291 | pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy()
292 | objectness_score_numpy = objectness_score[0].data.numpy()
293 | dy = pred_anchor_locs_numpy[:, 0::4]
294 | dx = pred_anchor_locs_numpy[:, 1::4]
295 | dh = pred_anchor_locs_numpy[:, 2::4]
296 | dw = pred_anchor_locs_numpy[:, 3::4]
297 | ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis]
298 | ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis]
299 | h = np.exp(dh) * anc_height[:, np.newaxis]
300 | w = np.exp(dw) * anc_width[:, np.newaxis]
301 | 
302 | #  将预测的目标框转换为[y1, x1, y2, x2]格式
303 | roi = np.zeros(pred_anchor_locs_numpy.shape, dtype=pred_anchor_locs_numpy.dtype)
304 | roi[:, 0::4] = ctr_y - 0.5 * h
305 | roi[:, 1::4] = ctr_x - 0.5 * w
306 | roi[:, 2::4] = ctr_y + 0.5 * h
307 | roi[:, 3::4] = ctr_x + 0.5 * w
308 | 
309 | 
310 | # 剪辑预测框到图像上
311 | img_size = (800, 800) #Image size
312 | roi[:, slice(0, 4, 2)] = np.clip(roi[:, slice(0, 4, 2)], 0, img_size[0])
313 | roi[:, slice(1, 4, 2)] = np.clip(roi[:, slice(1, 4, 2)], 0, img_size[1])
314 | print(roi.shape)  # (22500, 4)
315 | 
316 | #  去除高度或宽度 < threshold的预测框 （疑问：这样会不会忽略小目标）
317 | hs = roi[:, 2] - roi[:, 0]
318 | ws = roi[:, 3] - roi[:, 1]
319 | keep = np.where((hs >= min_size) & (ws >= min_size))[0]
320 | roi = roi[keep, :]
321 | score = objectness_score_numpy[keep]
322 | 
323 | # 按分数从高到低排序所有的（proposal, score）对
324 | order = score.ravel().argsort()[::-1]
325 | print(order.shape)  # (22500,)
326 | 
327 | # 取前几个预测框pre_nms_topN(如训练时12000，测试时300)
328 | order = order[:n_train_pre_nms]
329 | roi = roi[order, :]
330 | print(roi.shape)  # (12000, 4)
331 | 
332 | # nms（非极大抑制）计算： (去除和极大值anchor框IOU大于0.7的框——即去除相交的框，保留score大，且基本无相交的框)
333 | nms_thresh = 0.7
334 | y1 = roi[:, 0]
335 | x1 = roi[:, 1]
336 | y2 = roi[:, 2]
337 | x2 = roi[:, 3]
338 | 
339 | areas = (x2 - x1 + 1) * (y2 - y1 + 1)
340 | 
341 | score = score[order]
342 | order = score.argsort()[::-1]
343 | print order  # [11999  3996  4005 ...  7995  7994     0]
344 | keep = []
345 | while order.size > 0:
346 |     i = order[0]
347 |     keep.append(i)
348 |     xx1 = np.maximum(x1[i], x1[order[1:]])
349 |     yy1 = np.maximum(y1[i], y1[order[1:]])
350 |     xx2 = np.minimum(x2[i], x2[order[1:]])
351 |     yy2 = np.minimum(y2[i], y2[order[1:]])
352 | 
353 |     w = np.maximum(0.0, xx2 - xx1 + 1)
354 |     h = np.maximum(0.0, yy2 - yy1 + 1)
355 |     inter = w * h
356 |     ovr = inter / (areas[i] + areas[order[1:]] - inter)
357 | 
358 |     inds = np.where(ovr <= nms_thresh)[0]
359 |     order = order[inds + 1]
360 |     # print ovr
361 |     # print order
362 | 
363 | keep = keep[:n_train_post_nms]  # while training/testing , use accordingly
364 | roi = roi[keep]  # the final region proposals（region proposals表示预测目标框）
365 | print roi.shape  # (1758, 4)
366 | 
367 | 
368 | # Proposal targets
369 | n_sample = 128
370 | pos_ratio = 0.25
371 | pos_iou_thresh = 0.5
372 | neg_iou_thresh_hi = 0.5
373 | neg_iou_thresh_lo = 0.0
374 | 
375 | # 找到每个ground-truth目标（真实目标框）与region proposal（预测目标框）的iou
376 | ious = np.empty((len(roi), 2), dtype=np.float32)
377 | ious.fill(0)
378 | for num1, i in enumerate(roi):
379 |    ya1, xa1, ya2, xa2 = i
380 |    anchor_area = (ya2 - ya1) * (xa2 - xa1)
381 |    for num2, j in enumerate(bbox):
382 |        yb1, xb1, yb2, xb2 = j
383 |        box_area = (yb2 - yb1) * (xb2 - xb1)
384 | 
385 |        inter_x1 = max([xb1, xa1])
386 |        inter_y1 = max([yb1, ya1])
387 |        inter_x2 = min([xb2, xa2])
388 |        inter_y2 = min([yb2, ya2])
389 | 
390 |        if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
391 |            iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)
392 |            iou = iter_area / (anchor_area + box_area - iter_area)
393 |        else:
394 |            iou = 0.
395 | 
396 |        ious[num1, num2] = iou
397 | print(ious.shape)  # (1758, 2)
398 | 
399 | # 找到与每个region proposal具有较高IoU的ground truth，并且找到最大的IoU
400 | gt_assignment = ious.argmax(axis=1)
401 | max_iou = ious.max(axis=1)
402 | print(gt_assignment)  # [0 0 1 ... 0 0 0]
403 | print(max_iou)  # [0.17802152 0.17926688 0.04676317 ... 0.         0.         0.        ]
404 | 
405 | 
406 | # 为每个proposal分配标签：
407 | gt_roi_label = labels[gt_assignment]
408 | print(gt_roi_label)  # [6 6 8 ... 6 6 6]
409 | 
410 | # 希望只保留n_sample*pos_ratio（128*0.25=32）个前景样本，因此如果只得到少于32个正样本，保持原状。
411 | # 如果得到多余32个前景目标，从中采样32个样本
412 | pos_roi_per_image = 32
413 | pos_index = np.where(max_iou >= pos_iou_thresh)[0]
414 | pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
415 | if pos_index.size > 0:
416 |    pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False)
417 | # print(pos_roi_per_this_image)
418 | print(pos_index)  # 19
419 | 
420 | # 针对负[背景]region proposal进行相似处理
421 | neg_index = np.where((max_iou < neg_iou_thresh_hi) & (max_iou >= neg_iou_thresh_lo))[0]
422 | neg_roi_per_this_image = n_sample - pos_roi_per_this_image
423 | neg_roi_per_this_image = int(min(neg_roi_per_this_image, neg_index.size))
424 | if neg_index.size > 0:
425 |    neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False)
426 | # print(neg_roi_per_this_image)
427 | print(neg_index)  # 109
428 | 
429 | keep_index = np.append(pos_index, neg_index)
430 | gt_roi_labels = gt_roi_label[keep_index]
431 | gt_roi_labels[pos_roi_per_this_image:] = 0  # negative labels --> 0
432 | sample_roi = roi[keep_index]  # 预测框
433 | print(sample_roi.shape)  # (128, 4)
434 | 
435 | 
436 | bbox_for_sampled_roi = bbox[gt_assignment[keep_index]]  # 目标框
437 | print(bbox_for_sampled_roi.shape)  # (128, 4)
438 | 
439 | 
440 | # 根据上面得到的预测框和与之对应的目标框，计算4维参数（平移参数：dy, dx； 缩放参数：dh, dw）
441 | height = sample_roi[:, 2] - sample_roi[:, 0]
442 | width = sample_roi[:, 3] - sample_roi[:, 1]
443 | ctr_y = sample_roi[:, 0] + 0.5 * height
444 | ctr_x = sample_roi[:, 1] + 0.5 * width
445 | base_height = bbox_for_sampled_roi[:, 2] - bbox_for_sampled_roi[:, 0]
446 | base_width = bbox_for_sampled_roi[:, 3] - bbox_for_sampled_roi[:, 1]
447 | base_ctr_y = bbox_for_sampled_roi[:, 0] + 0.5 * base_height
448 | base_ctr_x = bbox_for_sampled_roi[:, 1] + 0.5 * base_width
449 | 
450 | eps = np.finfo(height.dtype).eps
451 | height = np.maximum(height, eps)
452 | width = np.maximum(width, eps)
453 | 
454 | dy = (base_ctr_y - ctr_y) / height
455 | dx = (base_ctr_x - ctr_x) / width
456 | dh = np.log(base_height / height)
457 | dw = np.log(base_width / width)
458 | 
459 | gt_roi_locs = np.vstack((dy, dx, dh, dw)).transpose()
460 | print(gt_roi_locs.shape)
461 | 
462 | 
463 | rois = torch.from_numpy(sample_roi).float()
464 | roi_indices = 0 * np.ones((len(rois),), dtype=np.int32)
465 | roi_indices = torch.from_numpy(roi_indices).float()
466 | print(rois.shape, roi_indices.shape)  # torch.Size([128, 4]) torch.Size([128])
467 | 
468 | indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1)
469 | xy_indices_and_rois = indices_and_rois[:, [0, 2, 1, 4, 3]]
470 | indices_and_rois = xy_indices_and_rois.contiguous()
471 | print(xy_indices_and_rois.shape)  # torch.Size([128, 5])
472 | 
473 | # 设计大小为7x7的roi_pooling层
474 | size = (7, 7)
475 | adaptive_max_pool = torch.nn.AdaptiveMaxPool2d(size[0], size[1])
476 | output = []
477 | rois = indices_and_rois.float()
478 | rois[:, 1:].mul_(1/16.0)  # Subsampling ratio
479 | rois = rois.long()
480 | num_rois = rois.size(0)
481 | for i in range(num_rois):
482 |    roi = rois[i]
483 |    im_idx = roi[0]
484 |    im = out_map.narrow(0, im_idx, 1)[..., roi[2]:(roi[4]+1), roi[1]:(roi[3]+1)]
485 |    # print adaptive_max_pool(im)[0].data.shape
486 |    output.append(adaptive_max_pool(im)[0].data)
487 | output = torch.stack(output)
488 | print output.shape
489 | output = torch.cat(output, 0)
490 | print(output.size())  # torch.Size([128, 512, 7, 7])
491 | 
492 | k = output.view(output.size(0), -1)
493 | print(k.shape)  # [128, 25088]
494 | 
495 | # 分类层
496 | roi_head_classifier = nn.Sequential(*[nn.Linear(25088, 4096),
497 |                                       nn.Linear(4096, 4096)])
498 | cls_loc = nn.Linear(4096, 21 * 4)  # (VOC 20 classes + 1 background. Each will have 4 co-ordinates)
499 | cls_loc.weight.data.normal_(0, 0.01)
500 | cls_loc.bias.data.zero_()
501 | score = nn.Linear(4096, 21)  # (VOC 20 classes + 1 background)
502 | 
503 | k = torch.autograd.Variable(k)
504 | k = roi_head_classifier(k)
505 | roi_cls_loc = cls_loc(k)
506 | roi_cls_score = score(k)
507 | print(roi_cls_loc.data.shape, roi_cls_score.data.shape)  # torch.Size([128, 84]), torch.Size([128, 21])
508 | 
509 | 
510 | #  Fast RCNN 损失函数
511 | print(pred_anchor_locs.shape)  # torch.Size([1, 22500, 4])  # RPN网络预测的坐标系数
512 | print(pred_cls_scores.shape)  # torch.Size([1, 22500, 2])   # RPN网络预测的类别
513 | print(anchor_locations.shape)  # (22500, 4)  # anchor对应的实际坐标系数
514 | print(anchor_labels.shape)  # (22500,)       # anchor的实际类别
515 | 
516 | # 我们将会从新排列，将输入和输出排成一行
517 | rpn_loc = pred_anchor_locs[0]
518 | rpn_score = pred_cls_scores[0]
519 | gt_rpn_loc = torch.from_numpy(anchor_locations)
520 | gt_rpn_score = torch.from_numpy(anchor_labels)
521 | print(rpn_loc.shape, rpn_score.shape, gt_rpn_loc.shape, gt_rpn_score.shape)
522 | # torch.Size([22500, 4]) torch.Size([22500, 2]) torch.Size([22500, 4]) torch.Size([22500])
523 | 
524 | # 对与classification我们使用Cross Entropy损失
525 | gt_rpn_score = torch.autograd.Variable(gt_rpn_score.long())
526 | rpn_cls_loss = torch.nn.functional.cross_entropy(rpn_score, gt_rpn_score, ignore_index=-1)
527 | print(rpn_cls_loss)  # Variable containing: 0.6931
528 | 
529 | # 对于 Regression 我们使用smooth L1 损失
530 | pos = gt_rpn_score.data > 0  # Regression 损失也被应用在有正标签的边界区域中
531 | mask = pos.unsqueeze(1).expand_as(rpn_loc)
532 | print(mask.shape)  # (22500L, 4L)
533 | 
534 | # 现在取有正数标签的边界区域
535 | mask_loc_preds = rpn_loc[mask].view(-1, 4)
536 | mask_loc_targets = gt_rpn_loc[mask].view(-1, 4)
537 | print(mask_loc_preds.shape, mask_loc_targets.shape)  # ((18L, 4L), (18L, 4L))
538 | 
539 | # regression损失应用如下
540 | x = np.abs(mask_loc_targets.numpy() - mask_loc_preds.data.numpy())
541 | print x.shape  # (18, 4)
542 | # print (x < 1)
543 | rpn_loc_loss = ((x < 1) * 0.5 * x**2) + ((x >= 1) * (x-0.5))
544 | # print rpn_loc_loss.shape  # (18, 4)
545 | rpn_loc_loss = rpn_loc_loss.sum()  # 1.1628926242031001
546 | print rpn_loc_loss
547 | # print rpn_loc_loss.shape
548 | # rpn_loc_loss = np.squeeze(rpn_loc_loss)
549 | # print rpn_loc_loss
550 | 
551 | N_reg = (gt_rpn_score > 0).float().sum()
552 | N_reg = np.squeeze(N_reg.data.numpy())
553 | 
554 | print "N_reg: {}, {}".format(N_reg, N_reg.shape)
555 | rpn_loc_loss = rpn_loc_loss / N_reg
556 | rpn_loc_loss = np.float32(rpn_loc_loss)
557 | # rpn_loc_loss = torch.autograd.Variable(torch.from_numpy(rpn_loc_loss))
558 | rpn_lambda = 10.
559 | rpn_cls_loss = np.squeeze(rpn_cls_loss.data.numpy())
560 | print "rpn_cls_loss: {}".format(rpn_cls_loss)  # 0.693146109581
561 | print 'rpn_loc_loss: {}'.format(rpn_loc_loss)  # 0.0646051466465
562 | rpn_loss = rpn_cls_loss + (rpn_lambda * rpn_loc_loss)
563 | print("rpn_loss: {}".format(rpn_loss))  # 1.33919757605
564 | 
565 | 
566 | # Fast R-CNN 损失函数
567 | # 预测
568 | print(roi_cls_loc.shape)  # # torch.Size([128, 84])
569 | print(roi_cls_score.shape)  # torch.Size([128, 21])
570 | 
571 | # 真实
572 | print(gt_roi_locs.shape)  # (128, 4)
573 | print(gt_roi_labels.shape)  # (128, )
574 | 
575 | gt_roi_loc = torch.from_numpy(gt_roi_locs)
576 | gt_roi_label = torch.from_numpy(np.float32(gt_roi_labels)).long()
577 | print(gt_roi_loc.shape, gt_roi_label.shape)  # torch.Size([128, 4]) torch.Size([128])
578 | 
579 | # 分类损失
580 | gt_roi_label = torch.autograd.Variable(gt_roi_label)
581 | roi_cls_loss = torch.nn.functional.cross_entropy(roi_cls_score, gt_roi_label, ignore_index=-1)
582 | print(roi_cls_loss)  # Variable containing:  3.0515
583 | 
584 | 
585 | # 回归损失
586 | n_sample = roi_cls_loc.shape[0]
587 | roi_loc = roi_cls_loc.view(n_sample, -1, 4)
588 | print(roi_loc.shape)  # (128L, 21L, 4L)
589 | 
590 | roi_loc = roi_loc[torch.arange(0, n_sample).long(), gt_roi_label]
591 | print(roi_loc.shape)  # torch.Size([128, 4])
592 | 
593 | 
594 | # 用计算RPN网络回归损失的方法计算回归损失
595 | # roi_loc_loss = REGLoss(roi_loc, gt_roi_loc)
596 | 
597 | pos = gt_roi_label.data > 0  # Regression 损失也被应用在有正标签的边界区域中
598 | mask = pos.unsqueeze(1).expand_as(roi_loc)
599 | print(mask.shape)  # (128, 4L)
600 | 
601 | # 现在取有正数标签的边界区域
602 | mask_loc_preds = roi_loc[mask].view(-1, 4)
603 | mask_loc_targets = gt_roi_loc[mask].view(-1, 4)
604 | print(mask_loc_preds.shape, mask_loc_targets.shape)  # ((19L, 4L), (19L, 4L))
605 | 
606 | 
607 | x = np.abs(mask_loc_targets.numpy() - mask_loc_preds.data.numpy())
608 | print x.shape  # (19, 4)
609 | 
610 | roi_loc_loss = ((x < 1) * 0.5 * x**2) + ((x >= 1) * (x-0.5))
611 | print(roi_loc_loss.sum())  # 1.4645805211187053
612 | 
613 | 
614 | N_reg = (gt_roi_label > 0).float().sum()
615 | N_reg = np.squeeze(N_reg.data.numpy())
616 | roi_loc_loss = roi_loc_loss.sum() / N_reg
617 | roi_loc_loss = np.float32(roi_loc_loss)
618 | print roi_loc_loss  # 0.077294916
619 | # roi_loc_loss = torch.autograd.Variable(torch.from_numpy(roi_loc_loss))
620 | 
621 | 
622 | # ROI损失总和
623 | roi_lambda = 10.
624 | roi_cls_loss = np.squeeze(roi_cls_loss.data.numpy())
625 | roi_loss = roi_cls_loss + (roi_lambda * roi_loc_loss)
626 | print(roi_loss)  # 3.810348778963089
627 | 
628 | 
629 | total_loss = rpn_loss + roi_loss
630 | 
631 | print total_loss  # 5.149546355009079
632 | 
633 | 
634 | 
635 | 
636 | 
637 | 
638 | 
639 | 
640 | 
641 | 
642 | 


--------------------------------------------------------------------------------
/model.py:
--------------------------------------------------------------------------------
  1 | #coding:utf8
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torchvision
  6 | from PIL import Image, ImageDraw
  7 | import numpy as np
  8 | 
  9 | 
 10 | # cfg = {
 11 | #     'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
 12 | #     'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
 13 | #     'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
 14 | #     'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
 15 | # }
 16 | 
 17 | featur_cfg = ''
 18 | 
 19 | 
 20 | class VGG(nn.Module):
 21 | 
 22 |     def __init__(self):
 23 |         super(VGG, self).__init__()
 24 | 
 25 |         cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512]
 26 |         self.features = self._make_layers(cfg)
 27 |         self._rpn_model()
 28 | 
 29 |         size = (7, 7)
 30 |         self.adaptive_max_pool = torch.nn.AdaptiveMaxPool2d(size[0], size[1])
 31 |         self.roi_classifier()
 32 | 
 33 |     def _make_layers(self, cfg):
 34 |         layers = []
 35 |         in_channels = 3
 36 |         for x in cfg:
 37 |             if x == 'M':
 38 |                 layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
 39 |             else:
 40 |                 layers += [nn.Conv2d(in_channels, x, kernel_size=3, padding=1),
 41 |                            nn.BatchNorm2d(x),
 42 |                            nn.ReLU(inplace=True)]
 43 |                 in_channels = x
 44 | 
 45 |         # layers += [nn.Conv2d(in_channels, 512, kernel_size=3, padding=1)]
 46 |         return nn.Sequential(*layers)
 47 |         # return layers
 48 | 
 49 |     def _rpn_model(self, mid_channels=512, in_channels=512, n_anchor=9):
 50 |         self.rpn_conv = nn.Conv2d(in_channels, mid_channels, 3, 1, 1)
 51 |         self.reg_layer = nn.Conv2d(mid_channels, n_anchor * 4, 1, 1, 0)
 52 |         # I will be going to use softmax here. you can equally use sigmoid if u replace 2 with 1.
 53 |         self.cls_layer = nn.Conv2d(mid_channels, n_anchor * 2, 1, 1, 0)
 54 | 
 55 |         # conv sliding layer
 56 |         self.rpn_conv.weight.data.normal_(0, 0.01)
 57 |         self.rpn_conv.bias.data.zero_()
 58 | 
 59 |         # Regression layer
 60 |         self.reg_layer.weight.data.normal_(0, 0.01)
 61 |         self.reg_layer.bias.data.zero_()
 62 | 
 63 |         # classification layer
 64 |         self.cls_layer.weight.data.normal_(0, 0.01)
 65 |         self.cls_layer.bias.data.zero_()
 66 | 
 67 |     def forward(self, data):
 68 |         out_map = self.features(data)
 69 |         # for layer in self.features:
 70 |         #     # print layer
 71 |         #     data = layer(data)
 72 |         #     # print data.data.shape
 73 |         #
 74 |         # # out = data.view(data.size(0), -1)
 75 |         x = self.rpn_conv(out_map)
 76 |         pred_anchor_locs = self.reg_layer(x)  # 回归层，计算有效anchor转为目标框的四个系数
 77 |         pred_cls_scores = self.cls_layer(x)  # 分类层，判断该anchor是否可以捕获目标
 78 | 
 79 |         return out_map, pred_anchor_locs, pred_cls_scores
 80 | 
 81 |     def roi_classifier(self, class_num=20):  # 假设为VOC数据集，共20分类
 82 |         # 分类层
 83 |         self.roi_head_classifier = nn.Sequential(*[nn.Linear(25088, 4096),
 84 |                                                    nn.ReLU(),
 85 |                                                    nn.Linear(4096, 4096),
 86 |                                                    nn.ReLU()])
 87 |         self.cls_loc = nn.Linear(4096, (class_num+1) * 4)  # (VOC 20 classes + 1 background. Each will have 4 co-ordinates)
 88 |         self.cls_loc.weight.data.normal_(0, 0.01)
 89 |         self.cls_loc.bias.data.zero_()
 90 | 
 91 | 
 92 |         self.score = nn.Linear(4096, class_num+1)  # (VOC 20 classes + 1 background)
 93 | 
 94 |     def rpn_loss(self, rpn_loc, rpn_score, gt_rpn_loc, gt_rpn_label, weight=10.0):
 95 |         # 对与classification我们使用Cross Entropy损失
 96 |         gt_rpn_label = torch.autograd.Variable(gt_rpn_label.long())
 97 |         rpn_cls_loss = torch.nn.functional.cross_entropy(rpn_score, gt_rpn_label, ignore_index=-1)
 98 |         # print(rpn_cls_loss)  # Variable containing: 0.6931
 99 | 
100 |         # 对于 Regression 我们使用smooth L1 损失
101 |         pos = gt_rpn_label.data > 0  # Regression 损失也被应用在有正标签的边界区域中
102 |         mask = pos.unsqueeze(1).expand_as(rpn_loc)
103 |         # print(mask.shape)  # (22500L, 4L)
104 | 
105 |         # 现在取有正数标签的边界区域
106 |         mask_pred_loc = rpn_loc[mask].view(-1, 4)
107 |         mask_target_loc = gt_rpn_loc[mask].view(-1, 4)
108 |         # print(mask_pred_loc.shape, mask_target_loc.shape)  # ((18L, 4L), (18L, 4L))
109 | 
110 |         # regression损失应用如下
111 |         x = np.abs(mask_target_loc.numpy() - mask_pred_loc.data.numpy())
112 |         # print x.shape  # (18, 4)
113 |         # print (x < 1)
114 |         rpn_loc_loss = ((x < 1) * 0.5 * x ** 2) + ((x >= 1) * (x - 0.5))
115 |         # print rpn_loc_loss.shape  # (18, 4)
116 |         rpn_loc_loss = rpn_loc_loss.sum()  # 1.1628926242031001
117 |         # print rpn_loc_loss
118 |         # print rpn_loc_loss.shape
119 |         # rpn_loc_loss = np.squeeze(rpn_loc_loss)
120 |         # print rpn_loc_loss
121 | 
122 |         N_reg = (gt_rpn_label > 0).float().sum()
123 |         N_reg = np.squeeze(N_reg.data.numpy())
124 | 
125 |         # print "N_reg: {}, {}".format(N_reg, N_reg.shape)
126 |         rpn_loc_loss = rpn_loc_loss / N_reg
127 |         rpn_loc_loss = np.float32(rpn_loc_loss)
128 |         # rpn_loc_loss = torch.autograd.Variable(torch.from_numpy(rpn_loc_loss))
129 | 
130 |         rpn_cls_loss = np.squeeze(rpn_cls_loss.data.numpy())
131 |         # print "rpn_cls_loss: {}".format(rpn_cls_loss)  # 0.693146109581
132 |         # print 'rpn_loc_loss: {}'.format(rpn_loc_loss)  # 0.0646051466465
133 |         rpn_loss = rpn_cls_loss + (weight * rpn_loc_loss)
134 |         # print("rpn_loss: {}".format(rpn_loss))  # 1.33919757605
135 |         return rpn_loss
136 | 
137 |     def roi_loss(self, pre_loc, pre_conf, target_loc, target_conf, weight=10.0):
138 |         # 分类损失
139 |         target_conf = torch.autograd.Variable(target_conf.long())
140 |         pred_conf_loss = torch.nn.functional.cross_entropy(pre_conf, target_conf, ignore_index=-1)
141 |         # print(pred_conf_loss)  # Variable containing:  3.0515
142 | 
143 |         #  对于 Regression 我们使用smooth L1 损失
144 |         # 用计算RPN网络回归损失的方法计算回归损失
145 |         # pre_loc_loss = REGLoss(pre_loc, target_loc)
146 |         pos = target_conf.data > 0  # Regression 损失也被应用在有正标签的边界区域中
147 |         mask = pos.unsqueeze(1).expand_as(pre_loc)  # (128, 4L)
148 | 
149 |         # 现在取有正数标签的边界区域
150 |         mask_pred_loc = pre_loc[mask].view(-1, 4)
151 |         mask_target_loc = target_loc[mask].view(-1, 4)
152 |         # print(mask_pred_loc.shape, mask_target_loc.shape)  # ((19L, 4L), (19L, 4L))
153 | 
154 |         x = np.abs(mask_target_loc.numpy() - mask_pred_loc.data.numpy())
155 |         # print x.shape  # (19, 4)
156 | 
157 |         pre_loc_loss = ((x < 1) * 0.5 * x ** 2) + ((x >= 1) * (x - 0.5))
158 |         # print(pre_loc_loss.sum())  # 1.4645805211187053
159 | 
160 |         N_reg = (target_conf > 0).float().sum()
161 |         N_reg = np.squeeze(N_reg.data.numpy())
162 |         pre_loc_loss = pre_loc_loss.sum() / N_reg
163 |         pre_loc_loss = np.float32(pre_loc_loss)
164 |         # print pre_loc_loss  # 0.077294916
165 |         # pre_loc_loss = torch.autograd.Variable(torch.from_numpy(pre_loc_loss))
166 |         # 损失总和
167 |         pred_conf_loss = np.squeeze(pred_conf_loss.data.numpy())
168 |         total_loss = pred_conf_loss + (weight * pre_loc_loss)
169 | 
170 |         return total_loss
171 | 
172 | 
173 | if __name__ == '__main__':
174 |     vgg = VGG()
175 |     print vgg
176 |     data = torch.randn((1, 3, 800, 800))
177 |     print data.shape
178 |     data = torch.autograd.Variable(data)
179 |     out = vgg.forward(data)
180 |     print out.data.shape
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #coding:utf8
  2 | 
  3 | import torch
  4 | import torchvision
  5 | from PIL import Image, ImageDraw
  6 | import numpy as np
  7 | from model import VGG
  8 | import utils
  9 | 
 10 | 
 11 | # 大体流程：
 12 | # 1. 图像通过vgg获得特征图，
 13 | # 2. 特征图通过RPN获得有效anchor的置信度(foreground)和转为预测框的坐标系数
 14 | # 3. 特征图和预测框通过ROI Pooling获取固定尺寸的预测目标特征图,即利用预测框，从特征图中把目标抠出来，
 15 | #    因为目标尺寸不一，再通过ROI Pooling的方法把目标转为统一的固定尺寸（7*7），这样就可以方便做目标的分类和预测框的修正处理。
 16 | # 4. 固定尺寸的预测目标特征图通过类别分类模型（self.score）获取预测框所属的类别，
 17 | # 5. 固定尺寸的预测目标特征图通过坐标分类模型(self.cls_loc)获取置信度和预测框修正的坐标系数。
 18 | 
 19 | # 假设 图片中的两个目标框"ground-truth"
 20 | bbox = np.asarray([[20, 30, 400, 500], [300, 400, 500, 600]], dtype=np.float32) # [y1, x1, y2, x2] format
 21 | # 假设 图片中两个目标框分别对应的标签
 22 | labels = np.asarray([6, 8], dtype=np.int8)  # 0 represents background
 23 | 
 24 | img_tensor = torch.zeros((1, 3, 800, 800)).float()
 25 | img_var = torch.autograd.Variable(img_tensor)
 26 | 
 27 | 
 28 | # ---------------------step_1: 获取目标anchor的置信度（anchor_conf）和平移缩放系数（anchor_locations）
 29 | # 初始化所有anchors, 并找出有效anchors和对应的index
 30 | # anchors： (22500, 4)  valid_anchor_boxes： (8940, 4)  valid_anchor_index：8940
 31 | anchors, valid_anchor_boxes, valid_anchor_index = utils.init_anchor()
 32 | # 计算有效anchors与所有目标框的IOU
 33 | # ious：（8940, 2) 每个有效anchor框与目标实体框的IOU
 34 | ious = utils.compute_iou(valid_anchor_boxes, bbox)
 35 | valid_anchor_len = len(valid_anchor_boxes)
 36 | # 在有效框中找到一定比例的正例和负例
 37 | label, argmax_ious = utils.get_pos_neg_sample(ious, valid_anchor_len, pos_iou_threshold=0.7,
 38 |                                  neg_iou_threshold=0.3, pos_ratio=0.5, n_sample=256)
 39 | # print np.sum(label == 1)  # 18个正例
 40 | # print np.sum(label == 0)  # 256-18=238个负例
 41 | 
 42 | # 现在让我们用具有最大iou的ground truth对象为每个anchor box分配位置。
 43 | # 注意，我们将为所有有效的anchor box分配anchor locs，而不考虑其标签，稍后在计算损失时，我们可以使用简单的过滤器删除它们。
 44 | # 每个有效anchor对应的目标框bbox
 45 | max_iou_bbox = bbox[argmax_ious]  # 有效anchor框对应的目标框坐标  (8940, 4)
 46 | # print max_iou_bbox.shape  # (8940, 4)，共有8940个有效anchor框，每个anchor有坐标值（y1, x1, y2, x2）
 47 | # 为所有有效的anchor_box分配anchor_locs，anchor_locs是每个有效的anchors转为对应目标框（bbox）的平移缩放系数
 48 | anchor_locs = utils.get_coefficient(valid_anchor_boxes, max_iou_bbox)
 49 | # print(anchor_locs.shape)  # (8940, 4)  4维参数（平移参数：dy, dx； 缩放参数：dh, dw）
 50 | 
 51 | # anchor_conf ： 所有anchor框对应的label（-1：无效anchor，0：负例有效anchor，1：正例有效anchor）
 52 | anchor_conf = np.empty((len(anchors),), dtype=label.dtype)
 53 | anchor_conf.fill(-1)
 54 | anchor_conf[valid_anchor_index] = label
 55 | print anchor_conf.shape  # 所有anchor对应的label（feature_size*feature_size*9）=》 (22500,)
 56 | 
 57 | # anchor_locations： 所有anchor框转为目标实体框的系数，无效anchor系数全部为0，有效anchor有有效系数
 58 | anchor_locations = np.empty((len(anchors),) + anchors.shape[1:], dtype=anchor_locs.dtype)
 59 | anchor_locations.fill(0)
 60 | anchor_locations[valid_anchor_index, :] = anchor_locs
 61 | print anchor_locations.shape  # 所有anchor对应的平移缩放系数（feature_size*feature_size*9，4）=》(22500, 4)
 62 | 
 63 | # 这里通过候选anchor与目标实体框计算得到anchor框的置信度（anchor_conf）和平移缩放系数（anchor_locations）
 64 | # ----------------------
 65 | 
 66 | 
 67 | # --------------------step_2: VGG 和 RPN 模型: RPN 预测的是anchor转为目标框的平移缩放系数
 68 | vgg = VGG()
 69 | # out_map 特征图， # pred_anchor_locs 预测anchor框到目标框转化的系数， pred_anchor_conf 预测anchor框的分数
 70 | out_map, pred_anchor_locs, pred_anchor_conf = vgg.forward(img_var)
 71 | print out_map.data.shape  # (batch_size, num, feature_size, feature_size) => (1, 512, 50, 50)
 72 | 
 73 | # 1. pred_anchor_locs 预测每个anchor框到目标框转化的系数（平移缩放），与 anchor_locations对应
 74 | pred_anchor_locs = pred_anchor_locs.permute(0, 2, 3, 1).contiguous().view(1, -1, 4)
 75 | print(pred_anchor_locs.shape)  # Out: torch.Size([1, 22500, 4])
 76 | 
 77 | # 2. 预测anchor框的置信度，每个anchor框都会对应一个置信度，与 anchor_conf对应
 78 | pred_anchor_conf = pred_anchor_conf.permute(0, 2, 3, 1).contiguous()
 79 | print(pred_anchor_conf.shape)  # Out torch.Size([1, 50, 50, 18])
 80 | objectness_score = pred_anchor_conf.view(1, 50, 50, 9, 2)[:, :, :, :, 1].contiguous().view(1, -1)
 81 | print(objectness_score.shape)  # Out torch.Size([1, 22500])
 82 | 
 83 | pred_anchor_conf = pred_anchor_conf.view(1, -1, 2)
 84 | print(pred_anchor_conf.shape)  # Out torch.size([1, 22500, 2])
 85 | # ---------------------
 86 | 
 87 | 
 88 | # ---------------------step_3: RPN 损失 （有效anchor与预测anchor之间的损失--坐标系数损失与置信度损失）
 89 | # 从上面step_1中，我们得到了目标anchor信息：
 90 | # 目标anchor坐标系数：anchor_locations  (22500, 4)
 91 | # 目标anchor置信度：anchor_conf  (22500,)
 92 | 
 93 | # 从上面step_2中，我们得到了预测anchor信息：
 94 | # RPN网络预测anchor的坐标系数：pred_anchor_locs  (1, 22500, 4)
 95 | # RPN网络预测anchor的置信度: pred_anchor_conf  (1, 22500, 2)
 96 | 
 97 | # 我们将会从新排列，将输入和输出排成一行
 98 | rpn_anchor_loc = pred_anchor_locs[0]
 99 | rpn_anchor_conf = pred_anchor_conf[0]
100 | anchor_locations = torch.from_numpy(anchor_locations)
101 | anchor_conf = torch.from_numpy(anchor_conf)
102 | print(rpn_anchor_loc.shape, rpn_anchor_conf.shape, anchor_locations.shape, anchor_conf.shape)
103 | # torch.Size([22500, 4]) torch.Size([22500, 2]) torch.Size([22500, 4]) torch.Size([22500])
104 | 
105 | rpn_loss = vgg.roi_loss(rpn_anchor_loc, rpn_anchor_conf, anchor_locations, anchor_conf, weight=10.0)
106 | print("rpn_loss: {}".format(rpn_loss))  # 1.33919
107 | # ---------------------
108 | 
109 | 
110 | # ---------------------step_4: 根据anchor和预测anchor系数，计算预测框（roi）和预测框的坐标系数(roi_locs)，
111 | # ---------------------并得到每个预测框的所属类别label(roi_labels)
112 | # 通过anchors框和模型预测的平移缩放系数，得到预测框ROI；再通过预测的分值和阈值进行过滤精简
113 | roi, score, order = utils.get_predict_bbox(anchors, pred_anchor_locs, objectness_score,
114 |                                            n_train_pre_nms=12000, min_size=16)
115 | 
116 | # 得到的预测框（ROI）还会有大量重叠，再通过NMS（非极大抑制）做进一步的过滤精简
117 | roi = utils.nms(roi, score, order, nms_thresh=0.7, n_train_post_nms=2000)
118 | 
119 | 
120 | # 根据预测框ROI与目标框BBox的IOU，得到每个预测框所要预测的目标框（预测框与哪个目标框的IOU大，就代表预测哪个目标）；
121 | # 并根据IOU对ROI做进一步过滤，并划分正负样例。
122 | sample_roi, keep_index, gt_assignment, roi_labels = utils.get_propose_target(roi, bbox, labels,
123 |                                                                                 n_sample=128,
124 |                                                                                 pos_ratio=0.25,
125 |                                                                                 pos_iou_thresh=0.5,
126 |                                                                                 neg_iou_thresh_hi=0.5,
127 |                                                                                 neg_iou_thresh_lo=0.0)
128 | # print(sample_roi.shape)  # (128, 4)
129 | # 预测框对应的目标框 bbox_for_sampled_roi
130 | bbox_for_sampled_roi = bbox[gt_assignment[keep_index]]  # 目标框
131 | print(bbox_for_sampled_roi.shape)  # (128, 4)
132 | # 预测框（ROI）转目标框的真实系数
133 | roi_locs = utils.get_coefficient(sample_roi, bbox_for_sampled_roi)
134 | # ---------------------
135 | 
136 | 
137 | # ---------------------step_5: ROI Pooling：
138 | # 这一步做了两件事：
139 | # 一是从特征图中根据ROI把相应的预测目标框抠出来(im)
140 | # 二是将抠出来的预测目标框通过adaptive_max_pool方法，输出为固定尺寸(512, 7, 7)，方便后续的批处理
141 | # 这样的特点：
142 | # 一是并没有在输入图像上预测，而是在VGG模型的输出特征图上进行预测，这样减少了计算量；
143 | # 二是因为目标实体尺寸多种多样，通过ROI Pooling方法将输出统一为固定尺寸(512, 7, 7)，方便进行批处理，
144 | # sample_roi：预测的有效框 (128, 4)
145 | rois = torch.from_numpy(sample_roi).float()
146 | # roi_indices：添加图像的索引[这里我们只有一个图像，其索引号为0]
147 | roi_indices = 0 * np.ones((len(rois),), dtype=np.int32)
148 | roi_indices = torch.from_numpy(roi_indices).float()
149 | print(rois.shape, roi_indices.shape)  # torch.Size([128, 4]) torch.Size([128])
150 | 
151 | # 将图像的索引号和预测的有效框进行合并, 这样我们将会得到维度是[N, 5]  5=>(index, x1, y1, x2, y2)的张量
152 | indices_and_rois = torch.cat([roi_indices[:, None], rois], dim=1)  # torch.Size([128, 5])
153 | 
154 | output = []
155 | rois = indices_and_rois.float()
156 | rois[:, 1:].mul_(1/16.0)  # 对预测框进行下采样，匹配特征图out_map
157 | rois = rois.long()
158 | num_rois = rois.size(0)
159 | # out_map: (batch_size, num, feature_size, feature_size) => (1, 512, 50, 50)
160 | for i in range(num_rois):
161 |    roi = rois[i]
162 |    im_idx = roi[0]  # 图片的索引号
163 |    # 取出索引号是im_idx的图片特征图=》(1, 512, 50, 50)，因为本实例就一张图片，所以操作完后shape并不变
164 |    out_map = out_map.narrow(0, im_idx, 1)
165 |    # 这一步是根据预测框的的x1,y1, x2,y2坐标，从特征图out_map中把目标实体抠出来
166 |    im = out_map[..., roi[2]:(roi[4]+1), roi[1]:(roi[3]+1)]
167 |    # print im.shape
168 |    # 将抠出来的目标实体im，做adaptive_max_pool计算，最后得到一个固定的尺寸(7,7)== > (512, 7, 7)，方便后面进行批处理
169 |    output.append(vgg.adaptive_max_pool(im)[0].data)
170 | # ---------------------ROI Pooling
171 | 
172 | 
173 | # ---------------------step_6: Classification 线性分类，预测预测框的类别，置信度和转为目标框的平移缩放系数（要与RPN区分）
174 | # note: if your pytorch version is 0.3.1, you must run this:
175 | # output = torch.stack(output)
176 | output = torch.cat(output, 0)  # torch.Size([128, 512, 7, 7])
177 | k = output.view(output.size(0), -1)  # [128, 25088]
178 | 
179 | k = torch.autograd.Variable(k)
180 | k = vgg.roi_head_classifier(k)  # (128, 4096)
181 | # torch.Size([128, 84])  84 ==> (20+1)*4,表示每个框有20个候选类别和一个置信度（假设为VOC数据集，共20分类），4表示坐标信息
182 | pred_roi_locs = vgg.cls_loc(k)
183 | # pred_roi_labels： [128, 21] 表示每个框的类别和置信度
184 | pred_roi_labels = vgg.score(k)
185 | print(pred_roi_locs.data.shape, pred_roi_labels.data.shape)  # torch.Size([128, 84]), torch.Size([128, 21])
186 | # ---------------------Classification
187 | 
188 | 
189 | # ---------------------step_7: 分类损失  (有效预测框真实系数与有效预测框的预测系数间损失，其中系数是转为目标框的坐标系数)
190 | # 从上面step_4中，我们得到了预测框转为目标框的目标信息：
191 | # 预测框的坐标系数(roi_locs)：  (128, 4)
192 | # 预测框的所属类别(roi_labels)：(128, )
193 | 
194 | # 从上面step_6中，我们得到了预测框转为目标框的预测信息：
195 | # 预测框的坐标系数：pred_roi_locs  (128, 84)
196 | # 预测框的所属类别和置信度: pred_roi_labels  (128, 21)
197 | 
198 | 
199 | gt_roi_loc = torch.from_numpy(roi_locs)
200 | gt_roi_label = torch.from_numpy(np.float32(roi_labels)).long()
201 | print(gt_roi_loc.shape, gt_roi_label.shape)  # torch.Size([128, 4]) torch.Size([128])
202 | 
203 | n_sample = pred_roi_locs.shape[0]
204 | roi_loc = pred_roi_locs.view(n_sample, -1, 4)  # (128L, 21L, 4L)
205 | 
206 | roi_loc = roi_loc[torch.arange(0, n_sample).long(), gt_roi_label]  # 根据预测框的真实类别，找到真实类别所对应的坐标系数
207 | # print(roi_loc.shape)  # torch.Size([128, 4])
208 | 
209 | roi_loss = vgg.roi_loss(roi_loc, pred_roi_labels, gt_roi_loc, gt_roi_label, weight=10.0)
210 | print(roi_loss)  # 3.810348778963089
211 | 
212 | 
213 | # 整体损失函数
214 | total_loss = rpn_loss + roi_loss
215 | print total_loss  # 5.149546355009079
216 | 
217 | 
218 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | # coding:utf8
  2 | import sys
  3 | import numpy as np
  4 | import torch
  5 | 
  6 | def init_anchor(img_size=800, sub_sample=16):
  7 |     ratios = [0.5, 1, 2]
  8 |     anchor_scales = [8, 16, 32]  # 该尺寸是针对特征图的
  9 | 
 10 |     # 一个特征点对应原图片中的16*16个像素点区域, 'img_size // sub_sample'得到特征图的尺寸
 11 |     feature_size = (img_size // sub_sample)
 12 |     # 这里相当于把图像分割成feature_size*feature_size的网格， 每个网格对应一个特征点。
 13 |     # ctr_x， ctr_y: 每个网格的右下方坐标
 14 |     ctr_x = np.arange(sub_sample, (feature_size + 1) * sub_sample, sub_sample)  # 共feature_size个
 15 |     ctr_y = np.arange(sub_sample, (feature_size + 1) * sub_sample, sub_sample)  # 共feature_size个
 16 |     # print len(ctr_x)  # 50
 17 | 
 18 |     index = 0
 19 |     # ctr: 每个网格的中心点，一共feature_size*feature_size个网格
 20 |     ctr = dict()
 21 |     for x in range(len(ctr_x)):
 22 |         for y in range(len(ctr_y)):
 23 |             ctr[index] = [-1, -1]
 24 |             ctr[index][1] = ctr_x[x] - 8  # 右下角坐标 - 8 = 中心坐标
 25 |             ctr[index][0] = ctr_y[y] - 8
 26 |             index += 1
 27 |     # print len(ctr)  # 将原图片分割成50*50=2500(feature_size*feature_size)个区域的中心点
 28 | 
 29 |     # 初始化：每个区域有9个anchors候选框，每个候选框的坐标(y1, x1, y2, x2)
 30 |     anchors = np.zeros(((feature_size * feature_size * 9), 4))  # (22500, 4)
 31 |     index = 0
 32 |     # 将候选框的坐标赋值到anchors
 33 |     for c in ctr:
 34 |         ctr_y, ctr_x = ctr[c]
 35 |         for i in range(len(ratios)):
 36 |             for j in range(len(anchor_scales)):
 37 |                 # anchor_scales 是针对特征图的，所以需要乘以下采样"sub_sample"
 38 |                 h = sub_sample * anchor_scales[j] * np.sqrt(ratios[i])
 39 |                 w = sub_sample * anchor_scales[j] * np.sqrt(1. / ratios[i])
 40 |                 anchors[index, 0] = ctr_y - h / 2.
 41 |                 anchors[index, 1] = ctr_x - w / 2.
 42 |                 anchors[index, 2] = ctr_y + h / 2.
 43 |                 anchors[index, 3] = ctr_x + w / 2.
 44 |                 index += 1
 45 | 
 46 |     # 去除坐标出界的边框，保留图片内的框——图片内框
 47 |     valid_anchor_index = np.where(
 48 |         (anchors[:, 0] >= 0) &
 49 |         (anchors[:, 1] >= 0) &
 50 |         (anchors[:, 2] <= 800) &
 51 |         (anchors[:, 3] <= 800)
 52 |     )[0]  # 该函数返回数组中满足条件的index
 53 |     # print valid_anchor_index.shape  # (8940,)，表明有8940个框满足条件
 54 | 
 55 |     # 获取有效anchor（即边框都在图片内的anchor）的坐标
 56 |     valid_anchor_boxes = anchors[valid_anchor_index]
 57 |     # print(valid_anchor_boxes.shape)  # (8940, 4)
 58 | 
 59 |     return anchors, valid_anchor_boxes, valid_anchor_index
 60 | 
 61 | 
 62 | # 计算有效anchor框"valid_anchor_boxes"与目标框"bbox"的IOU
 63 | def compute_iou(valid_anchor_boxes, bbox):
 64 |     valid_anchor_num = len(valid_anchor_boxes)
 65 |     ious = np.empty((valid_anchor_num, 2), dtype=np.float32)
 66 |     ious.fill(0)
 67 |     for num1, i in enumerate(valid_anchor_boxes):
 68 |         ya1, xa1, ya2, xa2 = i
 69 |         anchor_area = (ya2 - ya1) * (xa2 - xa1)  # anchor框面积
 70 |         for num2, j in enumerate(bbox):
 71 |             yb1, xb1, yb2, xb2 = j
 72 |             box_area = (yb2 - yb1) * (xb2 - xb1)  # 目标框面积
 73 |             inter_x1 = max([xb1, xa1])
 74 |             inter_y1 = max([yb1, ya1])
 75 |             inter_x2 = min([xb2, xa2])
 76 |             inter_y2 = min([yb2, ya2])
 77 |             if (inter_x1 < inter_x2) and (inter_y1 < inter_y2):
 78 |                 iter_area = (inter_y2 - inter_y1) * (inter_x2 - inter_x1)  # anchor框和目标框的相交面积
 79 |                 iou = iter_area / (anchor_area + box_area - iter_area)  # IOU计算
 80 |             else:
 81 |                 iou = 0.
 82 | 
 83 |             ious[num1, num2] = iou
 84 | 
 85 |     return ious
 86 | 
 87 | 
 88 | def get_pos_neg_sample(ious, valid_anchor_len, pos_iou_threshold=0.7,neg_iou_threshold=0.3, pos_ratio=0.5, n_sample=256):
 89 |     gt_argmax_ious = ious.argmax(axis=0)  # 找出每个目标实体框最大IOU的anchor框index，共2个, 与图片内目标框数量一致
 90 |     gt_max_ious = ious[gt_argmax_ious, np.arange(ious.shape[1])]  # 获取每个目标实体框最大IOU的值，与gt_argmax_ious对应, 共2个，与图片内目标框数量一致
 91 |     argmax_ious = ious.argmax(axis=1)  # 找出每个anchor框最大IOU的目标框index，共8940个, 每个anchor框都会对应一个最大IOU的目标框
 92 |     max_ious = ious[np.arange(valid_anchor_len), argmax_ious]  # 获取每个anchor框的最大IOU值， 与argmax_ious对应, 每个anchor框内都会有一个最大值
 93 | 
 94 |     gt_argmax_ious = np.where(ious == gt_max_ious)[0]  # 根据上面获取的目标最大IOU值，获取等于该值的index
 95 |     # print gt_argmax_ious.shape  # (18,) 共计18个
 96 | 
 97 |     label = np.empty((valid_anchor_len,), dtype=np.int32)
 98 |     label.fill(-1)
 99 |     # print label.shape  # (8940,)
100 |     label[max_ious < neg_iou_threshold] = 0  # anchor框内最大IOU值小于neg_iou_threshold，设为0
101 |     label[gt_argmax_ious] = 1  # anchor框有全局最大IOU值，设为1
102 |     label[max_ious >= pos_iou_threshold] = 1  # anchor框内最大IOU值大于等于pos_iou_threshold，设为1
103 | 
104 |     n_pos = pos_ratio * n_sample  # 正例样本数
105 | 
106 |     # 随机获取n_pos个正例，
107 |     pos_index = np.where(label == 1)[0]
108 |     if len(pos_index) > n_pos:
109 |         disable_index = np.random.choice(pos_index, size=(len(pos_index) - n_pos), replace=False)
110 |         label[disable_index] = -1
111 | 
112 |     n_neg = n_sample - np.sum(label == 1)
113 |     neg_index = np.where(label == 0)[0]
114 | 
115 |     if len(neg_index) > n_neg:
116 |         disable_index = np.random.choice(neg_index, size=(len(neg_index) - n_neg), replace=False)
117 |         label[disable_index] = -1
118 | 
119 |     return label, argmax_ious
120 | 
121 | 
122 | def get_predict_bbox(anchors, pred_anchor_locs, objectness_score, n_train_pre_nms=12000, min_size=16):
123 |     # 转换anchor格式从 y1, x1, y2, x2 到 ctr_x, ctr_y, h, w ：
124 |     anc_height = anchors[:, 2] - anchors[:, 0]
125 |     anc_width = anchors[:, 3] - anchors[:, 1]
126 |     anc_ctr_y = anchors[:, 0] + 0.5 * anc_height
127 |     anc_ctr_x = anchors[:, 1] + 0.5 * anc_width
128 | 
129 |     # 根据预测的四个系数，将anchor框通过平移和缩放转化为预测的目标框
130 |     pred_anchor_locs_numpy = pred_anchor_locs[0].data.numpy()
131 |     objectness_score_numpy = objectness_score[0].data.numpy()
132 |     dy = pred_anchor_locs_numpy[:, 0::4]
133 |     dx = pred_anchor_locs_numpy[:, 1::4]
134 |     dh = pred_anchor_locs_numpy[:, 2::4]
135 |     dw = pred_anchor_locs_numpy[:, 3::4]
136 |     ctr_y = dy * anc_height[:, np.newaxis] + anc_ctr_y[:, np.newaxis]
137 |     ctr_x = dx * anc_width[:, np.newaxis] + anc_ctr_x[:, np.newaxis]
138 |     h = np.exp(dh) * anc_height[:, np.newaxis]
139 |     w = np.exp(dw) * anc_width[:, np.newaxis]
140 | 
141 |     #  将预测的目标框转换为[y1, x1, y2, x2]格式
142 |     roi = np.zeros(pred_anchor_locs_numpy.shape, dtype=pred_anchor_locs_numpy.dtype)
143 |     roi[:, 0::4] = ctr_y - 0.5 * h
144 |     roi[:, 1::4] = ctr_x - 0.5 * w
145 |     roi[:, 2::4] = ctr_y + 0.5 * h
146 |     roi[:, 3::4] = ctr_x + 0.5 * w
147 | 
148 |     # 保证预测框的坐标全部落在图片中，y1,y2在（0, img_size[0]）之间, x1,x2在（0, img_size[1]）之间
149 |     img_size = (800, 800)  # Image size
150 |     roi[:, slice(0, 4, 2)] = np.clip(roi[:, slice(0, 4, 2)], 0, img_size[0])
151 |     roi[:, slice(1, 4, 2)] = np.clip(roi[:, slice(1, 4, 2)], 0, img_size[1])
152 |     # print(roi.shape)  # (22500, 4)
153 | 
154 |     #  去除高度或宽度 < threshold的预测框 （疑问：这样会不会忽略小目标）
155 |     hs = roi[:, 2] - roi[:, 0]
156 |     ws = roi[:, 3] - roi[:, 1]
157 |     keep = np.where((hs >= min_size) & (ws >= min_size))[0]
158 |     roi = roi[keep, :]
159 |     score = objectness_score_numpy[keep]
160 | 
161 |     # 按分数从高到低排序所有的（proposal, score）对
162 |     order = score.ravel().argsort()[::-1]   # (22500,)
163 |     # 取前几个预测框pre_nms_topN(如训练时12000，测试时300)
164 |     order = order[:n_train_pre_nms]
165 | 
166 |     return roi, score, order
167 | 
168 | # torch.masked_select()
169 | 
170 | def nms(roi, score, order, nms_thresh=0.7, n_train_post_nms=2000):
171 |     # nms（非极大抑制）计算： (去除和极大值anchor框IOU大于0.7的框——即去除相交的框，保留score大，且基本无相交的框)
172 |     roi = roi[order, :]  # (12000, 4)
173 |     score = score[order]
174 |     y1 = roi[:, 0]
175 |     x1 = roi[:, 1]
176 |     y2 = roi[:, 2]
177 |     x2 = roi[:, 3]
178 | 
179 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
180 | 
181 |     order = score.argsort()[::-1]
182 |     # print score
183 |     # print order
184 |     keep = []
185 |     while order.size > 0:
186 |         # print order
187 |         i = order[0]
188 |         keep.append(i)
189 |         xx1 = np.maximum(x1[i], x1[order[1:]])
190 |         yy1 = np.maximum(y1[i], y1[order[1:]])
191 |         xx2 = np.minimum(x2[i], x2[order[1:]])
192 |         yy2 = np.minimum(y2[i], y2[order[1:]])
193 | 
194 |         w = np.maximum(0.0, xx2 - xx1 + 1)
195 |         h = np.maximum(0.0, yy2 - yy1 + 1)
196 |         inter = w * h
197 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
198 | 
199 |         # print ovr
200 |         inds = np.where(ovr <= nms_thresh)[0]
201 |         # print inds
202 |         order = order[inds + 1]  # 这里加1是因为在计算IOU时，把序列的第一个忽略了（如上面的order[1:]）
203 | 
204 |     keep = keep[:n_train_post_nms]  # while training/testing , use accordingly
205 |     roi = roi[keep]  # the final region proposals（region proposals表示预测目标框）
206 |     # print roi.shape  # (1758, 4)
207 |     return roi
208 | 
209 | 
210 | def get_propose_target(roi, bbox, labels, n_sample=128, pos_ratio=0.25,
211 |                        pos_iou_thresh=0.5, neg_iou_thresh_hi=0.5, neg_iou_thresh_lo = 0.0):
212 |     # Proposal targets
213 |     # 找到每个ground-truth目标（真实目标框bbox）与region proposal（预测目标框roi）的iou
214 |     ious = compute_iou(roi, bbox)
215 |     # print(ious.shape)  # (1758, 2)
216 | 
217 |     # 找到与每个region proposal具有较高IoU的ground truth，并且找到最大的IoU
218 |     gt_assignment = ious.argmax(axis=1)
219 |     max_iou = ious.max(axis=1)
220 |     # print(gt_assignment)  # [0 0 1 ... 0 0 0]
221 |     # print(max_iou)  # [0.17802152 0.17926688 0.04676317 ... 0.         0.         0.        ]
222 | 
223 |     # 为每个proposal分配标签：
224 |     gt_roi_label = labels[gt_assignment]
225 |     # print(gt_roi_label)  # [6 6 8 ... 6 6 6]
226 | 
227 |     # 希望只保留n_sample*pos_ratio（128*0.25=32）个前景样本，因此如果只得到少于32个正样本，保持原状。
228 |     # 如果得到多余32个前景目标，从中采样32个样本
229 |     pos_roi_per_image = n_sample*pos_ratio
230 |     pos_index = np.where(max_iou >= pos_iou_thresh)[0]
231 |     pos_roi_per_this_image = int(min(pos_roi_per_image, pos_index.size))
232 |     if pos_index.size > 0:
233 |         pos_index = np.random.choice(pos_index, size=pos_roi_per_this_image, replace=False)
234 |     # print(pos_roi_per_this_image)
235 |     # print(pos_index)  # 19
236 | 
237 |     # 针对负[背景]region proposal进行相似处理
238 |     neg_index = np.where((max_iou < neg_iou_thresh_hi) & (max_iou >= neg_iou_thresh_lo))[0]
239 |     neg_roi_per_this_image = n_sample - pos_roi_per_this_image
240 |     neg_roi_per_this_image = int(min(neg_roi_per_this_image, neg_index.size))
241 |     if neg_index.size > 0:
242 |         neg_index = np.random.choice(neg_index, size=neg_roi_per_this_image, replace=False)
243 |     # print(neg_roi_per_this_image)
244 |     # print(neg_index)  # 109
245 | 
246 |     keep_index = np.append(pos_index, neg_index)
247 |     gt_roi_labels = gt_roi_label[keep_index]
248 |     gt_roi_labels[pos_roi_per_this_image:] = 0  # negative labels --> 0
249 |     sample_roi = roi[keep_index]  # 预测框
250 |     # print(sample_roi.shape)  # (128, 4)
251 |     return sample_roi, keep_index, gt_assignment, gt_roi_labels
252 | 
253 | 
254 | def get_coefficient(anchor, bbox):
255 |     # 根据上面得到的预测框和与之对应的目标框，计算4维参数（平移参数：dy, dx； 缩放参数：dh, dw）
256 |     height = anchor[:, 2] - anchor[:, 0]
257 |     width = anchor[:, 3] - anchor[:, 1]
258 |     ctr_y = anchor[:, 0] + 0.5 * height
259 |     ctr_x = anchor[:, 1] + 0.5 * width
260 |     base_height = bbox[:, 2] - bbox[:, 0]
261 |     base_width = bbox[:, 3] - bbox[:, 1]
262 |     base_ctr_y = bbox[:, 0] + 0.5 * base_height
263 |     base_ctr_x = bbox[:, 1] + 0.5 * base_width
264 | 
265 |     eps = np.finfo(height.dtype).eps
266 |     height = np.maximum(height, eps)
267 |     width = np.maximum(width, eps)
268 | 
269 |     dy = (base_ctr_y - ctr_y) / height
270 |     dx = (base_ctr_x - ctr_x) / width
271 |     dh = np.log(base_height / height)
272 |     dw = np.log(base_width / width)
273 | 
274 |     gt_roi_locs = np.vstack((dy, dx, dh, dw)).transpose()
275 |     # print(gt_roi_locs.shape)
276 | 
277 |     return gt_roi_locs


--------------------------------------------------------------------------------