├── Infer.py
├── README.md
├── main.py
├── objectDetection.py
└── output
├── classes
├── 0.0.png
├── 1.0.png
└── 2.0.png
├── detection-results-info.png
├── ground-truth-info.png
├── lamr.png
├── mAP.png
└── output.txt
/Infer.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | """
3 | 训练常基于dark-net的YOLOv3网络,目标检测
4 | """
5 | from __future__ import absolute_import
6 | from __future__ import division
7 | from __future__ import print_function
8 | import os
9 |
10 | os.environ["FLAGS_fraction_of_gpu_memory_to_use"] = '0.82'
11 |
12 | import uuid
13 | import numpy as np
14 | import time
15 | import six
16 | import math
17 | import random
18 | import paddle
19 | import paddle.fluid as fluid
20 | import logging
21 | import xml.etree.ElementTree
22 | import codecs
23 | import json
24 |
25 | from paddle.fluid.initializer import MSRA
26 | from paddle.fluid.param_attr import ParamAttr
27 | from paddle.fluid.regularizer import L2Decay
28 | from PIL import Image, ImageEnhance, ImageDraw, ImageFile
29 | ImageFile.LOAD_TRUNCATED_IMAGES = True
30 | Image.MAX_IMAGE_PIXELS = None
31 |
32 | logger = None # 日志对象
33 |
34 | train_params = {
35 | "data_dir": "data/data6045", # 数据目录
36 | "train_list": "train.txt", # 训练集文件
37 | "eval_list": "eval.txt",
38 | "class_dim": -1,
39 | "label_dict": {}, # 标签字典
40 | "num_dict": {},
41 | "image_count": -1,
42 | "continue_train": True, # 是否加载前一次的训练参数,接着训练
43 | "pretrained": False, # 是否预训练
44 | "pretrained_model_dir": "./pretrained-model",
45 | "save_model_dir": "./yolo-model", # 模型保存目录
46 | "model_prefix": "yolo-v3", # 模型前缀
47 | "freeze_dir": "freeze_model",
48 | "use_tiny": False, # 是否使用 裁剪 tiny 模型
49 | "max_box_num": 20, # 一幅图上最多有多少个目标
50 | "num_epochs": 2, # 训练轮次
51 | "train_batch_size": 10, # 对于完整yolov3,每一批的训练样本不能太多,内存会炸掉;如果使用tiny,可以适当大一些
52 | "use_gpu": True, # 是否使用GPU
53 | "yolo_cfg": { # YOLO模型参数
54 | "input_size": [3, 448, 448], # 原版的边长大小为608,为了提高训练速度和预测速度,此处压缩为448
55 | "anchors": [7, 10, 12, 22, 24, 17, 22, 45, 46, 33, 43, 88, 85, 66, 115, 146, 275, 240], # 锚点??
56 | "anchor_mask": [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
57 | },
58 | "yolo_tiny_cfg": { # YOLO tiny 模型参数
59 | "input_size": [3, 256, 256],
60 | "anchors": [6, 8, 13, 15, 22, 34, 48, 50, 81, 100, 205, 191],
61 | "anchor_mask": [[3, 4, 5], [0, 1, 2]]
62 | },
63 | "ignore_thresh": 0.7,
64 | "mean_rgb": [127.5, 127.5, 127.5],
65 | "mode": "train",
66 | "multi_data_reader_count": 4,
67 | "apply_distort": True, # 是否做图像扭曲增强
68 | "nms_top_k": 300,
69 | "nms_pos_k": 300,
70 | "valid_thresh": 0.01,
71 | "nms_thresh": 0.40, # 非最大值抑制阈值
72 | "image_distort_strategy": { # 图像扭曲策略
73 | "expand_prob": 0.5, # 扩展比率
74 | "expand_max_ratio": 4,
75 | "hue_prob": 0.5, # 色调
76 | "hue_delta": 18,
77 | "contrast_prob": 0.5, # 对比度
78 | "contrast_delta": 0.5,
79 | "saturation_prob": 0.5, # 饱和度
80 | "saturation_delta": 0.5,
81 | "brightness_prob": 0.5, # 亮度
82 | "brightness_delta": 0.125
83 | },
84 | "sgd_strategy": { # 梯度下降配置
85 | "learning_rate": 0.002,
86 | "lr_epochs": [30, 50, 65], # 学习率衰减分段(3个数字分为4段)
87 | "lr_decay": [1, 0.5, 0.25, 0.1] # 每段采用的学习率,对应lr_epochs参数4段
88 | },
89 | "early_stop": {
90 | "sample_frequency": 50,
91 | "successive_limit": 3,
92 | "min_loss": 2.5,
93 | "min_curr_map": 0.84
94 | }
95 | }
96 |
97 |
98 | def init_train_parameters():
99 | """
100 | 初始化训练参数,主要是初始化图片数量,类别数
101 | :return:
102 | """
103 | file_list = os.path.join(train_params['data_dir'], train_params['train_list']) # 训练集
104 | label_list = os.path.join(train_params['data_dir'], "label_list") # 标签文件
105 | index = 0
106 |
107 | # codecs是专门用作编码转换通用模块
108 | with codecs.open(label_list, encoding='utf-8') as flist:
109 | lines = [line.strip() for line in flist]
110 | for line in lines:
111 | train_params['num_dict'][index] = line.strip()
112 | train_params['label_dict'][line.strip()] = index
113 | index += 1
114 | train_params['class_dim'] = index
115 |
116 | with codecs.open(file_list, encoding='utf-8') as flist:
117 | lines = [line.strip() for line in flist]
118 | train_params['image_count'] = len(lines) # 图片数量
119 |
120 |
121 | # 日志相关配置
122 | def init_log_config(): # 初始化日志相关配置
123 | global logger
124 |
125 | logger = logging.getLogger() # 创建日志对象
126 | logger.setLevel(logging.INFO) # 设置日志级别
127 | log_path = os.path.join(os.getcwd(), 'logs')
128 |
129 | if not os.path.exists(log_path): # 创建日志路径
130 | os.makedirs(log_path)
131 |
132 | log_name = os.path.join(log_path, 'train.log') # 训练日志文件
133 | fh = logging.FileHandler(log_name, mode='w') # 打开文件句柄
134 | fh.setLevel(logging.DEBUG) # 设置级别
135 |
136 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
137 | fh.setFormatter(formatter)
138 | logger.addHandler(fh)
139 |
140 |
141 | init_log_config()
142 |
143 |
144 | # 定义YOLO3网络结构:darknet-53
145 | class YOLOv3(object):
146 | def __init__(self, class_num, anchors, anchor_mask):
147 | self.outputs = [] # 网络最终模型
148 | self.downsample_ratio = 1 # 下采样率
149 | self.anchor_mask = anchor_mask # 计算卷积核???
150 | self.anchors = anchors # 锚点
151 | self.class_num = class_num # 类别数量
152 |
153 | self.yolo_anchors = []
154 | self.yolo_classes = []
155 |
156 | for mask_pair in self.anchor_mask:
157 | mask_anchors = []
158 | for mask in mask_pair:
159 | mask_anchors.append(self.anchors[2 * mask])
160 | mask_anchors.append(self.anchors[2 * mask + 1])
161 | self.yolo_anchors.append(mask_anchors)
162 | self.yolo_classes.append(class_num)
163 |
164 | def name(self):
165 | return 'YOLOv3'
166 |
167 | # 获取anchors
168 | def get_anchors(self):
169 | return self.anchors
170 |
171 | # 获取anchor_mask
172 | def get_anchor_mask(self):
173 | return self.anchor_mask
174 |
175 | def get_class_num(self):
176 | return self.class_num
177 |
178 | def get_downsample_ratio(self):
179 | return self.downsample_ratio
180 |
181 | def get_yolo_anchors(self):
182 | return self.yolo_anchors
183 |
184 | def get_yolo_classes(self):
185 | return self.yolo_classes
186 |
187 | # 卷积正则化函数: 卷积、批量正则化处理、leakrelu
188 | def conv_bn(self,
189 | input, # 输入
190 | num_filters, # 卷积核数量
191 | filter_size, # 卷积核大小
192 | stride, # 步幅
193 | padding, # 填充
194 | use_cudnn=True):
195 | # 2d卷积操作
196 | conv = fluid.layers.conv2d(input=input,
197 | num_filters=num_filters,
198 | filter_size=filter_size,
199 | stride=stride,
200 | padding=padding,
201 | act=None,
202 | use_cudnn=use_cudnn, # 是否使用cudnn,cudnn利用cuda进行了加速处理
203 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)),
204 | bias_attr=False)
205 |
206 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉
207 | # 在batch_norm中使用leaky的话,只能使用默认的alpha=0.02;如果需要设值,必须提出去单独来
208 | # 正则化的目的,是为了防止过拟合,较小的L2值能防止过拟合
209 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02),
210 | regularizer=L2Decay(0.))
211 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0),
212 | regularizer=L2Decay(0.))
213 | out = fluid.layers.batch_norm(input=conv, act=None,
214 | param_attr=param_attr,
215 | bias_attr=bias_attr)
216 | # leaky_relu: Leaky ReLU是给所有负值赋予一个非零斜率
217 | out = fluid.layers.leaky_relu(out, 0.1)
218 | return out
219 |
220 | # 通过卷积实现降采样
221 | # 如:原始图片大小448*448,降采样后大小为 ((448+2)-3)/2 + 1 = 224
222 | def down_sample(self, input, num_filters, filter_size=3, stride=2, padding=1):
223 | self.downsample_ratio *= 2 # 降采样率
224 | return self.conv_bn(input,
225 | num_filters=num_filters,
226 | filter_size=filter_size,
227 | stride=stride,
228 | padding=padding)
229 |
230 | # 基本块:包含两个卷积/正则化层,一个残差块
231 | def basic_block(self, input, num_filters):
232 | conv1 = self.conv_bn(input, num_filters, filter_size=1, stride=1, padding=0)
233 | conv2 = self.conv_bn(conv1, num_filters * 2, filter_size=3, stride=1, padding=1)
234 | out = fluid.layers.elementwise_add(x=input, y=conv2, act=None) # 计算H(x)=F(x)+x
235 | return out
236 |
237 | # 创建多个basic_block
238 | def layer_warp(self, input, num_filters, count):
239 | res_out = self.basic_block(input, num_filters)
240 | for j in range(1, count):
241 | res_out = self.basic_block(res_out, num_filters)
242 | return res_out
243 |
244 | # 上采样
245 | def up_sample(self, input, scale=2):
246 | # get dynamic upsample output shape
247 | shape_nchw = fluid.layers.shape(input) # 获取input的形状
248 | shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4])
249 | shape_hw.stop_gradient = True
250 | in_shape = fluid.layers.cast(shape_hw, dtype='int32')
251 | out_shape = in_shape * scale # 计算输出数据形状
252 | out_shape.stop_gradient = True
253 |
254 | # reisze by actual_shape
255 | # 矩阵放大(最邻插值法)
256 | out = fluid.layers.resize_nearest(input=input,
257 | scale=scale,
258 | actual_shape=out_shape)
259 | return out
260 |
261 | def yolo_detection_block(self, input, num_filters):
262 | assert num_filters % 2 == 0, "num_filters {} cannot be divided by 2".format(num_filters)
263 |
264 | conv = input
265 | for j in range(2):
266 | conv = self.conv_bn(conv, num_filters, filter_size=1, stride=1, padding=0)
267 | conv = self.conv_bn(conv, num_filters * 2, filter_size=3, stride=1, padding=1)
268 | route = self.conv_bn(conv, num_filters, filter_size=1, stride=1, padding=0)
269 | tip = self.conv_bn(route, num_filters * 2, filter_size=3, stride=1, padding=1)
270 | return route, tip
271 |
272 | # 搭建网络模型 darknet-53
273 | def net(self, img):
274 | stages = [1, 2, 8, 8, 4]
275 | assert len(self.anchor_mask) <= len(stages), "anchor masks can't bigger than down_sample times"
276 | # 第一个卷积层: 256*256
277 | conv1 = self.conv_bn(img, num_filters=32, filter_size=3, stride=1, padding=1)
278 | # 第二个卷积层:128*128
279 | downsample_ = self.down_sample(conv1, conv1.shape[1] * 2) # 第二个参数为卷积核数量
280 | blocks = []
281 |
282 | # 循环创建basic_block组
283 | for i, stage_count in enumerate(stages):
284 | block = self.layer_warp(downsample_, # 输入数据
285 | 32 * (2 ** i), # 卷积核数量
286 | stage_count) # 基本块数量
287 | blocks.append(block)
288 | if i < len(stages) - 1: # 如果不是最后一组,做降采样
289 | downsample_ = self.down_sample(block, block.shape[1] * 2)
290 | blocks = blocks[-1:-4:-1] # 取倒数三层,并且逆序,后面跨层级联需要
291 |
292 | # yolo detector
293 | for i, block in enumerate(blocks):
294 | # yolo中跨视域链接
295 | if i > 0:
296 | block = fluid.layers.concat(input=[route, block], axis=1) # 连接route和block,按行
297 |
298 | route, tip = self.yolo_detection_block(block, # 输入
299 | num_filters=512 // (2 ** i)) # 卷积核数量
300 |
301 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02))
302 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.))
303 | block_out = fluid.layers.conv2d(input=tip,
304 | # 5 elements represent x|y|h|w|score
305 | num_filters=len(self.anchor_mask[i]) * (self.class_num + 5),
306 | filter_size=1,
307 | stride=1,
308 | padding=0,
309 | act=None,
310 | param_attr=param_attr,
311 | bias_attr=bias_attr)
312 | self.outputs.append(block_out)
313 |
314 | # 为了跨视域链接,差值方式提升特征图尺寸
315 | if i < len(blocks) - 1:
316 | route = self.conv_bn(route, 256 // (2 ** i), filter_size=1, stride=1, padding=0)
317 | route = self.up_sample(route) # 上采样
318 |
319 | return self.outputs
320 |
321 | # Tiny(精简版)YOLO模型
322 | class YOLOv3Tiny(object):
323 | def __init__(self, class_num, anchors, anchor_mask):
324 | self.outputs = []
325 | self.downsample_ratio = 1
326 | self.anchor_mask = anchor_mask
327 | self.anchors = anchors
328 | self.class_num = class_num
329 |
330 | self.yolo_anchors = []
331 | self.yolo_classes = []
332 | for mask_pair in self.anchor_mask:
333 | mask_anchors = []
334 | for mask in mask_pair:
335 | mask_anchors.append(self.anchors[2 * mask])
336 | mask_anchors.append(self.anchors[2 * mask + 1])
337 | self.yolo_anchors.append(mask_anchors)
338 | self.yolo_classes.append(class_num)
339 |
340 | def name(self):
341 | return 'YOLOv3-tiny'
342 |
343 | def get_anchors(self):
344 | return self.anchors
345 |
346 | def get_anchor_mask(self):
347 | return self.anchor_mask
348 |
349 | def get_class_num(self):
350 | return self.class_num
351 |
352 | def get_downsample_ratio(self):
353 | return self.downsample_ratio
354 |
355 | def get_yolo_anchors(self):
356 | return self.yolo_anchors
357 |
358 | def get_yolo_classes(self):
359 | return self.yolo_classes
360 |
361 | def conv_bn(self,
362 | input,
363 | num_filters,
364 | filter_size,
365 | stride,
366 | padding,
367 | num_groups=1,
368 | use_cudnn=True):
369 | conv = fluid.layers.conv2d(
370 | input=input,
371 | num_filters=num_filters,
372 | filter_size=filter_size,
373 | stride=stride,
374 | padding=padding,
375 | act=None,
376 | groups=num_groups,
377 | use_cudnn=use_cudnn,
378 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)),
379 | bias_attr=False)
380 |
381 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉
382 | out = fluid.layers.batch_norm(
383 | input=conv, act='relu',
384 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02), regularizer=L2Decay(0.)),
385 | bias_attr=ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.)))
386 |
387 | return out
388 |
389 | def depthwise_conv_bn(self, input, filter_size=3, stride=1, padding=1):
390 | num_filters = input.shape[1]
391 | return self.conv_bn(input,
392 | num_filters=num_filters,
393 | filter_size=filter_size,
394 | stride=stride,
395 | padding=padding,
396 | num_groups=num_filters)
397 |
398 | def down_sample(self, input, pool_size=2, pool_stride=2):
399 | self.downsample_ratio *= 2
400 | return fluid.layers.pool2d(input=input, pool_type='max', pool_size=pool_size,
401 | pool_stride=pool_stride)
402 |
403 | def basic_block(self, input, num_filters):
404 | conv1 = self.conv_bn(input, num_filters, filter_size=3, stride=1, padding=1)
405 | out = self.down_sample(conv1)
406 | return out
407 |
408 | def up_sample(self, input, scale=2):
409 | # get dynamic upsample output shape
410 | shape_nchw = fluid.layers.shape(input)
411 | shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4])
412 | shape_hw.stop_gradient = True
413 | in_shape = fluid.layers.cast(shape_hw, dtype='int32')
414 | out_shape = in_shape * scale
415 | out_shape.stop_gradient = True
416 |
417 | # reisze by actual_shape
418 | out = fluid.layers.resize_nearest(
419 | input=input,
420 | scale=scale,
421 | actual_shape=out_shape)
422 | return out
423 |
424 | def yolo_detection_block(self, input, num_filters):
425 | route = self.conv_bn(input, num_filters, filter_size=1, stride=1, padding=0)
426 | tip = self.conv_bn(route, num_filters * 2, filter_size=3, stride=1, padding=1)
427 | return route, tip
428 |
429 | def net(self, img):
430 | # darknet-tiny
431 | stages = [16, 32, 64, 128, 256, 512]
432 | assert len(self.anchor_mask) <= len(stages), "anchor masks can't bigger than down_sample times"
433 | # 256x256
434 | tmp = img
435 | blocks = []
436 | for i, stage_count in enumerate(stages):
437 | if i == len(stages) - 1:
438 | block = self.conv_bn(tmp, stage_count, filter_size=3, stride=1, padding=1)
439 | blocks.append(block)
440 | block = self.depthwise_conv_bn(blocks[-1])
441 | block = self.depthwise_conv_bn(blocks[-1])
442 | block = self.conv_bn(blocks[-1], stage_count * 2, filter_size=1, stride=1, padding=0)
443 | blocks.append(block)
444 | else:
445 | tmp = self.basic_block(tmp, stage_count)
446 | blocks.append(tmp)
447 |
448 | blocks = [blocks[-1], blocks[3]]
449 |
450 | # yolo detector
451 | for i, block in enumerate(blocks):
452 | # yolo 中跨视域链接
453 | if i > 0:
454 | block = fluid.layers.concat(input=[route, block], axis=1)
455 | if i < 1:
456 | route, tip = self.yolo_detection_block(block, num_filters=256 // (2 ** i))
457 | else:
458 | tip = self.conv_bn(block, num_filters=256, filter_size=3, stride=1, padding=1)
459 |
460 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02))
461 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.))
462 | block_out = fluid.layers.conv2d(input=tip,
463 | # 5 elements represent x|y|h|w|score
464 | num_filters=len(self.anchor_mask[i]) * (self.class_num + 5),
465 | filter_size=1,
466 | stride=1,
467 | padding=0,
468 | act=None,
469 | param_attr=param_attr,
470 | bias_attr=bias_attr)
471 | self.outputs.append(block_out)
472 | # 为了跨视域链接,差值方式提升特征图尺寸
473 | if i < len(blocks) - 1:
474 | route = self.conv_bn(route, 128 // (2 ** i), filter_size=1, stride=1, padding=0)
475 | route = self.up_sample(route)
476 |
477 | return self.outputs
478 |
479 |
480 | def get_yolo(is_tiny, class_num, anchors, anchor_mask):
481 | if is_tiny:
482 | return YOLOv3Tiny(class_num, anchors, anchor_mask)
483 | else:
484 | return YOLOv3(class_num, anchors, anchor_mask)
485 |
486 |
487 | class Sampler(object):
488 | """
489 | 采样器,用于扣取采样
490 | """
491 |
492 | def __init__(self, max_sample, max_trial, min_scale, max_scale,
493 | min_aspect_ratio, max_aspect_ratio, min_jaccard_overlap,
494 | max_jaccard_overlap):
495 | self.max_sample = max_sample
496 | self.max_trial = max_trial
497 | self.min_scale = min_scale
498 | self.max_scale = max_scale
499 | self.min_aspect_ratio = min_aspect_ratio
500 | self.max_aspect_ratio = max_aspect_ratio
501 | self.min_jaccard_overlap = min_jaccard_overlap
502 | self.max_jaccard_overlap = max_jaccard_overlap
503 |
504 |
505 | class bbox(object):
506 | """
507 | 外界矩形框
508 | """
509 |
510 | def __init__(self, xmin, ymin, xmax, ymax):
511 | self.xmin = xmin
512 | self.ymin = ymin
513 | self.xmax = xmax
514 | self.ymax = ymax
515 |
516 |
517 | # 坐标转换,由[x1, y1, w, h]转换为[center_x, center_y, w, h]
518 | # 并转换为范围在[0, 1]之间的相对坐标
519 | def box_to_center_relative(box, img_height, img_width):
520 | """
521 | Convert COCO annotations box with format [x1, y1, w, h] to
522 | center mode [center_x, center_y, w, h] and divide image width
523 | and height to get relative value in range[0, 1]
524 | """
525 | assert len(box) == 4, "box should be a len(4) list or tuple"
526 | x, y, w, h = box
527 |
528 | x1 = max(x, 0)
529 | x2 = min(x + w - 1, img_width - 1)
530 | y1 = max(y, 0)
531 | y2 = min(y + h - 1, img_height - 1)
532 |
533 | x = (x1 + x2) / 2 / img_width # x中心坐标
534 | y = (y1 + y2) / 2 / img_height # y中心坐标
535 | w = (x2 - x1) / img_width # 框宽度/图片总宽度
536 | h = (y2 - y1) / img_height # 框高度/图片总高度
537 |
538 | return np.array([x, y, w, h])
539 |
540 |
541 | # 调整图像大小
542 | def resize_img(img, sampled_labels, input_size):
543 | target_size = input_size
544 | img = img.resize((target_size[1], target_size[2]), Image.BILINEAR)
545 | return img
546 |
547 |
548 | # 计算交并比
549 | def box_iou_xywh(box1, box2):
550 | assert box1.shape[-1] == 4, "Box1 shape[-1] should be 4."
551 | assert box2.shape[-1] == 4, "Box2 shape[-1] should be 4."
552 |
553 | # 取两个框的坐标
554 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
555 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
556 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
557 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
558 |
559 | inter_x1 = np.maximum(b1_x1, b2_x1)
560 | inter_x2 = np.minimum(b1_x2, b2_x2)
561 | inter_y1 = np.maximum(b1_y1, b2_y1)
562 | inter_y2 = np.minimum(b1_y2, b2_y2)
563 | inter_w = inter_x2 - inter_x1 + 1 # 相交部分宽度
564 | inter_h = inter_y2 - inter_y1 + 1 # 相交部分高度
565 | inter_w[inter_w < 0] = 0
566 | inter_h[inter_h < 0] = 0
567 |
568 | inter_area = inter_w * inter_h # 相交面积
569 | b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) # 框1的面积
570 | b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) # 框2的面积
571 |
572 | return inter_area / (b1_area + b2_area - inter_area) # 相集面积/并集面积
573 |
574 |
575 | # box裁剪
576 | def box_crop(boxes, labels, crop, img_shape):
577 | x, y, w, h = map(float, crop)
578 | im_w, im_h = map(float, img_shape)
579 |
580 | boxes = boxes.copy()
581 | boxes[:, 0], boxes[:, 2] = (boxes[:, 0] - boxes[:, 2] / 2) * im_w, (boxes[:, 0] + boxes[:, 2] / 2) * im_w
582 | boxes[:, 1], boxes[:, 3] = (boxes[:, 1] - boxes[:, 3] / 2) * im_h, (boxes[:, 1] + boxes[:, 3] / 2) * im_h
583 |
584 | crop_box = np.array([x, y, x + w, y + h])
585 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
586 | mask = np.logical_and(crop_box[:2] <= centers, centers <= crop_box[2:]).all(axis=1)
587 |
588 | boxes[:, :2] = np.maximum(boxes[:, :2], crop_box[:2])
589 | boxes[:, 2:] = np.minimum(boxes[:, 2:], crop_box[2:])
590 | boxes[:, :2] -= crop_box[:2]
591 | boxes[:, 2:] -= crop_box[:2]
592 |
593 | mask = np.logical_and(mask, (boxes[:, :2] < boxes[:, 2:]).all(axis=1))
594 | boxes = boxes * np.expand_dims(mask.astype('float32'), axis=1)
595 | labels = labels * mask.astype('float32')
596 | boxes[:, 0], boxes[:, 2] = (boxes[:, 0] + boxes[:, 2]) / 2 / w, (boxes[:, 2] - boxes[:, 0]) / w
597 | boxes[:, 1], boxes[:, 3] = (boxes[:, 1] + boxes[:, 3]) / 2 / h, (boxes[:, 3] - boxes[:, 1]) / h
598 |
599 | return boxes, labels, mask.sum()
600 |
601 |
602 | # 图像增加:对比度,饱和度,明暗,颜色,扩张
603 | def random_brightness(img): # 亮度
604 | prob = np.random.uniform(0, 1)
605 |
606 | if prob < train_params['image_distort_strategy']['brightness_prob']:
607 | brightness_delta = train_params['image_distort_strategy']['brightness_delta'] # 默认值0.125
608 | delta = np.random.uniform(-brightness_delta, brightness_delta) + 1 # 产生均匀分布随机值
609 | img = ImageEnhance.Brightness(img).enhance(delta) # 调整图像亮度
610 |
611 | return img
612 |
613 |
614 | def random_contrast(img): # 对比度
615 | prob = np.random.uniform(0, 1)
616 |
617 | if prob < train_params['image_distort_strategy']['contrast_prob']:
618 | contrast_delta = train_params['image_distort_strategy']['contrast_delta']
619 | delta = np.random.uniform(-contrast_delta, contrast_delta) + 1
620 | img = ImageEnhance.Contrast(img).enhance(delta)
621 |
622 | return img
623 |
624 |
625 | def random_saturation(img): # 饱和度
626 | prob = np.random.uniform(0, 1)
627 |
628 | if prob < train_params['image_distort_strategy']['saturation_prob']:
629 | saturation_delta = train_params['image_distort_strategy']['saturation_delta']
630 | delta = np.random.uniform(-saturation_delta, saturation_delta) + 1
631 | img = ImageEnhance.Color(img).enhance(delta)
632 |
633 | return img
634 |
635 |
636 | def random_hue(img): # 色调
637 | prob = np.random.uniform(0, 1)
638 |
639 | if prob < train_params['image_distort_strategy']['hue_prob']:
640 | hue_delta = train_params['image_distort_strategy']['hue_delta']
641 | delta = np.random.uniform(-hue_delta, hue_delta)
642 | img_hsv = np.array(img.convert('HSV'))
643 | img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
644 | img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
645 |
646 | return img
647 |
648 |
649 | def distort_image(img): # 图像扭曲
650 | prob = np.random.uniform(0, 1)
651 | # Apply different distort order
652 | if prob > 0.5:
653 | img = random_brightness(img)
654 | img = random_contrast(img)
655 | img = random_saturation(img)
656 | img = random_hue(img)
657 | else:
658 | img = random_brightness(img)
659 | img = random_saturation(img)
660 | img = random_hue(img)
661 | img = random_contrast(img)
662 | return img
663 |
664 |
665 | # 随机裁剪
666 | def random_crop(img, boxes, labels, scales=[0.3, 1.0], max_ratio=2.0, constraints=None, max_trial=50):
667 | if random.random() > 0.6:
668 | return img, boxes, labels
669 | if len(boxes) == 0:
670 | return img, boxes, labels
671 |
672 | if not constraints:
673 | constraints = [(0.1, 1.0),
674 | (0.3, 1.0),
675 | (0.5, 1.0),
676 | (0.7, 1.0),
677 | (0.9, 1.0),
678 | (0.0, 1.0)] # 最小/最大交并比值
679 |
680 | w, h = img.size
681 | crops = [(0, 0, w, h)]
682 |
683 | for min_iou, max_iou in constraints:
684 | for _ in range(max_trial):
685 | scale = random.uniform(scales[0], scales[1])
686 | aspect_ratio = random.uniform(max(1 / max_ratio, scale * scale), \
687 | min(max_ratio, 1 / scale / scale))
688 | crop_h = int(h * scale / np.sqrt(aspect_ratio))
689 | crop_w = int(w * scale * np.sqrt(aspect_ratio))
690 | crop_x = random.randrange(w - crop_w)
691 | crop_y = random.randrange(h - crop_h)
692 | crop_box = np.array([[
693 | (crop_x + crop_w / 2.0) / w,
694 | (crop_y + crop_h / 2.0) / h,
695 | crop_w / float(w),
696 | crop_h / float(h)
697 | ]])
698 |
699 | iou = box_iou_xywh(crop_box, boxes)
700 | if min_iou <= iou.min() and max_iou >= iou.max():
701 | crops.append((crop_x, crop_y, crop_w, crop_h))
702 | break
703 |
704 | while crops:
705 | crop = crops.pop(np.random.randint(0, len(crops)))
706 | crop_boxes, crop_labels, box_num = box_crop(boxes, labels, crop, (w, h))
707 | if box_num < 1:
708 | continue
709 | img = img.crop((crop[0], crop[1], crop[0] + crop[2],
710 | crop[1] + crop[3])).resize(img.size, Image.LANCZOS)
711 | return img, crop_boxes, crop_labels
712 | return img, boxes, labels
713 |
714 |
715 | # 扩张
716 | def random_expand(img, gtboxes, keep_ratio=True):
717 | if np.random.uniform(0, 1) < train_params['image_distort_strategy']['expand_prob']:
718 | return img, gtboxes
719 |
720 | max_ratio = train_params['image_distort_strategy']['expand_max_ratio']
721 | w, h = img.size
722 | c = 3
723 | ratio_x = random.uniform(1, max_ratio)
724 | if keep_ratio:
725 | ratio_y = ratio_x
726 | else:
727 | ratio_y = random.uniform(1, max_ratio)
728 | oh = int(h * ratio_y)
729 | ow = int(w * ratio_x)
730 | off_x = random.randint(0, ow - w)
731 | off_y = random.randint(0, oh - h)
732 |
733 | out_img = np.zeros((oh, ow, c), np.uint8)
734 | for i in range(c):
735 | out_img[:, :, i] = train_params['mean_rgb'][i]
736 |
737 | out_img[off_y: off_y + h, off_x: off_x + w, :] = img
738 | gtboxes[:, 0] = ((gtboxes[:, 0] * w) + off_x) / float(ow)
739 | gtboxes[:, 1] = ((gtboxes[:, 1] * h) + off_y) / float(oh)
740 | gtboxes[:, 2] = gtboxes[:, 2] / ratio_x
741 | gtboxes[:, 3] = gtboxes[:, 3] / ratio_y
742 |
743 | return Image.fromarray(out_img), gtboxes
744 |
745 |
746 | # 预处理:图像样本增强,维度转换
747 | def preprocess(img, bbox_labels, input_size, mode):
748 | img_width, img_height = img.size
749 | sample_labels = np.array(bbox_labels)
750 |
751 | if mode == 'train':
752 | if train_params['apply_distort']: # 是否扭曲增强
753 | img = distort_image(img)
754 |
755 | img, gtboxes = random_expand(img, sample_labels[:, 1:5]) # 扩展增强
756 | img, gtboxes, gtlabels = random_crop(img, gtboxes, sample_labels[:, 0]) # 随机裁剪
757 | sample_labels[:, 0] = gtlabels
758 | sample_labels[:, 1:5] = gtboxes
759 |
760 | img = resize_img(img, sample_labels, input_size)
761 | img = np.array(img).astype('float32')
762 | img -= train_params['mean_rgb']
763 | img = img.transpose((2, 0, 1)) # HWC to CHW
764 | img *= 0.007843
765 | return img, sample_labels
766 |
767 |
768 | # 数据读取器
769 | # 根据样本文件,读取图片、并做数据增强,返回图片数据、边框、标签
770 | def custom_reader(file_list, data_dir, input_size, mode):
771 | def reader():
772 | np.random.shuffle(file_list) # 打乱文件列表
773 |
774 | for line in file_list: # 读取行,每行一个图片及标注
775 | if mode == 'train' or mode == 'eval':
776 | ###################### 以下可能是需要自定义修改的部分 ############################
777 | parts = line.split('\t') # 按照tab键拆分
778 | image_path = parts[0]
779 |
780 | img = Image.open(os.path.join(data_dir, image_path)) # 读取图像数据
781 | if img.mode != 'RGB':
782 | img = img.convert('RGB')
783 | im_width, im_height = img.size
784 |
785 | # bbox 的列表,每一个元素为这样
786 | # layout: label | x-center | y-cneter | width | height | difficult
787 | bbox_labels = []
788 | for object_str in parts[1:]: # 循环处理每一个目标标注信息
789 | if len(object_str) <= 1:
790 | continue
791 |
792 | bbox_sample = []
793 | object = json.loads(object_str)
794 | bbox_sample.append(float(train_params['label_dict'][object['value']]))
795 | bbox = object['coordinate'] # 获取框坐标
796 | # 计算x,y,w,h
797 | box = [bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]]
798 | bbox = box_to_center_relative(box, im_height, im_width) # 坐标转换
799 | bbox_sample.append(float(bbox[0]))
800 | bbox_sample.append(float(bbox[1]))
801 | bbox_sample.append(float(bbox[2]))
802 | bbox_sample.append(float(bbox[3]))
803 | difficult = float(0)
804 | bbox_sample.append(difficult)
805 | bbox_labels.append(bbox_sample)
806 | ###################### 可能需要自定义修改部分结束 ############################
807 |
808 | if len(bbox_labels) == 0:
809 | continue
810 |
811 | img, sample_labels = preprocess(img, bbox_labels, input_size, mode) # 预处理
812 | # sample_labels = np.array(sample_labels)
813 | if len(sample_labels) == 0:
814 | continue
815 |
816 | boxes = sample_labels[:, 1:5] # 坐标
817 | lbls = sample_labels[:, 0].astype('int32') # 标签
818 | difficults = sample_labels[:, -1].astype('int32')
819 | max_box_num = train_params['max_box_num'] # 一副图像最多多少个目标物体
820 | cope_size = max_box_num if len(boxes) >= max_box_num else len(boxes) # 控制最大目标数量
821 | ret_boxes = np.zeros((max_box_num, 4), dtype=np.float32)
822 | ret_lbls = np.zeros((max_box_num), dtype=np.int32)
823 | ret_difficults = np.zeros((max_box_num), dtype=np.int32)
824 | ret_boxes[0: cope_size] = boxes[0: cope_size]
825 | ret_lbls[0: cope_size] = lbls[0: cope_size]
826 | ret_difficults[0: cope_size] = difficults[0: cope_size]
827 |
828 | yield img, ret_boxes, ret_lbls
829 |
830 | elif mode == 'test':
831 | img_path = os.path.join(line)
832 |
833 | yield Image.open(img_path)
834 |
835 | return reader
836 |
837 |
838 | # 批量、随机数据读取器
839 | def single_custom_reader(file_path, data_dir, input_size, mode):
840 | file_path = os.path.join(data_dir, file_path)
841 |
842 | images = [line.strip() for line in open(file_path)]
843 | reader = custom_reader(images, data_dir, input_size, mode)
844 | reader = paddle.reader.shuffle(reader, train_params['train_batch_size'])
845 | reader = paddle.batch(reader, train_params['train_batch_size'])
846 |
847 | return reader
848 |
849 |
850 | # 定义优化器
851 | def optimizer_sgd_setting():
852 | batch_size = train_params["train_batch_size"] # batch大小
853 | iters = train_params["image_count"] // batch_size # 计算轮次
854 | iters = 1 if iters < 1 else iters
855 | learning_strategy = train_params['sgd_strategy']
856 | lr = learning_strategy['learning_rate'] # 学习率
857 |
858 | boundaries = [i * iters for i in learning_strategy["lr_epochs"]]
859 | values = [i * lr for i in learning_strategy["lr_decay"]]
860 | logger.info("origin learning rate: {0} boundaries: {1} values: {2}".format(lr, boundaries, values))
861 |
862 | optimizer = fluid.optimizer.SGDOptimizer(
863 | learning_rate=fluid.layers.piecewise_decay(boundaries, values), # 分段衰减学习率
864 | # learning_rate=lr,
865 | regularization=fluid.regularizer.L2Decay(0.00005))
866 |
867 | return optimizer
868 |
869 |
870 | # 创建program, feeder及yolo模型
871 | def build_program_with_feeder(main_prog, startup_prog, place):
872 | max_box_num = train_params['max_box_num']
873 | ues_tiny = train_params['use_tiny'] # 获取是否使用tiny yolo参数
874 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg']
875 |
876 | with fluid.program_guard(main_prog, startup_prog): # 更改全局主程序和启动程序
877 | img = fluid.layers.data(name='img', shape=yolo_config['input_size'], dtype='float32') # 图像
878 | gt_box = fluid.layers.data(name='gt_box', shape=[max_box_num, 4], dtype='float32') # 边框
879 | gt_label = fluid.layers.data(name='gt_label', shape=[max_box_num], dtype='int32') # 标签
880 |
881 | feeder = fluid.DataFeeder(feed_list=[img, gt_box, gt_label],
882 | place=place,
883 | program=main_prog) # 定义feeder
884 | reader = single_custom_reader(train_params['train_list'],
885 | train_params['data_dir'],
886 | yolo_config['input_size'], 'train') # 读取器
887 | # 获取yolo参数
888 | ues_tiny = train_params['use_tiny']
889 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg']
890 |
891 | with fluid.unique_name.guard():
892 | # 创建yolo模型
893 | model = get_yolo(ues_tiny, train_params['class_dim'], yolo_config['anchors'],
894 | yolo_config['anchor_mask'])
895 | outputs = model.net(img)
896 | return feeder, reader, get_loss(model, outputs, gt_box, gt_label)
897 |
898 |
899 | # 损失函数
900 | def get_loss(model, outputs, gt_box, gt_label):
901 | losses = []
902 | downsample_ratio = model.get_downsample_ratio()
903 |
904 | with fluid.unique_name.guard('train'):
905 | for i, out in enumerate(outputs):
906 | loss = fluid.layers.yolov3_loss(x=out,
907 | gt_box=gt_box, # 真实边框
908 | gt_label=gt_label, # 标签
909 | anchors=model.get_anchors(), # 锚点
910 | anchor_mask=model.get_anchor_mask()[i],
911 | class_num=model.get_class_num(),
912 | ignore_thresh=train_params['ignore_thresh'],
913 | # 对于类别不多的情况,设置为 False 会更合适一些,不然 score 会很小
914 | use_label_smooth=False,
915 | downsample_ratio=downsample_ratio)
916 | losses.append(fluid.layers.reduce_mean(loss))
917 | downsample_ratio //= 2
918 | loss = sum(losses)
919 | optimizer = optimizer_sgd_setting()
920 | optimizer.minimize(loss)
921 | return loss
922 |
923 |
924 | # 持久化参数加载
925 | def load_pretrained_params(exe, program):
926 | if train_params['continue_train'] and os.path.exists(train_params['save_model_dir']):
927 | logger.info('load param from retrain model')
928 | fluid.io.load_persistables(executor=exe,
929 | dirname=train_params['save_model_dir'],
930 | main_program=program)
931 | elif train_params['pretrained'] and os.path.exists(train_params['pretrained_model_dir']):
932 | logger.info('load param from pretrained model')
933 |
934 | def if_exist(var):
935 | return os.path.exists(os.path.join(train_params['pretrained_model_dir'], var.name))
936 |
937 | fluid.io.load_vars(exe, train_params['pretrained_model_dir'], main_program=program,
938 | predicate=if_exist)
939 |
940 |
941 | # 执行训练
942 | def train():
943 | init_log_config()
944 | init_train_parameters()
945 |
946 | logger.info("start train YOLOv3, train params:%s", str(train_params))
947 | logger.info("create place, use gpu:" + str(train_params['use_gpu']))
948 |
949 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace()
950 |
951 | logger.info("build network and program")
952 | train_program = fluid.Program()
953 | start_program = fluid.Program()
954 | feeder, reader, loss = build_program_with_feeder(train_program, start_program, place)
955 |
956 | logger.info("build executor and init params")
957 |
958 | exe = fluid.Executor(place)
959 | exe.run(start_program)
960 | train_fetch_list = [loss.name]
961 | load_pretrained_params(exe, train_program) # 加载模型及参数
962 |
963 | stop_strategy = train_params['early_stop']
964 | successive_limit = stop_strategy['successive_limit']
965 | sample_freq = stop_strategy['sample_frequency']
966 | min_curr_map = stop_strategy['min_curr_map']
967 | min_loss = stop_strategy['min_loss']
968 | stop_train = False
969 | successive_count = 0
970 | total_batch_count = 0
971 | valid_thresh = train_params['valid_thresh']
972 | nms_thresh = train_params['nms_thresh']
973 | current_best_loss = 10000000000.0
974 |
975 | # 开始迭代训练
976 | for pass_id in range(train_params["num_epochs"]):
977 | logger.info("current pass: {}, start read image".format(pass_id))
978 | batch_id = 0
979 | total_loss = 0.0
980 |
981 | for batch_id, data in enumerate(reader()):
982 | t1 = time.time()
983 |
984 | loss = exe.run(train_program,
985 | feed=feeder.feed(data),
986 | fetch_list=train_fetch_list) # 执行训练
987 |
988 | period = time.time() - t1
989 | loss = np.mean(np.array(loss))
990 | total_loss += loss
991 | batch_id += 1
992 | total_batch_count += 1
993 |
994 | if batch_id % 10 == 0: # 调整日志输出的频率
995 | logger.info(
996 | "pass {}, trainbatch {}, loss {} time {}".format(pass_id, batch_id, loss, "%2.2f sec" % period))
997 |
998 | pass_mean_loss = total_loss / batch_id
999 | logger.info("pass {0} train result, current pass mean loss: {1}".format(pass_id, pass_mean_loss))
1000 |
1001 | # 采用每训练完一轮停止办法,可以调整为更精细的保存策略
1002 | if pass_mean_loss < current_best_loss:
1003 | logger.info("temp save {} epcho train result, current best pass loss {}".format(pass_id, pass_mean_loss))
1004 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program,
1005 | executor=exe)
1006 | current_best_loss = pass_mean_loss
1007 |
1008 | logger.info("training till last epcho, end training")
1009 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program, executor=exe)
1010 |
1011 |
1012 |
1013 |
1014 |
1015 |
1016 | # 固化保存模型
1017 | import paddle
1018 | import paddle.fluid as fluid
1019 | import codecs
1020 |
1021 | init_train_parameters()
1022 |
1023 |
1024 | def freeze_model():
1025 | exe = fluid.Executor(fluid.CPUPlace())
1026 |
1027 | ues_tiny = train_params['use_tiny']
1028 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg']
1029 | path = train_params['save_model_dir']
1030 |
1031 | model = get_yolo(ues_tiny, train_params['class_dim'],
1032 | yolo_config['anchors'], yolo_config['anchor_mask'])
1033 | image = fluid.layers.data(name='image', shape=yolo_config['input_size'], dtype='float32')
1034 | image_shape = fluid.layers.data(name="image_shape", shape=[2], dtype='int32')
1035 |
1036 | boxes = []
1037 | scores = []
1038 | outputs = model.net(image)
1039 | downsample_ratio = model.get_downsample_ratio()
1040 |
1041 | for i, out in enumerate(outputs):
1042 | box, score = fluid.layers.yolo_box(x=out,
1043 | img_size=image_shape,
1044 | anchors=model.get_yolo_anchors()[i],
1045 | class_num=model.get_class_num(),
1046 | conf_thresh=train_params['valid_thresh'],
1047 | downsample_ratio=downsample_ratio,
1048 | name="yolo_box_" + str(i))
1049 | boxes.append(box)
1050 | scores.append(fluid.layers.transpose(score, perm=[0, 2, 1]))
1051 | downsample_ratio //= 2
1052 |
1053 | pred = fluid.layers.multiclass_nms(bboxes=fluid.layers.concat(boxes, axis=1),
1054 | scores=fluid.layers.concat(scores, axis=2),
1055 | score_threshold=train_params['valid_thresh'],
1056 | nms_top_k=train_params['nms_top_k'],
1057 | keep_top_k=train_params['nms_pos_k'],
1058 | nms_threshold=train_params['nms_thresh'],
1059 | background_label=-1,
1060 | name="multiclass_nms")
1061 |
1062 | freeze_program = fluid.default_main_program()
1063 |
1064 | fluid.io.load_persistables(exe, path, freeze_program)
1065 | freeze_program = freeze_program.clone(for_test=True)
1066 | print("freeze out: {0}, pred layout: {1}".format(train_params['freeze_dir'], pred))
1067 | # 保存模型
1068 | fluid.io.save_inference_model(train_params['freeze_dir'],
1069 | ['image', 'image_shape'],
1070 | pred, exe, freeze_program)
1071 | print("freeze end")
1072 |
1073 |
1074 |
1075 |
1076 |
1077 | # 预测
1078 | import codecs
1079 | import sys
1080 | import numpy as np
1081 | import time
1082 | import paddle
1083 | import paddle.fluid as fluid
1084 | import math
1085 | import functools
1086 |
1087 | from IPython.display import display
1088 | from PIL import Image
1089 | from PIL import ImageFont
1090 | from PIL import ImageDraw
1091 | from collections import namedtuple
1092 |
1093 | init_train_parameters()
1094 | ues_tiny = train_params['use_tiny']
1095 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg']
1096 |
1097 | target_size = yolo_config['input_size']
1098 | anchors = yolo_config['anchors']
1099 | anchor_mask = yolo_config['anchor_mask']
1100 | label_dict = train_params['num_dict']
1101 | class_dim = train_params['class_dim']
1102 | print("label_dict:{} class dim:{}".format(label_dict, class_dim))
1103 |
1104 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace()
1105 | exe = fluid.Executor(place)
1106 |
1107 | path = train_params['freeze_dir']
1108 | [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(dirname=path, executor=exe)
1109 |
1110 |
1111 | # 给图片画上外接矩形框
1112 | def draw_bbox_image(img, boxes, labels, save_name):
1113 | img_width, img_height = img.size
1114 |
1115 | draw = ImageDraw.Draw(img) # 图像绘制对象
1116 | for box, label in zip(boxes, labels):
1117 | xmin, ymin, xmax, ymax = box[0], box[1], box[2], box[3]
1118 | draw.rectangle((xmin, ymin, xmax, ymax), None, 'red') # 绘制矩形
1119 | draw.text((xmin, ymin), label_dict[int(label)], (255, 255, 0)) # 绘制标签
1120 | img.save(save_name)
1121 | display(img)
1122 |
1123 |
1124 | def resize_img(img, target_size):
1125 | """
1126 | 保持比例的缩放图片
1127 | :param img:
1128 | :param target_size:
1129 | :return:
1130 | """
1131 | img = img.resize(target_size[1:], Image.BILINEAR)
1132 | return img
1133 |
1134 |
1135 | def read_image(img_path):
1136 | """
1137 | 读取图片
1138 | :param img_path:
1139 | :return:
1140 | """
1141 | origin = Image.open(img_path)
1142 | img = resize_img(origin, target_size)
1143 | resized_img = img.copy()
1144 | if img.mode != 'RGB':
1145 | img = img.convert('RGB')
1146 | img = np.array(img).astype('float32').transpose((2, 0, 1)) # HWC to CHW
1147 | img -= 127.5
1148 | img *= 0.007843
1149 | img = img[np.newaxis, :]
1150 | return origin, img, resized_img
1151 |
1152 |
1153 | def infer(image_path):
1154 | origin, tensor_img, resized_img = read_image(image_path)
1155 | input_w, input_h = origin.size[0], origin.size[1]
1156 | image_shape = np.array([input_h, input_w], dtype='int32')
1157 | # print("image shape high:{0}, width:{1}".format(input_h, input_w))
1158 | t1 = time.time()
1159 | batch_outputs = exe.run(inference_program,
1160 | feed={feed_target_names[0]: tensor_img,
1161 | feed_target_names[1]: image_shape[np.newaxis, :]},
1162 | fetch_list=fetch_targets,
1163 | return_numpy=False)
1164 | period = time.time() - t1
1165 | print("predict cost time:{0}".format("%2.2f sec" % period))
1166 | bboxes = np.array(batch_outputs[0])
1167 | # print(bboxes)
1168 |
1169 | # 用于展示一张图片用于预测的效果
1170 | if bboxes.shape[1] != 6:
1171 | print("No object found in {}".format(image_path))
1172 | return
1173 | labels = bboxes[:, 0].astype('int32').tolist()
1174 | scores = bboxes[:, 1].astype('float32').tolist()
1175 | boxes = bboxes[:, 2:].astype('float32').tolist()
1176 |
1177 |
1178 | last_dot_index = image_path.rfind('.')
1179 | out_path = image_path[:last_dot_index]
1180 | out_path += '-result.jpg'
1181 | draw_bbox_image(origin, boxes, labels, out_path)
1182 | last_slash_index=image_path.rfind('/')
1183 |
1184 | predict = []
1185 | for i in range(len(labels)):
1186 | predictTmp = []
1187 | predictTmp.append(labels[i])
1188 | predictTmp.append(scores[i])
1189 | for j in boxes[i]:
1190 | predictTmp.append(j)
1191 | predict.append(predictTmp)
1192 | f = open("./input/detection-results/" + image_path[last_slash_index+1:last_dot_index]+'.txt', 'w')
1193 | for i in predict:
1194 | for j in i:
1195 | f.write(str(float(j)) + ' ')
1196 | f.write('\n')
1197 | f.close()
1198 | return predict
1199 |
1200 | if __name__ == '__main__':
1201 | if os.path.exists('./input') == False:
1202 | os.mkdir('./input')
1203 | os.mkdir('./input/detection-results')
1204 | os.mkdir('./input/ground-truth')
1205 | file_path = os.path.join(train_params['data_dir'], 'eval.txt')
1206 | images = [line.strip() for line in open(file_path)]
1207 | for line in images:
1208 | image_path = line
1209 | parts = line.split('\t')
1210 | filename = parts[0]
1211 | filename_path = os.path.join(train_params['data_dir']+'/lslm_test/', parts[0])
1212 | infer(filename_path)
1213 |
1214 | bbox_labels = []
1215 | for object_str in parts[1:]:
1216 | if len(object_str) <= 1:
1217 | continue
1218 | bbox_sample = []
1219 | object = json.loads(object_str)
1220 | bbox_sample.append(float(train_params['label_dict'][object['value']]))
1221 | bbox = object['coordinate']
1222 | bbox_sample.append(float(bbox[0][0]))
1223 | bbox_sample.append(float(bbox[0][1]))
1224 | bbox_sample.append(float(bbox[1][0]))
1225 | bbox_sample.append(float(bbox[1][1]))
1226 | bbox_labels.append(bbox_sample)
1227 |
1228 | f = open("./input/ground-truth/" + filename_path[24:-4]+'.txt', 'w')
1229 | for i in bbox_labels:
1230 | for j in i:
1231 | f.write(str(float(j)) + ' ')
1232 | f.write('\n')
1233 | f.close()
1234 |
1235 |
1236 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 基于YOLOv3的目标检测实验报告
2 |
3 | ## 目录
4 |
5 | - 小组成员及分工
6 | - YOLOv3目标检测网络
7 | - YOLO算法简介
8 | - 网络结构
9 | - PaddlePaddle代码实现
10 | - 主要参数
11 | - 模型建立
12 | - 训练与迭代
13 | - 数据集基本信息
14 | - 训练过程中的参数调整与模型优化
15 | - YOLO和YOLO-tiny对比
16 | - 参数调整
17 | - 模型优化
18 | - 网络性能分析
19 | - 挑战集测试分析
20 | - 实际结果
21 |
22 | ---
23 | ## 小组成员及分工
24 | 姓名|学号|贡献
25 | ---|:--:|:-:
26 | 马家昱|1950509|数据集搜索与整合、图片处理
27 | 陈冠忠|1950638|模型修改、调试、训练
28 | 陶思月|1951858|数据集拍摄、标记
29 | 黄继宣|1951857|数据集拍摄、标记
30 | 周婉莹|1950579|数据集拍摄、标记
31 | 罗格峰|1952222|数据集拍摄、标记
32 |
33 | ---
34 | ## YOLOv3目标检测网络
35 | ### YOLO算法简介
36 | - 相关算法
37 | 1. 滑动窗口
38 |
39 | 采用滑动窗口的目标检测算法将检测问题转化为了图像分类问题。其基本原理就是采用不同大小和比例(宽高比)的窗口在整张图片上以一定的步长进行滑动,然后对这些窗口对应的区域做图像分类,这样就可以实现对整张图片的检测了。
40 |

41 | 2. 非极大值抑制
42 |
43 | 首先从所有的检测框中找到置信度最大的那个框,然后挨个计算其与剩余框的交并比(IOU),如果其值大于一定阈值(重合度过高),那么就将该框剔除;然后对剩余的检测框重复上述过程,直到处理完所有的检测框。
44 | 
45 | - YOLO算法
46 |
47 | YOLO将对象检测重新定义为一个回归问题。它将单个卷积神经网络(CNN)应用于整个图像,将图像分成网格,并预测每个网格的类概率和边界框。对于每个网格,网络都会预测一个边界框和与每个类别(汽车,行人,交通信号灯等)相对应的概率。每个边界框可以使用四个描述符进行描述:
48 |
49 | 1. 边界框的中心
50 | 2. 高度
51 | 3. 宽度
52 | 4. 值映射到对象所属的类
53 |
54 | 此外,该算法还可以预测边界框中存在对象的概率。如果一个对象的中心落在一个网格单元中,则该网格单元负责检测该对象。每个网格中将有多个边界框。在训练时,我们希望每个对象只有一个边界框。因此,我们根据哪个Box与ground truth box的重叠度最高,从而分配一个Box来负责预测对象。
55 |
56 | 最后,对每个类的对象应用非最大值抑制的方法来过滤出“置信度”小于阈值的边界框。这为我们提供了图像预测。
57 |
58 | 
59 |
60 |
61 |
62 | ### 网络结构
63 | - YOLOv3采用了称之为Darknet-53的网络结构(含有53个卷积层),它借鉴了残差网络的做法,在一些层之间设置了快捷链路。下图展示了其基本结构。
64 | 
65 | 其中Darknet-53的具体结构如下,其采用448*448*3作为输入,左侧数字表示多重复的残差组件个数,每个残差组件有两个卷积层和一个快捷链路。
66 | 
67 |
68 | ### PaddlePaddle代码实现
69 | #### 主要参数
70 | ```
71 | train_params = {
72 | "data_dir": "data/data6045", # 数据目录
73 | "train_list": "train.txt", # 训练集文件
74 | "eval_list": "eval.txt",
75 | "class_dim": -1,
76 | "label_dict": {}, # 标签字典
77 | "num_dict": {},
78 | "image_count": -1,
79 | "continue_train": True, # 是否加载前一次的训练参数,接着训练
80 | "pretrained": False, # 是否预训练
81 | "pretrained_model_dir": "./pretrained-model",
82 | "save_model_dir": "./yolo-model", # 模型保存目录
83 | "model_prefix": "yolo-v3", # 模型前缀
84 | "freeze_dir": "freeze_model",
85 | "use_tiny": False, # 是否使用 裁剪 tiny 模型
86 | "max_box_num": 8, # 一幅图上最多有多少个目标
87 | "num_epochs": 15, # 训练轮次
88 | "train_batch_size": 12, # 对于完整yolov3,每一批的训练样本不能太多,内存会炸掉;如果使用tiny,可以适当大一些
89 | "use_gpu": True, # 是否使用GPU
90 | "yolo_cfg": { # YOLO模型参数
91 | "input_size": [3, 448, 448], # 原版的边长大小为608,为了提高训练速度和预测速度,此处压缩为448
92 | "anchors": [7, 10, 12, 22, 24, 17, 22, 45, 46, 33, 43, 88, 85, 66, 115, 146, 275, 240], # 锚点??
93 | "anchor_mask": [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
94 | },
95 | "yolo_tiny_cfg": { # YOLO tiny 模型参数
96 | "input_size": [3, 256, 256],
97 | "anchors": [6, 8, 13, 15, 22, 34, 48, 50, 81, 100, 205, 191],
98 | "anchor_mask": [[3, 4, 5], [0, 1, 2]]
99 | },
100 | "ignore_thresh": 0.7,
101 | "mean_rgb": [127.5, 127.5, 127.5],
102 | "mode": "train",
103 | "multi_data_reader_count": 4,
104 | "apply_distort": True, # 是否做图像扭曲增强
105 | "nms_top_k": 300,
106 | "nms_pos_k": 300,
107 | "valid_thresh": 0.01,
108 | "nms_thresh": 0.40, # 非最大值抑制阈值
109 | "image_distort_strategy": { # 图像扭曲策略
110 | "expand_prob": 0.5, # 扩展比率
111 | "expand_max_ratio": 4,
112 | "hue_prob": 0.5, # 色调
113 | "hue_delta": 18,
114 | "contrast_prob": 0.5, # 对比度
115 | "contrast_delta": 0.5,
116 | "saturation_prob": 0.5, # 饱和度
117 | "saturation_delta": 0.5,
118 | "brightness_prob": 0.5, # 亮度
119 | "brightness_delta": 0.125
120 | },
121 | "sgd_strategy": { # 梯度下降配置
122 | "learning_rate": 0.002,
123 | "lr_epochs": [30, 50, 65], # 学习率衰减分段(3个数字分为4段)
124 | "lr_decay": [1, 0.5, 0.25, 0.1] # 每段采用的学习率,对应lr_epochs参数4段
125 | },
126 | "early_stop": {
127 | "sample_frequency": 50,
128 | "successive_limit": 3,
129 | "min_loss": 2.5,
130 | "min_curr_map": 0.84
131 | }
132 | }
133 | ```
134 | #### 模型建立
135 | ```
136 | class YOLOv3(object):
137 | def __init__(self, class_num, anchors, anchor_mask):
138 | self.outputs = [] # 网络最终模型
139 | self.downsample_ratio = 1 # 下采样率
140 | self.anchor_mask = anchor_mask # 计算卷积核???
141 | self.anchors = anchors # 锚点
142 | self.class_num = class_num # 类别数量
143 |
144 | self.yolo_anchors = []
145 | self.yolo_classes = []
146 |
147 | for mask_pair in self.anchor_mask:
148 | mask_anchors = []
149 | for mask in mask_pair:
150 | mask_anchors.append(self.anchors[2 * mask])
151 | mask_anchors.append(self.anchors[2 * mask + 1])
152 | self.yolo_anchors.append(mask_anchors)
153 | self.yolo_classes.append(class_num)
154 |
155 | def name(self):
156 | return 'YOLOv3'
157 |
158 | # 获取anchors
159 | def get_anchors(self):
160 | return self.anchors
161 |
162 | # 获取anchor_mask
163 | def get_anchor_mask(self):
164 | return self.anchor_mask
165 |
166 | def get_class_num(self):
167 | return self.class_num
168 |
169 | def get_downsample_ratio(self):
170 | return self.downsample_ratio
171 |
172 | def get_yolo_anchors(self):
173 | return self.yolo_anchors
174 |
175 | def get_yolo_classes(self):
176 | return self.yolo_classes
177 |
178 | # 卷积正则化函数: 卷积、批量正则化处理、leakrelu
179 | def conv_bn(self,
180 | input, # 输入
181 | num_filters, # 卷积核数量
182 | filter_size, # 卷积核大小
183 | stride, # 步幅
184 | padding, # 填充
185 | use_cudnn=True):
186 | # 2d卷积操作
187 | conv = fluid.layers.conv2d(input=input,
188 | num_filters=num_filters,
189 | filter_size=filter_size,
190 | stride=stride,
191 | padding=padding,
192 | act=None,
193 | use_cudnn=use_cudnn, # 是否使用cudnn,cudnn利用cuda进行了加速处理
194 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)),
195 | bias_attr=False)
196 |
197 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉
198 | # 在batch_norm中使用leaky的话,只能使用默认的alpha=0.02;如果需要设值,必须提出去单独来
199 | # 正则化的目的,是为了防止过拟合,较小的L2值能防止过拟合
200 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02),
201 | regularizer=L2Decay(0.))
202 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0),
203 | regularizer=L2Decay(0.))
204 | out = fluid.layers.batch_norm(input=conv, act=None,
205 | param_attr=param_attr,
206 | bias_attr=bias_attr)
207 | # leaky_relu: Leaky ReLU是给所有负值赋予一个非零斜率
208 | out = fluid.layers.leaky_relu(out, 0.1)
209 | return out
210 | ```
211 | #### 训练与迭代
212 | ```
213 | # 执行训练
214 | def train():
215 | init_log_config()
216 | init_train_parameters()
217 |
218 | logger.info("start train YOLOv3, train params:%s", str(train_params))
219 | logger.info("create place, use gpu:" + str(train_params['use_gpu']))
220 |
221 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace()
222 |
223 | logger.info("build network and program")
224 | train_program = fluid.Program()
225 | start_program = fluid.Program()
226 | feeder, reader, loss = build_program_with_feeder(train_program, start_program, place)
227 |
228 | logger.info("build executor and init params")
229 |
230 | exe = fluid.Executor(place)
231 | exe.run(start_program)
232 | train_fetch_list = [loss.name]
233 | load_pretrained_params(exe, train_program) # 加载模型及参数
234 |
235 | stop_strategy = train_params['early_stop']
236 | successive_limit = stop_strategy['successive_limit']
237 | sample_freq = stop_strategy['sample_frequency']
238 | min_curr_map = stop_strategy['min_curr_map']
239 | min_loss = stop_strategy['min_loss']
240 | stop_train = False
241 | successive_count = 0
242 | total_batch_count = 0
243 | valid_thresh = train_params['valid_thresh']
244 | nms_thresh = train_params['nms_thresh']
245 | current_best_loss = 10000000000.0
246 |
247 | # 开始迭代训练
248 | for pass_id in range(train_params["num_epochs"]):
249 | logger.info("current pass: {}, start read image".format(pass_id))
250 | batch_id = 0
251 | total_loss = 0.0
252 |
253 | for batch_id, data in enumerate(reader()):
254 | t1 = time.time()
255 |
256 | loss = exe.run(train_program,
257 | feed=feeder.feed(data),
258 | fetch_list=train_fetch_list) # 执行训练
259 |
260 | period = time.time() - t1
261 | loss = np.mean(np.array(loss))
262 | total_loss += loss
263 | batch_id += 1
264 | total_batch_count += 1
265 |
266 | if batch_id % 10 == 0: # 调整日志输出的频率
267 | logger.info(
268 | "pass {}, trainbatch {}, loss {} time {}".format(pass_id, batch_id, loss, "%2.2f sec" % period))
269 |
270 | pass_mean_loss = total_loss / batch_id
271 | logger.info("pass {0} train result, current pass mean loss: {1}".format(pass_id, pass_mean_loss))
272 |
273 | # 采用每训练完一轮停止办法,可以调整为更精细的保存策略
274 | if pass_mean_loss < current_best_loss:
275 | logger.info("temp save {} epcho train result, current best pass loss {}".format(pass_id, pass_mean_loss))
276 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program,
277 | executor=exe)
278 | current_best_loss = pass_mean_loss
279 |
280 | logger.info("training till last epcho, end training")
281 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program, executor=exe)
282 | ```
283 | ---
284 |
285 | ## 数据集基本信息
286 | * 本组使用的数据集共有900张图片,其中500张来自校园拍摄实景,其余为下载的特定分类图片。
287 | * 所有图片宽高比均为4:3,分辨率为800*600。数据集图片主要分四类,包括单独的行人、自行车与汽车与前三类混杂在一起的图片。
288 | 
289 |
290 | ## 训练过程中的参数调整与模型优化
291 | ### YOLO和YOLO-tiny对比
292 | 
293 |
294 | 模型|训练30轮所用时长|
295 | ---|:--:|
296 | YOLO|2h9m|
297 | YOLO-tiny|1h41m|
298 | ### 参数调整
299 | - max_box_num": 8
300 | - nms_thresh": 0.40
301 | - valid_thresh": 0.015
302 | - 优化显存
303 | - os.environ["FLAGS_fraction_of_gpu_memory_to_use"] = '0.92'
304 | - os.environ["FLAGS_eager_delete_tensor_gb"] = '0'
305 | - os.environ["FLAGS_memory_fraction_of_eager_deletion"] = '1'
306 | - os.environ["FLAGS_fast_eager_deletion_mode"]='True'
307 | ### 模型优化
308 | - 优化器更改:原优化器为SGD
309 | ```
310 | optimizer=fluid.optimizer.SGDOptimizer(
311 | learning_rate=fluid.layers.piecewise_decay(boundaries, values), regularization=fluid.regularizer.L2Decay(0.00005))
312 | ```
313 | - 变更为Adam算法
314 | ```
315 | optimizer=fluid.optimizer.AdamOptimizer(learning_rate=0.01,beta1=0.9,beta2=0.999,regularization=fluid.regularizer.L2Decay(0.00005))
316 | ```
317 | - Adam优化对比分析:
318 | - 
319 |
320 | ## 网络性能分析
321 | - 挑战集测试分析
322 | - 
323 | - 实际结果
324 | - 
325 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import json
3 | import os
4 | import shutil
5 | import operator
6 | import sys
7 | import argparse
8 | import math
9 |
10 | import numpy as np
11 |
12 | MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge)
13 |
14 | parser = argparse.ArgumentParser()
15 | parser.add_argument('-na', '--no-animation', help="no animation is shown.", action="store_true")
16 | parser.add_argument('-np', '--no-plot', help="no plot is shown.", action="store_true")
17 | parser.add_argument('-q', '--quiet', help="minimalistic console output.", action="store_true")
18 | # argparse receiving list of classes to be ignored (e.g., python main.py --ignore person book)
19 | parser.add_argument('-i', '--ignore', nargs='+', type=str, help="ignore a list of classes.")
20 | # argparse receiving list of classes with specific IoU (e.g., python main.py --set-class-iou person 0.7)
21 | parser.add_argument('--set-class-iou', nargs='+', type=str, help="set IoU for a specific class.")
22 | args = parser.parse_args()
23 |
24 | '''
25 | 0,0 ------> x (width)
26 | |
27 | | (Left,Top)
28 | | *_________
29 | | | |
30 | | |
31 | y |_________|
32 | (height) *
33 | (Right,Bottom)
34 | '''
35 |
36 | # if there are no classes to ignore then replace None by empty list
37 | if args.ignore is None:
38 | args.ignore = []
39 |
40 | specific_iou_flagged = False
41 | if args.set_class_iou is not None:
42 | specific_iou_flagged = True
43 |
44 | # make sure that the cwd() is the location of the python script (so that every path makes sense)
45 | os.chdir(os.path.dirname(os.path.abspath(__file__)))
46 |
47 | GT_PATH = os.path.join(os.getcwd(), 'input', 'ground-truth')
48 | DR_PATH = os.path.join(os.getcwd(), 'input', 'detection-results')
49 | # if there are no images then no animation can be shown
50 | IMG_PATH = os.path.join(os.getcwd(), 'input', 'images-optional')
51 | if os.path.exists(IMG_PATH):
52 | for dirpath, dirnames, files in os.walk(IMG_PATH):
53 | if not files:
54 | # no image files found
55 | args.no_animation = True
56 | else:
57 | args.no_animation = True
58 |
59 | # try to import OpenCV if the user didn't choose the option --no-animation
60 | show_animation = False
61 | if not args.no_animation:
62 | try:
63 | import cv2
64 | show_animation = True
65 | except ImportError:
66 | print("\"opencv-python\" not found, please install to visualize the results.")
67 | args.no_animation = True
68 |
69 | # try to import Matplotlib if the user didn't choose the option --no-plot
70 | draw_plot = False
71 | if not args.no_plot:
72 | try:
73 | import matplotlib.pyplot as plt
74 | draw_plot = True
75 | except ImportError:
76 | print("\"matplotlib\" not found, please install it to get the resulting plots.")
77 | args.no_plot = True
78 |
79 |
80 | def log_average_miss_rate(prec, rec, num_images):
81 | """
82 | log-average miss rate:
83 | Calculated by averaging miss rates at 9 evenly spaced FPPI points
84 | between 10e-2 and 10e0, in log-space.
85 |
86 | output:
87 | lamr | log-average miss rate
88 | mr | miss rate
89 | fppi | false positives per image
90 |
91 | references:
92 | [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the
93 | State of the Art." Pattern Analysis and Machine Intelligence, IEEE
94 | Transactions on 34.4 (2012): 743 - 761.
95 | """
96 |
97 | # if there were no detections of that class
98 | if prec.size == 0:
99 | lamr = 0
100 | mr = 1
101 | fppi = 0
102 | return lamr, mr, fppi
103 |
104 | fppi = (1 - prec)
105 | mr = (1 - rec)
106 |
107 | fppi_tmp = np.insert(fppi, 0, -1.0)
108 | mr_tmp = np.insert(mr, 0, 1.0)
109 |
110 | # Use 9 evenly spaced reference points in log-space
111 | ref = np.logspace(-2.0, 0.0, num = 9)
112 | for i, ref_i in enumerate(ref):
113 | # np.where() will always find at least 1 index, since min(ref) = 0.01 and min(fppi_tmp) = -1.0
114 | j = np.where(fppi_tmp <= ref_i)[-1][-1]
115 | ref[i] = mr_tmp[j]
116 |
117 | # log(0) is undefined, so we use the np.maximum(1e-10, ref)
118 | lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref))))
119 |
120 | return lamr, mr, fppi
121 |
122 | """
123 | throw error and exit
124 | """
125 | def error(msg):
126 | print(msg)
127 | sys.exit(0)
128 |
129 | """
130 | check if the number is a float between 0.0 and 1.0
131 | """
132 | def is_float_between_0_and_1(value):
133 | try:
134 | val = float(value)
135 | if val > 0.0 and val < 1.0:
136 | return True
137 | else:
138 | return False
139 | except ValueError:
140 | return False
141 |
142 | """
143 | Calculate the AP given the recall and precision array
144 | 1st) We compute a version of the measured precision/recall curve with
145 | precision monotonically decreasing
146 | 2nd) We compute the AP as the area under this curve by numerical integration.
147 | """
148 | def voc_ap(rec, prec):
149 | """
150 | --- Official matlab code VOC2012---
151 | mrec=[0 ; rec ; 1];
152 | mpre=[0 ; prec ; 0];
153 | for i=numel(mpre)-1:-1:1
154 | mpre(i)=max(mpre(i),mpre(i+1));
155 | end
156 | i=find(mrec(2:end)~=mrec(1:end-1))+1;
157 | ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
158 | """
159 | rec.insert(0, 0.0) # insert 0.0 at begining of list
160 | rec.append(1.0) # insert 1.0 at end of list
161 | mrec = rec[:]
162 | prec.insert(0, 0.0) # insert 0.0 at begining of list
163 | prec.append(0.0) # insert 0.0 at end of list
164 | mpre = prec[:]
165 | """
166 | This part makes the precision monotonically decreasing
167 | (goes from the end to the beginning)
168 | matlab: for i=numel(mpre)-1:-1:1
169 | mpre(i)=max(mpre(i),mpre(i+1));
170 | """
171 | # matlab indexes start in 1 but python in 0, so I have to do:
172 | # range(start=(len(mpre) - 2), end=0, step=-1)
173 | # also the python function range excludes the end, resulting in:
174 | # range(start=(len(mpre) - 2), end=-1, step=-1)
175 | for i in range(len(mpre)-2, -1, -1):
176 | mpre[i] = max(mpre[i], mpre[i+1])
177 | """
178 | This part creates a list of indexes where the recall changes
179 | matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1;
180 | """
181 | i_list = []
182 | for i in range(1, len(mrec)):
183 | if mrec[i] != mrec[i-1]:
184 | i_list.append(i) # if it was matlab would be i + 1
185 | """
186 | The Average Precision (AP) is the area under the curve
187 | (numerical integration)
188 | matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i));
189 | """
190 | ap = 0.0
191 | for i in i_list:
192 | ap += ((mrec[i]-mrec[i-1])*mpre[i])
193 | return ap, mrec, mpre
194 |
195 |
196 | """
197 | Convert the lines of a file to a list
198 | """
199 | def file_lines_to_list(path):
200 | # open txt file lines to a list
201 | with open(path) as f:
202 | content = f.readlines()
203 | # remove whitespace characters like `\n` at the end of each line
204 | content = [x.strip() for x in content]
205 | return content
206 |
207 | """
208 | Draws text in image
209 | """
210 | def draw_text_in_image(img, text, pos, color, line_width):
211 | font = cv2.FONT_HERSHEY_PLAIN
212 | fontScale = 1
213 | lineType = 1
214 | bottomLeftCornerOfText = pos
215 | cv2.putText(img, text,
216 | bottomLeftCornerOfText,
217 | font,
218 | fontScale,
219 | color,
220 | lineType)
221 | text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0]
222 | return img, (line_width + text_width)
223 |
224 | """
225 | Plot - adjust axes
226 | """
227 | def adjust_axes(r, t, fig, axes):
228 | # get text width for re-scaling
229 | bb = t.get_window_extent(renderer=r)
230 | text_width_inches = bb.width / fig.dpi
231 | # get axis width in inches
232 | current_fig_width = fig.get_figwidth()
233 | new_fig_width = current_fig_width + text_width_inches
234 | propotion = new_fig_width / current_fig_width
235 | # get axis limit
236 | x_lim = axes.get_xlim()
237 | axes.set_xlim([x_lim[0], x_lim[1]*propotion])
238 |
239 | """
240 | Draw plot using Matplotlib
241 | """
242 | def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar):
243 | # sort the dictionary by decreasing value, into a list of tuples
244 | sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1))
245 | # unpacking the list of tuples into two lists
246 | sorted_keys, sorted_values = zip(*sorted_dic_by_value)
247 | #
248 | if true_p_bar != "":
249 | """
250 | Special case to draw in:
251 | - green -> TP: True Positives (object detected and matches ground-truth)
252 | - red -> FP: False Positives (object detected but does not match ground-truth)
253 | - pink -> FN: False Negatives (object not detected but present in the ground-truth)
254 | """
255 | fp_sorted = []
256 | tp_sorted = []
257 | for key in sorted_keys:
258 | fp_sorted.append(dictionary[key] - true_p_bar[key])
259 | tp_sorted.append(true_p_bar[key])
260 | plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive')
261 | plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted)
262 | # add legend
263 | plt.legend(loc='lower right')
264 | """
265 | Write number on side of bar
266 | """
267 | fig = plt.gcf() # gcf - get current figure
268 | axes = plt.gca()
269 | r = fig.canvas.get_renderer()
270 | for i, val in enumerate(sorted_values):
271 | fp_val = fp_sorted[i]
272 | tp_val = tp_sorted[i]
273 | fp_str_val = " " + str(fp_val)
274 | tp_str_val = fp_str_val + " " + str(tp_val)
275 | # trick to paint multicolor with offset:
276 | # first paint everything and then repaint the first number
277 | t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold')
278 | plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold')
279 | if i == (len(sorted_values)-1): # largest bar
280 | adjust_axes(r, t, fig, axes)
281 | else:
282 | plt.barh(range(n_classes), sorted_values, color=plot_color)
283 | """
284 | Write number on side of bar
285 | """
286 | fig = plt.gcf() # gcf - get current figure
287 | axes = plt.gca()
288 | r = fig.canvas.get_renderer()
289 | for i, val in enumerate(sorted_values):
290 | str_val = " " + str(val) # add a space before
291 | if val < 1.0:
292 | str_val = " {0:.2f}".format(val)
293 | t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold')
294 | # re-set axes to show number inside the figure
295 | if i == (len(sorted_values)-1): # largest bar
296 | adjust_axes(r, t, fig, axes)
297 | # set window title
298 | fig.canvas.set_window_title(window_title)
299 | # write classes in y axis
300 | tick_font_size = 12
301 | plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size)
302 | """
303 | Re-scale height accordingly
304 | """
305 | init_height = fig.get_figheight()
306 | # comput the matrix height in points and inches
307 | dpi = fig.dpi
308 | height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing)
309 | height_in = height_pt / dpi
310 | # compute the required figure height
311 | top_margin = 0.15 # in percentage of the figure height
312 | bottom_margin = 0.05 # in percentage of the figure height
313 | figure_height = height_in / (1 - top_margin - bottom_margin)
314 | # set new height
315 | if figure_height > init_height:
316 | fig.set_figheight(figure_height)
317 |
318 | # set plot title
319 | plt.title(plot_title, fontsize=14)
320 | # set axis titles
321 | # plt.xlabel('classes')
322 | plt.xlabel(x_label, fontsize='large')
323 | # adjust size of window
324 | fig.tight_layout()
325 | # save the plot
326 | fig.savefig(output_path)
327 | # show image
328 | if to_show:
329 | plt.show()
330 | # close the plot
331 | plt.close()
332 |
333 | """
334 | Create a ".temp_files/" and "output/" directory
335 | """
336 | TEMP_FILES_PATH = ".temp_files"
337 | if not os.path.exists(TEMP_FILES_PATH): # if it doesn't exist already
338 | os.makedirs(TEMP_FILES_PATH)
339 | output_files_path = "output"
340 | if os.path.exists(output_files_path): # if it exist already
341 | # reset the output directory
342 | shutil.rmtree(output_files_path)
343 |
344 | os.makedirs(output_files_path)
345 | if draw_plot:
346 | os.makedirs(os.path.join(output_files_path, "classes"))
347 | if show_animation:
348 | os.makedirs(os.path.join(output_files_path, "images", "detections_one_by_one"))
349 |
350 | """
351 | ground-truth
352 | Load each of the ground-truth files into a temporary ".json" file.
353 | Create a list of all the class names present in the ground-truth (gt_classes).
354 | """
355 | # get a list with the ground-truth files
356 | ground_truth_files_list = glob.glob(GT_PATH + '/*.txt')
357 | if len(ground_truth_files_list) == 0:
358 | error("Error: No ground-truth files found!")
359 | ground_truth_files_list.sort()
360 | # dictionary with counter per class
361 | gt_counter_per_class = {}
362 | counter_images_per_class = {}
363 |
364 | gt_files = []
365 | for txt_file in ground_truth_files_list:
366 | #print(txt_file)
367 | file_id = txt_file.split(".txt", 1)[0]
368 | file_id = os.path.basename(os.path.normpath(file_id))
369 | # check if there is a correspondent detection-results file
370 | temp_path = os.path.join(DR_PATH, (file_id + ".txt"))
371 | if not os.path.exists(temp_path):
372 | error_msg = "Error. File not found: {}\n".format(temp_path)
373 | error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)"
374 | error(error_msg)
375 | lines_list = file_lines_to_list(txt_file)
376 | # create ground-truth dictionary
377 | bounding_boxes = []
378 | is_difficult = False
379 | already_seen_classes = []
380 | for line in lines_list:
381 | try:
382 | if "difficult" in line:
383 | class_name, left, top, right, bottom, _difficult = line.split()
384 | is_difficult = True
385 | else:
386 | class_name, left, top, right, bottom = line.split()
387 | except ValueError:
388 | error_msg = "Error: File " + txt_file + " in the wrong format.\n"
389 | error_msg += " Expected: ['difficult']\n"
390 | error_msg += " Received: " + line
391 | error_msg += "\n\nIf you have a with spaces between words you should remove them\n"
392 | error_msg += "by running the script \"remove_space.py\" or \"rename_class.py\" in the \"extra/\" folder."
393 | error(error_msg)
394 | # check if class is in the ignore list, if yes skip
395 | if class_name in args.ignore:
396 | continue
397 | bbox = left + " " + top + " " + right + " " +bottom
398 | if is_difficult:
399 | bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True})
400 | is_difficult = False
401 | else:
402 | bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False})
403 | # count that object
404 | if class_name in gt_counter_per_class:
405 | gt_counter_per_class[class_name] += 1
406 | else:
407 | # if class didn't exist yet
408 | gt_counter_per_class[class_name] = 1
409 |
410 | if class_name not in already_seen_classes:
411 | if class_name in counter_images_per_class:
412 | counter_images_per_class[class_name] += 1
413 | else:
414 | # if class didn't exist yet
415 | counter_images_per_class[class_name] = 1
416 | already_seen_classes.append(class_name)
417 |
418 |
419 | # dump bounding_boxes into a ".json" file
420 | new_temp_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
421 | gt_files.append(new_temp_file)
422 | with open(new_temp_file, 'w') as outfile:
423 | json.dump(bounding_boxes, outfile)
424 |
425 | gt_classes = list(gt_counter_per_class.keys())
426 | # let's sort the classes alphabetically
427 | gt_classes = sorted(gt_classes)
428 | n_classes = len(gt_classes)
429 | #print(gt_classes)
430 | #print(gt_counter_per_class)
431 |
432 | """
433 | Check format of the flag --set-class-iou (if used)
434 | e.g. check if class exists
435 | """
436 | if specific_iou_flagged:
437 | n_args = len(args.set_class_iou)
438 | error_msg = \
439 | '\n --set-class-iou [class_1] [IoU_1] [class_2] [IoU_2] [...]'
440 | if n_args % 2 != 0:
441 | error('Error, missing arguments. Flag usage:' + error_msg)
442 | # [class_1] [IoU_1] [class_2] [IoU_2]
443 | # specific_iou_classes = ['class_1', 'class_2']
444 | specific_iou_classes = args.set_class_iou[::2] # even
445 | # iou_list = ['IoU_1', 'IoU_2']
446 | iou_list = args.set_class_iou[1::2] # odd
447 | if len(specific_iou_classes) != len(iou_list):
448 | error('Error, missing arguments. Flag usage:' + error_msg)
449 | for tmp_class in specific_iou_classes:
450 | if tmp_class not in gt_classes:
451 | error('Error, unknown class \"' + tmp_class + '\". Flag usage:' + error_msg)
452 | for num in iou_list:
453 | if not is_float_between_0_and_1(num):
454 | error('Error, IoU must be between 0.0 and 1.0. Flag usage:' + error_msg)
455 |
456 | """
457 | detection-results
458 | Load each of the detection-results files into a temporary ".json" file.
459 | """
460 | # get a list with the detection-results files
461 | dr_files_list = glob.glob(DR_PATH + '/*.txt')
462 | dr_files_list.sort()
463 |
464 | for class_index, class_name in enumerate(gt_classes):
465 | bounding_boxes = []
466 | for txt_file in dr_files_list:
467 | #print(txt_file)
468 | # the first time it checks if all the corresponding ground-truth files exist
469 | file_id = txt_file.split(".txt",1)[0]
470 | file_id = os.path.basename(os.path.normpath(file_id))
471 | temp_path = os.path.join(GT_PATH, (file_id + ".txt"))
472 | if class_index == 0:
473 | if not os.path.exists(temp_path):
474 | error_msg = "Error. File not found: {}\n".format(temp_path)
475 | error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)"
476 | error(error_msg)
477 | lines = file_lines_to_list(txt_file)
478 | for line in lines:
479 | try:
480 | tmp_class_name, confidence, left, top, right, bottom = line.split()
481 | except ValueError:
482 | error_msg = "Error: File " + txt_file + " in the wrong format.\n"
483 | error_msg += " Expected: \n"
484 | error_msg += " Received: " + line
485 | error(error_msg)
486 | if tmp_class_name == class_name:
487 | #print("match")
488 | bbox = left + " " + top + " " + right + " " +bottom
489 | bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox})
490 | #print(bounding_boxes)
491 | # sort detection-results by decreasing confidence
492 | bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True)
493 | with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile:
494 | json.dump(bounding_boxes, outfile)
495 |
496 | """
497 | Calculate the AP for each class
498 | """
499 | sum_AP = 0.0
500 | ap_dictionary = {}
501 | lamr_dictionary = {}
502 | # open file to store the output
503 | with open(output_files_path + "/output.txt", 'w') as output_file:
504 | output_file.write("# AP and precision/recall per class\n")
505 | count_true_positives = {}
506 | for class_index, class_name in enumerate(gt_classes):
507 | count_true_positives[class_name] = 0
508 | """
509 | Load detection-results of that class
510 | """
511 | dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json"
512 | dr_data = json.load(open(dr_file))
513 |
514 | """
515 | Assign detection-results to ground-truth objects
516 | """
517 | nd = len(dr_data)
518 | tp = [0] * nd # creates an array of zeros of size nd
519 | fp = [0] * nd
520 | for idx, detection in enumerate(dr_data):
521 | file_id = detection["file_id"]
522 | if show_animation:
523 | # find ground truth image
524 | ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*")
525 | #tifCounter = len(glob.glob1(myPath,"*.tif"))
526 | if len(ground_truth_img) == 0:
527 | error("Error. Image not found with id: " + file_id)
528 | elif len(ground_truth_img) > 1:
529 | error("Error. Multiple image with id: " + file_id)
530 | else: # found image
531 | #print(IMG_PATH + "/" + ground_truth_img[0])
532 | # Load image
533 | img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0])
534 | # load image with draws of multiple detections
535 | img_cumulative_path = output_files_path + "/images/" + ground_truth_img[0]
536 | if os.path.isfile(img_cumulative_path):
537 | img_cumulative = cv2.imread(img_cumulative_path)
538 | else:
539 | img_cumulative = img.copy()
540 | # Add bottom border to image
541 | bottom_border = 60
542 | BLACK = [0, 0, 0]
543 | img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK)
544 | # assign detection-results to ground truth object if any
545 | # open ground-truth with that file_id
546 | gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json"
547 | ground_truth_data = json.load(open(gt_file))
548 | ovmax = -1
549 | gt_match = -1
550 | # load detected object bounding-box
551 | bb = [ float(x) for x in detection["bbox"].split() ]
552 | for obj in ground_truth_data:
553 | # look for a class_name match
554 | if obj["class_name"] == class_name:
555 | bbgt = [ float(x) for x in obj["bbox"].split() ]
556 | bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])]
557 | iw = bi[2] - bi[0] + 1
558 | ih = bi[3] - bi[1] + 1
559 | if iw > 0 and ih > 0:
560 | # compute overlap (IoU) = area of intersection / area of union
561 | ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0]
562 | + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih
563 | ov = iw * ih / ua
564 | if ov > ovmax:
565 | ovmax = ov
566 | gt_match = obj
567 |
568 | # assign detection as true positive/don't care/false positive
569 | if show_animation:
570 | status = "NO MATCH FOUND!" # status is only used in the animation
571 | # set minimum overlap
572 | min_overlap = MINOVERLAP
573 | if specific_iou_flagged:
574 | if class_name in specific_iou_classes:
575 | index = specific_iou_classes.index(class_name)
576 | min_overlap = float(iou_list[index])
577 | if ovmax >= min_overlap:
578 | if "difficult" not in gt_match:
579 | if not bool(gt_match["used"]):
580 | # true positive
581 | tp[idx] = 1
582 | gt_match["used"] = True
583 | count_true_positives[class_name] += 1
584 | # update the ".json" file
585 | with open(gt_file, 'w') as f:
586 | f.write(json.dumps(ground_truth_data))
587 | if show_animation:
588 | status = "MATCH!"
589 | else:
590 | # false positive (multiple detection)
591 | fp[idx] = 1
592 | if show_animation:
593 | status = "REPEATED MATCH!"
594 | else:
595 | # false positive
596 | fp[idx] = 1
597 | if ovmax > 0:
598 | status = "INSUFFICIENT OVERLAP"
599 |
600 | """
601 | Draw image to show animation
602 | """
603 | if show_animation:
604 | height, widht = img.shape[:2]
605 | # colors (OpenCV works with BGR)
606 | white = (255,255,255)
607 | light_blue = (255,200,100)
608 | green = (0,255,0)
609 | light_red = (30,30,255)
610 | # 1st line
611 | margin = 10
612 | v_pos = int(height - margin - (bottom_border / 2.0))
613 | text = "Image: " + ground_truth_img[0] + " "
614 | img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
615 | text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " "
616 | img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width)
617 | if ovmax != -1:
618 | color = light_red
619 | if status == "INSUFFICIENT OVERLAP":
620 | text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100)
621 | else:
622 | text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100)
623 | color = green
624 | img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
625 | # 2nd line
626 | v_pos += int(bottom_border / 2.0)
627 | rank_pos = str(idx+1) # rank position (idx starts at 0)
628 | text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100)
629 | img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0)
630 | color = light_red
631 | if status == "MATCH!":
632 | color = green
633 | text = "Result: " + status + " "
634 | img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width)
635 |
636 | font = cv2.FONT_HERSHEY_SIMPLEX
637 | if ovmax > 0: # if there is intersections between the bounding-boxes
638 | bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ]
639 | cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
640 | cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2)
641 | cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA)
642 | bb = [int(i) for i in bb]
643 | cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
644 | cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2)
645 | cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA)
646 | # show image
647 | cv2.imshow("Animation", img)
648 | cv2.waitKey(20) # show for 20 ms
649 | # save image to output
650 | output_img_path = output_files_path + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg"
651 | cv2.imwrite(output_img_path, img)
652 | # save the image with all the objects drawn to it
653 | cv2.imwrite(img_cumulative_path, img_cumulative)
654 |
655 | #print(tp)
656 | # compute precision/recall
657 | cumsum = 0
658 | for idx, val in enumerate(fp):
659 | fp[idx] += cumsum
660 | cumsum += val
661 | cumsum = 0
662 | for idx, val in enumerate(tp):
663 | tp[idx] += cumsum
664 | cumsum += val
665 | #print(tp)
666 | rec = tp[:]
667 | for idx, val in enumerate(tp):
668 | rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name]
669 | #print(rec)
670 | prec = tp[:]
671 | for idx, val in enumerate(tp):
672 | prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx])
673 | #print(prec)
674 |
675 | ap, mrec, mprec = voc_ap(rec[:], prec[:])
676 | sum_AP += ap
677 | text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100)
678 | """
679 | Write to output.txt
680 | """
681 | rounded_prec = [ '%.2f' % elem for elem in prec ]
682 | rounded_rec = [ '%.2f' % elem for elem in rec ]
683 | output_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n")
684 | if not args.quiet:
685 | print(text)
686 | ap_dictionary[class_name] = ap
687 |
688 | n_images = counter_images_per_class[class_name]
689 | lamr, mr, fppi = log_average_miss_rate(np.array(prec), np.array(rec), n_images)
690 | lamr_dictionary[class_name] = lamr
691 |
692 | """
693 | Draw plot
694 | """
695 | if draw_plot:
696 | plt.plot(rec, prec, '-o')
697 | # add a new penultimate point to the list (mrec[-2], 0.0)
698 | # since the last line segment (and respective area) do not affect the AP value
699 | area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]]
700 | area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]]
701 | plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r')
702 | # set window title
703 | fig = plt.gcf() # gcf - get current figure
704 | fig.canvas.set_window_title('AP ' + class_name)
705 | # set plot title
706 | plt.title('class: ' + text)
707 | #plt.suptitle('This is a somewhat long figure title', fontsize=16)
708 | # set axis titles
709 | plt.xlabel('Recall')
710 | plt.ylabel('Precision')
711 | # optional - set axes
712 | axes = plt.gca() # gca - get current axes
713 | axes.set_xlim([0.0,1.0])
714 | axes.set_ylim([0.0,1.05]) # .05 to give some extra space
715 | # Alternative option -> wait for button to be pressed
716 | #while not plt.waitforbuttonpress(): pass # wait for key display
717 | # Alternative option -> normal display
718 | #plt.show()
719 | # save the plot
720 | fig.savefig(output_files_path + "/classes/" + class_name + ".png")
721 | plt.cla() # clear axes for next plot
722 |
723 | if show_animation:
724 | cv2.destroyAllWindows()
725 |
726 | output_file.write("\n# mAP of all classes\n")
727 | mAP = sum_AP / n_classes
728 | text = "mAP = {0:.2f}%".format(mAP*100)
729 | output_file.write(text + "\n")
730 | print(text)
731 |
732 | """
733 | Draw false negatives
734 | """
735 | if show_animation:
736 | pink = (203,192,255)
737 | for tmp_file in gt_files:
738 | ground_truth_data = json.load(open(tmp_file))
739 | #print(ground_truth_data)
740 | # get name of corresponding image
741 | start = TEMP_FILES_PATH + '/'
742 | img_id = tmp_file[tmp_file.find(start)+len(start):tmp_file.rfind('_ground_truth.json')]
743 | img_cumulative_path = output_files_path + "/images/" + img_id + ".jpg"
744 | img = cv2.imread(img_cumulative_path)
745 | if img is None:
746 | img_path = IMG_PATH + '/' + img_id + ".jpg"
747 | img = cv2.imread(img_path)
748 | # draw false negatives
749 | for obj in ground_truth_data:
750 | if not obj['used']:
751 | bbgt = [ int(round(float(x))) for x in obj["bbox"].split() ]
752 | cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),pink,2)
753 | cv2.imwrite(img_cumulative_path, img)
754 |
755 | # remove the temp_files directory
756 | shutil.rmtree(TEMP_FILES_PATH)
757 |
758 | """
759 | Count total of detection-results
760 | """
761 | # iterate through all the files
762 | det_counter_per_class = {}
763 | for txt_file in dr_files_list:
764 | # get lines to list
765 | lines_list = file_lines_to_list(txt_file)
766 | for line in lines_list:
767 | class_name = line.split()[0]
768 | # check if class is in the ignore list, if yes skip
769 | if class_name in args.ignore:
770 | continue
771 | # count that object
772 | if class_name in det_counter_per_class:
773 | det_counter_per_class[class_name] += 1
774 | else:
775 | # if class didn't exist yet
776 | det_counter_per_class[class_name] = 1
777 | #print(det_counter_per_class)
778 | dr_classes = list(det_counter_per_class.keys())
779 |
780 |
781 | """
782 | Plot the total number of occurences of each class in the ground-truth
783 | """
784 | if draw_plot:
785 | window_title = "ground-truth-info"
786 | plot_title = "ground-truth\n"
787 | plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)"
788 | x_label = "Number of objects per class"
789 | output_path = output_files_path + "/ground-truth-info.png"
790 | to_show = False
791 | plot_color = 'forestgreen'
792 | draw_plot_func(
793 | gt_counter_per_class,
794 | n_classes,
795 | window_title,
796 | plot_title,
797 | x_label,
798 | output_path,
799 | to_show,
800 | plot_color,
801 | '',
802 | )
803 |
804 | """
805 | Write number of ground-truth objects per class to results.txt
806 | """
807 | with open(output_files_path + "/output.txt", 'a') as output_file:
808 | output_file.write("\n# Number of ground-truth objects per class\n")
809 | for class_name in sorted(gt_counter_per_class):
810 | output_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n")
811 |
812 | """
813 | Finish counting true positives
814 | """
815 | for class_name in dr_classes:
816 | # if class exists in detection-result but not in ground-truth then there are no true positives in that class
817 | if class_name not in gt_classes:
818 | count_true_positives[class_name] = 0
819 | #print(count_true_positives)
820 |
821 | """
822 | Plot the total number of occurences of each class in the "detection-results" folder
823 | """
824 | if draw_plot:
825 | window_title = "detection-results-info"
826 | # Plot title
827 | plot_title = "detection-results\n"
828 | plot_title += "(" + str(len(dr_files_list)) + " files and "
829 | count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values()))
830 | plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)"
831 | # end Plot title
832 | x_label = "Number of objects per class"
833 | output_path = output_files_path + "/detection-results-info.png"
834 | to_show = False
835 | plot_color = 'forestgreen'
836 | true_p_bar = count_true_positives
837 | draw_plot_func(
838 | det_counter_per_class,
839 | len(det_counter_per_class),
840 | window_title,
841 | plot_title,
842 | x_label,
843 | output_path,
844 | to_show,
845 | plot_color,
846 | true_p_bar
847 | )
848 |
849 | """
850 | Write number of detected objects per class to output.txt
851 | """
852 | with open(output_files_path + "/output.txt", 'a') as output_file:
853 | output_file.write("\n# Number of detected objects per class\n")
854 | for class_name in sorted(dr_classes):
855 | n_det = det_counter_per_class[class_name]
856 | text = class_name + ": " + str(n_det)
857 | text += " (tp:" + str(count_true_positives[class_name]) + ""
858 | text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n"
859 | output_file.write(text)
860 |
861 | """
862 | Draw log-average miss rate plot (Show lamr of all classes in decreasing order)
863 | """
864 | if draw_plot:
865 | window_title = "lamr"
866 | plot_title = "log-average miss rate"
867 | x_label = "log-average miss rate"
868 | output_path = output_files_path + "/lamr.png"
869 | to_show = False
870 | plot_color = 'royalblue'
871 | draw_plot_func(
872 | lamr_dictionary,
873 | n_classes,
874 | window_title,
875 | plot_title,
876 | x_label,
877 | output_path,
878 | to_show,
879 | plot_color,
880 | ""
881 | )
882 |
883 | """
884 | Draw mAP plot (Show AP's of all classes in decreasing order)
885 | """
886 | if draw_plot:
887 | window_title = "mAP"
888 | plot_title = "mAP = {0:.2f}%".format(mAP*100)
889 | x_label = "Average Precision"
890 | output_path = output_files_path + "/mAP.png"
891 | to_show = True
892 | plot_color = 'royalblue'
893 | draw_plot_func(
894 | ap_dictionary,
895 | n_classes,
896 | window_title,
897 | plot_title,
898 | x_label,
899 | output_path,
900 | to_show,
901 | plot_color,
902 | ""
903 | )
904 |
--------------------------------------------------------------------------------
/objectDetection.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | """
3 | 训练常基于dark-net的YOLOv3网络,目标检测
4 | """
5 | from __future__ import absolute_import
6 | from __future__ import division
7 | from __future__ import print_function
8 | import os
9 |
10 | os.environ["FLAGS_fraction_of_gpu_memory_to_use"] = '0.92'
11 | os.environ["FLAGS_eager_delete_tensor_gb"] = '0'
12 | os.environ["FLAGS_memory_fraction_of_eager_deletion"] = '1'
13 | os.environ["FLAGS_fast_eager_deletion_mode"]='True'
14 |
15 | import uuid
16 | import numpy as np
17 | import time
18 | import six
19 | import math
20 | import random
21 | import paddle
22 | import paddle.fluid as fluid
23 | import logging
24 | import xml.etree.ElementTree
25 | import codecs
26 | import json
27 |
28 | from paddle.fluid.initializer import MSRA
29 | from paddle.fluid.param_attr import ParamAttr
30 | from paddle.fluid.regularizer import L2Decay
31 | from PIL import Image, ImageEnhance, ImageDraw, ImageFile
32 | ImageFile.LOAD_TRUNCATED_IMAGES = True
33 | Image.MAX_IMAGE_PIXELS = None
34 |
35 | logger = None # 日志对象
36 |
37 | train_params = {
38 | "data_dir": "data/data6045", # 数据目录
39 | "train_list": "train.txt", # 训练集文件
40 | "eval_list": "eval.txt",
41 | "class_dim": -1,
42 | "label_dict": {}, # 标签字典
43 | "num_dict": {},
44 | "image_count": -1,
45 | "continue_train": True, # 是否加载前一次的训练参数,接着训练
46 | "pretrained": False, # 是否预训练
47 | "pretrained_model_dir": "./pretrained-model",
48 | "save_model_dir": "./yolo-model", # 模型保存目录
49 | "model_prefix": "yolo-v3", # 模型前缀
50 | "freeze_dir": "freeze_model",
51 | "use_tiny": False, # 是否使用 裁剪 tiny 模型
52 | "max_box_num": 8, # 一幅图上最多有多少个目标
53 | "num_epochs": 100, # 训练轮次
54 | "train_batch_size": 7, # 对于完整yolov3,每一批的训练样本不能太多,内存会炸掉;如果使用tiny,可以适当大一些
55 | "use_gpu": True, # 是否使用GPU
56 | "yolo_cfg": { # YOLO模型参数
57 | "input_size": [3, 448, 448], # 原版的边长大小为608,为了提高训练速度和预测速度,此处压缩为448
58 | "anchors": [7, 10, 12, 22, 24, 17, 22, 45, 46, 33, 43, 88, 85, 66, 115, 146, 275, 240], # 锚点??
59 | "anchor_mask": [[6, 7, 8], [3, 4, 5], [0, 1, 2]]
60 | },
61 | "yolo_tiny_cfg": { # YOLO tiny 模型参数
62 | "input_size": [3, 256, 256],
63 | "anchors": [6, 8, 13, 15, 22, 34, 48, 50, 81, 100, 205, 191],
64 | "anchor_mask": [[3, 4, 5], [0, 1, 2]]
65 | },
66 | "ignore_thresh": 0.7,
67 | "mean_rgb": [127.5, 127.5, 127.5],
68 | "mode": "train",
69 | "multi_data_reader_count": 4,
70 | "apply_distort": True, # 是否做图像扭曲增强
71 | "nms_top_k": 300,
72 | "nms_pos_k": 300,
73 | "valid_thresh": 0.01,
74 | "nms_thresh": 0.40, # 非最大值抑制阈值
75 | "image_distort_strategy": { # 图像扭曲策略
76 | "expand_prob": 0.5, # 扩展比率
77 | "expand_max_ratio": 4,
78 | "hue_prob": 0.5, # 色调
79 | "hue_delta": 18,
80 | "contrast_prob": 0.5, # 对比度
81 | "contrast_delta": 0.5,
82 | "saturation_prob": 0.5, # 饱和度
83 | "saturation_delta": 0.5,
84 | "brightness_prob": 0.5, # 亮度
85 | "brightness_delta": 0.125
86 | },
87 | "sgd_strategy": { # 梯度下降配置
88 | "learning_rate": 0.002,
89 | "lr_epochs": [30, 50, 65], # 学习率衰减分段(3个数字分为4段)
90 | "lr_decay": [1, 0.5, 0.25, 0.1] # 每段采用的学习率,对应lr_epochs参数4段
91 | },
92 | "early_stop": {
93 | "sample_frequency": 50,
94 | "successive_limit": 3,
95 | "min_loss": 2.5,
96 | "min_curr_map": 0.84
97 | }
98 | }
99 |
100 |
101 | def init_train_parameters():
102 | """
103 | 初始化训练参数,主要是初始化图片数量,类别数
104 | :return:
105 | """
106 | file_list = os.path.join(train_params['data_dir'], train_params['train_list']) # 训练集
107 | label_list = os.path.join(train_params['data_dir'], "label_list") # 标签文件
108 | index = 0
109 |
110 | # codecs是专门用作编码转换通用模块
111 | with codecs.open(label_list, encoding='utf-8') as flist:
112 | lines = [line.strip() for line in flist]
113 | for line in lines:
114 | train_params['num_dict'][index] = line.strip()
115 | train_params['label_dict'][line.strip()] = index
116 | index += 1
117 | train_params['class_dim'] = index
118 |
119 | with codecs.open(file_list, encoding='utf-8') as flist:
120 | lines = [line.strip() for line in flist]
121 | train_params['image_count'] = len(lines) # 图片数量
122 |
123 |
124 | # 日志相关配置
125 | def init_log_config(): # 初始化日志相关配置
126 | global logger
127 |
128 | logger = logging.getLogger() # 创建日志对象
129 | logger.setLevel(logging.INFO) # 设置日志级别
130 | log_path = os.path.join(os.getcwd(), 'logs')
131 |
132 | if not os.path.exists(log_path): # 创建日志路径
133 | os.makedirs(log_path)
134 |
135 | log_name = os.path.join(log_path, 'train.log') # 训练日志文件
136 | fh = logging.FileHandler(log_name, mode='w') # 打开文件句柄
137 | fh.setLevel(logging.DEBUG) # 设置级别
138 |
139 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s")
140 | fh.setFormatter(formatter)
141 | logger.addHandler(fh)
142 |
143 |
144 | init_log_config()
145 |
146 |
147 | # 定义YOLO3网络结构:darknet-53
148 | class YOLOv3(object):
149 | def __init__(self, class_num, anchors, anchor_mask):
150 | self.outputs = [] # 网络最终模型
151 | self.downsample_ratio = 1 # 下采样率
152 | self.anchor_mask = anchor_mask # 计算卷积核???
153 | self.anchors = anchors # 锚点
154 | self.class_num = class_num # 类别数量
155 |
156 | self.yolo_anchors = []
157 | self.yolo_classes = []
158 |
159 | for mask_pair in self.anchor_mask:
160 | mask_anchors = []
161 | for mask in mask_pair:
162 | mask_anchors.append(self.anchors[2 * mask])
163 | mask_anchors.append(self.anchors[2 * mask + 1])
164 | self.yolo_anchors.append(mask_anchors)
165 | self.yolo_classes.append(class_num)
166 |
167 | def name(self):
168 | return 'YOLOv3'
169 |
170 | # 获取anchors
171 | def get_anchors(self):
172 | return self.anchors
173 |
174 | # 获取anchor_mask
175 | def get_anchor_mask(self):
176 | return self.anchor_mask
177 |
178 | def get_class_num(self):
179 | return self.class_num
180 |
181 | def get_downsample_ratio(self):
182 | return self.downsample_ratio
183 |
184 | def get_yolo_anchors(self):
185 | return self.yolo_anchors
186 |
187 | def get_yolo_classes(self):
188 | return self.yolo_classes
189 |
190 | # 卷积正则化函数: 卷积、批量正则化处理、leakrelu
191 | def conv_bn(self,
192 | input, # 输入
193 | num_filters, # 卷积核数量
194 | filter_size, # 卷积核大小
195 | stride, # 步幅
196 | padding, # 填充
197 | use_cudnn=True):
198 | # 2d卷积操作
199 | conv = fluid.layers.conv2d(input=input,
200 | num_filters=num_filters,
201 | filter_size=filter_size,
202 | stride=stride,
203 | padding=padding,
204 | act=None,
205 | use_cudnn=use_cudnn, # 是否使用cudnn,cudnn利用cuda进行了加速处理
206 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)),
207 | bias_attr=False)
208 |
209 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉
210 | # 在batch_norm中使用leaky的话,只能使用默认的alpha=0.02;如果需要设值,必须提出去单独来
211 | # 正则化的目的,是为了防止过拟合,较小的L2值能防止过拟合
212 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02),
213 | regularizer=L2Decay(0.))
214 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0),
215 | regularizer=L2Decay(0.))
216 | out = fluid.layers.batch_norm(input=conv, act=None,
217 | param_attr=param_attr,
218 | bias_attr=bias_attr)
219 | # leaky_relu: Leaky ReLU是给所有负值赋予一个非零斜率
220 | out = fluid.layers.leaky_relu(out, 0.1)
221 | return out
222 |
223 | # 通过卷积实现降采样
224 | # 如:原始图片大小448*448,降采样后大小为 ((448+2)-3)/2 + 1 = 224
225 | def down_sample(self, input, num_filters, filter_size=3, stride=2, padding=1):
226 | self.downsample_ratio *= 2 # 降采样率
227 | return self.conv_bn(input,
228 | num_filters=num_filters,
229 | filter_size=filter_size,
230 | stride=stride,
231 | padding=padding)
232 |
233 | # 基本块:包含两个卷积/正则化层,一个残差块
234 | def basic_block(self, input, num_filters):
235 | conv1 = self.conv_bn(input, num_filters, filter_size=1, stride=1, padding=0)
236 | conv2 = self.conv_bn(conv1, num_filters * 2, filter_size=3, stride=1, padding=1)
237 | out = fluid.layers.elementwise_add(x=input, y=conv2, act=None) # 计算H(x)=F(x)+x
238 | return out
239 |
240 | # 创建多个basic_block
241 | def layer_warp(self, input, num_filters, count):
242 | res_out = self.basic_block(input, num_filters)
243 | for j in range(1, count):
244 | res_out = self.basic_block(res_out, num_filters)
245 | return res_out
246 |
247 | # 上采样
248 | def up_sample(self, input, scale=2):
249 | # get dynamic upsample output shape
250 | shape_nchw = fluid.layers.shape(input) # 获取input的形状
251 | shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4])
252 | shape_hw.stop_gradient = True
253 | in_shape = fluid.layers.cast(shape_hw, dtype='int32')
254 | out_shape = in_shape * scale # 计算输出数据形状
255 | out_shape.stop_gradient = True
256 |
257 | # reisze by actual_shape
258 | # 矩阵放大(最邻插值法)
259 | out = fluid.layers.resize_nearest(input=input,
260 | scale=scale,
261 | actual_shape=out_shape)
262 | return out
263 |
264 | def yolo_detection_block(self, input, num_filters):
265 | assert num_filters % 2 == 0, "num_filters {} cannot be divided by 2".format(num_filters)
266 |
267 | conv = input
268 | for j in range(2):
269 | conv = self.conv_bn(conv, num_filters, filter_size=1, stride=1, padding=0)
270 | conv = self.conv_bn(conv, num_filters * 2, filter_size=3, stride=1, padding=1)
271 | route = self.conv_bn(conv, num_filters, filter_size=1, stride=1, padding=0)
272 | tip = self.conv_bn(route, num_filters * 2, filter_size=3, stride=1, padding=1)
273 | return route, tip
274 |
275 | # 搭建网络模型 darknet-53
276 | def net(self, img):
277 | stages = [1, 2, 8, 8, 4]
278 | assert len(self.anchor_mask) <= len(stages), "anchor masks can't bigger than down_sample times"
279 | # 第一个卷积层: 256*256
280 | conv1 = self.conv_bn(img, num_filters=32, filter_size=3, stride=1, padding=1)
281 | # 第二个卷积层:128*128
282 | downsample_ = self.down_sample(conv1, conv1.shape[1] * 2) # 第二个参数为卷积核数量
283 | blocks = []
284 |
285 | # 循环创建basic_block组
286 | for i, stage_count in enumerate(stages):
287 | block = self.layer_warp(downsample_, # 输入数据
288 | 32 * (2 ** i), # 卷积核数量
289 | stage_count) # 基本块数量
290 | blocks.append(block)
291 | if i < len(stages) - 1: # 如果不是最后一组,做降采样
292 | downsample_ = self.down_sample(block, block.shape[1] * 2)
293 | blocks = blocks[-1:-4:-1] # 取倒数三层,并且逆序,后面跨层级联需要
294 |
295 | # yolo detector
296 | for i, block in enumerate(blocks):
297 | # yolo中跨视域链接
298 | if i > 0:
299 | block = fluid.layers.concat(input=[route, block], axis=1) # 连接route和block,按行
300 |
301 | route, tip = self.yolo_detection_block(block, # 输入
302 | num_filters=512 // (2 ** i)) # 卷积核数量
303 |
304 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02))
305 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.))
306 | block_out = fluid.layers.conv2d(input=tip,
307 | # 5 elements represent x|y|h|w|score
308 | num_filters=len(self.anchor_mask[i]) * (self.class_num + 5),
309 | filter_size=1,
310 | stride=1,
311 | padding=0,
312 | act=None,
313 | param_attr=param_attr,
314 | bias_attr=bias_attr)
315 | self.outputs.append(block_out)
316 |
317 | # 为了跨视域链接,差值方式提升特征图尺寸
318 | if i < len(blocks) - 1:
319 | route = self.conv_bn(route, 256 // (2 ** i), filter_size=1, stride=1, padding=0)
320 | route = self.up_sample(route) # 上采样
321 |
322 | return self.outputs
323 |
324 | # Tiny(精简版)YOLO模型
325 | class YOLOv3Tiny(object):
326 | def __init__(self, class_num, anchors, anchor_mask):
327 | self.outputs = []
328 | self.downsample_ratio = 1
329 | self.anchor_mask = anchor_mask
330 | self.anchors = anchors
331 | self.class_num = class_num
332 |
333 | self.yolo_anchors = []
334 | self.yolo_classes = []
335 | for mask_pair in self.anchor_mask:
336 | mask_anchors = []
337 | for mask in mask_pair:
338 | mask_anchors.append(self.anchors[2 * mask])
339 | mask_anchors.append(self.anchors[2 * mask + 1])
340 | self.yolo_anchors.append(mask_anchors)
341 | self.yolo_classes.append(class_num)
342 |
343 | def name(self):
344 | return 'YOLOv3-tiny'
345 |
346 | def get_anchors(self):
347 | return self.anchors
348 |
349 | def get_anchor_mask(self):
350 | return self.anchor_mask
351 |
352 | def get_class_num(self):
353 | return self.class_num
354 |
355 | def get_downsample_ratio(self):
356 | return self.downsample_ratio
357 |
358 | def get_yolo_anchors(self):
359 | return self.yolo_anchors
360 |
361 | def get_yolo_classes(self):
362 | return self.yolo_classes
363 |
364 | def conv_bn(self,
365 | input,
366 | num_filters,
367 | filter_size,
368 | stride,
369 | padding,
370 | num_groups=1,
371 | use_cudnn=True):
372 | conv = fluid.layers.conv2d(
373 | input=input,
374 | num_filters=num_filters,
375 | filter_size=filter_size,
376 | stride=stride,
377 | padding=padding,
378 | act=None,
379 | groups=num_groups,
380 | use_cudnn=use_cudnn,
381 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)),
382 | bias_attr=False)
383 |
384 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉
385 | out = fluid.layers.batch_norm(
386 | input=conv, act='relu',
387 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02), regularizer=L2Decay(0.)),
388 | bias_attr=ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.)))
389 |
390 | return out
391 |
392 | def depthwise_conv_bn(self, input, filter_size=3, stride=1, padding=1):
393 | num_filters = input.shape[1]
394 | return self.conv_bn(input,
395 | num_filters=num_filters,
396 | filter_size=filter_size,
397 | stride=stride,
398 | padding=padding,
399 | num_groups=num_filters)
400 |
401 | def down_sample(self, input, pool_size=2, pool_stride=2):
402 | self.downsample_ratio *= 2
403 | return fluid.layers.pool2d(input=input, pool_type='max', pool_size=pool_size,
404 | pool_stride=pool_stride)
405 |
406 | def basic_block(self, input, num_filters):
407 | conv1 = self.conv_bn(input, num_filters, filter_size=3, stride=1, padding=1)
408 | out = self.down_sample(conv1)
409 | return out
410 |
411 | def up_sample(self, input, scale=2):
412 | # get dynamic upsample output shape
413 | shape_nchw = fluid.layers.shape(input)
414 | shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4])
415 | shape_hw.stop_gradient = True
416 | in_shape = fluid.layers.cast(shape_hw, dtype='int32')
417 | out_shape = in_shape * scale
418 | out_shape.stop_gradient = True
419 |
420 | # reisze by actual_shape
421 | out = fluid.layers.resize_nearest(
422 | input=input,
423 | scale=scale,
424 | actual_shape=out_shape)
425 | return out
426 |
427 | def yolo_detection_block(self, input, num_filters):
428 | route = self.conv_bn(input, num_filters, filter_size=1, stride=1, padding=0)
429 | tip = self.conv_bn(route, num_filters * 2, filter_size=3, stride=1, padding=1)
430 | return route, tip
431 |
432 | def net(self, img):
433 | # darknet-tiny
434 | stages = [16, 32, 64, 128, 256, 512]
435 | assert len(self.anchor_mask) <= len(stages), "anchor masks can't bigger than down_sample times"
436 | # 256x256
437 | tmp = img
438 | blocks = []
439 | for i, stage_count in enumerate(stages):
440 | if i == len(stages) - 1:
441 | block = self.conv_bn(tmp, stage_count, filter_size=3, stride=1, padding=1)
442 | blocks.append(block)
443 | block = self.depthwise_conv_bn(blocks[-1])
444 | block = self.depthwise_conv_bn(blocks[-1])
445 | block = self.conv_bn(blocks[-1], stage_count * 2, filter_size=1, stride=1, padding=0)
446 | blocks.append(block)
447 | else:
448 | tmp = self.basic_block(tmp, stage_count)
449 | blocks.append(tmp)
450 |
451 | blocks = [blocks[-1], blocks[3]]
452 |
453 | # yolo detector
454 | for i, block in enumerate(blocks):
455 | # yolo 中跨视域链接
456 | if i > 0:
457 | block = fluid.layers.concat(input=[route, block], axis=1)
458 | if i < 1:
459 | route, tip = self.yolo_detection_block(block, num_filters=256 // (2 ** i))
460 | else:
461 | tip = self.conv_bn(block, num_filters=256, filter_size=3, stride=1, padding=1)
462 |
463 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02))
464 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.))
465 | block_out = fluid.layers.conv2d(input=tip,
466 | # 5 elements represent x|y|h|w|score
467 | num_filters=len(self.anchor_mask[i]) * (self.class_num + 5),
468 | filter_size=1,
469 | stride=1,
470 | padding=0,
471 | act=None,
472 | param_attr=param_attr,
473 | bias_attr=bias_attr)
474 | self.outputs.append(block_out)
475 | # 为了跨视域链接,差值方式提升特征图尺寸
476 | if i < len(blocks) - 1:
477 | route = self.conv_bn(route, 128 // (2 ** i), filter_size=1, stride=1, padding=0)
478 | route = self.up_sample(route)
479 |
480 | return self.outputs
481 |
482 |
483 | def get_yolo(is_tiny, class_num, anchors, anchor_mask):
484 | if is_tiny:
485 | return YOLOv3Tiny(class_num, anchors, anchor_mask)
486 | else:
487 | return YOLOv3(class_num, anchors, anchor_mask)
488 |
489 |
490 | class Sampler(object):
491 | """
492 | 采样器,用于扣取采样
493 | """
494 |
495 | def __init__(self, max_sample, max_trial, min_scale, max_scale,
496 | min_aspect_ratio, max_aspect_ratio, min_jaccard_overlap,
497 | max_jaccard_overlap):
498 | self.max_sample = max_sample
499 | self.max_trial = max_trial
500 | self.min_scale = min_scale
501 | self.max_scale = max_scale
502 | self.min_aspect_ratio = min_aspect_ratio
503 | self.max_aspect_ratio = max_aspect_ratio
504 | self.min_jaccard_overlap = min_jaccard_overlap
505 | self.max_jaccard_overlap = max_jaccard_overlap
506 |
507 |
508 | class bbox(object):
509 | """
510 | 外界矩形框
511 | """
512 |
513 | def __init__(self, xmin, ymin, xmax, ymax):
514 | self.xmin = xmin
515 | self.ymin = ymin
516 | self.xmax = xmax
517 | self.ymax = ymax
518 |
519 |
520 | # 坐标转换,由[x1, y1, w, h]转换为[center_x, center_y, w, h]
521 | # 并转换为范围在[0, 1]之间的相对坐标
522 | def box_to_center_relative(box, img_height, img_width):
523 | """
524 | Convert COCO annotations box with format [x1, y1, w, h] to
525 | center mode [center_x, center_y, w, h] and divide image width
526 | and height to get relative value in range[0, 1]
527 | """
528 | assert len(box) == 4, "box should be a len(4) list or tuple"
529 | x, y, w, h = box
530 |
531 | x1 = max(x, 0)
532 | x2 = min(x + w - 1, img_width - 1)
533 | y1 = max(y, 0)
534 | y2 = min(y + h - 1, img_height - 1)
535 |
536 | x = (x1 + x2) / 2 / img_width # x中心坐标
537 | y = (y1 + y2) / 2 / img_height # y中心坐标
538 | w = (x2 - x1) / img_width # 框宽度/图片总宽度
539 | h = (y2 - y1) / img_height # 框高度/图片总高度
540 |
541 | return np.array([x, y, w, h])
542 |
543 |
544 | # 调整图像大小
545 | def resize_img(img, sampled_labels, input_size):
546 | target_size = input_size
547 | img = img.resize((target_size[1], target_size[2]), Image.BILINEAR)
548 | return img
549 |
550 |
551 | # 计算交并比
552 | def box_iou_xywh(box1, box2):
553 | assert box1.shape[-1] == 4, "Box1 shape[-1] should be 4."
554 | assert box2.shape[-1] == 4, "Box2 shape[-1] should be 4."
555 |
556 | # 取两个框的坐标
557 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
558 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
559 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
560 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
561 |
562 | inter_x1 = np.maximum(b1_x1, b2_x1)
563 | inter_x2 = np.minimum(b1_x2, b2_x2)
564 | inter_y1 = np.maximum(b1_y1, b2_y1)
565 | inter_y2 = np.minimum(b1_y2, b2_y2)
566 | inter_w = inter_x2 - inter_x1 + 1 # 相交部分宽度
567 | inter_h = inter_y2 - inter_y1 + 1 # 相交部分高度
568 | inter_w[inter_w < 0] = 0
569 | inter_h[inter_h < 0] = 0
570 |
571 | inter_area = inter_w * inter_h # 相交面积
572 | b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) # 框1的面积
573 | b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) # 框2的面积
574 |
575 | return inter_area / (b1_area + b2_area - inter_area) # 相集面积/并集面积
576 |
577 |
578 | # box裁剪
579 | def box_crop(boxes, labels, crop, img_shape):
580 | x, y, w, h = map(float, crop)
581 | im_w, im_h = map(float, img_shape)
582 |
583 | boxes = boxes.copy()
584 | boxes[:, 0], boxes[:, 2] = (boxes[:, 0] - boxes[:, 2] / 2) * im_w, (boxes[:, 0] + boxes[:, 2] / 2) * im_w
585 | boxes[:, 1], boxes[:, 3] = (boxes[:, 1] - boxes[:, 3] / 2) * im_h, (boxes[:, 1] + boxes[:, 3] / 2) * im_h
586 |
587 | crop_box = np.array([x, y, x + w, y + h])
588 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
589 | mask = np.logical_and(crop_box[:2] <= centers, centers <= crop_box[2:]).all(axis=1)
590 |
591 | boxes[:, :2] = np.maximum(boxes[:, :2], crop_box[:2])
592 | boxes[:, 2:] = np.minimum(boxes[:, 2:], crop_box[2:])
593 | boxes[:, :2] -= crop_box[:2]
594 | boxes[:, 2:] -= crop_box[:2]
595 |
596 | mask = np.logical_and(mask, (boxes[:, :2] < boxes[:, 2:]).all(axis=1))
597 | boxes = boxes * np.expand_dims(mask.astype('float32'), axis=1)
598 | labels = labels * mask.astype('float32')
599 | boxes[:, 0], boxes[:, 2] = (boxes[:, 0] + boxes[:, 2]) / 2 / w, (boxes[:, 2] - boxes[:, 0]) / w
600 | boxes[:, 1], boxes[:, 3] = (boxes[:, 1] + boxes[:, 3]) / 2 / h, (boxes[:, 3] - boxes[:, 1]) / h
601 |
602 | return boxes, labels, mask.sum()
603 |
604 |
605 | # 图像增加:对比度,饱和度,明暗,颜色,扩张
606 | def random_brightness(img): # 亮度
607 | prob = np.random.uniform(0, 1)
608 |
609 | if prob < train_params['image_distort_strategy']['brightness_prob']:
610 | brightness_delta = train_params['image_distort_strategy']['brightness_delta'] # 默认值0.125
611 | delta = np.random.uniform(-brightness_delta, brightness_delta) + 1 # 产生均匀分布随机值
612 | img = ImageEnhance.Brightness(img).enhance(delta) # 调整图像亮度
613 |
614 | return img
615 |
616 |
617 | def random_contrast(img): # 对比度
618 | prob = np.random.uniform(0, 1)
619 |
620 | if prob < train_params['image_distort_strategy']['contrast_prob']:
621 | contrast_delta = train_params['image_distort_strategy']['contrast_delta']
622 | delta = np.random.uniform(-contrast_delta, contrast_delta) + 1
623 | img = ImageEnhance.Contrast(img).enhance(delta)
624 |
625 | return img
626 |
627 |
628 | def random_saturation(img): # 饱和度
629 | prob = np.random.uniform(0, 1)
630 |
631 | if prob < train_params['image_distort_strategy']['saturation_prob']:
632 | saturation_delta = train_params['image_distort_strategy']['saturation_delta']
633 | delta = np.random.uniform(-saturation_delta, saturation_delta) + 1
634 | img = ImageEnhance.Color(img).enhance(delta)
635 |
636 | return img
637 |
638 |
639 | def random_hue(img): # 色调
640 | prob = np.random.uniform(0, 1)
641 |
642 | if prob < train_params['image_distort_strategy']['hue_prob']:
643 | hue_delta = train_params['image_distort_strategy']['hue_delta']
644 | delta = np.random.uniform(-hue_delta, hue_delta)
645 | img_hsv = np.array(img.convert('HSV'))
646 | img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta
647 | img = Image.fromarray(img_hsv, mode='HSV').convert('RGB')
648 |
649 | return img
650 |
651 |
652 | def distort_image(img): # 图像扭曲
653 | prob = np.random.uniform(0, 1)
654 | # Apply different distort order
655 | if prob > 0.5:
656 | img = random_brightness(img)
657 | img = random_contrast(img)
658 | img = random_saturation(img)
659 | img = random_hue(img)
660 | else:
661 | img = random_brightness(img)
662 | img = random_saturation(img)
663 | img = random_hue(img)
664 | img = random_contrast(img)
665 | return img
666 |
667 |
668 | # 随机裁剪
669 | def random_crop(img, boxes, labels, scales=[0.3, 1.0], max_ratio=2.0, constraints=None, max_trial=50):
670 | if random.random() > 0.6:
671 | return img, boxes, labels
672 | if len(boxes) == 0:
673 | return img, boxes, labels
674 |
675 | if not constraints:
676 | constraints = [(0.1, 1.0),
677 | (0.3, 1.0),
678 | (0.5, 1.0),
679 | (0.7, 1.0),
680 | (0.9, 1.0),
681 | (0.0, 1.0)] # 最小/最大交并比值
682 |
683 | w, h = img.size
684 | crops = [(0, 0, w, h)]
685 |
686 | for min_iou, max_iou in constraints:
687 | for _ in range(max_trial):
688 | scale = random.uniform(scales[0], scales[1])
689 | aspect_ratio = random.uniform(max(1 / max_ratio, scale * scale), \
690 | min(max_ratio, 1 / scale / scale))
691 | crop_h = int(h * scale / np.sqrt(aspect_ratio))
692 | crop_w = int(w * scale * np.sqrt(aspect_ratio))
693 | crop_x = random.randrange(w - crop_w)
694 | crop_y = random.randrange(h - crop_h)
695 | crop_box = np.array([[
696 | (crop_x + crop_w / 2.0) / w,
697 | (crop_y + crop_h / 2.0) / h,
698 | crop_w / float(w),
699 | crop_h / float(h)
700 | ]])
701 |
702 | iou = box_iou_xywh(crop_box, boxes)
703 | if min_iou <= iou.min() and max_iou >= iou.max():
704 | crops.append((crop_x, crop_y, crop_w, crop_h))
705 | break
706 |
707 | while crops:
708 | crop = crops.pop(np.random.randint(0, len(crops)))
709 | crop_boxes, crop_labels, box_num = box_crop(boxes, labels, crop, (w, h))
710 | if box_num < 1:
711 | continue
712 | img = img.crop((crop[0], crop[1], crop[0] + crop[2],
713 | crop[1] + crop[3])).resize(img.size, Image.LANCZOS)
714 | return img, crop_boxes, crop_labels
715 | return img, boxes, labels
716 |
717 |
718 | # 扩张
719 | def random_expand(img, gtboxes, keep_ratio=True):
720 | if np.random.uniform(0, 1) < train_params['image_distort_strategy']['expand_prob']:
721 | return img, gtboxes
722 |
723 | max_ratio = train_params['image_distort_strategy']['expand_max_ratio']
724 | w, h = img.size
725 | c = 3
726 | ratio_x = random.uniform(1, max_ratio)
727 | if keep_ratio:
728 | ratio_y = ratio_x
729 | else:
730 | ratio_y = random.uniform(1, max_ratio)
731 | oh = int(h * ratio_y)
732 | ow = int(w * ratio_x)
733 | off_x = random.randint(0, ow - w)
734 | off_y = random.randint(0, oh - h)
735 |
736 | out_img = np.zeros((oh, ow, c), np.uint8)
737 | for i in range(c):
738 | out_img[:, :, i] = train_params['mean_rgb'][i]
739 |
740 | out_img[off_y: off_y + h, off_x: off_x + w, :] = img
741 | gtboxes[:, 0] = ((gtboxes[:, 0] * w) + off_x) / float(ow)
742 | gtboxes[:, 1] = ((gtboxes[:, 1] * h) + off_y) / float(oh)
743 | gtboxes[:, 2] = gtboxes[:, 2] / ratio_x
744 | gtboxes[:, 3] = gtboxes[:, 3] / ratio_y
745 |
746 | return Image.fromarray(out_img), gtboxes
747 |
748 |
749 | # 预处理:图像样本增强,维度转换
750 | def preprocess(img, bbox_labels, input_size, mode):
751 | img_width, img_height = img.size
752 | sample_labels = np.array(bbox_labels)
753 |
754 | if mode == 'train':
755 | if train_params['apply_distort']: # 是否扭曲增强
756 | img = distort_image(img)
757 |
758 | img, gtboxes = random_expand(img, sample_labels[:, 1:5]) # 扩展增强
759 | img, gtboxes, gtlabels = random_crop(img, gtboxes, sample_labels[:, 0]) # 随机裁剪
760 | sample_labels[:, 0] = gtlabels
761 | sample_labels[:, 1:5] = gtboxes
762 |
763 | img = resize_img(img, sample_labels, input_size)
764 | img = np.array(img).astype('float32')
765 | img -= train_params['mean_rgb']
766 | img = img.transpose((2, 0, 1)) # HWC to CHW
767 | img *= 0.007843
768 | return img, sample_labels
769 |
770 |
771 | # 数据读取器
772 | # 根据样本文件,读取图片、并做数据增强,返回图片数据、边框、标签
773 | def custom_reader(file_list, data_dir, input_size, mode):
774 | def reader():
775 | np.random.shuffle(file_list) # 打乱文件列表
776 |
777 | for line in file_list: # 读取行,每行一个图片及标注
778 | if mode == 'train' or mode == 'eval':
779 | ###################### 以下可能是需要自定义修改的部分 ############################
780 | parts = line.split('\t') # 按照tab键拆分
781 | image_path = parts[0]
782 |
783 | img = Image.open(os.path.join(data_dir, image_path)) # 读取图像数据
784 | if img.mode != 'RGB':
785 | img = img.convert('RGB')
786 | im_width, im_height = img.size
787 |
788 | # bbox 的列表,每一个元素为这样
789 | # layout: label | x-center | y-cneter | width | height | difficult
790 | bbox_labels = []
791 | for object_str in parts[1:]: # 循环处理每一个目标标注信息
792 | if len(object_str) <= 1:
793 | continue
794 |
795 | bbox_sample = []
796 | object = json.loads(object_str)
797 | bbox_sample.append(float(train_params['label_dict'][object['value']]))
798 | bbox = object['coordinate'] # 获取框坐标
799 | # 计算x,y,w,h
800 | box = [bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]]
801 | bbox = box_to_center_relative(box, im_height, im_width) # 坐标转换
802 | bbox_sample.append(float(bbox[0]))
803 | bbox_sample.append(float(bbox[1]))
804 | bbox_sample.append(float(bbox[2]))
805 | bbox_sample.append(float(bbox[3]))
806 | difficult = float(0)
807 | bbox_sample.append(difficult)
808 | bbox_labels.append(bbox_sample)
809 | ###################### 可能需要自定义修改部分结束 ############################
810 |
811 | if len(bbox_labels) == 0:
812 | continue
813 |
814 | img, sample_labels = preprocess(img, bbox_labels, input_size, mode) # 预处理
815 | # sample_labels = np.array(sample_labels)
816 | if len(sample_labels) == 0:
817 | continue
818 |
819 | boxes = sample_labels[:, 1:5] # 坐标
820 | lbls = sample_labels[:, 0].astype('int32') # 标签
821 | difficults = sample_labels[:, -1].astype('int32')
822 | max_box_num = train_params['max_box_num'] # 一副图像最多多少个目标物体
823 | cope_size = max_box_num if len(boxes) >= max_box_num else len(boxes) # 控制最大目标数量
824 | ret_boxes = np.zeros((max_box_num, 4), dtype=np.float32)
825 | ret_lbls = np.zeros((max_box_num), dtype=np.int32)
826 | ret_difficults = np.zeros((max_box_num), dtype=np.int32)
827 | ret_boxes[0: cope_size] = boxes[0: cope_size]
828 | ret_lbls[0: cope_size] = lbls[0: cope_size]
829 | ret_difficults[0: cope_size] = difficults[0: cope_size]
830 |
831 | yield img, ret_boxes, ret_lbls
832 |
833 | elif mode == 'test':
834 | img_path = os.path.join(line)
835 |
836 | yield Image.open(img_path)
837 |
838 | return reader
839 |
840 |
841 | # 批量、随机数据读取器
842 | def single_custom_reader(file_path, data_dir, input_size, mode):
843 | file_path = os.path.join(data_dir, file_path)
844 |
845 | images = [line.strip() for line in open(file_path)]
846 | reader = custom_reader(images, data_dir, input_size, mode)
847 | reader = paddle.reader.shuffle(reader, train_params['train_batch_size'])
848 | reader = paddle.batch(reader, train_params['train_batch_size'])
849 |
850 | return reader
851 |
852 |
853 | # 定义优化器
854 | def optimizer_sgd_setting():
855 | batch_size = train_params["train_batch_size"] # batch大小
856 | iters = train_params["image_count"] // batch_size # 计算轮次
857 | iters = 1 if iters < 1 else iters
858 | '''
859 | learning_strategy = train_params['sgd_strategy']
860 | lr = learning_strategy['learning_rate'] # 学习率
861 |
862 | boundaries = [i * iters for i in learning_strategy["lr_epochs"]]
863 | values = [i * lr for i in learning_strategy["lr_decay"]]
864 | logger.info("origin learning rate: {0} boundaries: {1} values: {2}".format(lr, boundaries, values))
865 |
866 |
867 | optimizer = fluid.optimizer.SGDOptimizer(
868 | learning_rate=fluid.layers.piecewise_decay(boundaries, values), # 分段衰减学习率
869 | # learning_rate=lr,
870 | regularization=fluid.regularizer.L2Decay(0.00005))
871 | '''
872 | optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.01,beta1=0.9,beta2=0.999,regularization=fluid.regularizer.L2Decay(0.00005))
873 | return optimizer
874 |
875 |
876 | # 创建program, feeder及yolo模型
877 | def build_program_with_feeder(main_prog, startup_prog, place):
878 | max_box_num = train_params['max_box_num']
879 | ues_tiny = train_params['use_tiny'] # 获取是否使用tiny yolo参数
880 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg']
881 |
882 | with fluid.program_guard(main_prog, startup_prog): # 更改全局主程序和启动程序
883 | img = fluid.layers.data(name='img', shape=yolo_config['input_size'], dtype='float32') # 图像
884 | gt_box = fluid.layers.data(name='gt_box', shape=[max_box_num, 4], dtype='float32') # 边框
885 | gt_label = fluid.layers.data(name='gt_label', shape=[max_box_num], dtype='int32') # 标签
886 |
887 | feeder = fluid.DataFeeder(feed_list=[img, gt_box, gt_label],
888 | place=place,
889 | program=main_prog) # 定义feeder
890 | reader = single_custom_reader(train_params['train_list'],
891 | train_params['data_dir'],
892 | yolo_config['input_size'], 'train') # 读取器
893 | # 获取yolo参数
894 | ues_tiny = train_params['use_tiny']
895 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg']
896 |
897 | with fluid.unique_name.guard():
898 | # 创建yolo模型
899 | model = get_yolo(ues_tiny, train_params['class_dim'], yolo_config['anchors'],
900 | yolo_config['anchor_mask'])
901 | outputs = model.net(img)
902 | return feeder, reader, get_loss(model, outputs, gt_box, gt_label)
903 |
904 |
905 | # 损失函数
906 | def get_loss(model, outputs, gt_box, gt_label):
907 | losses = []
908 | downsample_ratio = model.get_downsample_ratio()
909 |
910 | with fluid.unique_name.guard('train'):
911 | for i, out in enumerate(outputs):
912 | loss = fluid.layers.yolov3_loss(x=out,
913 | gt_box=gt_box, # 真实边框
914 | gt_label=gt_label, # 标签
915 | anchors=model.get_anchors(), # 锚点
916 | anchor_mask=model.get_anchor_mask()[i],
917 | class_num=model.get_class_num(),
918 | ignore_thresh=train_params['ignore_thresh'],
919 | # 对于类别不多的情况,设置为 False 会更合适一些,不然 score 会很小
920 | use_label_smooth=False,
921 | downsample_ratio=downsample_ratio)
922 | losses.append(fluid.layers.reduce_mean(loss))
923 | downsample_ratio //= 2
924 | loss = sum(losses)
925 | optimizer = optimizer_sgd_setting()
926 | optimizer.minimize(loss)
927 | return loss
928 |
929 |
930 | # 持久化参数加载
931 | def load_pretrained_params(exe, program):
932 | if train_params['continue_train'] and os.path.exists(train_params['save_model_dir']):
933 | logger.info('load param from retrain model')
934 | fluid.io.load_persistables(executor=exe,
935 | dirname=train_params['save_model_dir'],
936 | main_program=program)
937 | elif train_params['pretrained'] and os.path.exists(train_params['pretrained_model_dir']):
938 | logger.info('load param from pretrained model')
939 |
940 | def if_exist(var):
941 | return os.path.exists(os.path.join(train_params['pretrained_model_dir'], var.name))
942 |
943 | fluid.io.load_vars(exe, train_params['pretrained_model_dir'], main_program=program,
944 | predicate=if_exist)
945 |
946 |
947 | # 执行训练
948 | def train():
949 | init_log_config()
950 | init_train_parameters()
951 |
952 | logger.info("start train YOLOv3, train params:%s", str(train_params))
953 | logger.info("create place, use gpu:" + str(train_params['use_gpu']))
954 |
955 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace()
956 |
957 | logger.info("build network and program")
958 | train_program = fluid.Program()
959 | start_program = fluid.Program()
960 | feeder, reader, loss = build_program_with_feeder(train_program, start_program, place)
961 |
962 | logger.info("build executor and init params")
963 |
964 | exe = fluid.Executor(place)
965 | exe.run(start_program)
966 | train_fetch_list = [loss.name]
967 | load_pretrained_params(exe, train_program) # 加载模型及参数
968 |
969 | stop_strategy = train_params['early_stop']
970 | successive_limit = stop_strategy['successive_limit']
971 | sample_freq = stop_strategy['sample_frequency']
972 | min_curr_map = stop_strategy['min_curr_map']
973 | min_loss = stop_strategy['min_loss']
974 | stop_train = False
975 | successive_count = 0
976 | total_batch_count = 0
977 | valid_thresh = train_params['valid_thresh']
978 | nms_thresh = train_params['nms_thresh']
979 | current_best_loss = 10000000000.0
980 |
981 | # 开始迭代训练
982 | for pass_id in range(train_params["num_epochs"]):
983 | logger.info("current pass: {}, start read image".format(pass_id))
984 | batch_id = 0
985 | total_loss = 0.0
986 |
987 | for batch_id, data in enumerate(reader()):
988 | t1 = time.time()
989 |
990 | loss = exe.run(train_program,
991 | feed=feeder.feed(data),
992 | fetch_list=train_fetch_list) # 执行训练
993 |
994 | period = time.time() - t1
995 | loss = np.mean(np.array(loss))
996 | total_loss += loss
997 | batch_id += 1
998 | total_batch_count += 1
999 |
1000 | if batch_id % 10 == 0: # 调整日志输出的频率
1001 | logger.info(
1002 | "pass {}, trainbatch {}, loss {} time {}".format(pass_id, batch_id, loss, "%2.2f sec" % period))
1003 |
1004 | pass_mean_loss = total_loss / batch_id
1005 | logger.info("pass {0} train result, current pass mean loss: {1}".format(pass_id, pass_mean_loss))
1006 |
1007 | # 采用每训练完一轮停止办法,可以调整为更精细的保存策略
1008 | if pass_mean_loss < current_best_loss:
1009 | logger.info("temp save {} epcho train result, current best pass loss {}".format(pass_id, pass_mean_loss))
1010 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program,
1011 | executor=exe)
1012 | current_best_loss = pass_mean_loss
1013 |
1014 | logger.info("training till last epcho, end training")
1015 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program, executor=exe)
1016 |
1017 |
1018 | if __name__ == '__main__':
1019 | train()
1020 |
1021 |
1022 |
1023 | # 固化保存模型
1024 | import paddle
1025 | import paddle.fluid as fluid
1026 | import codecs
1027 |
1028 | init_train_parameters()
1029 |
1030 |
1031 | def freeze_model():
1032 | exe = fluid.Executor(fluid.CPUPlace())
1033 |
1034 | ues_tiny = train_params['use_tiny']
1035 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg']
1036 | path = train_params['save_model_dir']
1037 |
1038 | model = get_yolo(ues_tiny, train_params['class_dim'],
1039 | yolo_config['anchors'], yolo_config['anchor_mask'])
1040 | image = fluid.layers.data(name='image', shape=yolo_config['input_size'], dtype='float32')
1041 | image_shape = fluid.layers.data(name="image_shape", shape=[2], dtype='int32')
1042 |
1043 | boxes = []
1044 | scores = []
1045 | outputs = model.net(image)
1046 | downsample_ratio = model.get_downsample_ratio()
1047 |
1048 | for i, out in enumerate(outputs):
1049 | box, score = fluid.layers.yolo_box(x=out,
1050 | img_size=image_shape,
1051 | anchors=model.get_yolo_anchors()[i],
1052 | class_num=model.get_class_num(),
1053 | conf_thresh=train_params['valid_thresh'],
1054 | downsample_ratio=downsample_ratio,
1055 | name="yolo_box_" + str(i))
1056 | boxes.append(box)
1057 | scores.append(fluid.layers.transpose(score, perm=[0, 2, 1]))
1058 | downsample_ratio //= 2
1059 |
1060 | pred = fluid.layers.multiclass_nms(bboxes=fluid.layers.concat(boxes, axis=1),
1061 | scores=fluid.layers.concat(scores, axis=2),
1062 | score_threshold=train_params['valid_thresh'],
1063 | nms_top_k=train_params['nms_top_k'],
1064 | keep_top_k=train_params['nms_pos_k'],
1065 | nms_threshold=train_params['nms_thresh'],
1066 | background_label=-1,
1067 | name="multiclass_nms")
1068 |
1069 | freeze_program = fluid.default_main_program()
1070 |
1071 | fluid.io.load_persistables(exe, path, freeze_program)
1072 | freeze_program = freeze_program.clone(for_test=True)
1073 | print("freeze out: {0}, pred layout: {1}".format(train_params['freeze_dir'], pred))
1074 | # 保存模型
1075 | fluid.io.save_inference_model(train_params['freeze_dir'],
1076 | ['image', 'image_shape'],
1077 | pred, exe, freeze_program)
1078 | print("freeze end")
1079 |
1080 |
1081 | if __name__ == '__main__':
1082 | freeze_model()
1083 |
1084 |
1085 | # 预测
1086 | import codecs
1087 | import sys
1088 | import numpy as np
1089 | import time
1090 | import paddle
1091 | import paddle.fluid as fluid
1092 | import math
1093 | import functools
1094 |
1095 | from IPython.display import display
1096 | from PIL import Image
1097 | from PIL import ImageFont
1098 | from PIL import ImageDraw
1099 | from collections import namedtuple
1100 |
1101 | init_train_parameters()
1102 | ues_tiny = train_params['use_tiny']
1103 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg']
1104 |
1105 | target_size = yolo_config['input_size']
1106 | anchors = yolo_config['anchors']
1107 | anchor_mask = yolo_config['anchor_mask']
1108 | label_dict = train_params['num_dict']
1109 | class_dim = train_params['class_dim']
1110 | print("label_dict:{} class dim:{}".format(label_dict, class_dim))
1111 |
1112 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace()
1113 | exe = fluid.Executor(place)
1114 |
1115 | path = train_params['freeze_dir']
1116 | [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(dirname=path, executor=exe)
1117 |
1118 |
1119 | # 给图片画上外接矩形框
1120 | def draw_bbox_image(img, boxes, labels, save_name):
1121 | img_width, img_height = img.size
1122 |
1123 | draw = ImageDraw.Draw(img) # 图像绘制对象
1124 | for box, label in zip(boxes, labels):
1125 | xmin, ymin, xmax, ymax = box[0], box[1], box[2], box[3]
1126 | draw.rectangle((xmin, ymin, xmax, ymax), None, 'red') # 绘制矩形
1127 | draw.text((xmin, ymin), label_dict[int(label)], (255, 255, 0)) # 绘制标签
1128 | img.save(save_name)
1129 | display(img)
1130 |
1131 |
1132 | def resize_img(img, target_size):
1133 | """
1134 | 保持比例的缩放图片
1135 | :param img:
1136 | :param target_size:
1137 | :return:
1138 | """
1139 | img = img.resize(target_size[1:], Image.BILINEAR)
1140 | return img
1141 |
1142 |
1143 | def read_image(img_path):
1144 | """
1145 | 读取图片
1146 | :param img_path:
1147 | :return:
1148 | """
1149 | origin = Image.open(img_path)
1150 | img = resize_img(origin, target_size)
1151 | resized_img = img.copy()
1152 | if img.mode != 'RGB':
1153 | img = img.convert('RGB')
1154 | img = np.array(img).astype('float32').transpose((2, 0, 1)) # HWC to CHW
1155 | img -= 127.5
1156 | img *= 0.007843
1157 | img = img[np.newaxis, :]
1158 | return origin, img, resized_img
1159 |
1160 |
1161 | def infer(image_path):
1162 | """
1163 | 预测,将结果保存到一副新的图片中
1164 | :param image_path:
1165 | :return:
1166 | """
1167 | origin, tensor_img, resized_img = read_image(image_path)
1168 | input_w, input_h = origin.size[0], origin.size[1]
1169 | image_shape = np.array([input_h, input_w], dtype='int32')
1170 | # print("image shape high:{0}, width:{1}".format(input_h, input_w))
1171 |
1172 | t1 = time.time()
1173 | # 执行预测
1174 | batch_outputs = exe.run(inference_program,
1175 | feed={feed_target_names[0]: tensor_img,
1176 | feed_target_names[1]: image_shape[np.newaxis, :]},
1177 | fetch_list=fetch_targets,
1178 | return_numpy=False)
1179 | period = time.time() - t1
1180 | print("predict cost time:{0}".format("%2.2f sec" % period))
1181 | bboxes = np.array(batch_outputs[0]) # 预测结果
1182 | # print(bboxes)
1183 |
1184 | if bboxes.shape[1] != 6:
1185 | print("No object found in {}".format(image_path))
1186 | return
1187 | labels = bboxes[:, 0].astype('int32') # 类别
1188 | scores = bboxes[:, 1].astype('float32') # 概率
1189 | boxes = bboxes[:, 2:].astype('float32') # 边框
1190 |
1191 | last_dot_index = image_path.rfind('.')
1192 | out_path = image_path[:last_dot_index]
1193 | out_path += '-result.jpg'
1194 | draw_bbox_image(origin, boxes, labels, out_path)
1195 |
1196 |
1197 | if __name__ == '__main__':
1198 | #image_name = sys.argv[1]
1199 | #image_path = image_name
1200 | image_path = "data/data6045/lslm_test/23.jpg"
1201 | infer(image_path)
--------------------------------------------------------------------------------
/output/classes/0.0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/classes/0.0.png
--------------------------------------------------------------------------------
/output/classes/1.0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/classes/1.0.png
--------------------------------------------------------------------------------
/output/classes/2.0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/classes/2.0.png
--------------------------------------------------------------------------------
/output/detection-results-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/detection-results-info.png
--------------------------------------------------------------------------------
/output/ground-truth-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/ground-truth-info.png
--------------------------------------------------------------------------------
/output/lamr.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/lamr.png
--------------------------------------------------------------------------------
/output/mAP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/mAP.png
--------------------------------------------------------------------------------
/output/output.txt:
--------------------------------------------------------------------------------
1 | # AP and precision/recall per class
2 | 47.18% = 0.0 AP
3 | Precision: ['1.00', '1.00', '1.00', '0.75', '0.60', '0.67', '0.71', '0.75', '0.78', '0.80', '0.82', '0.83', '0.85', '0.86', '0.87', '0.81', '0.76', '0.78', '0.79', '0.80', '0.81', '0.77', '0.78', '0.79', '0.76', '0.73', '0.70', '0.68', '0.69', '0.70', '0.68', '0.69', '0.67', '0.65', '0.66', '0.64', '0.62', '0.61', '0.62', '0.60', '0.61', '0.62', '0.60', '0.59', '0.58', '0.57', '0.55', '0.54', '0.53', '0.52', '0.51', '0.50', '0.49', '0.48', '0.47', '0.46', '0.46', '0.45', '0.44', '0.43', '0.43', '0.42', '0.41', '0.41', '0.40', '0.39', '0.39', '0.38', '0.38', '0.37', '0.37', '0.36', '0.36', '0.35', '0.35', '0.34', '0.34', '0.33', '0.33', '0.33', '0.32', '0.32', '0.31', '0.31', '0.31', '0.31', '0.31', '0.31', '0.30', '0.30', '0.30', '0.29', '0.29', '0.30', '0.29', '0.29', '0.29', '0.29', '0.28', '0.28', '0.28', '0.27', '0.27', '0.27', '0.27', '0.26']
4 | Recall :['0.02', '0.04', '0.07', '0.07', '0.07', '0.09', '0.11', '0.13', '0.15', '0.17', '0.20', '0.22', '0.24', '0.26', '0.28', '0.28', '0.28', '0.30', '0.33', '0.35', '0.37', '0.37', '0.39', '0.41', '0.41', '0.41', '0.41', '0.41', '0.43', '0.46', '0.46', '0.48', '0.48', '0.48', '0.50', '0.50', '0.50', '0.50', '0.52', '0.52', '0.54', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.59', '0.59', '0.59', '0.59', '0.59', '0.59', '0.59', '0.59', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61']
5 |
6 | 80.87% = 1.0 AP
7 | Precision: ['1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '0.93', '0.87', '0.88', '0.82', '0.78', '0.74', '0.70', '0.67', '0.68', '0.65', '0.62', '0.60', '0.58', '0.56', '0.54', '0.52', '0.50', '0.48', '0.47', '0.45', '0.44', '0.43', '0.42', '0.41', '0.39', '0.38', '0.38']
8 | Recall :['0.06', '0.11', '0.17', '0.22', '0.28', '0.33', '0.39', '0.44', '0.50', '0.56', '0.61', '0.67', '0.72', '0.72', '0.72', '0.78', '0.78', '0.78', '0.78', '0.78', '0.78', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83']
9 |
10 | 64.01% = 2.0 AP
11 | Precision: ['1.00', '1.00', '1.00', '1.00', '0.80', '0.67', '0.71', '0.75', '0.67', '0.70', '0.73', '0.75', '0.69', '0.71', '0.73', '0.69', '0.71', '0.67', '0.68', '0.70', '0.67', '0.64', '0.65', '0.67', '0.68', '0.65', '0.63', '0.61', '0.59', '0.57', '0.55', '0.53', '0.52', '0.50', '0.51', '0.50', '0.51', '0.53', '0.51', '0.50', '0.49', '0.48', '0.47', '0.45', '0.44', '0.43', '0.43', '0.42', '0.41', '0.42', '0.41', '0.40', '0.40', '0.39', '0.38', '0.38', '0.37', '0.36', '0.36', '0.35', '0.34', '0.34', '0.33', '0.33', '0.32', '0.32', '0.31', '0.31', '0.30', '0.30', '0.30', '0.29', '0.29', '0.28', '0.28', '0.28', '0.27', '0.27', '0.27', '0.26', '0.26', '0.26', '0.25', '0.25', '0.25', '0.24', '0.24', '0.24', '0.24', '0.23', '0.23', '0.23', '0.23']
12 | Recall :['0.04', '0.08', '0.12', '0.17', '0.17', '0.17', '0.21', '0.25', '0.25', '0.29', '0.33', '0.38', '0.38', '0.42', '0.46', '0.46', '0.50', '0.50', '0.54', '0.58', '0.58', '0.58', '0.62', '0.67', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.75', '0.75', '0.79', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88']
13 |
14 |
15 | # mAP of all classes
16 | mAP = 64.02%
17 |
18 | # Number of ground-truth objects per class
19 | 0.0: 46
20 | 1.0: 18
21 | 2.0: 24
22 |
23 | # Number of detected objects per class
24 | 0.0: 106 (tp:28, fp:78)
25 | 1.0: 40 (tp:15, fp:25)
26 | 2.0: 93 (tp:21, fp:72)
27 |
--------------------------------------------------------------------------------