├── Infer.py ├── README.md ├── main.py ├── objectDetection.py └── output ├── classes ├── 0.0.png ├── 1.0.png └── 2.0.png ├── detection-results-info.png ├── ground-truth-info.png ├── lamr.png ├── mAP.png └── output.txt /Infer.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | 训练常基于dark-net的YOLOv3网络,目标检测 4 | """ 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | import os 9 | 10 | os.environ["FLAGS_fraction_of_gpu_memory_to_use"] = '0.82' 11 | 12 | import uuid 13 | import numpy as np 14 | import time 15 | import six 16 | import math 17 | import random 18 | import paddle 19 | import paddle.fluid as fluid 20 | import logging 21 | import xml.etree.ElementTree 22 | import codecs 23 | import json 24 | 25 | from paddle.fluid.initializer import MSRA 26 | from paddle.fluid.param_attr import ParamAttr 27 | from paddle.fluid.regularizer import L2Decay 28 | from PIL import Image, ImageEnhance, ImageDraw, ImageFile 29 | ImageFile.LOAD_TRUNCATED_IMAGES = True 30 | Image.MAX_IMAGE_PIXELS = None 31 | 32 | logger = None # 日志对象 33 | 34 | train_params = { 35 | "data_dir": "data/data6045", # 数据目录 36 | "train_list": "train.txt", # 训练集文件 37 | "eval_list": "eval.txt", 38 | "class_dim": -1, 39 | "label_dict": {}, # 标签字典 40 | "num_dict": {}, 41 | "image_count": -1, 42 | "continue_train": True, # 是否加载前一次的训练参数,接着训练 43 | "pretrained": False, # 是否预训练 44 | "pretrained_model_dir": "./pretrained-model", 45 | "save_model_dir": "./yolo-model", # 模型保存目录 46 | "model_prefix": "yolo-v3", # 模型前缀 47 | "freeze_dir": "freeze_model", 48 | "use_tiny": False, # 是否使用 裁剪 tiny 模型 49 | "max_box_num": 20, # 一幅图上最多有多少个目标 50 | "num_epochs": 2, # 训练轮次 51 | "train_batch_size": 10, # 对于完整yolov3,每一批的训练样本不能太多,内存会炸掉;如果使用tiny,可以适当大一些 52 | "use_gpu": True, # 是否使用GPU 53 | "yolo_cfg": { # YOLO模型参数 54 | "input_size": [3, 448, 448], # 原版的边长大小为608,为了提高训练速度和预测速度,此处压缩为448 55 | "anchors": [7, 10, 12, 22, 24, 17, 22, 45, 46, 33, 43, 88, 85, 66, 115, 146, 275, 240], # 锚点?? 56 | "anchor_mask": [[6, 7, 8], [3, 4, 5], [0, 1, 2]] 57 | }, 58 | "yolo_tiny_cfg": { # YOLO tiny 模型参数 59 | "input_size": [3, 256, 256], 60 | "anchors": [6, 8, 13, 15, 22, 34, 48, 50, 81, 100, 205, 191], 61 | "anchor_mask": [[3, 4, 5], [0, 1, 2]] 62 | }, 63 | "ignore_thresh": 0.7, 64 | "mean_rgb": [127.5, 127.5, 127.5], 65 | "mode": "train", 66 | "multi_data_reader_count": 4, 67 | "apply_distort": True, # 是否做图像扭曲增强 68 | "nms_top_k": 300, 69 | "nms_pos_k": 300, 70 | "valid_thresh": 0.01, 71 | "nms_thresh": 0.40, # 非最大值抑制阈值 72 | "image_distort_strategy": { # 图像扭曲策略 73 | "expand_prob": 0.5, # 扩展比率 74 | "expand_max_ratio": 4, 75 | "hue_prob": 0.5, # 色调 76 | "hue_delta": 18, 77 | "contrast_prob": 0.5, # 对比度 78 | "contrast_delta": 0.5, 79 | "saturation_prob": 0.5, # 饱和度 80 | "saturation_delta": 0.5, 81 | "brightness_prob": 0.5, # 亮度 82 | "brightness_delta": 0.125 83 | }, 84 | "sgd_strategy": { # 梯度下降配置 85 | "learning_rate": 0.002, 86 | "lr_epochs": [30, 50, 65], # 学习率衰减分段(3个数字分为4段) 87 | "lr_decay": [1, 0.5, 0.25, 0.1] # 每段采用的学习率,对应lr_epochs参数4段 88 | }, 89 | "early_stop": { 90 | "sample_frequency": 50, 91 | "successive_limit": 3, 92 | "min_loss": 2.5, 93 | "min_curr_map": 0.84 94 | } 95 | } 96 | 97 | 98 | def init_train_parameters(): 99 | """ 100 | 初始化训练参数,主要是初始化图片数量,类别数 101 | :return: 102 | """ 103 | file_list = os.path.join(train_params['data_dir'], train_params['train_list']) # 训练集 104 | label_list = os.path.join(train_params['data_dir'], "label_list") # 标签文件 105 | index = 0 106 | 107 | # codecs是专门用作编码转换通用模块 108 | with codecs.open(label_list, encoding='utf-8') as flist: 109 | lines = [line.strip() for line in flist] 110 | for line in lines: 111 | train_params['num_dict'][index] = line.strip() 112 | train_params['label_dict'][line.strip()] = index 113 | index += 1 114 | train_params['class_dim'] = index 115 | 116 | with codecs.open(file_list, encoding='utf-8') as flist: 117 | lines = [line.strip() for line in flist] 118 | train_params['image_count'] = len(lines) # 图片数量 119 | 120 | 121 | # 日志相关配置 122 | def init_log_config(): # 初始化日志相关配置 123 | global logger 124 | 125 | logger = logging.getLogger() # 创建日志对象 126 | logger.setLevel(logging.INFO) # 设置日志级别 127 | log_path = os.path.join(os.getcwd(), 'logs') 128 | 129 | if not os.path.exists(log_path): # 创建日志路径 130 | os.makedirs(log_path) 131 | 132 | log_name = os.path.join(log_path, 'train.log') # 训练日志文件 133 | fh = logging.FileHandler(log_name, mode='w') # 打开文件句柄 134 | fh.setLevel(logging.DEBUG) # 设置级别 135 | 136 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") 137 | fh.setFormatter(formatter) 138 | logger.addHandler(fh) 139 | 140 | 141 | init_log_config() 142 | 143 | 144 | # 定义YOLO3网络结构:darknet-53 145 | class YOLOv3(object): 146 | def __init__(self, class_num, anchors, anchor_mask): 147 | self.outputs = [] # 网络最终模型 148 | self.downsample_ratio = 1 # 下采样率 149 | self.anchor_mask = anchor_mask # 计算卷积核??? 150 | self.anchors = anchors # 锚点 151 | self.class_num = class_num # 类别数量 152 | 153 | self.yolo_anchors = [] 154 | self.yolo_classes = [] 155 | 156 | for mask_pair in self.anchor_mask: 157 | mask_anchors = [] 158 | for mask in mask_pair: 159 | mask_anchors.append(self.anchors[2 * mask]) 160 | mask_anchors.append(self.anchors[2 * mask + 1]) 161 | self.yolo_anchors.append(mask_anchors) 162 | self.yolo_classes.append(class_num) 163 | 164 | def name(self): 165 | return 'YOLOv3' 166 | 167 | # 获取anchors 168 | def get_anchors(self): 169 | return self.anchors 170 | 171 | # 获取anchor_mask 172 | def get_anchor_mask(self): 173 | return self.anchor_mask 174 | 175 | def get_class_num(self): 176 | return self.class_num 177 | 178 | def get_downsample_ratio(self): 179 | return self.downsample_ratio 180 | 181 | def get_yolo_anchors(self): 182 | return self.yolo_anchors 183 | 184 | def get_yolo_classes(self): 185 | return self.yolo_classes 186 | 187 | # 卷积正则化函数: 卷积、批量正则化处理、leakrelu 188 | def conv_bn(self, 189 | input, # 输入 190 | num_filters, # 卷积核数量 191 | filter_size, # 卷积核大小 192 | stride, # 步幅 193 | padding, # 填充 194 | use_cudnn=True): 195 | # 2d卷积操作 196 | conv = fluid.layers.conv2d(input=input, 197 | num_filters=num_filters, 198 | filter_size=filter_size, 199 | stride=stride, 200 | padding=padding, 201 | act=None, 202 | use_cudnn=use_cudnn, # 是否使用cudnn,cudnn利用cuda进行了加速处理 203 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)), 204 | bias_attr=False) 205 | 206 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉 207 | # 在batch_norm中使用leaky的话,只能使用默认的alpha=0.02;如果需要设值,必须提出去单独来 208 | # 正则化的目的,是为了防止过拟合,较小的L2值能防止过拟合 209 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02), 210 | regularizer=L2Decay(0.)) 211 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), 212 | regularizer=L2Decay(0.)) 213 | out = fluid.layers.batch_norm(input=conv, act=None, 214 | param_attr=param_attr, 215 | bias_attr=bias_attr) 216 | # leaky_relu: Leaky ReLU是给所有负值赋予一个非零斜率 217 | out = fluid.layers.leaky_relu(out, 0.1) 218 | return out 219 | 220 | # 通过卷积实现降采样 221 | # 如:原始图片大小448*448,降采样后大小为 ((448+2)-3)/2 + 1 = 224 222 | def down_sample(self, input, num_filters, filter_size=3, stride=2, padding=1): 223 | self.downsample_ratio *= 2 # 降采样率 224 | return self.conv_bn(input, 225 | num_filters=num_filters, 226 | filter_size=filter_size, 227 | stride=stride, 228 | padding=padding) 229 | 230 | # 基本块:包含两个卷积/正则化层,一个残差块 231 | def basic_block(self, input, num_filters): 232 | conv1 = self.conv_bn(input, num_filters, filter_size=1, stride=1, padding=0) 233 | conv2 = self.conv_bn(conv1, num_filters * 2, filter_size=3, stride=1, padding=1) 234 | out = fluid.layers.elementwise_add(x=input, y=conv2, act=None) # 计算H(x)=F(x)+x 235 | return out 236 | 237 | # 创建多个basic_block 238 | def layer_warp(self, input, num_filters, count): 239 | res_out = self.basic_block(input, num_filters) 240 | for j in range(1, count): 241 | res_out = self.basic_block(res_out, num_filters) 242 | return res_out 243 | 244 | # 上采样 245 | def up_sample(self, input, scale=2): 246 | # get dynamic upsample output shape 247 | shape_nchw = fluid.layers.shape(input) # 获取input的形状 248 | shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4]) 249 | shape_hw.stop_gradient = True 250 | in_shape = fluid.layers.cast(shape_hw, dtype='int32') 251 | out_shape = in_shape * scale # 计算输出数据形状 252 | out_shape.stop_gradient = True 253 | 254 | # reisze by actual_shape 255 | # 矩阵放大(最邻插值法) 256 | out = fluid.layers.resize_nearest(input=input, 257 | scale=scale, 258 | actual_shape=out_shape) 259 | return out 260 | 261 | def yolo_detection_block(self, input, num_filters): 262 | assert num_filters % 2 == 0, "num_filters {} cannot be divided by 2".format(num_filters) 263 | 264 | conv = input 265 | for j in range(2): 266 | conv = self.conv_bn(conv, num_filters, filter_size=1, stride=1, padding=0) 267 | conv = self.conv_bn(conv, num_filters * 2, filter_size=3, stride=1, padding=1) 268 | route = self.conv_bn(conv, num_filters, filter_size=1, stride=1, padding=0) 269 | tip = self.conv_bn(route, num_filters * 2, filter_size=3, stride=1, padding=1) 270 | return route, tip 271 | 272 | # 搭建网络模型 darknet-53 273 | def net(self, img): 274 | stages = [1, 2, 8, 8, 4] 275 | assert len(self.anchor_mask) <= len(stages), "anchor masks can't bigger than down_sample times" 276 | # 第一个卷积层: 256*256 277 | conv1 = self.conv_bn(img, num_filters=32, filter_size=3, stride=1, padding=1) 278 | # 第二个卷积层:128*128 279 | downsample_ = self.down_sample(conv1, conv1.shape[1] * 2) # 第二个参数为卷积核数量 280 | blocks = [] 281 | 282 | # 循环创建basic_block组 283 | for i, stage_count in enumerate(stages): 284 | block = self.layer_warp(downsample_, # 输入数据 285 | 32 * (2 ** i), # 卷积核数量 286 | stage_count) # 基本块数量 287 | blocks.append(block) 288 | if i < len(stages) - 1: # 如果不是最后一组,做降采样 289 | downsample_ = self.down_sample(block, block.shape[1] * 2) 290 | blocks = blocks[-1:-4:-1] # 取倒数三层,并且逆序,后面跨层级联需要 291 | 292 | # yolo detector 293 | for i, block in enumerate(blocks): 294 | # yolo中跨视域链接 295 | if i > 0: 296 | block = fluid.layers.concat(input=[route, block], axis=1) # 连接route和block,按行 297 | 298 | route, tip = self.yolo_detection_block(block, # 输入 299 | num_filters=512 // (2 ** i)) # 卷积核数量 300 | 301 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)) 302 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.)) 303 | block_out = fluid.layers.conv2d(input=tip, 304 | # 5 elements represent x|y|h|w|score 305 | num_filters=len(self.anchor_mask[i]) * (self.class_num + 5), 306 | filter_size=1, 307 | stride=1, 308 | padding=0, 309 | act=None, 310 | param_attr=param_attr, 311 | bias_attr=bias_attr) 312 | self.outputs.append(block_out) 313 | 314 | # 为了跨视域链接,差值方式提升特征图尺寸 315 | if i < len(blocks) - 1: 316 | route = self.conv_bn(route, 256 // (2 ** i), filter_size=1, stride=1, padding=0) 317 | route = self.up_sample(route) # 上采样 318 | 319 | return self.outputs 320 | 321 | # Tiny(精简版)YOLO模型 322 | class YOLOv3Tiny(object): 323 | def __init__(self, class_num, anchors, anchor_mask): 324 | self.outputs = [] 325 | self.downsample_ratio = 1 326 | self.anchor_mask = anchor_mask 327 | self.anchors = anchors 328 | self.class_num = class_num 329 | 330 | self.yolo_anchors = [] 331 | self.yolo_classes = [] 332 | for mask_pair in self.anchor_mask: 333 | mask_anchors = [] 334 | for mask in mask_pair: 335 | mask_anchors.append(self.anchors[2 * mask]) 336 | mask_anchors.append(self.anchors[2 * mask + 1]) 337 | self.yolo_anchors.append(mask_anchors) 338 | self.yolo_classes.append(class_num) 339 | 340 | def name(self): 341 | return 'YOLOv3-tiny' 342 | 343 | def get_anchors(self): 344 | return self.anchors 345 | 346 | def get_anchor_mask(self): 347 | return self.anchor_mask 348 | 349 | def get_class_num(self): 350 | return self.class_num 351 | 352 | def get_downsample_ratio(self): 353 | return self.downsample_ratio 354 | 355 | def get_yolo_anchors(self): 356 | return self.yolo_anchors 357 | 358 | def get_yolo_classes(self): 359 | return self.yolo_classes 360 | 361 | def conv_bn(self, 362 | input, 363 | num_filters, 364 | filter_size, 365 | stride, 366 | padding, 367 | num_groups=1, 368 | use_cudnn=True): 369 | conv = fluid.layers.conv2d( 370 | input=input, 371 | num_filters=num_filters, 372 | filter_size=filter_size, 373 | stride=stride, 374 | padding=padding, 375 | act=None, 376 | groups=num_groups, 377 | use_cudnn=use_cudnn, 378 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)), 379 | bias_attr=False) 380 | 381 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉 382 | out = fluid.layers.batch_norm( 383 | input=conv, act='relu', 384 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02), regularizer=L2Decay(0.)), 385 | bias_attr=ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.))) 386 | 387 | return out 388 | 389 | def depthwise_conv_bn(self, input, filter_size=3, stride=1, padding=1): 390 | num_filters = input.shape[1] 391 | return self.conv_bn(input, 392 | num_filters=num_filters, 393 | filter_size=filter_size, 394 | stride=stride, 395 | padding=padding, 396 | num_groups=num_filters) 397 | 398 | def down_sample(self, input, pool_size=2, pool_stride=2): 399 | self.downsample_ratio *= 2 400 | return fluid.layers.pool2d(input=input, pool_type='max', pool_size=pool_size, 401 | pool_stride=pool_stride) 402 | 403 | def basic_block(self, input, num_filters): 404 | conv1 = self.conv_bn(input, num_filters, filter_size=3, stride=1, padding=1) 405 | out = self.down_sample(conv1) 406 | return out 407 | 408 | def up_sample(self, input, scale=2): 409 | # get dynamic upsample output shape 410 | shape_nchw = fluid.layers.shape(input) 411 | shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4]) 412 | shape_hw.stop_gradient = True 413 | in_shape = fluid.layers.cast(shape_hw, dtype='int32') 414 | out_shape = in_shape * scale 415 | out_shape.stop_gradient = True 416 | 417 | # reisze by actual_shape 418 | out = fluid.layers.resize_nearest( 419 | input=input, 420 | scale=scale, 421 | actual_shape=out_shape) 422 | return out 423 | 424 | def yolo_detection_block(self, input, num_filters): 425 | route = self.conv_bn(input, num_filters, filter_size=1, stride=1, padding=0) 426 | tip = self.conv_bn(route, num_filters * 2, filter_size=3, stride=1, padding=1) 427 | return route, tip 428 | 429 | def net(self, img): 430 | # darknet-tiny 431 | stages = [16, 32, 64, 128, 256, 512] 432 | assert len(self.anchor_mask) <= len(stages), "anchor masks can't bigger than down_sample times" 433 | # 256x256 434 | tmp = img 435 | blocks = [] 436 | for i, stage_count in enumerate(stages): 437 | if i == len(stages) - 1: 438 | block = self.conv_bn(tmp, stage_count, filter_size=3, stride=1, padding=1) 439 | blocks.append(block) 440 | block = self.depthwise_conv_bn(blocks[-1]) 441 | block = self.depthwise_conv_bn(blocks[-1]) 442 | block = self.conv_bn(blocks[-1], stage_count * 2, filter_size=1, stride=1, padding=0) 443 | blocks.append(block) 444 | else: 445 | tmp = self.basic_block(tmp, stage_count) 446 | blocks.append(tmp) 447 | 448 | blocks = [blocks[-1], blocks[3]] 449 | 450 | # yolo detector 451 | for i, block in enumerate(blocks): 452 | # yolo 中跨视域链接 453 | if i > 0: 454 | block = fluid.layers.concat(input=[route, block], axis=1) 455 | if i < 1: 456 | route, tip = self.yolo_detection_block(block, num_filters=256 // (2 ** i)) 457 | else: 458 | tip = self.conv_bn(block, num_filters=256, filter_size=3, stride=1, padding=1) 459 | 460 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)) 461 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.)) 462 | block_out = fluid.layers.conv2d(input=tip, 463 | # 5 elements represent x|y|h|w|score 464 | num_filters=len(self.anchor_mask[i]) * (self.class_num + 5), 465 | filter_size=1, 466 | stride=1, 467 | padding=0, 468 | act=None, 469 | param_attr=param_attr, 470 | bias_attr=bias_attr) 471 | self.outputs.append(block_out) 472 | # 为了跨视域链接,差值方式提升特征图尺寸 473 | if i < len(blocks) - 1: 474 | route = self.conv_bn(route, 128 // (2 ** i), filter_size=1, stride=1, padding=0) 475 | route = self.up_sample(route) 476 | 477 | return self.outputs 478 | 479 | 480 | def get_yolo(is_tiny, class_num, anchors, anchor_mask): 481 | if is_tiny: 482 | return YOLOv3Tiny(class_num, anchors, anchor_mask) 483 | else: 484 | return YOLOv3(class_num, anchors, anchor_mask) 485 | 486 | 487 | class Sampler(object): 488 | """ 489 | 采样器,用于扣取采样 490 | """ 491 | 492 | def __init__(self, max_sample, max_trial, min_scale, max_scale, 493 | min_aspect_ratio, max_aspect_ratio, min_jaccard_overlap, 494 | max_jaccard_overlap): 495 | self.max_sample = max_sample 496 | self.max_trial = max_trial 497 | self.min_scale = min_scale 498 | self.max_scale = max_scale 499 | self.min_aspect_ratio = min_aspect_ratio 500 | self.max_aspect_ratio = max_aspect_ratio 501 | self.min_jaccard_overlap = min_jaccard_overlap 502 | self.max_jaccard_overlap = max_jaccard_overlap 503 | 504 | 505 | class bbox(object): 506 | """ 507 | 外界矩形框 508 | """ 509 | 510 | def __init__(self, xmin, ymin, xmax, ymax): 511 | self.xmin = xmin 512 | self.ymin = ymin 513 | self.xmax = xmax 514 | self.ymax = ymax 515 | 516 | 517 | # 坐标转换,由[x1, y1, w, h]转换为[center_x, center_y, w, h] 518 | # 并转换为范围在[0, 1]之间的相对坐标 519 | def box_to_center_relative(box, img_height, img_width): 520 | """ 521 | Convert COCO annotations box with format [x1, y1, w, h] to 522 | center mode [center_x, center_y, w, h] and divide image width 523 | and height to get relative value in range[0, 1] 524 | """ 525 | assert len(box) == 4, "box should be a len(4) list or tuple" 526 | x, y, w, h = box 527 | 528 | x1 = max(x, 0) 529 | x2 = min(x + w - 1, img_width - 1) 530 | y1 = max(y, 0) 531 | y2 = min(y + h - 1, img_height - 1) 532 | 533 | x = (x1 + x2) / 2 / img_width # x中心坐标 534 | y = (y1 + y2) / 2 / img_height # y中心坐标 535 | w = (x2 - x1) / img_width # 框宽度/图片总宽度 536 | h = (y2 - y1) / img_height # 框高度/图片总高度 537 | 538 | return np.array([x, y, w, h]) 539 | 540 | 541 | # 调整图像大小 542 | def resize_img(img, sampled_labels, input_size): 543 | target_size = input_size 544 | img = img.resize((target_size[1], target_size[2]), Image.BILINEAR) 545 | return img 546 | 547 | 548 | # 计算交并比 549 | def box_iou_xywh(box1, box2): 550 | assert box1.shape[-1] == 4, "Box1 shape[-1] should be 4." 551 | assert box2.shape[-1] == 4, "Box2 shape[-1] should be 4." 552 | 553 | # 取两个框的坐标 554 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 555 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 556 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 557 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 558 | 559 | inter_x1 = np.maximum(b1_x1, b2_x1) 560 | inter_x2 = np.minimum(b1_x2, b2_x2) 561 | inter_y1 = np.maximum(b1_y1, b2_y1) 562 | inter_y2 = np.minimum(b1_y2, b2_y2) 563 | inter_w = inter_x2 - inter_x1 + 1 # 相交部分宽度 564 | inter_h = inter_y2 - inter_y1 + 1 # 相交部分高度 565 | inter_w[inter_w < 0] = 0 566 | inter_h[inter_h < 0] = 0 567 | 568 | inter_area = inter_w * inter_h # 相交面积 569 | b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) # 框1的面积 570 | b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) # 框2的面积 571 | 572 | return inter_area / (b1_area + b2_area - inter_area) # 相集面积/并集面积 573 | 574 | 575 | # box裁剪 576 | def box_crop(boxes, labels, crop, img_shape): 577 | x, y, w, h = map(float, crop) 578 | im_w, im_h = map(float, img_shape) 579 | 580 | boxes = boxes.copy() 581 | boxes[:, 0], boxes[:, 2] = (boxes[:, 0] - boxes[:, 2] / 2) * im_w, (boxes[:, 0] + boxes[:, 2] / 2) * im_w 582 | boxes[:, 1], boxes[:, 3] = (boxes[:, 1] - boxes[:, 3] / 2) * im_h, (boxes[:, 1] + boxes[:, 3] / 2) * im_h 583 | 584 | crop_box = np.array([x, y, x + w, y + h]) 585 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 586 | mask = np.logical_and(crop_box[:2] <= centers, centers <= crop_box[2:]).all(axis=1) 587 | 588 | boxes[:, :2] = np.maximum(boxes[:, :2], crop_box[:2]) 589 | boxes[:, 2:] = np.minimum(boxes[:, 2:], crop_box[2:]) 590 | boxes[:, :2] -= crop_box[:2] 591 | boxes[:, 2:] -= crop_box[:2] 592 | 593 | mask = np.logical_and(mask, (boxes[:, :2] < boxes[:, 2:]).all(axis=1)) 594 | boxes = boxes * np.expand_dims(mask.astype('float32'), axis=1) 595 | labels = labels * mask.astype('float32') 596 | boxes[:, 0], boxes[:, 2] = (boxes[:, 0] + boxes[:, 2]) / 2 / w, (boxes[:, 2] - boxes[:, 0]) / w 597 | boxes[:, 1], boxes[:, 3] = (boxes[:, 1] + boxes[:, 3]) / 2 / h, (boxes[:, 3] - boxes[:, 1]) / h 598 | 599 | return boxes, labels, mask.sum() 600 | 601 | 602 | # 图像增加:对比度,饱和度,明暗,颜色,扩张 603 | def random_brightness(img): # 亮度 604 | prob = np.random.uniform(0, 1) 605 | 606 | if prob < train_params['image_distort_strategy']['brightness_prob']: 607 | brightness_delta = train_params['image_distort_strategy']['brightness_delta'] # 默认值0.125 608 | delta = np.random.uniform(-brightness_delta, brightness_delta) + 1 # 产生均匀分布随机值 609 | img = ImageEnhance.Brightness(img).enhance(delta) # 调整图像亮度 610 | 611 | return img 612 | 613 | 614 | def random_contrast(img): # 对比度 615 | prob = np.random.uniform(0, 1) 616 | 617 | if prob < train_params['image_distort_strategy']['contrast_prob']: 618 | contrast_delta = train_params['image_distort_strategy']['contrast_delta'] 619 | delta = np.random.uniform(-contrast_delta, contrast_delta) + 1 620 | img = ImageEnhance.Contrast(img).enhance(delta) 621 | 622 | return img 623 | 624 | 625 | def random_saturation(img): # 饱和度 626 | prob = np.random.uniform(0, 1) 627 | 628 | if prob < train_params['image_distort_strategy']['saturation_prob']: 629 | saturation_delta = train_params['image_distort_strategy']['saturation_delta'] 630 | delta = np.random.uniform(-saturation_delta, saturation_delta) + 1 631 | img = ImageEnhance.Color(img).enhance(delta) 632 | 633 | return img 634 | 635 | 636 | def random_hue(img): # 色调 637 | prob = np.random.uniform(0, 1) 638 | 639 | if prob < train_params['image_distort_strategy']['hue_prob']: 640 | hue_delta = train_params['image_distort_strategy']['hue_delta'] 641 | delta = np.random.uniform(-hue_delta, hue_delta) 642 | img_hsv = np.array(img.convert('HSV')) 643 | img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta 644 | img = Image.fromarray(img_hsv, mode='HSV').convert('RGB') 645 | 646 | return img 647 | 648 | 649 | def distort_image(img): # 图像扭曲 650 | prob = np.random.uniform(0, 1) 651 | # Apply different distort order 652 | if prob > 0.5: 653 | img = random_brightness(img) 654 | img = random_contrast(img) 655 | img = random_saturation(img) 656 | img = random_hue(img) 657 | else: 658 | img = random_brightness(img) 659 | img = random_saturation(img) 660 | img = random_hue(img) 661 | img = random_contrast(img) 662 | return img 663 | 664 | 665 | # 随机裁剪 666 | def random_crop(img, boxes, labels, scales=[0.3, 1.0], max_ratio=2.0, constraints=None, max_trial=50): 667 | if random.random() > 0.6: 668 | return img, boxes, labels 669 | if len(boxes) == 0: 670 | return img, boxes, labels 671 | 672 | if not constraints: 673 | constraints = [(0.1, 1.0), 674 | (0.3, 1.0), 675 | (0.5, 1.0), 676 | (0.7, 1.0), 677 | (0.9, 1.0), 678 | (0.0, 1.0)] # 最小/最大交并比值 679 | 680 | w, h = img.size 681 | crops = [(0, 0, w, h)] 682 | 683 | for min_iou, max_iou in constraints: 684 | for _ in range(max_trial): 685 | scale = random.uniform(scales[0], scales[1]) 686 | aspect_ratio = random.uniform(max(1 / max_ratio, scale * scale), \ 687 | min(max_ratio, 1 / scale / scale)) 688 | crop_h = int(h * scale / np.sqrt(aspect_ratio)) 689 | crop_w = int(w * scale * np.sqrt(aspect_ratio)) 690 | crop_x = random.randrange(w - crop_w) 691 | crop_y = random.randrange(h - crop_h) 692 | crop_box = np.array([[ 693 | (crop_x + crop_w / 2.0) / w, 694 | (crop_y + crop_h / 2.0) / h, 695 | crop_w / float(w), 696 | crop_h / float(h) 697 | ]]) 698 | 699 | iou = box_iou_xywh(crop_box, boxes) 700 | if min_iou <= iou.min() and max_iou >= iou.max(): 701 | crops.append((crop_x, crop_y, crop_w, crop_h)) 702 | break 703 | 704 | while crops: 705 | crop = crops.pop(np.random.randint(0, len(crops))) 706 | crop_boxes, crop_labels, box_num = box_crop(boxes, labels, crop, (w, h)) 707 | if box_num < 1: 708 | continue 709 | img = img.crop((crop[0], crop[1], crop[0] + crop[2], 710 | crop[1] + crop[3])).resize(img.size, Image.LANCZOS) 711 | return img, crop_boxes, crop_labels 712 | return img, boxes, labels 713 | 714 | 715 | # 扩张 716 | def random_expand(img, gtboxes, keep_ratio=True): 717 | if np.random.uniform(0, 1) < train_params['image_distort_strategy']['expand_prob']: 718 | return img, gtboxes 719 | 720 | max_ratio = train_params['image_distort_strategy']['expand_max_ratio'] 721 | w, h = img.size 722 | c = 3 723 | ratio_x = random.uniform(1, max_ratio) 724 | if keep_ratio: 725 | ratio_y = ratio_x 726 | else: 727 | ratio_y = random.uniform(1, max_ratio) 728 | oh = int(h * ratio_y) 729 | ow = int(w * ratio_x) 730 | off_x = random.randint(0, ow - w) 731 | off_y = random.randint(0, oh - h) 732 | 733 | out_img = np.zeros((oh, ow, c), np.uint8) 734 | for i in range(c): 735 | out_img[:, :, i] = train_params['mean_rgb'][i] 736 | 737 | out_img[off_y: off_y + h, off_x: off_x + w, :] = img 738 | gtboxes[:, 0] = ((gtboxes[:, 0] * w) + off_x) / float(ow) 739 | gtboxes[:, 1] = ((gtboxes[:, 1] * h) + off_y) / float(oh) 740 | gtboxes[:, 2] = gtboxes[:, 2] / ratio_x 741 | gtboxes[:, 3] = gtboxes[:, 3] / ratio_y 742 | 743 | return Image.fromarray(out_img), gtboxes 744 | 745 | 746 | # 预处理:图像样本增强,维度转换 747 | def preprocess(img, bbox_labels, input_size, mode): 748 | img_width, img_height = img.size 749 | sample_labels = np.array(bbox_labels) 750 | 751 | if mode == 'train': 752 | if train_params['apply_distort']: # 是否扭曲增强 753 | img = distort_image(img) 754 | 755 | img, gtboxes = random_expand(img, sample_labels[:, 1:5]) # 扩展增强 756 | img, gtboxes, gtlabels = random_crop(img, gtboxes, sample_labels[:, 0]) # 随机裁剪 757 | sample_labels[:, 0] = gtlabels 758 | sample_labels[:, 1:5] = gtboxes 759 | 760 | img = resize_img(img, sample_labels, input_size) 761 | img = np.array(img).astype('float32') 762 | img -= train_params['mean_rgb'] 763 | img = img.transpose((2, 0, 1)) # HWC to CHW 764 | img *= 0.007843 765 | return img, sample_labels 766 | 767 | 768 | # 数据读取器 769 | # 根据样本文件,读取图片、并做数据增强,返回图片数据、边框、标签 770 | def custom_reader(file_list, data_dir, input_size, mode): 771 | def reader(): 772 | np.random.shuffle(file_list) # 打乱文件列表 773 | 774 | for line in file_list: # 读取行,每行一个图片及标注 775 | if mode == 'train' or mode == 'eval': 776 | ###################### 以下可能是需要自定义修改的部分 ############################ 777 | parts = line.split('\t') # 按照tab键拆分 778 | image_path = parts[0] 779 | 780 | img = Image.open(os.path.join(data_dir, image_path)) # 读取图像数据 781 | if img.mode != 'RGB': 782 | img = img.convert('RGB') 783 | im_width, im_height = img.size 784 | 785 | # bbox 的列表,每一个元素为这样 786 | # layout: label | x-center | y-cneter | width | height | difficult 787 | bbox_labels = [] 788 | for object_str in parts[1:]: # 循环处理每一个目标标注信息 789 | if len(object_str) <= 1: 790 | continue 791 | 792 | bbox_sample = [] 793 | object = json.loads(object_str) 794 | bbox_sample.append(float(train_params['label_dict'][object['value']])) 795 | bbox = object['coordinate'] # 获取框坐标 796 | # 计算x,y,w,h 797 | box = [bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]] 798 | bbox = box_to_center_relative(box, im_height, im_width) # 坐标转换 799 | bbox_sample.append(float(bbox[0])) 800 | bbox_sample.append(float(bbox[1])) 801 | bbox_sample.append(float(bbox[2])) 802 | bbox_sample.append(float(bbox[3])) 803 | difficult = float(0) 804 | bbox_sample.append(difficult) 805 | bbox_labels.append(bbox_sample) 806 | ###################### 可能需要自定义修改部分结束 ############################ 807 | 808 | if len(bbox_labels) == 0: 809 | continue 810 | 811 | img, sample_labels = preprocess(img, bbox_labels, input_size, mode) # 预处理 812 | # sample_labels = np.array(sample_labels) 813 | if len(sample_labels) == 0: 814 | continue 815 | 816 | boxes = sample_labels[:, 1:5] # 坐标 817 | lbls = sample_labels[:, 0].astype('int32') # 标签 818 | difficults = sample_labels[:, -1].astype('int32') 819 | max_box_num = train_params['max_box_num'] # 一副图像最多多少个目标物体 820 | cope_size = max_box_num if len(boxes) >= max_box_num else len(boxes) # 控制最大目标数量 821 | ret_boxes = np.zeros((max_box_num, 4), dtype=np.float32) 822 | ret_lbls = np.zeros((max_box_num), dtype=np.int32) 823 | ret_difficults = np.zeros((max_box_num), dtype=np.int32) 824 | ret_boxes[0: cope_size] = boxes[0: cope_size] 825 | ret_lbls[0: cope_size] = lbls[0: cope_size] 826 | ret_difficults[0: cope_size] = difficults[0: cope_size] 827 | 828 | yield img, ret_boxes, ret_lbls 829 | 830 | elif mode == 'test': 831 | img_path = os.path.join(line) 832 | 833 | yield Image.open(img_path) 834 | 835 | return reader 836 | 837 | 838 | # 批量、随机数据读取器 839 | def single_custom_reader(file_path, data_dir, input_size, mode): 840 | file_path = os.path.join(data_dir, file_path) 841 | 842 | images = [line.strip() for line in open(file_path)] 843 | reader = custom_reader(images, data_dir, input_size, mode) 844 | reader = paddle.reader.shuffle(reader, train_params['train_batch_size']) 845 | reader = paddle.batch(reader, train_params['train_batch_size']) 846 | 847 | return reader 848 | 849 | 850 | # 定义优化器 851 | def optimizer_sgd_setting(): 852 | batch_size = train_params["train_batch_size"] # batch大小 853 | iters = train_params["image_count"] // batch_size # 计算轮次 854 | iters = 1 if iters < 1 else iters 855 | learning_strategy = train_params['sgd_strategy'] 856 | lr = learning_strategy['learning_rate'] # 学习率 857 | 858 | boundaries = [i * iters for i in learning_strategy["lr_epochs"]] 859 | values = [i * lr for i in learning_strategy["lr_decay"]] 860 | logger.info("origin learning rate: {0} boundaries: {1} values: {2}".format(lr, boundaries, values)) 861 | 862 | optimizer = fluid.optimizer.SGDOptimizer( 863 | learning_rate=fluid.layers.piecewise_decay(boundaries, values), # 分段衰减学习率 864 | # learning_rate=lr, 865 | regularization=fluid.regularizer.L2Decay(0.00005)) 866 | 867 | return optimizer 868 | 869 | 870 | # 创建program, feeder及yolo模型 871 | def build_program_with_feeder(main_prog, startup_prog, place): 872 | max_box_num = train_params['max_box_num'] 873 | ues_tiny = train_params['use_tiny'] # 获取是否使用tiny yolo参数 874 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg'] 875 | 876 | with fluid.program_guard(main_prog, startup_prog): # 更改全局主程序和启动程序 877 | img = fluid.layers.data(name='img', shape=yolo_config['input_size'], dtype='float32') # 图像 878 | gt_box = fluid.layers.data(name='gt_box', shape=[max_box_num, 4], dtype='float32') # 边框 879 | gt_label = fluid.layers.data(name='gt_label', shape=[max_box_num], dtype='int32') # 标签 880 | 881 | feeder = fluid.DataFeeder(feed_list=[img, gt_box, gt_label], 882 | place=place, 883 | program=main_prog) # 定义feeder 884 | reader = single_custom_reader(train_params['train_list'], 885 | train_params['data_dir'], 886 | yolo_config['input_size'], 'train') # 读取器 887 | # 获取yolo参数 888 | ues_tiny = train_params['use_tiny'] 889 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg'] 890 | 891 | with fluid.unique_name.guard(): 892 | # 创建yolo模型 893 | model = get_yolo(ues_tiny, train_params['class_dim'], yolo_config['anchors'], 894 | yolo_config['anchor_mask']) 895 | outputs = model.net(img) 896 | return feeder, reader, get_loss(model, outputs, gt_box, gt_label) 897 | 898 | 899 | # 损失函数 900 | def get_loss(model, outputs, gt_box, gt_label): 901 | losses = [] 902 | downsample_ratio = model.get_downsample_ratio() 903 | 904 | with fluid.unique_name.guard('train'): 905 | for i, out in enumerate(outputs): 906 | loss = fluid.layers.yolov3_loss(x=out, 907 | gt_box=gt_box, # 真实边框 908 | gt_label=gt_label, # 标签 909 | anchors=model.get_anchors(), # 锚点 910 | anchor_mask=model.get_anchor_mask()[i], 911 | class_num=model.get_class_num(), 912 | ignore_thresh=train_params['ignore_thresh'], 913 | # 对于类别不多的情况,设置为 False 会更合适一些,不然 score 会很小 914 | use_label_smooth=False, 915 | downsample_ratio=downsample_ratio) 916 | losses.append(fluid.layers.reduce_mean(loss)) 917 | downsample_ratio //= 2 918 | loss = sum(losses) 919 | optimizer = optimizer_sgd_setting() 920 | optimizer.minimize(loss) 921 | return loss 922 | 923 | 924 | # 持久化参数加载 925 | def load_pretrained_params(exe, program): 926 | if train_params['continue_train'] and os.path.exists(train_params['save_model_dir']): 927 | logger.info('load param from retrain model') 928 | fluid.io.load_persistables(executor=exe, 929 | dirname=train_params['save_model_dir'], 930 | main_program=program) 931 | elif train_params['pretrained'] and os.path.exists(train_params['pretrained_model_dir']): 932 | logger.info('load param from pretrained model') 933 | 934 | def if_exist(var): 935 | return os.path.exists(os.path.join(train_params['pretrained_model_dir'], var.name)) 936 | 937 | fluid.io.load_vars(exe, train_params['pretrained_model_dir'], main_program=program, 938 | predicate=if_exist) 939 | 940 | 941 | # 执行训练 942 | def train(): 943 | init_log_config() 944 | init_train_parameters() 945 | 946 | logger.info("start train YOLOv3, train params:%s", str(train_params)) 947 | logger.info("create place, use gpu:" + str(train_params['use_gpu'])) 948 | 949 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace() 950 | 951 | logger.info("build network and program") 952 | train_program = fluid.Program() 953 | start_program = fluid.Program() 954 | feeder, reader, loss = build_program_with_feeder(train_program, start_program, place) 955 | 956 | logger.info("build executor and init params") 957 | 958 | exe = fluid.Executor(place) 959 | exe.run(start_program) 960 | train_fetch_list = [loss.name] 961 | load_pretrained_params(exe, train_program) # 加载模型及参数 962 | 963 | stop_strategy = train_params['early_stop'] 964 | successive_limit = stop_strategy['successive_limit'] 965 | sample_freq = stop_strategy['sample_frequency'] 966 | min_curr_map = stop_strategy['min_curr_map'] 967 | min_loss = stop_strategy['min_loss'] 968 | stop_train = False 969 | successive_count = 0 970 | total_batch_count = 0 971 | valid_thresh = train_params['valid_thresh'] 972 | nms_thresh = train_params['nms_thresh'] 973 | current_best_loss = 10000000000.0 974 | 975 | # 开始迭代训练 976 | for pass_id in range(train_params["num_epochs"]): 977 | logger.info("current pass: {}, start read image".format(pass_id)) 978 | batch_id = 0 979 | total_loss = 0.0 980 | 981 | for batch_id, data in enumerate(reader()): 982 | t1 = time.time() 983 | 984 | loss = exe.run(train_program, 985 | feed=feeder.feed(data), 986 | fetch_list=train_fetch_list) # 执行训练 987 | 988 | period = time.time() - t1 989 | loss = np.mean(np.array(loss)) 990 | total_loss += loss 991 | batch_id += 1 992 | total_batch_count += 1 993 | 994 | if batch_id % 10 == 0: # 调整日志输出的频率 995 | logger.info( 996 | "pass {}, trainbatch {}, loss {} time {}".format(pass_id, batch_id, loss, "%2.2f sec" % period)) 997 | 998 | pass_mean_loss = total_loss / batch_id 999 | logger.info("pass {0} train result, current pass mean loss: {1}".format(pass_id, pass_mean_loss)) 1000 | 1001 | # 采用每训练完一轮停止办法,可以调整为更精细的保存策略 1002 | if pass_mean_loss < current_best_loss: 1003 | logger.info("temp save {} epcho train result, current best pass loss {}".format(pass_id, pass_mean_loss)) 1004 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program, 1005 | executor=exe) 1006 | current_best_loss = pass_mean_loss 1007 | 1008 | logger.info("training till last epcho, end training") 1009 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program, executor=exe) 1010 | 1011 | 1012 | 1013 | 1014 | 1015 | 1016 | # 固化保存模型 1017 | import paddle 1018 | import paddle.fluid as fluid 1019 | import codecs 1020 | 1021 | init_train_parameters() 1022 | 1023 | 1024 | def freeze_model(): 1025 | exe = fluid.Executor(fluid.CPUPlace()) 1026 | 1027 | ues_tiny = train_params['use_tiny'] 1028 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg'] 1029 | path = train_params['save_model_dir'] 1030 | 1031 | model = get_yolo(ues_tiny, train_params['class_dim'], 1032 | yolo_config['anchors'], yolo_config['anchor_mask']) 1033 | image = fluid.layers.data(name='image', shape=yolo_config['input_size'], dtype='float32') 1034 | image_shape = fluid.layers.data(name="image_shape", shape=[2], dtype='int32') 1035 | 1036 | boxes = [] 1037 | scores = [] 1038 | outputs = model.net(image) 1039 | downsample_ratio = model.get_downsample_ratio() 1040 | 1041 | for i, out in enumerate(outputs): 1042 | box, score = fluid.layers.yolo_box(x=out, 1043 | img_size=image_shape, 1044 | anchors=model.get_yolo_anchors()[i], 1045 | class_num=model.get_class_num(), 1046 | conf_thresh=train_params['valid_thresh'], 1047 | downsample_ratio=downsample_ratio, 1048 | name="yolo_box_" + str(i)) 1049 | boxes.append(box) 1050 | scores.append(fluid.layers.transpose(score, perm=[0, 2, 1])) 1051 | downsample_ratio //= 2 1052 | 1053 | pred = fluid.layers.multiclass_nms(bboxes=fluid.layers.concat(boxes, axis=1), 1054 | scores=fluid.layers.concat(scores, axis=2), 1055 | score_threshold=train_params['valid_thresh'], 1056 | nms_top_k=train_params['nms_top_k'], 1057 | keep_top_k=train_params['nms_pos_k'], 1058 | nms_threshold=train_params['nms_thresh'], 1059 | background_label=-1, 1060 | name="multiclass_nms") 1061 | 1062 | freeze_program = fluid.default_main_program() 1063 | 1064 | fluid.io.load_persistables(exe, path, freeze_program) 1065 | freeze_program = freeze_program.clone(for_test=True) 1066 | print("freeze out: {0}, pred layout: {1}".format(train_params['freeze_dir'], pred)) 1067 | # 保存模型 1068 | fluid.io.save_inference_model(train_params['freeze_dir'], 1069 | ['image', 'image_shape'], 1070 | pred, exe, freeze_program) 1071 | print("freeze end") 1072 | 1073 | 1074 | 1075 | 1076 | 1077 | # 预测 1078 | import codecs 1079 | import sys 1080 | import numpy as np 1081 | import time 1082 | import paddle 1083 | import paddle.fluid as fluid 1084 | import math 1085 | import functools 1086 | 1087 | from IPython.display import display 1088 | from PIL import Image 1089 | from PIL import ImageFont 1090 | from PIL import ImageDraw 1091 | from collections import namedtuple 1092 | 1093 | init_train_parameters() 1094 | ues_tiny = train_params['use_tiny'] 1095 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg'] 1096 | 1097 | target_size = yolo_config['input_size'] 1098 | anchors = yolo_config['anchors'] 1099 | anchor_mask = yolo_config['anchor_mask'] 1100 | label_dict = train_params['num_dict'] 1101 | class_dim = train_params['class_dim'] 1102 | print("label_dict:{} class dim:{}".format(label_dict, class_dim)) 1103 | 1104 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace() 1105 | exe = fluid.Executor(place) 1106 | 1107 | path = train_params['freeze_dir'] 1108 | [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(dirname=path, executor=exe) 1109 | 1110 | 1111 | # 给图片画上外接矩形框 1112 | def draw_bbox_image(img, boxes, labels, save_name): 1113 | img_width, img_height = img.size 1114 | 1115 | draw = ImageDraw.Draw(img) # 图像绘制对象 1116 | for box, label in zip(boxes, labels): 1117 | xmin, ymin, xmax, ymax = box[0], box[1], box[2], box[3] 1118 | draw.rectangle((xmin, ymin, xmax, ymax), None, 'red') # 绘制矩形 1119 | draw.text((xmin, ymin), label_dict[int(label)], (255, 255, 0)) # 绘制标签 1120 | img.save(save_name) 1121 | display(img) 1122 | 1123 | 1124 | def resize_img(img, target_size): 1125 | """ 1126 | 保持比例的缩放图片 1127 | :param img: 1128 | :param target_size: 1129 | :return: 1130 | """ 1131 | img = img.resize(target_size[1:], Image.BILINEAR) 1132 | return img 1133 | 1134 | 1135 | def read_image(img_path): 1136 | """ 1137 | 读取图片 1138 | :param img_path: 1139 | :return: 1140 | """ 1141 | origin = Image.open(img_path) 1142 | img = resize_img(origin, target_size) 1143 | resized_img = img.copy() 1144 | if img.mode != 'RGB': 1145 | img = img.convert('RGB') 1146 | img = np.array(img).astype('float32').transpose((2, 0, 1)) # HWC to CHW 1147 | img -= 127.5 1148 | img *= 0.007843 1149 | img = img[np.newaxis, :] 1150 | return origin, img, resized_img 1151 | 1152 | 1153 | def infer(image_path): 1154 | origin, tensor_img, resized_img = read_image(image_path) 1155 | input_w, input_h = origin.size[0], origin.size[1] 1156 | image_shape = np.array([input_h, input_w], dtype='int32') 1157 | # print("image shape high:{0}, width:{1}".format(input_h, input_w)) 1158 | t1 = time.time() 1159 | batch_outputs = exe.run(inference_program, 1160 | feed={feed_target_names[0]: tensor_img, 1161 | feed_target_names[1]: image_shape[np.newaxis, :]}, 1162 | fetch_list=fetch_targets, 1163 | return_numpy=False) 1164 | period = time.time() - t1 1165 | print("predict cost time:{0}".format("%2.2f sec" % period)) 1166 | bboxes = np.array(batch_outputs[0]) 1167 | # print(bboxes) 1168 | 1169 | # 用于展示一张图片用于预测的效果 1170 | if bboxes.shape[1] != 6: 1171 | print("No object found in {}".format(image_path)) 1172 | return 1173 | labels = bboxes[:, 0].astype('int32').tolist() 1174 | scores = bboxes[:, 1].astype('float32').tolist() 1175 | boxes = bboxes[:, 2:].astype('float32').tolist() 1176 | 1177 | 1178 | last_dot_index = image_path.rfind('.') 1179 | out_path = image_path[:last_dot_index] 1180 | out_path += '-result.jpg' 1181 | draw_bbox_image(origin, boxes, labels, out_path) 1182 | last_slash_index=image_path.rfind('/') 1183 | 1184 | predict = [] 1185 | for i in range(len(labels)): 1186 | predictTmp = [] 1187 | predictTmp.append(labels[i]) 1188 | predictTmp.append(scores[i]) 1189 | for j in boxes[i]: 1190 | predictTmp.append(j) 1191 | predict.append(predictTmp) 1192 | f = open("./input/detection-results/" + image_path[last_slash_index+1:last_dot_index]+'.txt', 'w') 1193 | for i in predict: 1194 | for j in i: 1195 | f.write(str(float(j)) + ' ') 1196 | f.write('\n') 1197 | f.close() 1198 | return predict 1199 | 1200 | if __name__ == '__main__': 1201 | if os.path.exists('./input') == False: 1202 | os.mkdir('./input') 1203 | os.mkdir('./input/detection-results') 1204 | os.mkdir('./input/ground-truth') 1205 | file_path = os.path.join(train_params['data_dir'], 'eval.txt') 1206 | images = [line.strip() for line in open(file_path)] 1207 | for line in images: 1208 | image_path = line 1209 | parts = line.split('\t') 1210 | filename = parts[0] 1211 | filename_path = os.path.join(train_params['data_dir']+'/lslm_test/', parts[0]) 1212 | infer(filename_path) 1213 | 1214 | bbox_labels = [] 1215 | for object_str in parts[1:]: 1216 | if len(object_str) <= 1: 1217 | continue 1218 | bbox_sample = [] 1219 | object = json.loads(object_str) 1220 | bbox_sample.append(float(train_params['label_dict'][object['value']])) 1221 | bbox = object['coordinate'] 1222 | bbox_sample.append(float(bbox[0][0])) 1223 | bbox_sample.append(float(bbox[0][1])) 1224 | bbox_sample.append(float(bbox[1][0])) 1225 | bbox_sample.append(float(bbox[1][1])) 1226 | bbox_labels.append(bbox_sample) 1227 | 1228 | f = open("./input/ground-truth/" + filename_path[24:-4]+'.txt', 'w') 1229 | for i in bbox_labels: 1230 | for j in i: 1231 | f.write(str(float(j)) + ' ') 1232 | f.write('\n') 1233 | f.close() 1234 | 1235 | 1236 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 基于YOLOv3的目标检测实验报告 2 | 3 | ## 目录 4 | 5 | - 小组成员及分工 6 | - YOLOv3目标检测网络 7 | - YOLO算法简介 8 | - 网络结构 9 | - PaddlePaddle代码实现 10 | - 主要参数 11 | - 模型建立 12 | - 训练与迭代 13 | - 数据集基本信息 14 | - 训练过程中的参数调整与模型优化 15 | - YOLO和YOLO-tiny对比 16 | - 参数调整 17 | - 模型优化 18 | - 网络性能分析 19 | - 挑战集测试分析 20 | - 实际结果 21 | 22 | --- 23 | ## 小组成员及分工 24 | 姓名|学号|贡献 25 | ---|:--:|:-: 26 | 马家昱|1950509|数据集搜索与整合、图片处理 27 | 陈冠忠|1950638|模型修改、调试、训练 28 | 陶思月|1951858|数据集拍摄、标记 29 | 黄继宣|1951857|数据集拍摄、标记 30 | 周婉莹|1950579|数据集拍摄、标记 31 | 罗格峰|1952222|数据集拍摄、标记 32 | 33 | --- 34 | ## YOLOv3目标检测网络 35 | ### YOLO算法简介 36 | - 相关算法 37 | 1. 滑动窗口 38 | 39 |   采用滑动窗口的目标检测算法将检测问题转化为了图像分类问题。其基本原理就是采用不同大小和比例(宽高比)的窗口在整张图片上以一定的步长进行滑动,然后对这些窗口对应的区域做图像分类,这样就可以实现对整张图片的检测了。 40 |
41 | 2. 非极大值抑制 42 | 43 |   首先从所有的检测框中找到置信度最大的那个框,然后挨个计算其与剩余框的交并比(IOU),如果其值大于一定阈值(重合度过高),那么就将该框剔除;然后对剩余的检测框重复上述过程,直到处理完所有的检测框。 44 |
45 | - YOLO算法 46 | 47 |   YOLO将对象检测重新定义为一个回归问题。它将单个卷积神经网络(CNN)应用于整个图像,将图像分成网格,并预测每个网格的类概率和边界框。对于每个网格,网络都会预测一个边界框和与每个类别(汽车,行人,交通信号灯等)相对应的概率。每个边界框可以使用四个描述符进行描述: 48 | 49 | 1. 边界框的中心 50 | 2. 高度 51 | 3. 宽度 52 | 4. 值映射到对象所属的类 53 | 54 |   此外,该算法还可以预测边界框中存在对象的概率。如果一个对象的中心落在一个网格单元中,则该网格单元负责检测该对象。每个网格中将有多个边界框。在训练时,我们希望每个对象只有一个边界框。因此,我们根据哪个Box与ground truth box的重叠度最高,从而分配一个Box来负责预测对象。 55 | 56 |   最后,对每个类的对象应用非最大值抑制的方法来过滤出“置信度”小于阈值的边界框。这为我们提供了图像预测。 57 | 58 |
59 | 60 | 61 | 62 | ### 网络结构 63 | - YOLOv3采用了称之为Darknet-53的网络结构(含有53个卷积层),它借鉴了残差网络的做法,在一些层之间设置了快捷链路。下图展示了其基本结构。 64 |
65 | 其中Darknet-53的具体结构如下,其采用448*448*3作为输入,左侧数字表示多重复的残差组件个数,每个残差组件有两个卷积层和一个快捷链路。 66 |
67 | 68 | ### PaddlePaddle代码实现 69 | #### 主要参数 70 | ``` 71 | train_params = { 72 | "data_dir": "data/data6045", # 数据目录 73 | "train_list": "train.txt", # 训练集文件 74 | "eval_list": "eval.txt", 75 | "class_dim": -1, 76 | "label_dict": {}, # 标签字典 77 | "num_dict": {}, 78 | "image_count": -1, 79 | "continue_train": True, # 是否加载前一次的训练参数,接着训练 80 | "pretrained": False, # 是否预训练 81 | "pretrained_model_dir": "./pretrained-model", 82 | "save_model_dir": "./yolo-model", # 模型保存目录 83 | "model_prefix": "yolo-v3", # 模型前缀 84 | "freeze_dir": "freeze_model", 85 | "use_tiny": False, # 是否使用 裁剪 tiny 模型 86 | "max_box_num": 8, # 一幅图上最多有多少个目标 87 | "num_epochs": 15, # 训练轮次 88 | "train_batch_size": 12, # 对于完整yolov3,每一批的训练样本不能太多,内存会炸掉;如果使用tiny,可以适当大一些 89 | "use_gpu": True, # 是否使用GPU 90 | "yolo_cfg": { # YOLO模型参数 91 | "input_size": [3, 448, 448], # 原版的边长大小为608,为了提高训练速度和预测速度,此处压缩为448 92 | "anchors": [7, 10, 12, 22, 24, 17, 22, 45, 46, 33, 43, 88, 85, 66, 115, 146, 275, 240], # 锚点?? 93 | "anchor_mask": [[6, 7, 8], [3, 4, 5], [0, 1, 2]] 94 | }, 95 | "yolo_tiny_cfg": { # YOLO tiny 模型参数 96 | "input_size": [3, 256, 256], 97 | "anchors": [6, 8, 13, 15, 22, 34, 48, 50, 81, 100, 205, 191], 98 | "anchor_mask": [[3, 4, 5], [0, 1, 2]] 99 | }, 100 | "ignore_thresh": 0.7, 101 | "mean_rgb": [127.5, 127.5, 127.5], 102 | "mode": "train", 103 | "multi_data_reader_count": 4, 104 | "apply_distort": True, # 是否做图像扭曲增强 105 | "nms_top_k": 300, 106 | "nms_pos_k": 300, 107 | "valid_thresh": 0.01, 108 | "nms_thresh": 0.40, # 非最大值抑制阈值 109 | "image_distort_strategy": { # 图像扭曲策略 110 | "expand_prob": 0.5, # 扩展比率 111 | "expand_max_ratio": 4, 112 | "hue_prob": 0.5, # 色调 113 | "hue_delta": 18, 114 | "contrast_prob": 0.5, # 对比度 115 | "contrast_delta": 0.5, 116 | "saturation_prob": 0.5, # 饱和度 117 | "saturation_delta": 0.5, 118 | "brightness_prob": 0.5, # 亮度 119 | "brightness_delta": 0.125 120 | }, 121 | "sgd_strategy": { # 梯度下降配置 122 | "learning_rate": 0.002, 123 | "lr_epochs": [30, 50, 65], # 学习率衰减分段(3个数字分为4段) 124 | "lr_decay": [1, 0.5, 0.25, 0.1] # 每段采用的学习率,对应lr_epochs参数4段 125 | }, 126 | "early_stop": { 127 | "sample_frequency": 50, 128 | "successive_limit": 3, 129 | "min_loss": 2.5, 130 | "min_curr_map": 0.84 131 | } 132 | } 133 | ``` 134 | #### 模型建立 135 | ``` 136 | class YOLOv3(object): 137 | def __init__(self, class_num, anchors, anchor_mask): 138 | self.outputs = [] # 网络最终模型 139 | self.downsample_ratio = 1 # 下采样率 140 | self.anchor_mask = anchor_mask # 计算卷积核??? 141 | self.anchors = anchors # 锚点 142 | self.class_num = class_num # 类别数量 143 | 144 | self.yolo_anchors = [] 145 | self.yolo_classes = [] 146 | 147 | for mask_pair in self.anchor_mask: 148 | mask_anchors = [] 149 | for mask in mask_pair: 150 | mask_anchors.append(self.anchors[2 * mask]) 151 | mask_anchors.append(self.anchors[2 * mask + 1]) 152 | self.yolo_anchors.append(mask_anchors) 153 | self.yolo_classes.append(class_num) 154 | 155 | def name(self): 156 | return 'YOLOv3' 157 | 158 | # 获取anchors 159 | def get_anchors(self): 160 | return self.anchors 161 | 162 | # 获取anchor_mask 163 | def get_anchor_mask(self): 164 | return self.anchor_mask 165 | 166 | def get_class_num(self): 167 | return self.class_num 168 | 169 | def get_downsample_ratio(self): 170 | return self.downsample_ratio 171 | 172 | def get_yolo_anchors(self): 173 | return self.yolo_anchors 174 | 175 | def get_yolo_classes(self): 176 | return self.yolo_classes 177 | 178 | # 卷积正则化函数: 卷积、批量正则化处理、leakrelu 179 | def conv_bn(self, 180 | input, # 输入 181 | num_filters, # 卷积核数量 182 | filter_size, # 卷积核大小 183 | stride, # 步幅 184 | padding, # 填充 185 | use_cudnn=True): 186 | # 2d卷积操作 187 | conv = fluid.layers.conv2d(input=input, 188 | num_filters=num_filters, 189 | filter_size=filter_size, 190 | stride=stride, 191 | padding=padding, 192 | act=None, 193 | use_cudnn=use_cudnn, # 是否使用cudnn,cudnn利用cuda进行了加速处理 194 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)), 195 | bias_attr=False) 196 | 197 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉 198 | # 在batch_norm中使用leaky的话,只能使用默认的alpha=0.02;如果需要设值,必须提出去单独来 199 | # 正则化的目的,是为了防止过拟合,较小的L2值能防止过拟合 200 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02), 201 | regularizer=L2Decay(0.)) 202 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), 203 | regularizer=L2Decay(0.)) 204 | out = fluid.layers.batch_norm(input=conv, act=None, 205 | param_attr=param_attr, 206 | bias_attr=bias_attr) 207 | # leaky_relu: Leaky ReLU是给所有负值赋予一个非零斜率 208 | out = fluid.layers.leaky_relu(out, 0.1) 209 | return out 210 | ``` 211 | #### 训练与迭代 212 | ``` 213 | # 执行训练 214 | def train(): 215 | init_log_config() 216 | init_train_parameters() 217 | 218 | logger.info("start train YOLOv3, train params:%s", str(train_params)) 219 | logger.info("create place, use gpu:" + str(train_params['use_gpu'])) 220 | 221 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace() 222 | 223 | logger.info("build network and program") 224 | train_program = fluid.Program() 225 | start_program = fluid.Program() 226 | feeder, reader, loss = build_program_with_feeder(train_program, start_program, place) 227 | 228 | logger.info("build executor and init params") 229 | 230 | exe = fluid.Executor(place) 231 | exe.run(start_program) 232 | train_fetch_list = [loss.name] 233 | load_pretrained_params(exe, train_program) # 加载模型及参数 234 | 235 | stop_strategy = train_params['early_stop'] 236 | successive_limit = stop_strategy['successive_limit'] 237 | sample_freq = stop_strategy['sample_frequency'] 238 | min_curr_map = stop_strategy['min_curr_map'] 239 | min_loss = stop_strategy['min_loss'] 240 | stop_train = False 241 | successive_count = 0 242 | total_batch_count = 0 243 | valid_thresh = train_params['valid_thresh'] 244 | nms_thresh = train_params['nms_thresh'] 245 | current_best_loss = 10000000000.0 246 | 247 | # 开始迭代训练 248 | for pass_id in range(train_params["num_epochs"]): 249 | logger.info("current pass: {}, start read image".format(pass_id)) 250 | batch_id = 0 251 | total_loss = 0.0 252 | 253 | for batch_id, data in enumerate(reader()): 254 | t1 = time.time() 255 | 256 | loss = exe.run(train_program, 257 | feed=feeder.feed(data), 258 | fetch_list=train_fetch_list) # 执行训练 259 | 260 | period = time.time() - t1 261 | loss = np.mean(np.array(loss)) 262 | total_loss += loss 263 | batch_id += 1 264 | total_batch_count += 1 265 | 266 | if batch_id % 10 == 0: # 调整日志输出的频率 267 | logger.info( 268 | "pass {}, trainbatch {}, loss {} time {}".format(pass_id, batch_id, loss, "%2.2f sec" % period)) 269 | 270 | pass_mean_loss = total_loss / batch_id 271 | logger.info("pass {0} train result, current pass mean loss: {1}".format(pass_id, pass_mean_loss)) 272 | 273 | # 采用每训练完一轮停止办法,可以调整为更精细的保存策略 274 | if pass_mean_loss < current_best_loss: 275 | logger.info("temp save {} epcho train result, current best pass loss {}".format(pass_id, pass_mean_loss)) 276 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program, 277 | executor=exe) 278 | current_best_loss = pass_mean_loss 279 | 280 | logger.info("training till last epcho, end training") 281 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program, executor=exe) 282 | ``` 283 | --- 284 | 285 | ## 数据集基本信息 286 | * 本组使用的数据集共有900张图片,其中500张来自校园拍摄实景,其余为下载的特定分类图片。 287 | * 所有图片宽高比均为4:3,分辨率为800*600。数据集图片主要分四类,包括单独的行人、自行车与汽车与前三类混杂在一起的图片。 288 | ![avatar](https://wx1.sinaimg.cn/mw690/005CNyQ8ly1gm2hh1x9qaj31770nu7wh.jpg)![avatar](https://wx1.sinaimg.cn/mw690/005CNyQ8ly1gm2hh1xo3pj31720nz4qp.jpg) 289 | 290 | ## 训练过程中的参数调整与模型优化 291 | ### YOLO和YOLO-tiny对比 292 |
293 | 294 | 模型|训练30轮所用时长| 295 | ---|:--:| 296 | YOLO|2h9m| 297 | YOLO-tiny|1h41m| 298 | ### 参数调整 299 | - max_box_num": 8 300 | - nms_thresh": 0.40 301 | - valid_thresh": 0.015 302 | - 优化显存 303 | - os.environ["FLAGS_fraction_of_gpu_memory_to_use"] = '0.92' 304 | - os.environ["FLAGS_eager_delete_tensor_gb"] = '0' 305 | - os.environ["FLAGS_memory_fraction_of_eager_deletion"] = '1' 306 | - os.environ["FLAGS_fast_eager_deletion_mode"]='True' 307 | ### 模型优化 308 | - 优化器更改:原优化器为SGD 309 | ``` 310 | optimizer=fluid.optimizer.SGDOptimizer( 311 | learning_rate=fluid.layers.piecewise_decay(boundaries, values), regularization=fluid.regularizer.L2Decay(0.00005)) 312 | ``` 313 | - 变更为Adam算法 314 | ``` 315 | optimizer=fluid.optimizer.AdamOptimizer(learning_rate=0.01,beta1=0.9,beta2=0.999,regularization=fluid.regularizer.L2Decay(0.00005)) 316 | ``` 317 | - Adam优化对比分析: 318 | - ![avatar](https://wx2.sinaimg.cn/mw690/005CNyQ8gy1gm42jjv0noj30q80joglh.jpg)![avatar](https://wx3.sinaimg.cn/mw690/005CNyQ8gy1gm42gr58jsj30uk0ikaal.jpg) 319 | 320 | ## 网络性能分析 321 | - 挑战集测试分析 322 | - ![avatar](https://wx1.sinaimg.cn/mw690/005CNyQ8ly1gm2ovf5tr1j30hs0dcdg9.jpg)![avatar](https://wx1.sinaimg.cn/mw690/005CNyQ8ly1gm2ovf5w0aj30hs0dcaaa.jpg)![avatar](https://wx4.sinaimg.cn/mw690/005CNyQ8ly1gm2ovf627sj30hs0dcdg1.jpg)![avatar](https://wx2.sinaimg.cn/mw690/005CNyQ8ly1gm2ovf8ihlj30hs0dc0sv.jpg) 323 | - 实际结果 324 | - ![avatar](https://wx4.sinaimg.cn/mw690/005CNyQ8ly1gm2oofq8nzj31400u0hdt.jpg)![avatar](https://wx3.sinaimg.cn/mw690/005CNyQ8ly1gm2oof38goj31400u0kd6.jpg) 325 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import json 3 | import os 4 | import shutil 5 | import operator 6 | import sys 7 | import argparse 8 | import math 9 | 10 | import numpy as np 11 | 12 | MINOVERLAP = 0.5 # default value (defined in the PASCAL VOC2012 challenge) 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('-na', '--no-animation', help="no animation is shown.", action="store_true") 16 | parser.add_argument('-np', '--no-plot', help="no plot is shown.", action="store_true") 17 | parser.add_argument('-q', '--quiet', help="minimalistic console output.", action="store_true") 18 | # argparse receiving list of classes to be ignored (e.g., python main.py --ignore person book) 19 | parser.add_argument('-i', '--ignore', nargs='+', type=str, help="ignore a list of classes.") 20 | # argparse receiving list of classes with specific IoU (e.g., python main.py --set-class-iou person 0.7) 21 | parser.add_argument('--set-class-iou', nargs='+', type=str, help="set IoU for a specific class.") 22 | args = parser.parse_args() 23 | 24 | ''' 25 | 0,0 ------> x (width) 26 | | 27 | | (Left,Top) 28 | | *_________ 29 | | | | 30 | | | 31 | y |_________| 32 | (height) * 33 | (Right,Bottom) 34 | ''' 35 | 36 | # if there are no classes to ignore then replace None by empty list 37 | if args.ignore is None: 38 | args.ignore = [] 39 | 40 | specific_iou_flagged = False 41 | if args.set_class_iou is not None: 42 | specific_iou_flagged = True 43 | 44 | # make sure that the cwd() is the location of the python script (so that every path makes sense) 45 | os.chdir(os.path.dirname(os.path.abspath(__file__))) 46 | 47 | GT_PATH = os.path.join(os.getcwd(), 'input', 'ground-truth') 48 | DR_PATH = os.path.join(os.getcwd(), 'input', 'detection-results') 49 | # if there are no images then no animation can be shown 50 | IMG_PATH = os.path.join(os.getcwd(), 'input', 'images-optional') 51 | if os.path.exists(IMG_PATH): 52 | for dirpath, dirnames, files in os.walk(IMG_PATH): 53 | if not files: 54 | # no image files found 55 | args.no_animation = True 56 | else: 57 | args.no_animation = True 58 | 59 | # try to import OpenCV if the user didn't choose the option --no-animation 60 | show_animation = False 61 | if not args.no_animation: 62 | try: 63 | import cv2 64 | show_animation = True 65 | except ImportError: 66 | print("\"opencv-python\" not found, please install to visualize the results.") 67 | args.no_animation = True 68 | 69 | # try to import Matplotlib if the user didn't choose the option --no-plot 70 | draw_plot = False 71 | if not args.no_plot: 72 | try: 73 | import matplotlib.pyplot as plt 74 | draw_plot = True 75 | except ImportError: 76 | print("\"matplotlib\" not found, please install it to get the resulting plots.") 77 | args.no_plot = True 78 | 79 | 80 | def log_average_miss_rate(prec, rec, num_images): 81 | """ 82 | log-average miss rate: 83 | Calculated by averaging miss rates at 9 evenly spaced FPPI points 84 | between 10e-2 and 10e0, in log-space. 85 | 86 | output: 87 | lamr | log-average miss rate 88 | mr | miss rate 89 | fppi | false positives per image 90 | 91 | references: 92 | [1] Dollar, Piotr, et al. "Pedestrian Detection: An Evaluation of the 93 | State of the Art." Pattern Analysis and Machine Intelligence, IEEE 94 | Transactions on 34.4 (2012): 743 - 761. 95 | """ 96 | 97 | # if there were no detections of that class 98 | if prec.size == 0: 99 | lamr = 0 100 | mr = 1 101 | fppi = 0 102 | return lamr, mr, fppi 103 | 104 | fppi = (1 - prec) 105 | mr = (1 - rec) 106 | 107 | fppi_tmp = np.insert(fppi, 0, -1.0) 108 | mr_tmp = np.insert(mr, 0, 1.0) 109 | 110 | # Use 9 evenly spaced reference points in log-space 111 | ref = np.logspace(-2.0, 0.0, num = 9) 112 | for i, ref_i in enumerate(ref): 113 | # np.where() will always find at least 1 index, since min(ref) = 0.01 and min(fppi_tmp) = -1.0 114 | j = np.where(fppi_tmp <= ref_i)[-1][-1] 115 | ref[i] = mr_tmp[j] 116 | 117 | # log(0) is undefined, so we use the np.maximum(1e-10, ref) 118 | lamr = math.exp(np.mean(np.log(np.maximum(1e-10, ref)))) 119 | 120 | return lamr, mr, fppi 121 | 122 | """ 123 | throw error and exit 124 | """ 125 | def error(msg): 126 | print(msg) 127 | sys.exit(0) 128 | 129 | """ 130 | check if the number is a float between 0.0 and 1.0 131 | """ 132 | def is_float_between_0_and_1(value): 133 | try: 134 | val = float(value) 135 | if val > 0.0 and val < 1.0: 136 | return True 137 | else: 138 | return False 139 | except ValueError: 140 | return False 141 | 142 | """ 143 | Calculate the AP given the recall and precision array 144 | 1st) We compute a version of the measured precision/recall curve with 145 | precision monotonically decreasing 146 | 2nd) We compute the AP as the area under this curve by numerical integration. 147 | """ 148 | def voc_ap(rec, prec): 149 | """ 150 | --- Official matlab code VOC2012--- 151 | mrec=[0 ; rec ; 1]; 152 | mpre=[0 ; prec ; 0]; 153 | for i=numel(mpre)-1:-1:1 154 | mpre(i)=max(mpre(i),mpre(i+1)); 155 | end 156 | i=find(mrec(2:end)~=mrec(1:end-1))+1; 157 | ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); 158 | """ 159 | rec.insert(0, 0.0) # insert 0.0 at begining of list 160 | rec.append(1.0) # insert 1.0 at end of list 161 | mrec = rec[:] 162 | prec.insert(0, 0.0) # insert 0.0 at begining of list 163 | prec.append(0.0) # insert 0.0 at end of list 164 | mpre = prec[:] 165 | """ 166 | This part makes the precision monotonically decreasing 167 | (goes from the end to the beginning) 168 | matlab: for i=numel(mpre)-1:-1:1 169 | mpre(i)=max(mpre(i),mpre(i+1)); 170 | """ 171 | # matlab indexes start in 1 but python in 0, so I have to do: 172 | # range(start=(len(mpre) - 2), end=0, step=-1) 173 | # also the python function range excludes the end, resulting in: 174 | # range(start=(len(mpre) - 2), end=-1, step=-1) 175 | for i in range(len(mpre)-2, -1, -1): 176 | mpre[i] = max(mpre[i], mpre[i+1]) 177 | """ 178 | This part creates a list of indexes where the recall changes 179 | matlab: i=find(mrec(2:end)~=mrec(1:end-1))+1; 180 | """ 181 | i_list = [] 182 | for i in range(1, len(mrec)): 183 | if mrec[i] != mrec[i-1]: 184 | i_list.append(i) # if it was matlab would be i + 1 185 | """ 186 | The Average Precision (AP) is the area under the curve 187 | (numerical integration) 188 | matlab: ap=sum((mrec(i)-mrec(i-1)).*mpre(i)); 189 | """ 190 | ap = 0.0 191 | for i in i_list: 192 | ap += ((mrec[i]-mrec[i-1])*mpre[i]) 193 | return ap, mrec, mpre 194 | 195 | 196 | """ 197 | Convert the lines of a file to a list 198 | """ 199 | def file_lines_to_list(path): 200 | # open txt file lines to a list 201 | with open(path) as f: 202 | content = f.readlines() 203 | # remove whitespace characters like `\n` at the end of each line 204 | content = [x.strip() for x in content] 205 | return content 206 | 207 | """ 208 | Draws text in image 209 | """ 210 | def draw_text_in_image(img, text, pos, color, line_width): 211 | font = cv2.FONT_HERSHEY_PLAIN 212 | fontScale = 1 213 | lineType = 1 214 | bottomLeftCornerOfText = pos 215 | cv2.putText(img, text, 216 | bottomLeftCornerOfText, 217 | font, 218 | fontScale, 219 | color, 220 | lineType) 221 | text_width, _ = cv2.getTextSize(text, font, fontScale, lineType)[0] 222 | return img, (line_width + text_width) 223 | 224 | """ 225 | Plot - adjust axes 226 | """ 227 | def adjust_axes(r, t, fig, axes): 228 | # get text width for re-scaling 229 | bb = t.get_window_extent(renderer=r) 230 | text_width_inches = bb.width / fig.dpi 231 | # get axis width in inches 232 | current_fig_width = fig.get_figwidth() 233 | new_fig_width = current_fig_width + text_width_inches 234 | propotion = new_fig_width / current_fig_width 235 | # get axis limit 236 | x_lim = axes.get_xlim() 237 | axes.set_xlim([x_lim[0], x_lim[1]*propotion]) 238 | 239 | """ 240 | Draw plot using Matplotlib 241 | """ 242 | def draw_plot_func(dictionary, n_classes, window_title, plot_title, x_label, output_path, to_show, plot_color, true_p_bar): 243 | # sort the dictionary by decreasing value, into a list of tuples 244 | sorted_dic_by_value = sorted(dictionary.items(), key=operator.itemgetter(1)) 245 | # unpacking the list of tuples into two lists 246 | sorted_keys, sorted_values = zip(*sorted_dic_by_value) 247 | # 248 | if true_p_bar != "": 249 | """ 250 | Special case to draw in: 251 | - green -> TP: True Positives (object detected and matches ground-truth) 252 | - red -> FP: False Positives (object detected but does not match ground-truth) 253 | - pink -> FN: False Negatives (object not detected but present in the ground-truth) 254 | """ 255 | fp_sorted = [] 256 | tp_sorted = [] 257 | for key in sorted_keys: 258 | fp_sorted.append(dictionary[key] - true_p_bar[key]) 259 | tp_sorted.append(true_p_bar[key]) 260 | plt.barh(range(n_classes), fp_sorted, align='center', color='crimson', label='False Positive') 261 | plt.barh(range(n_classes), tp_sorted, align='center', color='forestgreen', label='True Positive', left=fp_sorted) 262 | # add legend 263 | plt.legend(loc='lower right') 264 | """ 265 | Write number on side of bar 266 | """ 267 | fig = plt.gcf() # gcf - get current figure 268 | axes = plt.gca() 269 | r = fig.canvas.get_renderer() 270 | for i, val in enumerate(sorted_values): 271 | fp_val = fp_sorted[i] 272 | tp_val = tp_sorted[i] 273 | fp_str_val = " " + str(fp_val) 274 | tp_str_val = fp_str_val + " " + str(tp_val) 275 | # trick to paint multicolor with offset: 276 | # first paint everything and then repaint the first number 277 | t = plt.text(val, i, tp_str_val, color='forestgreen', va='center', fontweight='bold') 278 | plt.text(val, i, fp_str_val, color='crimson', va='center', fontweight='bold') 279 | if i == (len(sorted_values)-1): # largest bar 280 | adjust_axes(r, t, fig, axes) 281 | else: 282 | plt.barh(range(n_classes), sorted_values, color=plot_color) 283 | """ 284 | Write number on side of bar 285 | """ 286 | fig = plt.gcf() # gcf - get current figure 287 | axes = plt.gca() 288 | r = fig.canvas.get_renderer() 289 | for i, val in enumerate(sorted_values): 290 | str_val = " " + str(val) # add a space before 291 | if val < 1.0: 292 | str_val = " {0:.2f}".format(val) 293 | t = plt.text(val, i, str_val, color=plot_color, va='center', fontweight='bold') 294 | # re-set axes to show number inside the figure 295 | if i == (len(sorted_values)-1): # largest bar 296 | adjust_axes(r, t, fig, axes) 297 | # set window title 298 | fig.canvas.set_window_title(window_title) 299 | # write classes in y axis 300 | tick_font_size = 12 301 | plt.yticks(range(n_classes), sorted_keys, fontsize=tick_font_size) 302 | """ 303 | Re-scale height accordingly 304 | """ 305 | init_height = fig.get_figheight() 306 | # comput the matrix height in points and inches 307 | dpi = fig.dpi 308 | height_pt = n_classes * (tick_font_size * 1.4) # 1.4 (some spacing) 309 | height_in = height_pt / dpi 310 | # compute the required figure height 311 | top_margin = 0.15 # in percentage of the figure height 312 | bottom_margin = 0.05 # in percentage of the figure height 313 | figure_height = height_in / (1 - top_margin - bottom_margin) 314 | # set new height 315 | if figure_height > init_height: 316 | fig.set_figheight(figure_height) 317 | 318 | # set plot title 319 | plt.title(plot_title, fontsize=14) 320 | # set axis titles 321 | # plt.xlabel('classes') 322 | plt.xlabel(x_label, fontsize='large') 323 | # adjust size of window 324 | fig.tight_layout() 325 | # save the plot 326 | fig.savefig(output_path) 327 | # show image 328 | if to_show: 329 | plt.show() 330 | # close the plot 331 | plt.close() 332 | 333 | """ 334 | Create a ".temp_files/" and "output/" directory 335 | """ 336 | TEMP_FILES_PATH = ".temp_files" 337 | if not os.path.exists(TEMP_FILES_PATH): # if it doesn't exist already 338 | os.makedirs(TEMP_FILES_PATH) 339 | output_files_path = "output" 340 | if os.path.exists(output_files_path): # if it exist already 341 | # reset the output directory 342 | shutil.rmtree(output_files_path) 343 | 344 | os.makedirs(output_files_path) 345 | if draw_plot: 346 | os.makedirs(os.path.join(output_files_path, "classes")) 347 | if show_animation: 348 | os.makedirs(os.path.join(output_files_path, "images", "detections_one_by_one")) 349 | 350 | """ 351 | ground-truth 352 | Load each of the ground-truth files into a temporary ".json" file. 353 | Create a list of all the class names present in the ground-truth (gt_classes). 354 | """ 355 | # get a list with the ground-truth files 356 | ground_truth_files_list = glob.glob(GT_PATH + '/*.txt') 357 | if len(ground_truth_files_list) == 0: 358 | error("Error: No ground-truth files found!") 359 | ground_truth_files_list.sort() 360 | # dictionary with counter per class 361 | gt_counter_per_class = {} 362 | counter_images_per_class = {} 363 | 364 | gt_files = [] 365 | for txt_file in ground_truth_files_list: 366 | #print(txt_file) 367 | file_id = txt_file.split(".txt", 1)[0] 368 | file_id = os.path.basename(os.path.normpath(file_id)) 369 | # check if there is a correspondent detection-results file 370 | temp_path = os.path.join(DR_PATH, (file_id + ".txt")) 371 | if not os.path.exists(temp_path): 372 | error_msg = "Error. File not found: {}\n".format(temp_path) 373 | error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)" 374 | error(error_msg) 375 | lines_list = file_lines_to_list(txt_file) 376 | # create ground-truth dictionary 377 | bounding_boxes = [] 378 | is_difficult = False 379 | already_seen_classes = [] 380 | for line in lines_list: 381 | try: 382 | if "difficult" in line: 383 | class_name, left, top, right, bottom, _difficult = line.split() 384 | is_difficult = True 385 | else: 386 | class_name, left, top, right, bottom = line.split() 387 | except ValueError: 388 | error_msg = "Error: File " + txt_file + " in the wrong format.\n" 389 | error_msg += " Expected: ['difficult']\n" 390 | error_msg += " Received: " + line 391 | error_msg += "\n\nIf you have a with spaces between words you should remove them\n" 392 | error_msg += "by running the script \"remove_space.py\" or \"rename_class.py\" in the \"extra/\" folder." 393 | error(error_msg) 394 | # check if class is in the ignore list, if yes skip 395 | if class_name in args.ignore: 396 | continue 397 | bbox = left + " " + top + " " + right + " " +bottom 398 | if is_difficult: 399 | bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False, "difficult":True}) 400 | is_difficult = False 401 | else: 402 | bounding_boxes.append({"class_name":class_name, "bbox":bbox, "used":False}) 403 | # count that object 404 | if class_name in gt_counter_per_class: 405 | gt_counter_per_class[class_name] += 1 406 | else: 407 | # if class didn't exist yet 408 | gt_counter_per_class[class_name] = 1 409 | 410 | if class_name not in already_seen_classes: 411 | if class_name in counter_images_per_class: 412 | counter_images_per_class[class_name] += 1 413 | else: 414 | # if class didn't exist yet 415 | counter_images_per_class[class_name] = 1 416 | already_seen_classes.append(class_name) 417 | 418 | 419 | # dump bounding_boxes into a ".json" file 420 | new_temp_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json" 421 | gt_files.append(new_temp_file) 422 | with open(new_temp_file, 'w') as outfile: 423 | json.dump(bounding_boxes, outfile) 424 | 425 | gt_classes = list(gt_counter_per_class.keys()) 426 | # let's sort the classes alphabetically 427 | gt_classes = sorted(gt_classes) 428 | n_classes = len(gt_classes) 429 | #print(gt_classes) 430 | #print(gt_counter_per_class) 431 | 432 | """ 433 | Check format of the flag --set-class-iou (if used) 434 | e.g. check if class exists 435 | """ 436 | if specific_iou_flagged: 437 | n_args = len(args.set_class_iou) 438 | error_msg = \ 439 | '\n --set-class-iou [class_1] [IoU_1] [class_2] [IoU_2] [...]' 440 | if n_args % 2 != 0: 441 | error('Error, missing arguments. Flag usage:' + error_msg) 442 | # [class_1] [IoU_1] [class_2] [IoU_2] 443 | # specific_iou_classes = ['class_1', 'class_2'] 444 | specific_iou_classes = args.set_class_iou[::2] # even 445 | # iou_list = ['IoU_1', 'IoU_2'] 446 | iou_list = args.set_class_iou[1::2] # odd 447 | if len(specific_iou_classes) != len(iou_list): 448 | error('Error, missing arguments. Flag usage:' + error_msg) 449 | for tmp_class in specific_iou_classes: 450 | if tmp_class not in gt_classes: 451 | error('Error, unknown class \"' + tmp_class + '\". Flag usage:' + error_msg) 452 | for num in iou_list: 453 | if not is_float_between_0_and_1(num): 454 | error('Error, IoU must be between 0.0 and 1.0. Flag usage:' + error_msg) 455 | 456 | """ 457 | detection-results 458 | Load each of the detection-results files into a temporary ".json" file. 459 | """ 460 | # get a list with the detection-results files 461 | dr_files_list = glob.glob(DR_PATH + '/*.txt') 462 | dr_files_list.sort() 463 | 464 | for class_index, class_name in enumerate(gt_classes): 465 | bounding_boxes = [] 466 | for txt_file in dr_files_list: 467 | #print(txt_file) 468 | # the first time it checks if all the corresponding ground-truth files exist 469 | file_id = txt_file.split(".txt",1)[0] 470 | file_id = os.path.basename(os.path.normpath(file_id)) 471 | temp_path = os.path.join(GT_PATH, (file_id + ".txt")) 472 | if class_index == 0: 473 | if not os.path.exists(temp_path): 474 | error_msg = "Error. File not found: {}\n".format(temp_path) 475 | error_msg += "(You can avoid this error message by running extra/intersect-gt-and-dr.py)" 476 | error(error_msg) 477 | lines = file_lines_to_list(txt_file) 478 | for line in lines: 479 | try: 480 | tmp_class_name, confidence, left, top, right, bottom = line.split() 481 | except ValueError: 482 | error_msg = "Error: File " + txt_file + " in the wrong format.\n" 483 | error_msg += " Expected: \n" 484 | error_msg += " Received: " + line 485 | error(error_msg) 486 | if tmp_class_name == class_name: 487 | #print("match") 488 | bbox = left + " " + top + " " + right + " " +bottom 489 | bounding_boxes.append({"confidence":confidence, "file_id":file_id, "bbox":bbox}) 490 | #print(bounding_boxes) 491 | # sort detection-results by decreasing confidence 492 | bounding_boxes.sort(key=lambda x:float(x['confidence']), reverse=True) 493 | with open(TEMP_FILES_PATH + "/" + class_name + "_dr.json", 'w') as outfile: 494 | json.dump(bounding_boxes, outfile) 495 | 496 | """ 497 | Calculate the AP for each class 498 | """ 499 | sum_AP = 0.0 500 | ap_dictionary = {} 501 | lamr_dictionary = {} 502 | # open file to store the output 503 | with open(output_files_path + "/output.txt", 'w') as output_file: 504 | output_file.write("# AP and precision/recall per class\n") 505 | count_true_positives = {} 506 | for class_index, class_name in enumerate(gt_classes): 507 | count_true_positives[class_name] = 0 508 | """ 509 | Load detection-results of that class 510 | """ 511 | dr_file = TEMP_FILES_PATH + "/" + class_name + "_dr.json" 512 | dr_data = json.load(open(dr_file)) 513 | 514 | """ 515 | Assign detection-results to ground-truth objects 516 | """ 517 | nd = len(dr_data) 518 | tp = [0] * nd # creates an array of zeros of size nd 519 | fp = [0] * nd 520 | for idx, detection in enumerate(dr_data): 521 | file_id = detection["file_id"] 522 | if show_animation: 523 | # find ground truth image 524 | ground_truth_img = glob.glob1(IMG_PATH, file_id + ".*") 525 | #tifCounter = len(glob.glob1(myPath,"*.tif")) 526 | if len(ground_truth_img) == 0: 527 | error("Error. Image not found with id: " + file_id) 528 | elif len(ground_truth_img) > 1: 529 | error("Error. Multiple image with id: " + file_id) 530 | else: # found image 531 | #print(IMG_PATH + "/" + ground_truth_img[0]) 532 | # Load image 533 | img = cv2.imread(IMG_PATH + "/" + ground_truth_img[0]) 534 | # load image with draws of multiple detections 535 | img_cumulative_path = output_files_path + "/images/" + ground_truth_img[0] 536 | if os.path.isfile(img_cumulative_path): 537 | img_cumulative = cv2.imread(img_cumulative_path) 538 | else: 539 | img_cumulative = img.copy() 540 | # Add bottom border to image 541 | bottom_border = 60 542 | BLACK = [0, 0, 0] 543 | img = cv2.copyMakeBorder(img, 0, bottom_border, 0, 0, cv2.BORDER_CONSTANT, value=BLACK) 544 | # assign detection-results to ground truth object if any 545 | # open ground-truth with that file_id 546 | gt_file = TEMP_FILES_PATH + "/" + file_id + "_ground_truth.json" 547 | ground_truth_data = json.load(open(gt_file)) 548 | ovmax = -1 549 | gt_match = -1 550 | # load detected object bounding-box 551 | bb = [ float(x) for x in detection["bbox"].split() ] 552 | for obj in ground_truth_data: 553 | # look for a class_name match 554 | if obj["class_name"] == class_name: 555 | bbgt = [ float(x) for x in obj["bbox"].split() ] 556 | bi = [max(bb[0],bbgt[0]), max(bb[1],bbgt[1]), min(bb[2],bbgt[2]), min(bb[3],bbgt[3])] 557 | iw = bi[2] - bi[0] + 1 558 | ih = bi[3] - bi[1] + 1 559 | if iw > 0 and ih > 0: 560 | # compute overlap (IoU) = area of intersection / area of union 561 | ua = (bb[2] - bb[0] + 1) * (bb[3] - bb[1] + 1) + (bbgt[2] - bbgt[0] 562 | + 1) * (bbgt[3] - bbgt[1] + 1) - iw * ih 563 | ov = iw * ih / ua 564 | if ov > ovmax: 565 | ovmax = ov 566 | gt_match = obj 567 | 568 | # assign detection as true positive/don't care/false positive 569 | if show_animation: 570 | status = "NO MATCH FOUND!" # status is only used in the animation 571 | # set minimum overlap 572 | min_overlap = MINOVERLAP 573 | if specific_iou_flagged: 574 | if class_name in specific_iou_classes: 575 | index = specific_iou_classes.index(class_name) 576 | min_overlap = float(iou_list[index]) 577 | if ovmax >= min_overlap: 578 | if "difficult" not in gt_match: 579 | if not bool(gt_match["used"]): 580 | # true positive 581 | tp[idx] = 1 582 | gt_match["used"] = True 583 | count_true_positives[class_name] += 1 584 | # update the ".json" file 585 | with open(gt_file, 'w') as f: 586 | f.write(json.dumps(ground_truth_data)) 587 | if show_animation: 588 | status = "MATCH!" 589 | else: 590 | # false positive (multiple detection) 591 | fp[idx] = 1 592 | if show_animation: 593 | status = "REPEATED MATCH!" 594 | else: 595 | # false positive 596 | fp[idx] = 1 597 | if ovmax > 0: 598 | status = "INSUFFICIENT OVERLAP" 599 | 600 | """ 601 | Draw image to show animation 602 | """ 603 | if show_animation: 604 | height, widht = img.shape[:2] 605 | # colors (OpenCV works with BGR) 606 | white = (255,255,255) 607 | light_blue = (255,200,100) 608 | green = (0,255,0) 609 | light_red = (30,30,255) 610 | # 1st line 611 | margin = 10 612 | v_pos = int(height - margin - (bottom_border / 2.0)) 613 | text = "Image: " + ground_truth_img[0] + " " 614 | img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) 615 | text = "Class [" + str(class_index) + "/" + str(n_classes) + "]: " + class_name + " " 616 | img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), light_blue, line_width) 617 | if ovmax != -1: 618 | color = light_red 619 | if status == "INSUFFICIENT OVERLAP": 620 | text = "IoU: {0:.2f}% ".format(ovmax*100) + "< {0:.2f}% ".format(min_overlap*100) 621 | else: 622 | text = "IoU: {0:.2f}% ".format(ovmax*100) + ">= {0:.2f}% ".format(min_overlap*100) 623 | color = green 624 | img, _ = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) 625 | # 2nd line 626 | v_pos += int(bottom_border / 2.0) 627 | rank_pos = str(idx+1) # rank position (idx starts at 0) 628 | text = "Detection #rank: " + rank_pos + " confidence: {0:.2f}% ".format(float(detection["confidence"])*100) 629 | img, line_width = draw_text_in_image(img, text, (margin, v_pos), white, 0) 630 | color = light_red 631 | if status == "MATCH!": 632 | color = green 633 | text = "Result: " + status + " " 634 | img, line_width = draw_text_in_image(img, text, (margin + line_width, v_pos), color, line_width) 635 | 636 | font = cv2.FONT_HERSHEY_SIMPLEX 637 | if ovmax > 0: # if there is intersections between the bounding-boxes 638 | bbgt = [ int(round(float(x))) for x in gt_match["bbox"].split() ] 639 | cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) 640 | cv2.rectangle(img_cumulative,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),light_blue,2) 641 | cv2.putText(img_cumulative, class_name, (bbgt[0],bbgt[1] - 5), font, 0.6, light_blue, 1, cv2.LINE_AA) 642 | bb = [int(i) for i in bb] 643 | cv2.rectangle(img,(bb[0],bb[1]),(bb[2],bb[3]),color,2) 644 | cv2.rectangle(img_cumulative,(bb[0],bb[1]),(bb[2],bb[3]),color,2) 645 | cv2.putText(img_cumulative, class_name, (bb[0],bb[1] - 5), font, 0.6, color, 1, cv2.LINE_AA) 646 | # show image 647 | cv2.imshow("Animation", img) 648 | cv2.waitKey(20) # show for 20 ms 649 | # save image to output 650 | output_img_path = output_files_path + "/images/detections_one_by_one/" + class_name + "_detection" + str(idx) + ".jpg" 651 | cv2.imwrite(output_img_path, img) 652 | # save the image with all the objects drawn to it 653 | cv2.imwrite(img_cumulative_path, img_cumulative) 654 | 655 | #print(tp) 656 | # compute precision/recall 657 | cumsum = 0 658 | for idx, val in enumerate(fp): 659 | fp[idx] += cumsum 660 | cumsum += val 661 | cumsum = 0 662 | for idx, val in enumerate(tp): 663 | tp[idx] += cumsum 664 | cumsum += val 665 | #print(tp) 666 | rec = tp[:] 667 | for idx, val in enumerate(tp): 668 | rec[idx] = float(tp[idx]) / gt_counter_per_class[class_name] 669 | #print(rec) 670 | prec = tp[:] 671 | for idx, val in enumerate(tp): 672 | prec[idx] = float(tp[idx]) / (fp[idx] + tp[idx]) 673 | #print(prec) 674 | 675 | ap, mrec, mprec = voc_ap(rec[:], prec[:]) 676 | sum_AP += ap 677 | text = "{0:.2f}%".format(ap*100) + " = " + class_name + " AP " #class_name + " AP = {0:.2f}%".format(ap*100) 678 | """ 679 | Write to output.txt 680 | """ 681 | rounded_prec = [ '%.2f' % elem for elem in prec ] 682 | rounded_rec = [ '%.2f' % elem for elem in rec ] 683 | output_file.write(text + "\n Precision: " + str(rounded_prec) + "\n Recall :" + str(rounded_rec) + "\n\n") 684 | if not args.quiet: 685 | print(text) 686 | ap_dictionary[class_name] = ap 687 | 688 | n_images = counter_images_per_class[class_name] 689 | lamr, mr, fppi = log_average_miss_rate(np.array(prec), np.array(rec), n_images) 690 | lamr_dictionary[class_name] = lamr 691 | 692 | """ 693 | Draw plot 694 | """ 695 | if draw_plot: 696 | plt.plot(rec, prec, '-o') 697 | # add a new penultimate point to the list (mrec[-2], 0.0) 698 | # since the last line segment (and respective area) do not affect the AP value 699 | area_under_curve_x = mrec[:-1] + [mrec[-2]] + [mrec[-1]] 700 | area_under_curve_y = mprec[:-1] + [0.0] + [mprec[-1]] 701 | plt.fill_between(area_under_curve_x, 0, area_under_curve_y, alpha=0.2, edgecolor='r') 702 | # set window title 703 | fig = plt.gcf() # gcf - get current figure 704 | fig.canvas.set_window_title('AP ' + class_name) 705 | # set plot title 706 | plt.title('class: ' + text) 707 | #plt.suptitle('This is a somewhat long figure title', fontsize=16) 708 | # set axis titles 709 | plt.xlabel('Recall') 710 | plt.ylabel('Precision') 711 | # optional - set axes 712 | axes = plt.gca() # gca - get current axes 713 | axes.set_xlim([0.0,1.0]) 714 | axes.set_ylim([0.0,1.05]) # .05 to give some extra space 715 | # Alternative option -> wait for button to be pressed 716 | #while not plt.waitforbuttonpress(): pass # wait for key display 717 | # Alternative option -> normal display 718 | #plt.show() 719 | # save the plot 720 | fig.savefig(output_files_path + "/classes/" + class_name + ".png") 721 | plt.cla() # clear axes for next plot 722 | 723 | if show_animation: 724 | cv2.destroyAllWindows() 725 | 726 | output_file.write("\n# mAP of all classes\n") 727 | mAP = sum_AP / n_classes 728 | text = "mAP = {0:.2f}%".format(mAP*100) 729 | output_file.write(text + "\n") 730 | print(text) 731 | 732 | """ 733 | Draw false negatives 734 | """ 735 | if show_animation: 736 | pink = (203,192,255) 737 | for tmp_file in gt_files: 738 | ground_truth_data = json.load(open(tmp_file)) 739 | #print(ground_truth_data) 740 | # get name of corresponding image 741 | start = TEMP_FILES_PATH + '/' 742 | img_id = tmp_file[tmp_file.find(start)+len(start):tmp_file.rfind('_ground_truth.json')] 743 | img_cumulative_path = output_files_path + "/images/" + img_id + ".jpg" 744 | img = cv2.imread(img_cumulative_path) 745 | if img is None: 746 | img_path = IMG_PATH + '/' + img_id + ".jpg" 747 | img = cv2.imread(img_path) 748 | # draw false negatives 749 | for obj in ground_truth_data: 750 | if not obj['used']: 751 | bbgt = [ int(round(float(x))) for x in obj["bbox"].split() ] 752 | cv2.rectangle(img,(bbgt[0],bbgt[1]),(bbgt[2],bbgt[3]),pink,2) 753 | cv2.imwrite(img_cumulative_path, img) 754 | 755 | # remove the temp_files directory 756 | shutil.rmtree(TEMP_FILES_PATH) 757 | 758 | """ 759 | Count total of detection-results 760 | """ 761 | # iterate through all the files 762 | det_counter_per_class = {} 763 | for txt_file in dr_files_list: 764 | # get lines to list 765 | lines_list = file_lines_to_list(txt_file) 766 | for line in lines_list: 767 | class_name = line.split()[0] 768 | # check if class is in the ignore list, if yes skip 769 | if class_name in args.ignore: 770 | continue 771 | # count that object 772 | if class_name in det_counter_per_class: 773 | det_counter_per_class[class_name] += 1 774 | else: 775 | # if class didn't exist yet 776 | det_counter_per_class[class_name] = 1 777 | #print(det_counter_per_class) 778 | dr_classes = list(det_counter_per_class.keys()) 779 | 780 | 781 | """ 782 | Plot the total number of occurences of each class in the ground-truth 783 | """ 784 | if draw_plot: 785 | window_title = "ground-truth-info" 786 | plot_title = "ground-truth\n" 787 | plot_title += "(" + str(len(ground_truth_files_list)) + " files and " + str(n_classes) + " classes)" 788 | x_label = "Number of objects per class" 789 | output_path = output_files_path + "/ground-truth-info.png" 790 | to_show = False 791 | plot_color = 'forestgreen' 792 | draw_plot_func( 793 | gt_counter_per_class, 794 | n_classes, 795 | window_title, 796 | plot_title, 797 | x_label, 798 | output_path, 799 | to_show, 800 | plot_color, 801 | '', 802 | ) 803 | 804 | """ 805 | Write number of ground-truth objects per class to results.txt 806 | """ 807 | with open(output_files_path + "/output.txt", 'a') as output_file: 808 | output_file.write("\n# Number of ground-truth objects per class\n") 809 | for class_name in sorted(gt_counter_per_class): 810 | output_file.write(class_name + ": " + str(gt_counter_per_class[class_name]) + "\n") 811 | 812 | """ 813 | Finish counting true positives 814 | """ 815 | for class_name in dr_classes: 816 | # if class exists in detection-result but not in ground-truth then there are no true positives in that class 817 | if class_name not in gt_classes: 818 | count_true_positives[class_name] = 0 819 | #print(count_true_positives) 820 | 821 | """ 822 | Plot the total number of occurences of each class in the "detection-results" folder 823 | """ 824 | if draw_plot: 825 | window_title = "detection-results-info" 826 | # Plot title 827 | plot_title = "detection-results\n" 828 | plot_title += "(" + str(len(dr_files_list)) + " files and " 829 | count_non_zero_values_in_dictionary = sum(int(x) > 0 for x in list(det_counter_per_class.values())) 830 | plot_title += str(count_non_zero_values_in_dictionary) + " detected classes)" 831 | # end Plot title 832 | x_label = "Number of objects per class" 833 | output_path = output_files_path + "/detection-results-info.png" 834 | to_show = False 835 | plot_color = 'forestgreen' 836 | true_p_bar = count_true_positives 837 | draw_plot_func( 838 | det_counter_per_class, 839 | len(det_counter_per_class), 840 | window_title, 841 | plot_title, 842 | x_label, 843 | output_path, 844 | to_show, 845 | plot_color, 846 | true_p_bar 847 | ) 848 | 849 | """ 850 | Write number of detected objects per class to output.txt 851 | """ 852 | with open(output_files_path + "/output.txt", 'a') as output_file: 853 | output_file.write("\n# Number of detected objects per class\n") 854 | for class_name in sorted(dr_classes): 855 | n_det = det_counter_per_class[class_name] 856 | text = class_name + ": " + str(n_det) 857 | text += " (tp:" + str(count_true_positives[class_name]) + "" 858 | text += ", fp:" + str(n_det - count_true_positives[class_name]) + ")\n" 859 | output_file.write(text) 860 | 861 | """ 862 | Draw log-average miss rate plot (Show lamr of all classes in decreasing order) 863 | """ 864 | if draw_plot: 865 | window_title = "lamr" 866 | plot_title = "log-average miss rate" 867 | x_label = "log-average miss rate" 868 | output_path = output_files_path + "/lamr.png" 869 | to_show = False 870 | plot_color = 'royalblue' 871 | draw_plot_func( 872 | lamr_dictionary, 873 | n_classes, 874 | window_title, 875 | plot_title, 876 | x_label, 877 | output_path, 878 | to_show, 879 | plot_color, 880 | "" 881 | ) 882 | 883 | """ 884 | Draw mAP plot (Show AP's of all classes in decreasing order) 885 | """ 886 | if draw_plot: 887 | window_title = "mAP" 888 | plot_title = "mAP = {0:.2f}%".format(mAP*100) 889 | x_label = "Average Precision" 890 | output_path = output_files_path + "/mAP.png" 891 | to_show = True 892 | plot_color = 'royalblue' 893 | draw_plot_func( 894 | ap_dictionary, 895 | n_classes, 896 | window_title, 897 | plot_title, 898 | x_label, 899 | output_path, 900 | to_show, 901 | plot_color, 902 | "" 903 | ) 904 | -------------------------------------------------------------------------------- /objectDetection.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | 训练常基于dark-net的YOLOv3网络,目标检测 4 | """ 5 | from __future__ import absolute_import 6 | from __future__ import division 7 | from __future__ import print_function 8 | import os 9 | 10 | os.environ["FLAGS_fraction_of_gpu_memory_to_use"] = '0.92' 11 | os.environ["FLAGS_eager_delete_tensor_gb"] = '0' 12 | os.environ["FLAGS_memory_fraction_of_eager_deletion"] = '1' 13 | os.environ["FLAGS_fast_eager_deletion_mode"]='True' 14 | 15 | import uuid 16 | import numpy as np 17 | import time 18 | import six 19 | import math 20 | import random 21 | import paddle 22 | import paddle.fluid as fluid 23 | import logging 24 | import xml.etree.ElementTree 25 | import codecs 26 | import json 27 | 28 | from paddle.fluid.initializer import MSRA 29 | from paddle.fluid.param_attr import ParamAttr 30 | from paddle.fluid.regularizer import L2Decay 31 | from PIL import Image, ImageEnhance, ImageDraw, ImageFile 32 | ImageFile.LOAD_TRUNCATED_IMAGES = True 33 | Image.MAX_IMAGE_PIXELS = None 34 | 35 | logger = None # 日志对象 36 | 37 | train_params = { 38 | "data_dir": "data/data6045", # 数据目录 39 | "train_list": "train.txt", # 训练集文件 40 | "eval_list": "eval.txt", 41 | "class_dim": -1, 42 | "label_dict": {}, # 标签字典 43 | "num_dict": {}, 44 | "image_count": -1, 45 | "continue_train": True, # 是否加载前一次的训练参数,接着训练 46 | "pretrained": False, # 是否预训练 47 | "pretrained_model_dir": "./pretrained-model", 48 | "save_model_dir": "./yolo-model", # 模型保存目录 49 | "model_prefix": "yolo-v3", # 模型前缀 50 | "freeze_dir": "freeze_model", 51 | "use_tiny": False, # 是否使用 裁剪 tiny 模型 52 | "max_box_num": 8, # 一幅图上最多有多少个目标 53 | "num_epochs": 100, # 训练轮次 54 | "train_batch_size": 7, # 对于完整yolov3,每一批的训练样本不能太多,内存会炸掉;如果使用tiny,可以适当大一些 55 | "use_gpu": True, # 是否使用GPU 56 | "yolo_cfg": { # YOLO模型参数 57 | "input_size": [3, 448, 448], # 原版的边长大小为608,为了提高训练速度和预测速度,此处压缩为448 58 | "anchors": [7, 10, 12, 22, 24, 17, 22, 45, 46, 33, 43, 88, 85, 66, 115, 146, 275, 240], # 锚点?? 59 | "anchor_mask": [[6, 7, 8], [3, 4, 5], [0, 1, 2]] 60 | }, 61 | "yolo_tiny_cfg": { # YOLO tiny 模型参数 62 | "input_size": [3, 256, 256], 63 | "anchors": [6, 8, 13, 15, 22, 34, 48, 50, 81, 100, 205, 191], 64 | "anchor_mask": [[3, 4, 5], [0, 1, 2]] 65 | }, 66 | "ignore_thresh": 0.7, 67 | "mean_rgb": [127.5, 127.5, 127.5], 68 | "mode": "train", 69 | "multi_data_reader_count": 4, 70 | "apply_distort": True, # 是否做图像扭曲增强 71 | "nms_top_k": 300, 72 | "nms_pos_k": 300, 73 | "valid_thresh": 0.01, 74 | "nms_thresh": 0.40, # 非最大值抑制阈值 75 | "image_distort_strategy": { # 图像扭曲策略 76 | "expand_prob": 0.5, # 扩展比率 77 | "expand_max_ratio": 4, 78 | "hue_prob": 0.5, # 色调 79 | "hue_delta": 18, 80 | "contrast_prob": 0.5, # 对比度 81 | "contrast_delta": 0.5, 82 | "saturation_prob": 0.5, # 饱和度 83 | "saturation_delta": 0.5, 84 | "brightness_prob": 0.5, # 亮度 85 | "brightness_delta": 0.125 86 | }, 87 | "sgd_strategy": { # 梯度下降配置 88 | "learning_rate": 0.002, 89 | "lr_epochs": [30, 50, 65], # 学习率衰减分段(3个数字分为4段) 90 | "lr_decay": [1, 0.5, 0.25, 0.1] # 每段采用的学习率,对应lr_epochs参数4段 91 | }, 92 | "early_stop": { 93 | "sample_frequency": 50, 94 | "successive_limit": 3, 95 | "min_loss": 2.5, 96 | "min_curr_map": 0.84 97 | } 98 | } 99 | 100 | 101 | def init_train_parameters(): 102 | """ 103 | 初始化训练参数,主要是初始化图片数量,类别数 104 | :return: 105 | """ 106 | file_list = os.path.join(train_params['data_dir'], train_params['train_list']) # 训练集 107 | label_list = os.path.join(train_params['data_dir'], "label_list") # 标签文件 108 | index = 0 109 | 110 | # codecs是专门用作编码转换通用模块 111 | with codecs.open(label_list, encoding='utf-8') as flist: 112 | lines = [line.strip() for line in flist] 113 | for line in lines: 114 | train_params['num_dict'][index] = line.strip() 115 | train_params['label_dict'][line.strip()] = index 116 | index += 1 117 | train_params['class_dim'] = index 118 | 119 | with codecs.open(file_list, encoding='utf-8') as flist: 120 | lines = [line.strip() for line in flist] 121 | train_params['image_count'] = len(lines) # 图片数量 122 | 123 | 124 | # 日志相关配置 125 | def init_log_config(): # 初始化日志相关配置 126 | global logger 127 | 128 | logger = logging.getLogger() # 创建日志对象 129 | logger.setLevel(logging.INFO) # 设置日志级别 130 | log_path = os.path.join(os.getcwd(), 'logs') 131 | 132 | if not os.path.exists(log_path): # 创建日志路径 133 | os.makedirs(log_path) 134 | 135 | log_name = os.path.join(log_path, 'train.log') # 训练日志文件 136 | fh = logging.FileHandler(log_name, mode='w') # 打开文件句柄 137 | fh.setLevel(logging.DEBUG) # 设置级别 138 | 139 | formatter = logging.Formatter("%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s") 140 | fh.setFormatter(formatter) 141 | logger.addHandler(fh) 142 | 143 | 144 | init_log_config() 145 | 146 | 147 | # 定义YOLO3网络结构:darknet-53 148 | class YOLOv3(object): 149 | def __init__(self, class_num, anchors, anchor_mask): 150 | self.outputs = [] # 网络最终模型 151 | self.downsample_ratio = 1 # 下采样率 152 | self.anchor_mask = anchor_mask # 计算卷积核??? 153 | self.anchors = anchors # 锚点 154 | self.class_num = class_num # 类别数量 155 | 156 | self.yolo_anchors = [] 157 | self.yolo_classes = [] 158 | 159 | for mask_pair in self.anchor_mask: 160 | mask_anchors = [] 161 | for mask in mask_pair: 162 | mask_anchors.append(self.anchors[2 * mask]) 163 | mask_anchors.append(self.anchors[2 * mask + 1]) 164 | self.yolo_anchors.append(mask_anchors) 165 | self.yolo_classes.append(class_num) 166 | 167 | def name(self): 168 | return 'YOLOv3' 169 | 170 | # 获取anchors 171 | def get_anchors(self): 172 | return self.anchors 173 | 174 | # 获取anchor_mask 175 | def get_anchor_mask(self): 176 | return self.anchor_mask 177 | 178 | def get_class_num(self): 179 | return self.class_num 180 | 181 | def get_downsample_ratio(self): 182 | return self.downsample_ratio 183 | 184 | def get_yolo_anchors(self): 185 | return self.yolo_anchors 186 | 187 | def get_yolo_classes(self): 188 | return self.yolo_classes 189 | 190 | # 卷积正则化函数: 卷积、批量正则化处理、leakrelu 191 | def conv_bn(self, 192 | input, # 输入 193 | num_filters, # 卷积核数量 194 | filter_size, # 卷积核大小 195 | stride, # 步幅 196 | padding, # 填充 197 | use_cudnn=True): 198 | # 2d卷积操作 199 | conv = fluid.layers.conv2d(input=input, 200 | num_filters=num_filters, 201 | filter_size=filter_size, 202 | stride=stride, 203 | padding=padding, 204 | act=None, 205 | use_cudnn=use_cudnn, # 是否使用cudnn,cudnn利用cuda进行了加速处理 206 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)), 207 | bias_attr=False) 208 | 209 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉 210 | # 在batch_norm中使用leaky的话,只能使用默认的alpha=0.02;如果需要设值,必须提出去单独来 211 | # 正则化的目的,是为了防止过拟合,较小的L2值能防止过拟合 212 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02), 213 | regularizer=L2Decay(0.)) 214 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), 215 | regularizer=L2Decay(0.)) 216 | out = fluid.layers.batch_norm(input=conv, act=None, 217 | param_attr=param_attr, 218 | bias_attr=bias_attr) 219 | # leaky_relu: Leaky ReLU是给所有负值赋予一个非零斜率 220 | out = fluid.layers.leaky_relu(out, 0.1) 221 | return out 222 | 223 | # 通过卷积实现降采样 224 | # 如:原始图片大小448*448,降采样后大小为 ((448+2)-3)/2 + 1 = 224 225 | def down_sample(self, input, num_filters, filter_size=3, stride=2, padding=1): 226 | self.downsample_ratio *= 2 # 降采样率 227 | return self.conv_bn(input, 228 | num_filters=num_filters, 229 | filter_size=filter_size, 230 | stride=stride, 231 | padding=padding) 232 | 233 | # 基本块:包含两个卷积/正则化层,一个残差块 234 | def basic_block(self, input, num_filters): 235 | conv1 = self.conv_bn(input, num_filters, filter_size=1, stride=1, padding=0) 236 | conv2 = self.conv_bn(conv1, num_filters * 2, filter_size=3, stride=1, padding=1) 237 | out = fluid.layers.elementwise_add(x=input, y=conv2, act=None) # 计算H(x)=F(x)+x 238 | return out 239 | 240 | # 创建多个basic_block 241 | def layer_warp(self, input, num_filters, count): 242 | res_out = self.basic_block(input, num_filters) 243 | for j in range(1, count): 244 | res_out = self.basic_block(res_out, num_filters) 245 | return res_out 246 | 247 | # 上采样 248 | def up_sample(self, input, scale=2): 249 | # get dynamic upsample output shape 250 | shape_nchw = fluid.layers.shape(input) # 获取input的形状 251 | shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4]) 252 | shape_hw.stop_gradient = True 253 | in_shape = fluid.layers.cast(shape_hw, dtype='int32') 254 | out_shape = in_shape * scale # 计算输出数据形状 255 | out_shape.stop_gradient = True 256 | 257 | # reisze by actual_shape 258 | # 矩阵放大(最邻插值法) 259 | out = fluid.layers.resize_nearest(input=input, 260 | scale=scale, 261 | actual_shape=out_shape) 262 | return out 263 | 264 | def yolo_detection_block(self, input, num_filters): 265 | assert num_filters % 2 == 0, "num_filters {} cannot be divided by 2".format(num_filters) 266 | 267 | conv = input 268 | for j in range(2): 269 | conv = self.conv_bn(conv, num_filters, filter_size=1, stride=1, padding=0) 270 | conv = self.conv_bn(conv, num_filters * 2, filter_size=3, stride=1, padding=1) 271 | route = self.conv_bn(conv, num_filters, filter_size=1, stride=1, padding=0) 272 | tip = self.conv_bn(route, num_filters * 2, filter_size=3, stride=1, padding=1) 273 | return route, tip 274 | 275 | # 搭建网络模型 darknet-53 276 | def net(self, img): 277 | stages = [1, 2, 8, 8, 4] 278 | assert len(self.anchor_mask) <= len(stages), "anchor masks can't bigger than down_sample times" 279 | # 第一个卷积层: 256*256 280 | conv1 = self.conv_bn(img, num_filters=32, filter_size=3, stride=1, padding=1) 281 | # 第二个卷积层:128*128 282 | downsample_ = self.down_sample(conv1, conv1.shape[1] * 2) # 第二个参数为卷积核数量 283 | blocks = [] 284 | 285 | # 循环创建basic_block组 286 | for i, stage_count in enumerate(stages): 287 | block = self.layer_warp(downsample_, # 输入数据 288 | 32 * (2 ** i), # 卷积核数量 289 | stage_count) # 基本块数量 290 | blocks.append(block) 291 | if i < len(stages) - 1: # 如果不是最后一组,做降采样 292 | downsample_ = self.down_sample(block, block.shape[1] * 2) 293 | blocks = blocks[-1:-4:-1] # 取倒数三层,并且逆序,后面跨层级联需要 294 | 295 | # yolo detector 296 | for i, block in enumerate(blocks): 297 | # yolo中跨视域链接 298 | if i > 0: 299 | block = fluid.layers.concat(input=[route, block], axis=1) # 连接route和block,按行 300 | 301 | route, tip = self.yolo_detection_block(block, # 输入 302 | num_filters=512 // (2 ** i)) # 卷积核数量 303 | 304 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)) 305 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.)) 306 | block_out = fluid.layers.conv2d(input=tip, 307 | # 5 elements represent x|y|h|w|score 308 | num_filters=len(self.anchor_mask[i]) * (self.class_num + 5), 309 | filter_size=1, 310 | stride=1, 311 | padding=0, 312 | act=None, 313 | param_attr=param_attr, 314 | bias_attr=bias_attr) 315 | self.outputs.append(block_out) 316 | 317 | # 为了跨视域链接,差值方式提升特征图尺寸 318 | if i < len(blocks) - 1: 319 | route = self.conv_bn(route, 256 // (2 ** i), filter_size=1, stride=1, padding=0) 320 | route = self.up_sample(route) # 上采样 321 | 322 | return self.outputs 323 | 324 | # Tiny(精简版)YOLO模型 325 | class YOLOv3Tiny(object): 326 | def __init__(self, class_num, anchors, anchor_mask): 327 | self.outputs = [] 328 | self.downsample_ratio = 1 329 | self.anchor_mask = anchor_mask 330 | self.anchors = anchors 331 | self.class_num = class_num 332 | 333 | self.yolo_anchors = [] 334 | self.yolo_classes = [] 335 | for mask_pair in self.anchor_mask: 336 | mask_anchors = [] 337 | for mask in mask_pair: 338 | mask_anchors.append(self.anchors[2 * mask]) 339 | mask_anchors.append(self.anchors[2 * mask + 1]) 340 | self.yolo_anchors.append(mask_anchors) 341 | self.yolo_classes.append(class_num) 342 | 343 | def name(self): 344 | return 'YOLOv3-tiny' 345 | 346 | def get_anchors(self): 347 | return self.anchors 348 | 349 | def get_anchor_mask(self): 350 | return self.anchor_mask 351 | 352 | def get_class_num(self): 353 | return self.class_num 354 | 355 | def get_downsample_ratio(self): 356 | return self.downsample_ratio 357 | 358 | def get_yolo_anchors(self): 359 | return self.yolo_anchors 360 | 361 | def get_yolo_classes(self): 362 | return self.yolo_classes 363 | 364 | def conv_bn(self, 365 | input, 366 | num_filters, 367 | filter_size, 368 | stride, 369 | padding, 370 | num_groups=1, 371 | use_cudnn=True): 372 | conv = fluid.layers.conv2d( 373 | input=input, 374 | num_filters=num_filters, 375 | filter_size=filter_size, 376 | stride=stride, 377 | padding=padding, 378 | act=None, 379 | groups=num_groups, 380 | use_cudnn=use_cudnn, 381 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)), 382 | bias_attr=False) 383 | 384 | # batch_norm中的参数不需要参与正则化,所以主动使用正则系数为0的正则项屏蔽掉 385 | out = fluid.layers.batch_norm( 386 | input=conv, act='relu', 387 | param_attr=ParamAttr(initializer=fluid.initializer.Normal(0., 0.02), regularizer=L2Decay(0.)), 388 | bias_attr=ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.))) 389 | 390 | return out 391 | 392 | def depthwise_conv_bn(self, input, filter_size=3, stride=1, padding=1): 393 | num_filters = input.shape[1] 394 | return self.conv_bn(input, 395 | num_filters=num_filters, 396 | filter_size=filter_size, 397 | stride=stride, 398 | padding=padding, 399 | num_groups=num_filters) 400 | 401 | def down_sample(self, input, pool_size=2, pool_stride=2): 402 | self.downsample_ratio *= 2 403 | return fluid.layers.pool2d(input=input, pool_type='max', pool_size=pool_size, 404 | pool_stride=pool_stride) 405 | 406 | def basic_block(self, input, num_filters): 407 | conv1 = self.conv_bn(input, num_filters, filter_size=3, stride=1, padding=1) 408 | out = self.down_sample(conv1) 409 | return out 410 | 411 | def up_sample(self, input, scale=2): 412 | # get dynamic upsample output shape 413 | shape_nchw = fluid.layers.shape(input) 414 | shape_hw = fluid.layers.slice(shape_nchw, axes=[0], starts=[2], ends=[4]) 415 | shape_hw.stop_gradient = True 416 | in_shape = fluid.layers.cast(shape_hw, dtype='int32') 417 | out_shape = in_shape * scale 418 | out_shape.stop_gradient = True 419 | 420 | # reisze by actual_shape 421 | out = fluid.layers.resize_nearest( 422 | input=input, 423 | scale=scale, 424 | actual_shape=out_shape) 425 | return out 426 | 427 | def yolo_detection_block(self, input, num_filters): 428 | route = self.conv_bn(input, num_filters, filter_size=1, stride=1, padding=0) 429 | tip = self.conv_bn(route, num_filters * 2, filter_size=3, stride=1, padding=1) 430 | return route, tip 431 | 432 | def net(self, img): 433 | # darknet-tiny 434 | stages = [16, 32, 64, 128, 256, 512] 435 | assert len(self.anchor_mask) <= len(stages), "anchor masks can't bigger than down_sample times" 436 | # 256x256 437 | tmp = img 438 | blocks = [] 439 | for i, stage_count in enumerate(stages): 440 | if i == len(stages) - 1: 441 | block = self.conv_bn(tmp, stage_count, filter_size=3, stride=1, padding=1) 442 | blocks.append(block) 443 | block = self.depthwise_conv_bn(blocks[-1]) 444 | block = self.depthwise_conv_bn(blocks[-1]) 445 | block = self.conv_bn(blocks[-1], stage_count * 2, filter_size=1, stride=1, padding=0) 446 | blocks.append(block) 447 | else: 448 | tmp = self.basic_block(tmp, stage_count) 449 | blocks.append(tmp) 450 | 451 | blocks = [blocks[-1], blocks[3]] 452 | 453 | # yolo detector 454 | for i, block in enumerate(blocks): 455 | # yolo 中跨视域链接 456 | if i > 0: 457 | block = fluid.layers.concat(input=[route, block], axis=1) 458 | if i < 1: 459 | route, tip = self.yolo_detection_block(block, num_filters=256 // (2 ** i)) 460 | else: 461 | tip = self.conv_bn(block, num_filters=256, filter_size=3, stride=1, padding=1) 462 | 463 | param_attr = ParamAttr(initializer=fluid.initializer.Normal(0., 0.02)) 464 | bias_attr = ParamAttr(initializer=fluid.initializer.Constant(0.0), regularizer=L2Decay(0.)) 465 | block_out = fluid.layers.conv2d(input=tip, 466 | # 5 elements represent x|y|h|w|score 467 | num_filters=len(self.anchor_mask[i]) * (self.class_num + 5), 468 | filter_size=1, 469 | stride=1, 470 | padding=0, 471 | act=None, 472 | param_attr=param_attr, 473 | bias_attr=bias_attr) 474 | self.outputs.append(block_out) 475 | # 为了跨视域链接,差值方式提升特征图尺寸 476 | if i < len(blocks) - 1: 477 | route = self.conv_bn(route, 128 // (2 ** i), filter_size=1, stride=1, padding=0) 478 | route = self.up_sample(route) 479 | 480 | return self.outputs 481 | 482 | 483 | def get_yolo(is_tiny, class_num, anchors, anchor_mask): 484 | if is_tiny: 485 | return YOLOv3Tiny(class_num, anchors, anchor_mask) 486 | else: 487 | return YOLOv3(class_num, anchors, anchor_mask) 488 | 489 | 490 | class Sampler(object): 491 | """ 492 | 采样器,用于扣取采样 493 | """ 494 | 495 | def __init__(self, max_sample, max_trial, min_scale, max_scale, 496 | min_aspect_ratio, max_aspect_ratio, min_jaccard_overlap, 497 | max_jaccard_overlap): 498 | self.max_sample = max_sample 499 | self.max_trial = max_trial 500 | self.min_scale = min_scale 501 | self.max_scale = max_scale 502 | self.min_aspect_ratio = min_aspect_ratio 503 | self.max_aspect_ratio = max_aspect_ratio 504 | self.min_jaccard_overlap = min_jaccard_overlap 505 | self.max_jaccard_overlap = max_jaccard_overlap 506 | 507 | 508 | class bbox(object): 509 | """ 510 | 外界矩形框 511 | """ 512 | 513 | def __init__(self, xmin, ymin, xmax, ymax): 514 | self.xmin = xmin 515 | self.ymin = ymin 516 | self.xmax = xmax 517 | self.ymax = ymax 518 | 519 | 520 | # 坐标转换,由[x1, y1, w, h]转换为[center_x, center_y, w, h] 521 | # 并转换为范围在[0, 1]之间的相对坐标 522 | def box_to_center_relative(box, img_height, img_width): 523 | """ 524 | Convert COCO annotations box with format [x1, y1, w, h] to 525 | center mode [center_x, center_y, w, h] and divide image width 526 | and height to get relative value in range[0, 1] 527 | """ 528 | assert len(box) == 4, "box should be a len(4) list or tuple" 529 | x, y, w, h = box 530 | 531 | x1 = max(x, 0) 532 | x2 = min(x + w - 1, img_width - 1) 533 | y1 = max(y, 0) 534 | y2 = min(y + h - 1, img_height - 1) 535 | 536 | x = (x1 + x2) / 2 / img_width # x中心坐标 537 | y = (y1 + y2) / 2 / img_height # y中心坐标 538 | w = (x2 - x1) / img_width # 框宽度/图片总宽度 539 | h = (y2 - y1) / img_height # 框高度/图片总高度 540 | 541 | return np.array([x, y, w, h]) 542 | 543 | 544 | # 调整图像大小 545 | def resize_img(img, sampled_labels, input_size): 546 | target_size = input_size 547 | img = img.resize((target_size[1], target_size[2]), Image.BILINEAR) 548 | return img 549 | 550 | 551 | # 计算交并比 552 | def box_iou_xywh(box1, box2): 553 | assert box1.shape[-1] == 4, "Box1 shape[-1] should be 4." 554 | assert box2.shape[-1] == 4, "Box2 shape[-1] should be 4." 555 | 556 | # 取两个框的坐标 557 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 558 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 559 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 560 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 561 | 562 | inter_x1 = np.maximum(b1_x1, b2_x1) 563 | inter_x2 = np.minimum(b1_x2, b2_x2) 564 | inter_y1 = np.maximum(b1_y1, b2_y1) 565 | inter_y2 = np.minimum(b1_y2, b2_y2) 566 | inter_w = inter_x2 - inter_x1 + 1 # 相交部分宽度 567 | inter_h = inter_y2 - inter_y1 + 1 # 相交部分高度 568 | inter_w[inter_w < 0] = 0 569 | inter_h[inter_h < 0] = 0 570 | 571 | inter_area = inter_w * inter_h # 相交面积 572 | b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1) # 框1的面积 573 | b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1) # 框2的面积 574 | 575 | return inter_area / (b1_area + b2_area - inter_area) # 相集面积/并集面积 576 | 577 | 578 | # box裁剪 579 | def box_crop(boxes, labels, crop, img_shape): 580 | x, y, w, h = map(float, crop) 581 | im_w, im_h = map(float, img_shape) 582 | 583 | boxes = boxes.copy() 584 | boxes[:, 0], boxes[:, 2] = (boxes[:, 0] - boxes[:, 2] / 2) * im_w, (boxes[:, 0] + boxes[:, 2] / 2) * im_w 585 | boxes[:, 1], boxes[:, 3] = (boxes[:, 1] - boxes[:, 3] / 2) * im_h, (boxes[:, 1] + boxes[:, 3] / 2) * im_h 586 | 587 | crop_box = np.array([x, y, x + w, y + h]) 588 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0 589 | mask = np.logical_and(crop_box[:2] <= centers, centers <= crop_box[2:]).all(axis=1) 590 | 591 | boxes[:, :2] = np.maximum(boxes[:, :2], crop_box[:2]) 592 | boxes[:, 2:] = np.minimum(boxes[:, 2:], crop_box[2:]) 593 | boxes[:, :2] -= crop_box[:2] 594 | boxes[:, 2:] -= crop_box[:2] 595 | 596 | mask = np.logical_and(mask, (boxes[:, :2] < boxes[:, 2:]).all(axis=1)) 597 | boxes = boxes * np.expand_dims(mask.astype('float32'), axis=1) 598 | labels = labels * mask.astype('float32') 599 | boxes[:, 0], boxes[:, 2] = (boxes[:, 0] + boxes[:, 2]) / 2 / w, (boxes[:, 2] - boxes[:, 0]) / w 600 | boxes[:, 1], boxes[:, 3] = (boxes[:, 1] + boxes[:, 3]) / 2 / h, (boxes[:, 3] - boxes[:, 1]) / h 601 | 602 | return boxes, labels, mask.sum() 603 | 604 | 605 | # 图像增加:对比度,饱和度,明暗,颜色,扩张 606 | def random_brightness(img): # 亮度 607 | prob = np.random.uniform(0, 1) 608 | 609 | if prob < train_params['image_distort_strategy']['brightness_prob']: 610 | brightness_delta = train_params['image_distort_strategy']['brightness_delta'] # 默认值0.125 611 | delta = np.random.uniform(-brightness_delta, brightness_delta) + 1 # 产生均匀分布随机值 612 | img = ImageEnhance.Brightness(img).enhance(delta) # 调整图像亮度 613 | 614 | return img 615 | 616 | 617 | def random_contrast(img): # 对比度 618 | prob = np.random.uniform(0, 1) 619 | 620 | if prob < train_params['image_distort_strategy']['contrast_prob']: 621 | contrast_delta = train_params['image_distort_strategy']['contrast_delta'] 622 | delta = np.random.uniform(-contrast_delta, contrast_delta) + 1 623 | img = ImageEnhance.Contrast(img).enhance(delta) 624 | 625 | return img 626 | 627 | 628 | def random_saturation(img): # 饱和度 629 | prob = np.random.uniform(0, 1) 630 | 631 | if prob < train_params['image_distort_strategy']['saturation_prob']: 632 | saturation_delta = train_params['image_distort_strategy']['saturation_delta'] 633 | delta = np.random.uniform(-saturation_delta, saturation_delta) + 1 634 | img = ImageEnhance.Color(img).enhance(delta) 635 | 636 | return img 637 | 638 | 639 | def random_hue(img): # 色调 640 | prob = np.random.uniform(0, 1) 641 | 642 | if prob < train_params['image_distort_strategy']['hue_prob']: 643 | hue_delta = train_params['image_distort_strategy']['hue_delta'] 644 | delta = np.random.uniform(-hue_delta, hue_delta) 645 | img_hsv = np.array(img.convert('HSV')) 646 | img_hsv[:, :, 0] = img_hsv[:, :, 0] + delta 647 | img = Image.fromarray(img_hsv, mode='HSV').convert('RGB') 648 | 649 | return img 650 | 651 | 652 | def distort_image(img): # 图像扭曲 653 | prob = np.random.uniform(0, 1) 654 | # Apply different distort order 655 | if prob > 0.5: 656 | img = random_brightness(img) 657 | img = random_contrast(img) 658 | img = random_saturation(img) 659 | img = random_hue(img) 660 | else: 661 | img = random_brightness(img) 662 | img = random_saturation(img) 663 | img = random_hue(img) 664 | img = random_contrast(img) 665 | return img 666 | 667 | 668 | # 随机裁剪 669 | def random_crop(img, boxes, labels, scales=[0.3, 1.0], max_ratio=2.0, constraints=None, max_trial=50): 670 | if random.random() > 0.6: 671 | return img, boxes, labels 672 | if len(boxes) == 0: 673 | return img, boxes, labels 674 | 675 | if not constraints: 676 | constraints = [(0.1, 1.0), 677 | (0.3, 1.0), 678 | (0.5, 1.0), 679 | (0.7, 1.0), 680 | (0.9, 1.0), 681 | (0.0, 1.0)] # 最小/最大交并比值 682 | 683 | w, h = img.size 684 | crops = [(0, 0, w, h)] 685 | 686 | for min_iou, max_iou in constraints: 687 | for _ in range(max_trial): 688 | scale = random.uniform(scales[0], scales[1]) 689 | aspect_ratio = random.uniform(max(1 / max_ratio, scale * scale), \ 690 | min(max_ratio, 1 / scale / scale)) 691 | crop_h = int(h * scale / np.sqrt(aspect_ratio)) 692 | crop_w = int(w * scale * np.sqrt(aspect_ratio)) 693 | crop_x = random.randrange(w - crop_w) 694 | crop_y = random.randrange(h - crop_h) 695 | crop_box = np.array([[ 696 | (crop_x + crop_w / 2.0) / w, 697 | (crop_y + crop_h / 2.0) / h, 698 | crop_w / float(w), 699 | crop_h / float(h) 700 | ]]) 701 | 702 | iou = box_iou_xywh(crop_box, boxes) 703 | if min_iou <= iou.min() and max_iou >= iou.max(): 704 | crops.append((crop_x, crop_y, crop_w, crop_h)) 705 | break 706 | 707 | while crops: 708 | crop = crops.pop(np.random.randint(0, len(crops))) 709 | crop_boxes, crop_labels, box_num = box_crop(boxes, labels, crop, (w, h)) 710 | if box_num < 1: 711 | continue 712 | img = img.crop((crop[0], crop[1], crop[0] + crop[2], 713 | crop[1] + crop[3])).resize(img.size, Image.LANCZOS) 714 | return img, crop_boxes, crop_labels 715 | return img, boxes, labels 716 | 717 | 718 | # 扩张 719 | def random_expand(img, gtboxes, keep_ratio=True): 720 | if np.random.uniform(0, 1) < train_params['image_distort_strategy']['expand_prob']: 721 | return img, gtboxes 722 | 723 | max_ratio = train_params['image_distort_strategy']['expand_max_ratio'] 724 | w, h = img.size 725 | c = 3 726 | ratio_x = random.uniform(1, max_ratio) 727 | if keep_ratio: 728 | ratio_y = ratio_x 729 | else: 730 | ratio_y = random.uniform(1, max_ratio) 731 | oh = int(h * ratio_y) 732 | ow = int(w * ratio_x) 733 | off_x = random.randint(0, ow - w) 734 | off_y = random.randint(0, oh - h) 735 | 736 | out_img = np.zeros((oh, ow, c), np.uint8) 737 | for i in range(c): 738 | out_img[:, :, i] = train_params['mean_rgb'][i] 739 | 740 | out_img[off_y: off_y + h, off_x: off_x + w, :] = img 741 | gtboxes[:, 0] = ((gtboxes[:, 0] * w) + off_x) / float(ow) 742 | gtboxes[:, 1] = ((gtboxes[:, 1] * h) + off_y) / float(oh) 743 | gtboxes[:, 2] = gtboxes[:, 2] / ratio_x 744 | gtboxes[:, 3] = gtboxes[:, 3] / ratio_y 745 | 746 | return Image.fromarray(out_img), gtboxes 747 | 748 | 749 | # 预处理:图像样本增强,维度转换 750 | def preprocess(img, bbox_labels, input_size, mode): 751 | img_width, img_height = img.size 752 | sample_labels = np.array(bbox_labels) 753 | 754 | if mode == 'train': 755 | if train_params['apply_distort']: # 是否扭曲增强 756 | img = distort_image(img) 757 | 758 | img, gtboxes = random_expand(img, sample_labels[:, 1:5]) # 扩展增强 759 | img, gtboxes, gtlabels = random_crop(img, gtboxes, sample_labels[:, 0]) # 随机裁剪 760 | sample_labels[:, 0] = gtlabels 761 | sample_labels[:, 1:5] = gtboxes 762 | 763 | img = resize_img(img, sample_labels, input_size) 764 | img = np.array(img).astype('float32') 765 | img -= train_params['mean_rgb'] 766 | img = img.transpose((2, 0, 1)) # HWC to CHW 767 | img *= 0.007843 768 | return img, sample_labels 769 | 770 | 771 | # 数据读取器 772 | # 根据样本文件,读取图片、并做数据增强,返回图片数据、边框、标签 773 | def custom_reader(file_list, data_dir, input_size, mode): 774 | def reader(): 775 | np.random.shuffle(file_list) # 打乱文件列表 776 | 777 | for line in file_list: # 读取行,每行一个图片及标注 778 | if mode == 'train' or mode == 'eval': 779 | ###################### 以下可能是需要自定义修改的部分 ############################ 780 | parts = line.split('\t') # 按照tab键拆分 781 | image_path = parts[0] 782 | 783 | img = Image.open(os.path.join(data_dir, image_path)) # 读取图像数据 784 | if img.mode != 'RGB': 785 | img = img.convert('RGB') 786 | im_width, im_height = img.size 787 | 788 | # bbox 的列表,每一个元素为这样 789 | # layout: label | x-center | y-cneter | width | height | difficult 790 | bbox_labels = [] 791 | for object_str in parts[1:]: # 循环处理每一个目标标注信息 792 | if len(object_str) <= 1: 793 | continue 794 | 795 | bbox_sample = [] 796 | object = json.loads(object_str) 797 | bbox_sample.append(float(train_params['label_dict'][object['value']])) 798 | bbox = object['coordinate'] # 获取框坐标 799 | # 计算x,y,w,h 800 | box = [bbox[0][0], bbox[0][1], bbox[1][0] - bbox[0][0], bbox[1][1] - bbox[0][1]] 801 | bbox = box_to_center_relative(box, im_height, im_width) # 坐标转换 802 | bbox_sample.append(float(bbox[0])) 803 | bbox_sample.append(float(bbox[1])) 804 | bbox_sample.append(float(bbox[2])) 805 | bbox_sample.append(float(bbox[3])) 806 | difficult = float(0) 807 | bbox_sample.append(difficult) 808 | bbox_labels.append(bbox_sample) 809 | ###################### 可能需要自定义修改部分结束 ############################ 810 | 811 | if len(bbox_labels) == 0: 812 | continue 813 | 814 | img, sample_labels = preprocess(img, bbox_labels, input_size, mode) # 预处理 815 | # sample_labels = np.array(sample_labels) 816 | if len(sample_labels) == 0: 817 | continue 818 | 819 | boxes = sample_labels[:, 1:5] # 坐标 820 | lbls = sample_labels[:, 0].astype('int32') # 标签 821 | difficults = sample_labels[:, -1].astype('int32') 822 | max_box_num = train_params['max_box_num'] # 一副图像最多多少个目标物体 823 | cope_size = max_box_num if len(boxes) >= max_box_num else len(boxes) # 控制最大目标数量 824 | ret_boxes = np.zeros((max_box_num, 4), dtype=np.float32) 825 | ret_lbls = np.zeros((max_box_num), dtype=np.int32) 826 | ret_difficults = np.zeros((max_box_num), dtype=np.int32) 827 | ret_boxes[0: cope_size] = boxes[0: cope_size] 828 | ret_lbls[0: cope_size] = lbls[0: cope_size] 829 | ret_difficults[0: cope_size] = difficults[0: cope_size] 830 | 831 | yield img, ret_boxes, ret_lbls 832 | 833 | elif mode == 'test': 834 | img_path = os.path.join(line) 835 | 836 | yield Image.open(img_path) 837 | 838 | return reader 839 | 840 | 841 | # 批量、随机数据读取器 842 | def single_custom_reader(file_path, data_dir, input_size, mode): 843 | file_path = os.path.join(data_dir, file_path) 844 | 845 | images = [line.strip() for line in open(file_path)] 846 | reader = custom_reader(images, data_dir, input_size, mode) 847 | reader = paddle.reader.shuffle(reader, train_params['train_batch_size']) 848 | reader = paddle.batch(reader, train_params['train_batch_size']) 849 | 850 | return reader 851 | 852 | 853 | # 定义优化器 854 | def optimizer_sgd_setting(): 855 | batch_size = train_params["train_batch_size"] # batch大小 856 | iters = train_params["image_count"] // batch_size # 计算轮次 857 | iters = 1 if iters < 1 else iters 858 | ''' 859 | learning_strategy = train_params['sgd_strategy'] 860 | lr = learning_strategy['learning_rate'] # 学习率 861 | 862 | boundaries = [i * iters for i in learning_strategy["lr_epochs"]] 863 | values = [i * lr for i in learning_strategy["lr_decay"]] 864 | logger.info("origin learning rate: {0} boundaries: {1} values: {2}".format(lr, boundaries, values)) 865 | 866 | 867 | optimizer = fluid.optimizer.SGDOptimizer( 868 | learning_rate=fluid.layers.piecewise_decay(boundaries, values), # 分段衰减学习率 869 | # learning_rate=lr, 870 | regularization=fluid.regularizer.L2Decay(0.00005)) 871 | ''' 872 | optimizer = fluid.optimizer.AdamOptimizer(learning_rate=0.01,beta1=0.9,beta2=0.999,regularization=fluid.regularizer.L2Decay(0.00005)) 873 | return optimizer 874 | 875 | 876 | # 创建program, feeder及yolo模型 877 | def build_program_with_feeder(main_prog, startup_prog, place): 878 | max_box_num = train_params['max_box_num'] 879 | ues_tiny = train_params['use_tiny'] # 获取是否使用tiny yolo参数 880 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg'] 881 | 882 | with fluid.program_guard(main_prog, startup_prog): # 更改全局主程序和启动程序 883 | img = fluid.layers.data(name='img', shape=yolo_config['input_size'], dtype='float32') # 图像 884 | gt_box = fluid.layers.data(name='gt_box', shape=[max_box_num, 4], dtype='float32') # 边框 885 | gt_label = fluid.layers.data(name='gt_label', shape=[max_box_num], dtype='int32') # 标签 886 | 887 | feeder = fluid.DataFeeder(feed_list=[img, gt_box, gt_label], 888 | place=place, 889 | program=main_prog) # 定义feeder 890 | reader = single_custom_reader(train_params['train_list'], 891 | train_params['data_dir'], 892 | yolo_config['input_size'], 'train') # 读取器 893 | # 获取yolo参数 894 | ues_tiny = train_params['use_tiny'] 895 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg'] 896 | 897 | with fluid.unique_name.guard(): 898 | # 创建yolo模型 899 | model = get_yolo(ues_tiny, train_params['class_dim'], yolo_config['anchors'], 900 | yolo_config['anchor_mask']) 901 | outputs = model.net(img) 902 | return feeder, reader, get_loss(model, outputs, gt_box, gt_label) 903 | 904 | 905 | # 损失函数 906 | def get_loss(model, outputs, gt_box, gt_label): 907 | losses = [] 908 | downsample_ratio = model.get_downsample_ratio() 909 | 910 | with fluid.unique_name.guard('train'): 911 | for i, out in enumerate(outputs): 912 | loss = fluid.layers.yolov3_loss(x=out, 913 | gt_box=gt_box, # 真实边框 914 | gt_label=gt_label, # 标签 915 | anchors=model.get_anchors(), # 锚点 916 | anchor_mask=model.get_anchor_mask()[i], 917 | class_num=model.get_class_num(), 918 | ignore_thresh=train_params['ignore_thresh'], 919 | # 对于类别不多的情况,设置为 False 会更合适一些,不然 score 会很小 920 | use_label_smooth=False, 921 | downsample_ratio=downsample_ratio) 922 | losses.append(fluid.layers.reduce_mean(loss)) 923 | downsample_ratio //= 2 924 | loss = sum(losses) 925 | optimizer = optimizer_sgd_setting() 926 | optimizer.minimize(loss) 927 | return loss 928 | 929 | 930 | # 持久化参数加载 931 | def load_pretrained_params(exe, program): 932 | if train_params['continue_train'] and os.path.exists(train_params['save_model_dir']): 933 | logger.info('load param from retrain model') 934 | fluid.io.load_persistables(executor=exe, 935 | dirname=train_params['save_model_dir'], 936 | main_program=program) 937 | elif train_params['pretrained'] and os.path.exists(train_params['pretrained_model_dir']): 938 | logger.info('load param from pretrained model') 939 | 940 | def if_exist(var): 941 | return os.path.exists(os.path.join(train_params['pretrained_model_dir'], var.name)) 942 | 943 | fluid.io.load_vars(exe, train_params['pretrained_model_dir'], main_program=program, 944 | predicate=if_exist) 945 | 946 | 947 | # 执行训练 948 | def train(): 949 | init_log_config() 950 | init_train_parameters() 951 | 952 | logger.info("start train YOLOv3, train params:%s", str(train_params)) 953 | logger.info("create place, use gpu:" + str(train_params['use_gpu'])) 954 | 955 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace() 956 | 957 | logger.info("build network and program") 958 | train_program = fluid.Program() 959 | start_program = fluid.Program() 960 | feeder, reader, loss = build_program_with_feeder(train_program, start_program, place) 961 | 962 | logger.info("build executor and init params") 963 | 964 | exe = fluid.Executor(place) 965 | exe.run(start_program) 966 | train_fetch_list = [loss.name] 967 | load_pretrained_params(exe, train_program) # 加载模型及参数 968 | 969 | stop_strategy = train_params['early_stop'] 970 | successive_limit = stop_strategy['successive_limit'] 971 | sample_freq = stop_strategy['sample_frequency'] 972 | min_curr_map = stop_strategy['min_curr_map'] 973 | min_loss = stop_strategy['min_loss'] 974 | stop_train = False 975 | successive_count = 0 976 | total_batch_count = 0 977 | valid_thresh = train_params['valid_thresh'] 978 | nms_thresh = train_params['nms_thresh'] 979 | current_best_loss = 10000000000.0 980 | 981 | # 开始迭代训练 982 | for pass_id in range(train_params["num_epochs"]): 983 | logger.info("current pass: {}, start read image".format(pass_id)) 984 | batch_id = 0 985 | total_loss = 0.0 986 | 987 | for batch_id, data in enumerate(reader()): 988 | t1 = time.time() 989 | 990 | loss = exe.run(train_program, 991 | feed=feeder.feed(data), 992 | fetch_list=train_fetch_list) # 执行训练 993 | 994 | period = time.time() - t1 995 | loss = np.mean(np.array(loss)) 996 | total_loss += loss 997 | batch_id += 1 998 | total_batch_count += 1 999 | 1000 | if batch_id % 10 == 0: # 调整日志输出的频率 1001 | logger.info( 1002 | "pass {}, trainbatch {}, loss {} time {}".format(pass_id, batch_id, loss, "%2.2f sec" % period)) 1003 | 1004 | pass_mean_loss = total_loss / batch_id 1005 | logger.info("pass {0} train result, current pass mean loss: {1}".format(pass_id, pass_mean_loss)) 1006 | 1007 | # 采用每训练完一轮停止办法,可以调整为更精细的保存策略 1008 | if pass_mean_loss < current_best_loss: 1009 | logger.info("temp save {} epcho train result, current best pass loss {}".format(pass_id, pass_mean_loss)) 1010 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program, 1011 | executor=exe) 1012 | current_best_loss = pass_mean_loss 1013 | 1014 | logger.info("training till last epcho, end training") 1015 | fluid.io.save_persistables(dirname=train_params['save_model_dir'], main_program=train_program, executor=exe) 1016 | 1017 | 1018 | if __name__ == '__main__': 1019 | train() 1020 | 1021 | 1022 | 1023 | # 固化保存模型 1024 | import paddle 1025 | import paddle.fluid as fluid 1026 | import codecs 1027 | 1028 | init_train_parameters() 1029 | 1030 | 1031 | def freeze_model(): 1032 | exe = fluid.Executor(fluid.CPUPlace()) 1033 | 1034 | ues_tiny = train_params['use_tiny'] 1035 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg'] 1036 | path = train_params['save_model_dir'] 1037 | 1038 | model = get_yolo(ues_tiny, train_params['class_dim'], 1039 | yolo_config['anchors'], yolo_config['anchor_mask']) 1040 | image = fluid.layers.data(name='image', shape=yolo_config['input_size'], dtype='float32') 1041 | image_shape = fluid.layers.data(name="image_shape", shape=[2], dtype='int32') 1042 | 1043 | boxes = [] 1044 | scores = [] 1045 | outputs = model.net(image) 1046 | downsample_ratio = model.get_downsample_ratio() 1047 | 1048 | for i, out in enumerate(outputs): 1049 | box, score = fluid.layers.yolo_box(x=out, 1050 | img_size=image_shape, 1051 | anchors=model.get_yolo_anchors()[i], 1052 | class_num=model.get_class_num(), 1053 | conf_thresh=train_params['valid_thresh'], 1054 | downsample_ratio=downsample_ratio, 1055 | name="yolo_box_" + str(i)) 1056 | boxes.append(box) 1057 | scores.append(fluid.layers.transpose(score, perm=[0, 2, 1])) 1058 | downsample_ratio //= 2 1059 | 1060 | pred = fluid.layers.multiclass_nms(bboxes=fluid.layers.concat(boxes, axis=1), 1061 | scores=fluid.layers.concat(scores, axis=2), 1062 | score_threshold=train_params['valid_thresh'], 1063 | nms_top_k=train_params['nms_top_k'], 1064 | keep_top_k=train_params['nms_pos_k'], 1065 | nms_threshold=train_params['nms_thresh'], 1066 | background_label=-1, 1067 | name="multiclass_nms") 1068 | 1069 | freeze_program = fluid.default_main_program() 1070 | 1071 | fluid.io.load_persistables(exe, path, freeze_program) 1072 | freeze_program = freeze_program.clone(for_test=True) 1073 | print("freeze out: {0}, pred layout: {1}".format(train_params['freeze_dir'], pred)) 1074 | # 保存模型 1075 | fluid.io.save_inference_model(train_params['freeze_dir'], 1076 | ['image', 'image_shape'], 1077 | pred, exe, freeze_program) 1078 | print("freeze end") 1079 | 1080 | 1081 | if __name__ == '__main__': 1082 | freeze_model() 1083 | 1084 | 1085 | # 预测 1086 | import codecs 1087 | import sys 1088 | import numpy as np 1089 | import time 1090 | import paddle 1091 | import paddle.fluid as fluid 1092 | import math 1093 | import functools 1094 | 1095 | from IPython.display import display 1096 | from PIL import Image 1097 | from PIL import ImageFont 1098 | from PIL import ImageDraw 1099 | from collections import namedtuple 1100 | 1101 | init_train_parameters() 1102 | ues_tiny = train_params['use_tiny'] 1103 | yolo_config = train_params['yolo_tiny_cfg'] if ues_tiny else train_params['yolo_cfg'] 1104 | 1105 | target_size = yolo_config['input_size'] 1106 | anchors = yolo_config['anchors'] 1107 | anchor_mask = yolo_config['anchor_mask'] 1108 | label_dict = train_params['num_dict'] 1109 | class_dim = train_params['class_dim'] 1110 | print("label_dict:{} class dim:{}".format(label_dict, class_dim)) 1111 | 1112 | place = fluid.CUDAPlace(0) if train_params['use_gpu'] else fluid.CPUPlace() 1113 | exe = fluid.Executor(place) 1114 | 1115 | path = train_params['freeze_dir'] 1116 | [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(dirname=path, executor=exe) 1117 | 1118 | 1119 | # 给图片画上外接矩形框 1120 | def draw_bbox_image(img, boxes, labels, save_name): 1121 | img_width, img_height = img.size 1122 | 1123 | draw = ImageDraw.Draw(img) # 图像绘制对象 1124 | for box, label in zip(boxes, labels): 1125 | xmin, ymin, xmax, ymax = box[0], box[1], box[2], box[3] 1126 | draw.rectangle((xmin, ymin, xmax, ymax), None, 'red') # 绘制矩形 1127 | draw.text((xmin, ymin), label_dict[int(label)], (255, 255, 0)) # 绘制标签 1128 | img.save(save_name) 1129 | display(img) 1130 | 1131 | 1132 | def resize_img(img, target_size): 1133 | """ 1134 | 保持比例的缩放图片 1135 | :param img: 1136 | :param target_size: 1137 | :return: 1138 | """ 1139 | img = img.resize(target_size[1:], Image.BILINEAR) 1140 | return img 1141 | 1142 | 1143 | def read_image(img_path): 1144 | """ 1145 | 读取图片 1146 | :param img_path: 1147 | :return: 1148 | """ 1149 | origin = Image.open(img_path) 1150 | img = resize_img(origin, target_size) 1151 | resized_img = img.copy() 1152 | if img.mode != 'RGB': 1153 | img = img.convert('RGB') 1154 | img = np.array(img).astype('float32').transpose((2, 0, 1)) # HWC to CHW 1155 | img -= 127.5 1156 | img *= 0.007843 1157 | img = img[np.newaxis, :] 1158 | return origin, img, resized_img 1159 | 1160 | 1161 | def infer(image_path): 1162 | """ 1163 | 预测,将结果保存到一副新的图片中 1164 | :param image_path: 1165 | :return: 1166 | """ 1167 | origin, tensor_img, resized_img = read_image(image_path) 1168 | input_w, input_h = origin.size[0], origin.size[1] 1169 | image_shape = np.array([input_h, input_w], dtype='int32') 1170 | # print("image shape high:{0}, width:{1}".format(input_h, input_w)) 1171 | 1172 | t1 = time.time() 1173 | # 执行预测 1174 | batch_outputs = exe.run(inference_program, 1175 | feed={feed_target_names[0]: tensor_img, 1176 | feed_target_names[1]: image_shape[np.newaxis, :]}, 1177 | fetch_list=fetch_targets, 1178 | return_numpy=False) 1179 | period = time.time() - t1 1180 | print("predict cost time:{0}".format("%2.2f sec" % period)) 1181 | bboxes = np.array(batch_outputs[0]) # 预测结果 1182 | # print(bboxes) 1183 | 1184 | if bboxes.shape[1] != 6: 1185 | print("No object found in {}".format(image_path)) 1186 | return 1187 | labels = bboxes[:, 0].astype('int32') # 类别 1188 | scores = bboxes[:, 1].astype('float32') # 概率 1189 | boxes = bboxes[:, 2:].astype('float32') # 边框 1190 | 1191 | last_dot_index = image_path.rfind('.') 1192 | out_path = image_path[:last_dot_index] 1193 | out_path += '-result.jpg' 1194 | draw_bbox_image(origin, boxes, labels, out_path) 1195 | 1196 | 1197 | if __name__ == '__main__': 1198 | #image_name = sys.argv[1] 1199 | #image_path = image_name 1200 | image_path = "data/data6045/lslm_test/23.jpg" 1201 | infer(image_path) -------------------------------------------------------------------------------- /output/classes/0.0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/classes/0.0.png -------------------------------------------------------------------------------- /output/classes/1.0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/classes/1.0.png -------------------------------------------------------------------------------- /output/classes/2.0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/classes/2.0.png -------------------------------------------------------------------------------- /output/detection-results-info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/detection-results-info.png -------------------------------------------------------------------------------- /output/ground-truth-info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/ground-truth-info.png -------------------------------------------------------------------------------- /output/lamr.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/lamr.png -------------------------------------------------------------------------------- /output/mAP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/imajiayu/object_detection_based-on-yolov3/de6ceb99541ed9544d593675872fac3c35f18deb/output/mAP.png -------------------------------------------------------------------------------- /output/output.txt: -------------------------------------------------------------------------------- 1 | # AP and precision/recall per class 2 | 47.18% = 0.0 AP 3 | Precision: ['1.00', '1.00', '1.00', '0.75', '0.60', '0.67', '0.71', '0.75', '0.78', '0.80', '0.82', '0.83', '0.85', '0.86', '0.87', '0.81', '0.76', '0.78', '0.79', '0.80', '0.81', '0.77', '0.78', '0.79', '0.76', '0.73', '0.70', '0.68', '0.69', '0.70', '0.68', '0.69', '0.67', '0.65', '0.66', '0.64', '0.62', '0.61', '0.62', '0.60', '0.61', '0.62', '0.60', '0.59', '0.58', '0.57', '0.55', '0.54', '0.53', '0.52', '0.51', '0.50', '0.49', '0.48', '0.47', '0.46', '0.46', '0.45', '0.44', '0.43', '0.43', '0.42', '0.41', '0.41', '0.40', '0.39', '0.39', '0.38', '0.38', '0.37', '0.37', '0.36', '0.36', '0.35', '0.35', '0.34', '0.34', '0.33', '0.33', '0.33', '0.32', '0.32', '0.31', '0.31', '0.31', '0.31', '0.31', '0.31', '0.30', '0.30', '0.30', '0.29', '0.29', '0.30', '0.29', '0.29', '0.29', '0.29', '0.28', '0.28', '0.28', '0.27', '0.27', '0.27', '0.27', '0.26'] 4 | Recall :['0.02', '0.04', '0.07', '0.07', '0.07', '0.09', '0.11', '0.13', '0.15', '0.17', '0.20', '0.22', '0.24', '0.26', '0.28', '0.28', '0.28', '0.30', '0.33', '0.35', '0.37', '0.37', '0.39', '0.41', '0.41', '0.41', '0.41', '0.41', '0.43', '0.46', '0.46', '0.48', '0.48', '0.48', '0.50', '0.50', '0.50', '0.50', '0.52', '0.52', '0.54', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.57', '0.59', '0.59', '0.59', '0.59', '0.59', '0.59', '0.59', '0.59', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61', '0.61'] 5 | 6 | 80.87% = 1.0 AP 7 | Precision: ['1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '1.00', '0.93', '0.87', '0.88', '0.82', '0.78', '0.74', '0.70', '0.67', '0.68', '0.65', '0.62', '0.60', '0.58', '0.56', '0.54', '0.52', '0.50', '0.48', '0.47', '0.45', '0.44', '0.43', '0.42', '0.41', '0.39', '0.38', '0.38'] 8 | Recall :['0.06', '0.11', '0.17', '0.22', '0.28', '0.33', '0.39', '0.44', '0.50', '0.56', '0.61', '0.67', '0.72', '0.72', '0.72', '0.78', '0.78', '0.78', '0.78', '0.78', '0.78', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83'] 9 | 10 | 64.01% = 2.0 AP 11 | Precision: ['1.00', '1.00', '1.00', '1.00', '0.80', '0.67', '0.71', '0.75', '0.67', '0.70', '0.73', '0.75', '0.69', '0.71', '0.73', '0.69', '0.71', '0.67', '0.68', '0.70', '0.67', '0.64', '0.65', '0.67', '0.68', '0.65', '0.63', '0.61', '0.59', '0.57', '0.55', '0.53', '0.52', '0.50', '0.51', '0.50', '0.51', '0.53', '0.51', '0.50', '0.49', '0.48', '0.47', '0.45', '0.44', '0.43', '0.43', '0.42', '0.41', '0.42', '0.41', '0.40', '0.40', '0.39', '0.38', '0.38', '0.37', '0.36', '0.36', '0.35', '0.34', '0.34', '0.33', '0.33', '0.32', '0.32', '0.31', '0.31', '0.30', '0.30', '0.30', '0.29', '0.29', '0.28', '0.28', '0.28', '0.27', '0.27', '0.27', '0.26', '0.26', '0.26', '0.25', '0.25', '0.25', '0.24', '0.24', '0.24', '0.24', '0.23', '0.23', '0.23', '0.23'] 12 | Recall :['0.04', '0.08', '0.12', '0.17', '0.17', '0.17', '0.21', '0.25', '0.25', '0.29', '0.33', '0.38', '0.38', '0.42', '0.46', '0.46', '0.50', '0.50', '0.54', '0.58', '0.58', '0.58', '0.62', '0.67', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.71', '0.75', '0.75', '0.79', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.83', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88', '0.88'] 13 | 14 | 15 | # mAP of all classes 16 | mAP = 64.02% 17 | 18 | # Number of ground-truth objects per class 19 | 0.0: 46 20 | 1.0: 18 21 | 2.0: 24 22 | 23 | # Number of detected objects per class 24 | 0.0: 106 (tp:28, fp:78) 25 | 1.0: 40 (tp:15, fp:25) 26 | 2.0: 93 (tp:21, fp:72) 27 | --------------------------------------------------------------------------------