├── DSOD.py └── README.md /DSOD.py: -------------------------------------------------------------------------------- 1 | import os 2 | import gc 3 | import xml.etree.ElementTree as etxml 4 | import math 5 | import random 6 | import skimage.io 7 | import skimage.transform 8 | import numpy as np 9 | import tensorflow as tf 10 | import tensorlayer as tl 11 | from tensorlayer.layers import * 12 | from tensorflow.python.ops import variables 13 | import time 14 | from imutils.object_detection import non_max_suppression 15 | import imutils 16 | import cv2 17 | import matplotlib.pyplot as plt 18 | batch_size = 16 19 | running_count = 5000 20 | file_name_list = os.listdir('./train_datasets/voc2012/JPEGImages/') 21 | lable_arr = ['background','aeroplane','bicycle','bird','boat','bottle','bus','car','cat','chair','cow','diningtable','dog','horse','motorbike','person','pottedplant','sheep','sofa','train','tvmonitor'] 22 | img_size = [300, 300] 23 | # 分类总数量 24 | classes_size = 21 25 | # 背景分类的值 26 | background_classes_val = 0 27 | # 每个特征图单元的default box数量 28 | default_box_size = [6, 6, 6, 6, 6, 6] 29 | # default box 尺寸长宽比例 30 | box_aspect_ratio = [ 31 | [0.5, 1.0, 2.0, 3.0,1/3.0], 32 | [0.5, 1.0, 2.0, 3.0, 1 / 3.0], 33 | [0.5, 1.0, 2.0, 3.0, 1 / 3.0], 34 | [0.5, 1.0, 2.0, 3.0, 1 / 3.0], 35 | [0.5, 1.0, 2.0, 3.0, 1 / 3.0], 36 | [0.5, 1.0, 2.0, 3.0, 1 / 3.0] 37 | ] 38 | # 最小default box面积比例 39 | min_box_scale = 0.1 40 | # 最大default box面积比例 41 | max_box_scale = 0.9 42 | # 每个特征层的面积比例 43 | # numpy生成等差数组,效果等同于论文中的s_k=s_min+(s_max-s_min)*(k-1)/(m-1) 44 | default_box_scale = np.linspace(min_box_scale, max_box_scale, num=np.amax(default_box_size)) 45 | print('## default_box_scale:' + str(default_box_scale)) 46 | # 卷积步长 47 | conv_strides_1 = [1, 1, 1, 1] 48 | conv_strides_2 = [1, 2, 2, 1] 49 | conv_strides_3 = [1, 3, 3, 1] 50 | 51 | tl_strides_1 = (1, 1) 52 | tl_strides_2 = (2, 2) 53 | tl_strides_3 = (3, 3) 54 | # 池化窗口 55 | pool_size = [1, 2, 2, 1] 56 | tl_pool_size = (2, 2) 57 | # 池化步长 58 | pool_strides = [1, 2, 2, 1] 59 | tl_pool_strides = (2, 2) 60 | # Batch Normalization 算法的 decay 参数 61 | conv_bn_decay = 0.9999 62 | # Batch Normalization 算法的 variance_epsilon 参数 63 | conv_bn_epsilon = 0.001 64 | # Jaccard相似度判断阀值 65 | jaccard_value = 0.55 66 | feature_maps_shape=[] 67 | all_default_boxs_len=0 68 | all_default_boxs=[] 69 | 70 | jitter = 0.2 71 | def get_traindata_voc(batch_size): 72 | def get_actual_data_from_xml(xml_path): 73 | actual_item = [] 74 | try: 75 | annotation_node = etxml.parse(xml_path).getroot() 76 | img_width = float(annotation_node.find('size').find('width').text.strip()) 77 | img_height = float(annotation_node.find('size').find('height').text.strip()) 78 | object_node_list = annotation_node.findall('object') 79 | for obj_node in object_node_list: 80 | lable = lable_arr.index(obj_node.find('name').text.strip()) 81 | bndbox = obj_node.find('bndbox') 82 | x_min = float(bndbox.find('xmin').text.strip()) 83 | y_min = float(bndbox.find('ymin').text.strip()) 84 | x_max = float(bndbox.find('xmax').text.strip()) 85 | y_max = float(bndbox.find('ymax').text.strip()) 86 | # 位置数据用比例来表示,格式[center_x,center_y,width,height,lable] 87 | actual_item.append([((x_min + x_max) / 2 / img_width), ((y_min + y_max) / 2 / img_height), 88 | ((x_max - x_min) / img_width), ((y_max - y_min) / img_height), lable]) 89 | return actual_item 90 | except: 91 | return None 92 | 93 | train_data = [] 94 | actual_data = [] 95 | file_list = random.sample(file_name_list, batch_size) 96 | for f_name in file_list: 97 | img_path = './train_datasets/voc2012/JPEGImages/' + f_name 98 | xml_path = './train_datasets/voc2012/Annotations/' + f_name.replace('.jpg', '.xml') 99 | if os.path.splitext(img_path)[1].lower() == '.jpg': 100 | actual_item = get_actual_data_from_xml(xml_path) 101 | img = skimage.io.imread(img_path) 102 | if actual_item != None: 103 | countwhile=0 104 | while True: 105 | clas=[] 106 | coords=[] 107 | for x in actual_item: 108 | clas.append(x[4]) 109 | coords.append([x[0],x[1],x[2],x[3]]) 110 | tmp0 = random.randint(-30, 50) 111 | tmp1 = random.randint(-30, 50) 112 | imgr=img.copy() 113 | scale = np.max((400 / float(img.shape[1]), 114 | 400 / float(img.shape[0]))) 115 | im, coords = tl.prepro.obj_box_imresize(imgr, coords, 116 | [int(img.shape[0] * scale) + tmp0, int(img.shape[1] * scale) + tmp1], 117 | is_rescale=True, interp='bicubic') 118 | # print(im.shape) 119 | # print(coords) 120 | 121 | for wi in range(7): 122 | imt, clast, coordst = tl.prepro.obj_box_zoom(im, clas, coords, zoom_range=(1.0, 2.2), 123 | fill_mode='nearest', 124 | order=1, is_rescale=True, is_center=True, 125 | is_random=True, 126 | thresh_wh=0.04, thresh_wh2=8.0) 127 | # print(im.shape) 128 | if clast!=[]: 129 | im=imt 130 | clas= clast 131 | coords =coordst 132 | break 133 | if wi>=6: 134 | im, clas, coords = tl.prepro.obj_box_zoom(im, clas, coords, zoom_range=(0.7, 1.2), 135 | fill_mode='nearest', 136 | order=1, is_rescale=True, is_center=True, 137 | is_random=True, 138 | thresh_wh=0.05, thresh_wh2=8.0) 139 | 140 | im, coords = tl.prepro.obj_box_left_right_flip(im, 141 | coords, is_rescale=True, is_center=True, is_random=True) 142 | # print(coords) 143 | for wi in range(8): 144 | imt, clast, coordst = tl.prepro.obj_box_crop(im, clas, coords, 145 | wrg=300, hrg=300, 146 | is_rescale=True, is_center=True, is_random=True, 147 | thresh_wh=0.07, thresh_wh2=7.0) 148 | if clast!=[]: 149 | im=imt 150 | clas= clast 151 | coords =coordst 152 | break 153 | if wi==7: 154 | im, clas, coords = tl.prepro.obj_box_crop(im, clas, coords, 155 | wrg=300, hrg=300, 156 | is_rescale=True, is_center=True, 157 | is_random=True, 158 | thresh_wh=0.07, thresh_wh2=8.0) 159 | 160 | 161 | im = tl.prepro.illumination(im, gamma=(0.2, 1.2), 162 | contrast=(0.2, 1.2), saturation=(0.2, 1.2), is_random=True) 163 | im = tl.prepro.adjust_hue(im, hout=0.1, is_offset=True, 164 | is_clip=True, is_random=True) 165 | im = im / 127.5 - 1. 166 | aitems = [] 167 | if clas!=[]: 168 | for x in range(len(clas)): 169 | aitem=[coords[x][0],coords[x][1],coords[x][2],coords[x][3],clas[x]] 170 | aitems.append(aitem) 171 | actual_data.append(aitems) 172 | train_data.append(im) 173 | break 174 | countwhile+=1 175 | if countwhile>=4: 176 | clas = [] 177 | coords = [] 178 | for x in actual_item: 179 | clas.append(x[4]) 180 | coords.append([x[0], x[1], x[2], x[3]]) 181 | tmp0 = random.randint(1, 30) 182 | tmp1 = random.randint(1, 30) 183 | imgr = img.copy() 184 | im, coords = tl.prepro.obj_box_imresize(imgr, coords, 185 | [300 + tmp0, 186 | 300 + tmp1], 187 | is_rescale=True, interp='bicubic') 188 | im, coords = tl.prepro.obj_box_left_right_flip(im, 189 | coords, is_rescale=True, is_center=True, 190 | is_random=True) 191 | im, clas, coords = tl.prepro.obj_box_crop(im, clas, coords, 192 | wrg=300, hrg=300, 193 | is_rescale=True, is_center=True, 194 | is_random=True, 195 | thresh_wh=0.02, thresh_wh2=10.0) 196 | 197 | 198 | 199 | im = tl.prepro.illumination(im, gamma=(0.8, 1.2), 200 | contrast=(0.8, 1.2), saturation=(0.8, 1.2), is_random=True) 201 | im = tl.prepro.pixel_value_scale(im, 0.1, [0, 255], is_random=True) 202 | im = im / 127.5 - 1. 203 | 204 | aitems = [] 205 | if len(clas) != 0: 206 | for x in range(len(clas)): 207 | aitem = [coords[x][0], coords[x][1], coords[x][2], coords[x][3], clas[x]] 208 | aitems.append(aitem) 209 | actual_data.append(aitems) 210 | train_data.append(im) 211 | break 212 | else: 213 | print('Error : ' + xml_path) 214 | continue 215 | return train_data, actual_data, file_list 216 | 217 | def generate_groundtruth_data(input_actual_data): 218 | # 生成空数组,用于保存groundtruth 219 | input_actual_data_len = len(input_actual_data) 220 | gt_class = np.zeros((input_actual_data_len, all_default_boxs_len)) 221 | gt_location = np.zeros((input_actual_data_len, all_default_boxs_len, 4)) 222 | gt_positives_jacc = np.zeros((input_actual_data_len, all_default_boxs_len)) 223 | gt_positives = np.zeros((input_actual_data_len, all_default_boxs_len)) 224 | gt_negatives = np.zeros((input_actual_data_len, all_default_boxs_len)) 225 | background_jacc = max(0, (jaccard_value - 0.2)) 226 | # 初始化正例训练数据 227 | for img_index in range(input_actual_data_len): 228 | for pre_actual in input_actual_data[img_index]: 229 | gt_class_val = pre_actual[-1:][0] 230 | 231 | if gt_class_val>20 or gt_class_val<0: 232 | gt_class_val=0 233 | gt_box_val = pre_actual[:-1] 234 | for boxe_index in range(all_default_boxs_len): 235 | jacc,gt_box_val_loc = jaccard(gt_box_val, all_default_boxs[boxe_index]) 236 | if jacc > jaccard_value or jacc == jaccard_value: 237 | gt_class[img_index][boxe_index] = gt_class_val 238 | gt_location[img_index][boxe_index] = gt_box_val_loc 239 | gt_positives_jacc[img_index][boxe_index] = jacc 240 | gt_positives[img_index][boxe_index] = 1 241 | gt_negatives[img_index][boxe_index] = 0 242 | # 如果没有正例,则随机创建一个正例,预防nan 243 | if np.sum(gt_positives[img_index]) == 0: 244 | # print('【没有匹配jacc】:'+str(input_actual_data[img_index])) 245 | random_pos_index = np.random.randint(low=0, high=all_default_boxs_len, size=1)[0] 246 | gt_class[img_index][random_pos_index] = background_classes_val 247 | gt_location[img_index][random_pos_index] = [0.00001, 0.00001, 0.00001, 0.00001] 248 | gt_positives_jacc[img_index][random_pos_index] = jaccard_value 249 | gt_positives[img_index][random_pos_index] = 1 250 | gt_negatives[img_index][random_pos_index] = 0 251 | gt_neg_end_count = int(np.sum(gt_positives[img_index]) * 3) 252 | if (gt_neg_end_count + np.sum(gt_positives[img_index])) > all_default_boxs_len: 253 | gt_neg_end_count = all_default_boxs_len - np.sum(gt_positives[img_index]) 254 | gt_neg_index = np.random.randint(low=0, high=all_default_boxs_len, size=gt_neg_end_count) 255 | for r_index in gt_neg_index: 256 | if gt_positives_jacc[img_index][r_index] < background_jacc and gt_positives[img_index][r_index] != 1: 257 | gt_class[img_index][r_index] = background_classes_val 258 | gt_positives[img_index][r_index] = 0 259 | gt_negatives[img_index][r_index] = 1 260 | gt_class = check_numerics(gt_class, 'gt_class') 261 | gt_location = check_numerics(gt_location, 'gt_class') 262 | gt_positives = check_numerics(gt_positives, 'gt_positives') 263 | gt_negatives = check_numerics(gt_negatives, 'gt_negatives') 264 | return gt_class, gt_location, gt_positives, gt_negatives 265 | 266 | def jaccard(rect1, rect2): 267 | x_overlap = max(0, (min(rect1[0] + (rect1[2] / 2), rect2[0] + (rect2[2] / 2)) - max(rect1[0] - (rect1[2] / 2), 268 | rect2[0] - (rect2[2] / 2)))) 269 | y_overlap = max(0, (min(rect1[1] + (rect1[3] / 2), rect2[1] + (rect2[3] / 2)) - max(rect1[1] - (rect1[3] / 2), 270 | rect2[1] - (rect2[3] / 2)))) 271 | intersection = x_overlap * y_overlap 272 | # 删除超出图像大小的部分 273 | rect1_width_sub = 0 274 | rect1_height_sub = 0 275 | rect2_width_sub = 0 276 | rect2_height_sub = 0 277 | if (rect1[0] - rect1[2] / 2) < 0: rect1_width_sub += 0 - (rect1[0] - rect1[2] / 2) 278 | if (rect1[0] + rect1[2] / 2) > 1: rect1_width_sub += (rect1[0] + rect1[2] / 2) - 1 279 | if (rect1[1] - rect1[3] / 2) < 0: rect1_height_sub += 0 - (rect1[1] - rect1[3] / 2) 280 | if (rect1[1] + rect1[3] / 2) > 1: rect1_height_sub += (rect1[1] + rect1[3] / 2) - 1 281 | if (rect2[0] - rect2[2] / 2) < 0: rect2_width_sub += 0 - (rect2[0] - rect2[2] / 2) 282 | if (rect2[0] + rect2[2] / 2) > 1: rect2_width_sub += (rect2[0] + rect2[2] / 2) - 1 283 | if (rect2[1] - rect2[3] / 2) < 0: rect2_height_sub += 0 - (rect2[1] - rect2[3] / 2) 284 | if (rect2[1] + rect2[3] / 2) > 1: rect2_height_sub += (rect2[1] + rect2[3] / 2) - 1 285 | area_box_a = (rect1[2] - rect1_width_sub) * (rect1[3] - rect1_height_sub) 286 | area_box_b = (rect2[2] - rect2_width_sub) * (rect2[3] - rect2_height_sub) 287 | union = area_box_a + area_box_b - intersection 288 | if intersection > 0 and union > 0: 289 | return intersection / union,[(rect1[0]-(rect2[0]))/rect2[2],(rect1[1]-(rect2[1]))/rect2[3],math.log(rect1[2]/rect2[2]),math.log(rect1[3]/rect2[3])] 290 | 291 | else: 292 | return 0,[0.00001,0.00001,0.00001,0.00001] 293 | 294 | def denseblock(input,blocknum=1,step=48,firstchannel=192,is_train=True,name='denseblock',reuse=None): 295 | with tf.variable_scope(name, reuse=reuse): 296 | tl.layers.set_name_reuse(reuse) 297 | nettemp=LambdaLayer(input, lambda x: tf.identity(x), name="INPUTS") 298 | for x in range(blocknum): 299 | netbn = BatchNormLayer(nettemp, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='bn/' + str(x)) 300 | net=Conv2d(netbn, firstchannel, (1, 1), (1, 1), padding='SAME',name='neta/'+str(x)) 301 | netbn = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name=name + 'bn2/' + str(x)) 302 | net=Conv2d(netbn, step, (3, 3), (1, 1), padding='SAME',name='netb/'+str(x)) 303 | nettemp= ConcatLayer([nettemp,net], -1,name='concattemp/'+str(x)) 304 | net = nettemp 305 | return net 306 | 307 | def denseblockpl(input,step=256,firstchannel=256,is_train=True,name='densepl',reuse=None): 308 | with tf.variable_scope(name, reuse=reuse): 309 | tl.layers.set_name_reuse(reuse) 310 | input = LambdaLayer(input, lambda x: tf.identity(x), name="INPUTS") 311 | netbn2=MaxPool2d(input,(2,2),(2,2),padding='SAME', name='bnpool2') 312 | netbn2 = BatchNormLayer(netbn2, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name=name + 'bn2pl' ) 313 | netbn2 = Conv2d(netbn2, firstchannel, (1, 1), (1, 1), padding='SAME', name='bnconv2' ) 314 | netbn = BatchNormLayer(input, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name= 'bn' ) 315 | net=Conv2d(netbn, firstchannel, (1, 1), (1, 1), padding='SAME',name='neta') 316 | netbn = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='bn2') 317 | net=Conv2d(netbn, step, (3, 3), (2, 2), padding='SAME',name='netb') 318 | nettemp = ConcatLayer([net,netbn2], -1,name='concat') 319 | return nettemp 320 | 321 | def denseblockfin(input,step=256,firstchannel=256,is_train=True,name='densepl',reuse=None): 322 | with tf.variable_scope(name, reuse=reuse): 323 | tl.layers.set_name_reuse(reuse) 324 | input = LambdaLayer(input, lambda x: tf.identity(x), name="INPUTS") 325 | netbn2=MaxPool2d(input,(3,3),(1,1),padding='VALID', name='bnpool2') 326 | netbn2 = BatchNormLayer(netbn2, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name=name + 'bn2pl' ) 327 | netbn2 = Conv2d(netbn2, firstchannel, (1, 1), (1, 1), padding='SAME', name='bnconv2' ) 328 | netbn = BatchNormLayer(input, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name= 'bn' ) 329 | net=Conv2d(netbn, firstchannel, (1, 1), (1, 1), padding='SAME',name='neta') 330 | netbn = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='bn2') 331 | net=Conv2d(netbn, step, (3, 3), (1, 1), padding='VALID',name='netb') 332 | nettemp = ConcatLayer([net,netbn2], -1,name='concat') 333 | return nettemp 334 | 335 | def inference(inputs, is_train, reuse): 336 | W_init = tf.contrib.layers.xavier_initializer() 337 | with tf.variable_scope("model", reuse=reuse): 338 | tl.layers.set_name_reuse(reuse) 339 | net = InputLayer(inputs, name='input') 340 | net = Conv2d(net, 64, (3, 3), (2, 2), padding='SAME', 341 | W_init=W_init, name='stem1') 342 | net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='stem1_bn') 343 | net = Conv2d(net, 64, (3, 3), (1, 1), padding='SAME', 344 | W_init=W_init, name='stem2') 345 | net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='stem2_bn') 346 | net = Conv2d(net, 128, (3, 3), (1, 1), padding='SAME', 347 | W_init=W_init, name='stem3') 348 | net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='stem3_bn') 349 | net = MaxPool2d(net, filter_size=(2, 2), strides=(2, 2), name='stem3_pool') 350 | net = denseblock(net, blocknum=6, step=48, firstchannel=192, is_train=is_train, name='denseblock0', reuse=reuse) 351 | net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='denseblock0_bn') 352 | net = Conv2d(net, 416, (1, 1), (1, 1), padding='SAME', 353 | W_init=W_init, name='denseblock0_cnn') 354 | net = MaxPool2d(net, filter_size=(2, 2), strides=(2, 2), name='denseblock0_pool') 355 | net = denseblock(net, blocknum=8, step=48, firstchannel=192, is_train=is_train, name='denseblock1', reuse=reuse) 356 | net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='denseblock1_bn') 357 | net = Conv2d(net, 800, (1, 1), (1, 1), padding='SAME', 358 | W_init=W_init, name='denseblock1_cnn') 359 | netfirst=BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='feature_first_bn') 360 | net = MaxPool2d(net, filter_size=(2, 2), strides=(2, 2), name='denseblock2_pool1') 361 | net = denseblock(net, blocknum=8, step=48, firstchannel=192, is_train=is_train, name='denseblock2', reuse=reuse) 362 | net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='denseblock2_bn') 363 | net = Conv2d(net, 1184, (1, 1), (1, 1), padding='SAME', 364 | W_init=W_init, name='denseblock2_cnn') 365 | net = denseblock(net, blocknum=8, step=48, firstchannel=192, is_train=is_train, name='denseblock3', reuse=reuse) 366 | net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='denseblock3_bn') 367 | net = Conv2d(net, 256, (1, 1), (1, 1), padding='SAME', 368 | W_init=W_init, name='denseblock2_cnna') 369 | netpl=MaxPool2d(netfirst, filter_size=(2, 2), strides=(2, 2), name='First_pool') 370 | netpl=BatchNormLayer(netpl, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='First_bn') 371 | netpl = Conv2d(netpl, 256, (1, 1), (1, 1), padding='SAME', 372 | W_init=W_init, name='denseblock2_cnnb') 373 | net=ConcatLayer([net,netpl],-1,"Second_Cat") 374 | netsecond = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='feature_second_bn') 375 | net = denseblockpl(net, step=256, firstchannel=256, is_train=is_train, name='denseplz1', reuse=reuse) 376 | netthird = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, 377 | name='feature_third_bn') 378 | net = denseblockpl(net, step=128, firstchannel=128, is_train=is_train, name='denseplz2', reuse=reuse) 379 | netfourth = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, 380 | name='feature_fourth_bn') 381 | net = denseblockpl(net, step=128, firstchannel=128, is_train=is_train, name='denseplz3', reuse=reuse) 382 | netfifth = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, 383 | name='feature_fifth_bn') 384 | net = denseblockfin(net, step=128, firstchannel=128, is_train=is_train, name='denseplz4', reuse=reuse) 385 | netsixth = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, 386 | name='feature_sixth_bn') 387 | outfirst=Conv2d(netfirst, default_box_size[0] * (classes_size + 4), (3, 3), (1, 1), padding='SAME', 388 | W_init=W_init, name='firstout') 389 | outsecond=Conv2d(netsecond, default_box_size[1] * (classes_size + 4), (3, 3), (1, 1), padding='SAME', 390 | W_init=W_init, name='secondout') 391 | outthird=Conv2d(netthird, default_box_size[2] * (classes_size + 4), (3, 3), (1, 1), padding='SAME', 392 | W_init=W_init, name='thirdout') 393 | outfourth=Conv2d(netfourth, default_box_size[3] * (classes_size + 4), (3, 3), (1, 1), padding='SAME', 394 | W_init=W_init, name='fourthout') 395 | outfifth=Conv2d(netfifth, default_box_size[4] * (classes_size + 4), (3, 3), (1, 1), padding='SAME', 396 | W_init=W_init, name='fifthout') 397 | outsixth=Conv2d(netsixth, default_box_size[5] * (classes_size + 4), (3, 3), (1, 1), padding='SAME', 398 | W_init=W_init, name='sixthout') 399 | features1=outfirst.outputs 400 | features2=outsecond.outputs 401 | features3=outthird.outputs 402 | features4=outfourth.outputs 403 | features5=outfifth.outputs 404 | features6=outsixth.outputs 405 | feature_maps = [features1, features2, features3, features4, features5,features6] 406 | global feature_maps_shape 407 | feature_maps_shape = [m.get_shape().as_list() for m in feature_maps] 408 | tmp_all_feature = [] 409 | for i, fmap in zip(range(len(feature_maps)), feature_maps): 410 | width = feature_maps_shape[i][1] 411 | height = feature_maps_shape[i][2] 412 | tmp_all_feature.append( 413 | tf.reshape(fmap, [-1, (width * height * default_box_size[i]), (classes_size + 4)])) 414 | tmp_all_feature = tf.concat(tmp_all_feature, axis=1) 415 | feature_class = tmp_all_feature[:, :, :classes_size] 416 | feature_location = tmp_all_feature[:, :, classes_size:] 417 | print('## feature_class shape : ' + str(feature_class.get_shape().as_list())) 418 | print('## feature_location shape : ' + str(feature_location.get_shape().as_list())) 419 | # 生成所有default boxs 420 | global all_default_boxs 421 | all_default_boxs = generate_all_default_boxs() 422 | # print(all_default_boxs) 423 | global all_default_boxs_len 424 | all_default_boxs_len = len(all_default_boxs) 425 | print('## all default boxs : ' + str(all_default_boxs_len)) 426 | return feature_class,feature_location,all_default_boxs,all_default_boxs_len 427 | 428 | def smooth_L1(x): 429 | return tf.where(tf.less_equal(tf.abs(x), 1.0), tf.multiply(0.5, tf.pow(x, 2.0)), tf.subtract(tf.abs(x), 0.5)) 430 | 431 | def elloss(feature_class,feature_location,groundtruth_class,groundtruth_location,groundtruth_positives,groundtruth_count): 432 | softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=feature_class, 433 | labels=groundtruth_class) 434 | loss_location = tf.div(tf.reduce_sum(tf.multiply( 435 | tf.reduce_sum(smooth_L1(tf.subtract(groundtruth_location, feature_location)), 436 | reduction_indices=2), groundtruth_positives), reduction_indices=1), 437 | tf.reduce_sum(groundtruth_positives, reduction_indices=1)) 438 | loss_class = tf.div( 439 | tf.reduce_sum(tf.multiply(softmax_cross_entropy, groundtruth_count), reduction_indices=1), 440 | tf.reduce_sum(groundtruth_count, reduction_indices=1)) 441 | loss_all = tf.reduce_sum(tf.add(loss_class, loss_location*5)) 442 | return loss_all,loss_class,loss_location 443 | 444 | def generate_all_default_boxs(): 445 | all_default_boxes = [] 446 | for index, map_shape in zip(range(len(feature_maps_shape)), feature_maps_shape): 447 | width = int(map_shape[1]) 448 | height = int(map_shape[2]) 449 | cell_scale = default_box_scale[index] 450 | for x in range(width): 451 | for y in range(height): 452 | for ratio in box_aspect_ratio[index]: 453 | center_x = (x / float(width)) + (0.5 / float(width)) 454 | center_y = (y / float(height)) + (0.5 / float(height)) 455 | box_width = cell_scale*np.sqrt(ratio)/1.2 456 | box_height = cell_scale/np.sqrt(ratio)/1.2 457 | all_default_boxes.append([center_x, center_y, box_width, box_height]) 458 | all_default_boxes.append([(x / float(width)) + (0.5 / float(width)), (y / float(height)) + (0.5 / float(height)), cell_scale*1.5,cell_scale*1.4]) 459 | all_default_boxes = np.array(all_default_boxes) 460 | all_default_boxes = check_numerics(all_default_boxes, 'all_default_boxes') 461 | return all_default_boxes 462 | 463 | def check_numerics(input_dataset, message): 464 | if str(input_dataset).find('Tensor') == 0: 465 | input_dataset = tf.check_numerics(input_dataset, message) 466 | else: 467 | dataset = np.array(input_dataset) 468 | nan_count = np.count_nonzero(dataset != dataset) 469 | inf_count = len(dataset[dataset == float("inf")]) 470 | n_inf_count = len(dataset[dataset == float("-inf")]) 471 | if nan_count > 0 or inf_count > 0 or n_inf_count > 0: 472 | data_error = '【' + message + '】出现数据错误!【nan:' + str(nan_count) + '|inf:' + str( 473 | inf_count) + '|-inf:' + str(n_inf_count) + '】' 474 | raise Exception(data_error) 475 | return input_dataset 476 | 477 | if __name__ == '__main__': 478 | imageinput=tf.placeholder(tf.float32,[None,300,300,3],"inputsimage") 479 | imageinputtest = tf.placeholder(tf.float32, [None, 300, 300, 3], "inputsimage") 480 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) 481 | fc, fl, _, _ = inference(imageinput, True, None) 482 | 483 | fc2, fl2, _, _ = inference(imageinputtest, False, True) 484 | 485 | groundtruth_class = tf.placeholder(shape=[None, all_default_boxs_len], dtype=tf.int32, 486 | name='groundtruth_class') 487 | groundtruth_location = tf.placeholder(shape=[None, all_default_boxs_len, 4], dtype=tf.float32, 488 | name='groundtruth_location') 489 | groundtruth_positives = tf.placeholder(shape=[None, all_default_boxs_len], dtype=tf.float32, 490 | name='groundtruth_positives') 491 | groundtruth_negatives = tf.placeholder(shape=[None, all_default_boxs_len], dtype=tf.float32, 492 | name='groundtruth_negatives') 493 | groundtruth_count = tf.add(groundtruth_positives, groundtruth_negatives) 494 | learning_rt=0.000001 495 | learning_rate = tf.placeholder(tf.float32, None, 'learning_rate') 496 | loss_allt, loss_classt, loss_locationt = elloss(fc, fl, groundtruth_class, groundtruth_location, groundtruth_positives, groundtruth_count) 497 | train = tf.train.MomentumOptimizer(learning_rate,momentum=0.9).minimize(loss_allt) 498 | tf.summary.scalar('loss_all_train', loss_allt) 499 | tf.summary.scalar('loss_class_train', tf.reduce_sum(loss_classt) ) 500 | tf.summary.scalar('loss_location_train', tf.reduce_sum(loss_locationt)) 501 | merged = tf.summary.merge_all() 502 | with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: 503 | trainwrite = tf.summary.FileWriter("logs/", sess.graph) 504 | sess.run(tf.global_variables_initializer()) 505 | saver2 = tf.train.Saver(var_list=tf.trainable_variables()) 506 | zzz = variables._all_saveable_objects().copy() 507 | print(zzz) 508 | saver = tf.train.Saver() 509 | if os.path.exists('./session_paramsdddaleasy/session2.ckpt.index') : 510 | print('\nStart Restore') 511 | saver2.restore(sess, './session_paramsdddaleasy/session2.ckpt') 512 | print('\nEnd Restore') 513 | print('\nStart Training') 514 | min_loss_location = 100000. 515 | min_loss_class = 100000. 516 | avg_loss=0 517 | avg_lossloc=0 518 | avg_losclass=0 519 | ptlos=0 520 | ptlosc=0 521 | ptlosl=0 522 | while((min_loss_location + min_loss_class) > 0.001 and running_count < 100000): 523 | running_count += 1 524 | train_data, actual_data, _ = get_traindata_voc(batch_size) 525 | starttime = time.time() 526 | gt_class, gt_location, gt_positives, gt_negatives=generate_groundtruth_data(actual_data) 527 | if len(train_data) > 0: 528 | loss_all,loss_class,loss_location,_,pred_class,pred_location = sess.run([loss_allt, loss_classt, loss_locationt,train,fc, fl],feed_dict={imageinput:train_data,groundtruth_class:gt_class,groundtruth_location:gt_location,groundtruth_positives:gt_positives,groundtruth_negatives:gt_negatives,learning_rate:learning_rt}) 529 | l = np.sum(loss_location) 530 | c = np.sum(loss_class) 531 | avg_loss +=loss_all 532 | avg_lossloc += loss_class 533 | avg_losclass += loss_location 534 | if min_loss_location > l: 535 | min_loss_location = l 536 | if min_loss_class > c: 537 | min_loss_class = c 538 | print('Running:【' + str(running_count) + '】|Loss All:【' + str( 539 | min_loss_location + min_loss_class) + '|' + str(loss_all) + '】|Location:【' + str( 540 | np.sum(loss_location)) + '】|Class:【' + str(np.sum(loss_class)) + '】|pred_class:【' + str( 541 | np.sum(pred_class)) + '|' + str(np.amax(pred_class)) + '|' + str( 542 | np.min(pred_class)) + '】|pred_location:【' + str(np.sum(pred_location)) + '|' + str( 543 | np.amax(pred_location)) + '|' + str(np.min(pred_location)) + '】TIME:'+str(time.time()-starttime)) 544 | if running_count % 100 == 0: 545 | print('---------') 546 | print('avgloss') 547 | print(avg_loss/100.) 548 | print(np.sum(avg_lossloc/100.) ) 549 | print(np.sum(avg_losclass/100.) ) 550 | print(ptlos-avg_loss/100.) 551 | print(ptlosc-np.sum(avg_lossloc/100.) ) 552 | print(ptlosl-np.sum(avg_losclass/100.) ) 553 | ptlos = avg_loss/100. 554 | ptlosc = np.sum(avg_lossloc/100. ) 555 | ptlosl = np.sum(avg_losclass/100. ) 556 | print('---------') 557 | avg_loss=0 558 | avg_lossloc = 0 559 | avg_losclass = 0 560 | if running_count % 100 == 0: 561 | results = sess.run(merged,feed_dict={imageinput:train_data,groundtruth_class:gt_class,groundtruth_location:gt_location,groundtruth_positives:gt_positives,groundtruth_negatives:gt_negatives,learning_rate:learning_rt}) 562 | trainwrite.add_summary(results, running_count) 563 | if running_count % 500 == 0: 564 | saver.save(sess, './session_paramsdddaleasy/session.ckpt') 565 | print('session.ckpt has been saved.') 566 | gc.collect() 567 | else: 568 | print('No Data Exists!') 569 | break -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DSOD的Tensorflow实现。 2 | ------------------- 3 | 4 | 使用Tensorlayer和Skimage等,过程中使用了自己修改后的Tensorlayer增广,上传前修改应该能在原始TL跑,没测试代码。按论文可以在VOC07+12上训练吧,Loss那块Loc给了高权重。 5 | 6 | 源码框架参考[jasonli8848d](https://github.com/lslcode/SSD_for_Tensorflow)的工作,做了一些修改。主要是Groundtruth那块还有增广。另外这个代码增广极度耗费CPU资源,训练效率很低,收敛也不快。有问题可以到[某乎](https://zhuanlan.zhihu.com/p/33957333)评论或者给我发邮件,colinyoo#outlook#com。虽然未必能给予帮助。 7 | 8 | 9 | --------------------------------------------------------------------------------