├── DSOD.py
└── README.md


/DSOD.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gc
  3 | import xml.etree.ElementTree as etxml
  4 | import math
  5 | import random
  6 | import skimage.io
  7 | import skimage.transform
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | import tensorlayer as tl
 11 | from tensorlayer.layers import *
 12 | from tensorflow.python.ops import variables
 13 | import time
 14 | from imutils.object_detection import non_max_suppression
 15 | import imutils
 16 | import cv2
 17 | import matplotlib.pyplot as plt
 18 | batch_size = 16
 19 | running_count = 5000
 20 | file_name_list = os.listdir('./train_datasets/voc2012/JPEGImages/')
 21 | lable_arr = ['background','aeroplane','bicycle','bird','boat','bottle','bus','car','cat','chair','cow','diningtable','dog','horse','motorbike','person','pottedplant','sheep','sofa','train','tvmonitor']
 22 | img_size = [300, 300]
 23 | # 分类总数量
 24 | classes_size = 21
 25 | # 背景分类的值
 26 | background_classes_val = 0
 27 | # 每个特征图单元的default box数量
 28 | default_box_size = [6, 6, 6, 6, 6, 6]
 29 | # default box 尺寸长宽比例
 30 | box_aspect_ratio = [
 31 |     [0.5, 1.0, 2.0, 3.0,1/3.0],
 32 |     [0.5, 1.0, 2.0, 3.0, 1 / 3.0],
 33 |     [0.5, 1.0, 2.0, 3.0, 1 / 3.0],
 34 |     [0.5, 1.0, 2.0, 3.0, 1 / 3.0],
 35 |     [0.5, 1.0, 2.0, 3.0, 1 / 3.0],
 36 |     [0.5, 1.0, 2.0, 3.0, 1 / 3.0]
 37 | ]
 38 | # 最小default box面积比例
 39 | min_box_scale = 0.1
 40 | # 最大default box面积比例
 41 | max_box_scale = 0.9
 42 | # 每个特征层的面积比例
 43 | # numpy生成等差数组，效果等同于论文中的s_k=s_min+(s_max-s_min)*(k-1)/(m-1)
 44 | default_box_scale = np.linspace(min_box_scale, max_box_scale, num=np.amax(default_box_size))
 45 | print('##   default_box_scale:' + str(default_box_scale))
 46 | # 卷积步长
 47 | conv_strides_1 = [1, 1, 1, 1]
 48 | conv_strides_2 = [1, 2, 2, 1]
 49 | conv_strides_3 = [1, 3, 3, 1]
 50 | 
 51 | tl_strides_1 = (1, 1)
 52 | tl_strides_2 = (2, 2)
 53 | tl_strides_3 = (3, 3)
 54 | # 池化窗口
 55 | pool_size = [1, 2, 2, 1]
 56 | tl_pool_size = (2, 2)
 57 | # 池化步长
 58 | pool_strides = [1, 2, 2, 1]
 59 | tl_pool_strides = (2, 2)
 60 | # Batch Normalization 算法的 decay 参数
 61 | conv_bn_decay = 0.9999
 62 | # Batch Normalization 算法的 variance_epsilon 参数
 63 | conv_bn_epsilon = 0.001
 64 | # Jaccard相似度判断阀值
 65 | jaccard_value = 0.55
 66 | feature_maps_shape=[]
 67 | all_default_boxs_len=0
 68 | all_default_boxs=[]
 69 | 
 70 | jitter = 0.2
 71 | def get_traindata_voc(batch_size):
 72 |     def get_actual_data_from_xml(xml_path):
 73 |         actual_item = []
 74 |         try:
 75 |             annotation_node = etxml.parse(xml_path).getroot()
 76 |             img_width = float(annotation_node.find('size').find('width').text.strip())
 77 |             img_height = float(annotation_node.find('size').find('height').text.strip())
 78 |             object_node_list = annotation_node.findall('object')
 79 |             for obj_node in object_node_list:
 80 |                 lable = lable_arr.index(obj_node.find('name').text.strip())
 81 |                 bndbox = obj_node.find('bndbox')
 82 |                 x_min = float(bndbox.find('xmin').text.strip())
 83 |                 y_min = float(bndbox.find('ymin').text.strip())
 84 |                 x_max = float(bndbox.find('xmax').text.strip())
 85 |                 y_max = float(bndbox.find('ymax').text.strip())
 86 |                 # 位置数据用比例来表示，格式[center_x,center_y,width,height,lable]
 87 |                 actual_item.append([((x_min + x_max) / 2 / img_width), ((y_min + y_max) / 2 / img_height),
 88 |                                     ((x_max - x_min) / img_width), ((y_max - y_min) / img_height), lable])
 89 |             return actual_item
 90 |         except:
 91 |             return None
 92 | 
 93 |     train_data = []
 94 |     actual_data = []
 95 |     file_list = random.sample(file_name_list, batch_size)
 96 |     for f_name in file_list:
 97 |         img_path = './train_datasets/voc2012/JPEGImages/' + f_name
 98 |         xml_path = './train_datasets/voc2012/Annotations/' + f_name.replace('.jpg', '.xml')
 99 |         if os.path.splitext(img_path)[1].lower() == '.jpg':
100 |             actual_item = get_actual_data_from_xml(xml_path)
101 |             img = skimage.io.imread(img_path)
102 |             if actual_item != None:
103 |                 countwhile=0
104 |                 while True:
105 |                     clas=[]
106 |                     coords=[]
107 |                     for x in actual_item:
108 |                         clas.append(x[4])
109 |                         coords.append([x[0],x[1],x[2],x[3]])
110 |                     tmp0 = random.randint(-30, 50)
111 |                     tmp1 = random.randint(-30, 50)
112 |                     imgr=img.copy()
113 |                     scale = np.max((400 / float(img.shape[1]),
114 |                                     400 / float(img.shape[0])))
115 |                     im, coords = tl.prepro.obj_box_imresize(imgr, coords,
116 |                                                             [int(img.shape[0] * scale) + tmp0, int(img.shape[1] * scale) + tmp1],
117 |                                                             is_rescale=True, interp='bicubic')
118 |                     # print(im.shape)
119 |                     # print(coords)
120 | 
121 |                     for wi in range(7):
122 |                         imt, clast, coordst = tl.prepro.obj_box_zoom(im, clas, coords, zoom_range=(1.0, 2.2),
123 |                                                                   fill_mode='nearest',
124 |                                                                   order=1, is_rescale=True, is_center=True,
125 |                                                                   is_random=True,
126 |                                                                   thresh_wh=0.04, thresh_wh2=8.0)
127 |                         # print(im.shape)
128 |                         if clast!=[]:
129 |                             im=imt
130 |                             clas= clast
131 |                             coords =coordst
132 |                             break
133 |                         if wi>=6:
134 |                             im, clas, coords = tl.prepro.obj_box_zoom(im, clas, coords, zoom_range=(0.7, 1.2),
135 |                                                                          fill_mode='nearest',
136 |                                                                          order=1, is_rescale=True, is_center=True,
137 |                                                                          is_random=True,
138 |                                                                          thresh_wh=0.05, thresh_wh2=8.0)
139 | 
140 |                     im, coords = tl.prepro.obj_box_left_right_flip(im,
141 |                                                                    coords, is_rescale=True, is_center=True, is_random=True)
142 |                     # print(coords)
143 |                     for wi in range(8):
144 |                         imt, clast, coordst = tl.prepro.obj_box_crop(im, clas, coords,
145 |                                                                   wrg=300, hrg=300,
146 |                                                                   is_rescale=True, is_center=True, is_random=True,
147 |                         thresh_wh=0.07, thresh_wh2=7.0)
148 |                         if clast!=[]:
149 |                             im=imt
150 |                             clas= clast
151 |                             coords =coordst
152 |                             break
153 |                         if wi==7:
154 |                             im, clas, coords = tl.prepro.obj_box_crop(im, clas, coords,
155 |                                                                          wrg=300, hrg=300,
156 |                                                                          is_rescale=True, is_center=True,
157 |                                                                          is_random=True,
158 |                                                                          thresh_wh=0.07, thresh_wh2=8.0)
159 | 
160 | 
161 |                     im = tl.prepro.illumination(im, gamma=(0.2, 1.2),
162 |                                                 contrast=(0.2, 1.2), saturation=(0.2, 1.2), is_random=True)
163 |                     im = tl.prepro.adjust_hue(im, hout=0.1, is_offset=True,
164 |                                               is_clip=True, is_random=True)
165 |                     im = im / 127.5 - 1.
166 |                     aitems = []
167 |                     if clas!=[]:
168 |                         for x in range(len(clas)):
169 |                             aitem=[coords[x][0],coords[x][1],coords[x][2],coords[x][3],clas[x]]
170 |                             aitems.append(aitem)
171 |                         actual_data.append(aitems)
172 |                         train_data.append(im)
173 |                         break
174 |                     countwhile+=1
175 |                     if countwhile>=4:
176 |                         clas = []
177 |                         coords = []
178 |                         for x in actual_item:
179 |                             clas.append(x[4])
180 |                             coords.append([x[0], x[1], x[2], x[3]])
181 |                         tmp0 = random.randint(1, 30)
182 |                         tmp1 = random.randint(1, 30)
183 |                         imgr = img.copy()
184 |                         im, coords = tl.prepro.obj_box_imresize(imgr, coords,
185 |                                                                 [300 + tmp0,
186 |                                                                  300 + tmp1],
187 |                                                                 is_rescale=True, interp='bicubic')
188 |                         im, coords = tl.prepro.obj_box_left_right_flip(im,
189 |                                                                        coords, is_rescale=True, is_center=True,
190 |                                                                        is_random=True)
191 |                         im, clas, coords = tl.prepro.obj_box_crop(im, clas, coords,
192 |                                                                      wrg=300, hrg=300,
193 |                                                                      is_rescale=True, is_center=True,
194 |                                                                      is_random=True,
195 |                                                                      thresh_wh=0.02, thresh_wh2=10.0)
196 | 
197 | 
198 | 
199 |                         im = tl.prepro.illumination(im, gamma=(0.8, 1.2),
200 |                                                     contrast=(0.8, 1.2), saturation=(0.8, 1.2), is_random=True)
201 |                         im = tl.prepro.pixel_value_scale(im, 0.1, [0, 255], is_random=True)
202 |                         im = im / 127.5 - 1.
203 | 
204 |                         aitems = []
205 |                         if len(clas) != 0:
206 |                             for x in range(len(clas)):
207 |                                 aitem = [coords[x][0], coords[x][1], coords[x][2], coords[x][3], clas[x]]
208 |                                 aitems.append(aitem)
209 |                             actual_data.append(aitems)
210 |                             train_data.append(im)
211 |                             break
212 |             else:
213 |                 print('Error : ' + xml_path)
214 |                 continue
215 |     return train_data, actual_data, file_list
216 | 
217 | def generate_groundtruth_data(input_actual_data):
218 |     # 生成空数组，用于保存groundtruth
219 |     input_actual_data_len = len(input_actual_data)
220 |     gt_class = np.zeros((input_actual_data_len, all_default_boxs_len))
221 |     gt_location = np.zeros((input_actual_data_len, all_default_boxs_len, 4))
222 |     gt_positives_jacc = np.zeros((input_actual_data_len, all_default_boxs_len))
223 |     gt_positives = np.zeros((input_actual_data_len, all_default_boxs_len))
224 |     gt_negatives = np.zeros((input_actual_data_len, all_default_boxs_len))
225 |     background_jacc = max(0, (jaccard_value - 0.2))
226 |     # 初始化正例训练数据
227 |     for img_index in range(input_actual_data_len):
228 |         for pre_actual in input_actual_data[img_index]:
229 |             gt_class_val = pre_actual[-1:][0]
230 | 
231 |             if gt_class_val>20 or gt_class_val<0:
232 |                 gt_class_val=0
233 |             gt_box_val = pre_actual[:-1]
234 |             for boxe_index in range(all_default_boxs_len):
235 |                 jacc,gt_box_val_loc = jaccard(gt_box_val, all_default_boxs[boxe_index])
236 |                 if jacc > jaccard_value or jacc == jaccard_value:
237 |                     gt_class[img_index][boxe_index] = gt_class_val
238 |                     gt_location[img_index][boxe_index] = gt_box_val_loc
239 |                     gt_positives_jacc[img_index][boxe_index] = jacc
240 |                     gt_positives[img_index][boxe_index] = 1
241 |                     gt_negatives[img_index][boxe_index] = 0
242 |         # 如果没有正例，则随机创建一个正例，预防nan
243 |         if np.sum(gt_positives[img_index]) == 0:
244 |             # print('【没有匹配jacc】:'+str(input_actual_data[img_index]))
245 |             random_pos_index = np.random.randint(low=0, high=all_default_boxs_len, size=1)[0]
246 |             gt_class[img_index][random_pos_index] = background_classes_val
247 |             gt_location[img_index][random_pos_index] = [0.00001, 0.00001, 0.00001, 0.00001]
248 |             gt_positives_jacc[img_index][random_pos_index] = jaccard_value
249 |             gt_positives[img_index][random_pos_index] = 1
250 |             gt_negatives[img_index][random_pos_index] = 0
251 |         gt_neg_end_count = int(np.sum(gt_positives[img_index]) * 3)
252 |         if (gt_neg_end_count + np.sum(gt_positives[img_index])) > all_default_boxs_len:
253 |             gt_neg_end_count = all_default_boxs_len - np.sum(gt_positives[img_index])
254 |         gt_neg_index = np.random.randint(low=0, high=all_default_boxs_len, size=gt_neg_end_count)
255 |         for r_index in gt_neg_index:
256 |             if gt_positives_jacc[img_index][r_index] < background_jacc and gt_positives[img_index][r_index] != 1:
257 |                 gt_class[img_index][r_index] = background_classes_val
258 |                 gt_positives[img_index][r_index] = 0
259 |                 gt_negatives[img_index][r_index] = 1
260 |     gt_class = check_numerics(gt_class, 'gt_class')
261 |     gt_location = check_numerics(gt_location, 'gt_class')
262 |     gt_positives = check_numerics(gt_positives, 'gt_positives')
263 |     gt_negatives = check_numerics(gt_negatives, 'gt_negatives')
264 |     return gt_class, gt_location, gt_positives, gt_negatives
265 | 
266 | def jaccard(rect1, rect2):
267 |     x_overlap = max(0, (min(rect1[0] + (rect1[2] / 2), rect2[0] + (rect2[2] / 2)) - max(rect1[0] - (rect1[2] / 2),
268 |                                                                                         rect2[0] - (rect2[2] / 2))))
269 |     y_overlap = max(0, (min(rect1[1] + (rect1[3] / 2), rect2[1] + (rect2[3] / 2)) - max(rect1[1] - (rect1[3] / 2),
270 |                                                                                         rect2[1] - (rect2[3] / 2))))
271 |     intersection = x_overlap * y_overlap
272 |     # 删除超出图像大小的部分
273 |     rect1_width_sub = 0
274 |     rect1_height_sub = 0
275 |     rect2_width_sub = 0
276 |     rect2_height_sub = 0
277 |     if (rect1[0] - rect1[2] / 2) < 0: rect1_width_sub += 0 - (rect1[0] - rect1[2] / 2)
278 |     if (rect1[0] + rect1[2] / 2) > 1: rect1_width_sub += (rect1[0] + rect1[2] / 2) - 1
279 |     if (rect1[1] - rect1[3] / 2) < 0: rect1_height_sub += 0 - (rect1[1] - rect1[3] / 2)
280 |     if (rect1[1] + rect1[3] / 2) > 1: rect1_height_sub += (rect1[1] + rect1[3] / 2) - 1
281 |     if (rect2[0] - rect2[2] / 2) < 0: rect2_width_sub += 0 - (rect2[0] - rect2[2] / 2)
282 |     if (rect2[0] + rect2[2] / 2) > 1: rect2_width_sub += (rect2[0] + rect2[2] / 2) - 1
283 |     if (rect2[1] - rect2[3] / 2) < 0: rect2_height_sub += 0 - (rect2[1] - rect2[3] / 2)
284 |     if (rect2[1] + rect2[3] / 2) > 1: rect2_height_sub += (rect2[1] + rect2[3] / 2) - 1
285 |     area_box_a = (rect1[2] - rect1_width_sub) * (rect1[3] - rect1_height_sub)
286 |     area_box_b = (rect2[2] - rect2_width_sub) * (rect2[3] - rect2_height_sub)
287 |     union = area_box_a + area_box_b - intersection
288 |     if intersection > 0 and union > 0:
289 |         return intersection / union,[(rect1[0]-(rect2[0]))/rect2[2],(rect1[1]-(rect2[1]))/rect2[3],math.log(rect1[2]/rect2[2]),math.log(rect1[3]/rect2[3])]
290 | 
291 |     else:
292 |         return 0,[0.00001,0.00001,0.00001,0.00001]
293 | 
294 | def denseblock(input,blocknum=1,step=48,firstchannel=192,is_train=True,name='denseblock',reuse=None):
295 |     with tf.variable_scope(name, reuse=reuse):
296 |         tl.layers.set_name_reuse(reuse)
297 |         nettemp=LambdaLayer(input, lambda x: tf.identity(x), name="INPUTS")
298 |         for x in range(blocknum):
299 |             netbn = BatchNormLayer(nettemp, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='bn/' + str(x))
300 |             net=Conv2d(netbn, firstchannel, (1, 1), (1, 1), padding='SAME',name='neta/'+str(x))
301 |             netbn = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name=name + 'bn2/' + str(x))
302 |             net=Conv2d(netbn, step, (3, 3), (1, 1), padding='SAME',name='netb/'+str(x))
303 |             nettemp= ConcatLayer([nettemp,net], -1,name='concattemp/'+str(x))
304 |             net = nettemp
305 |     return net
306 | 
307 | def denseblockpl(input,step=256,firstchannel=256,is_train=True,name='densepl',reuse=None):
308 |     with tf.variable_scope(name, reuse=reuse):
309 |         tl.layers.set_name_reuse(reuse)
310 |         input = LambdaLayer(input, lambda x: tf.identity(x), name="INPUTS")
311 |         netbn2=MaxPool2d(input,(2,2),(2,2),padding='SAME', name='bnpool2')
312 |         netbn2 = BatchNormLayer(netbn2, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name=name + 'bn2pl' )
313 |         netbn2 = Conv2d(netbn2, firstchannel, (1, 1), (1, 1), padding='SAME', name='bnconv2' )
314 |         netbn = BatchNormLayer(input, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name= 'bn' )
315 |         net=Conv2d(netbn, firstchannel, (1, 1), (1, 1), padding='SAME',name='neta')
316 |         netbn = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='bn2')
317 |         net=Conv2d(netbn, step, (3, 3), (2, 2), padding='SAME',name='netb')
318 |         nettemp = ConcatLayer([net,netbn2], -1,name='concat')
319 |     return nettemp
320 | 
321 | def denseblockfin(input,step=256,firstchannel=256,is_train=True,name='densepl',reuse=None):
322 |     with tf.variable_scope(name, reuse=reuse):
323 |         tl.layers.set_name_reuse(reuse)
324 |         input = LambdaLayer(input, lambda x: tf.identity(x), name="INPUTS")
325 |         netbn2=MaxPool2d(input,(3,3),(1,1),padding='VALID', name='bnpool2')
326 |         netbn2 = BatchNormLayer(netbn2, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name=name + 'bn2pl' )
327 |         netbn2 = Conv2d(netbn2, firstchannel, (1, 1), (1, 1), padding='SAME', name='bnconv2' )
328 |         netbn = BatchNormLayer(input, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name= 'bn' )
329 |         net=Conv2d(netbn, firstchannel, (1, 1), (1, 1), padding='SAME',name='neta')
330 |         netbn = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='bn2')
331 |         net=Conv2d(netbn, step, (3, 3), (1, 1), padding='VALID',name='netb')
332 |         nettemp = ConcatLayer([net,netbn2], -1,name='concat')
333 |     return nettemp
334 | 
335 | def inference(inputs, is_train, reuse):
336 |     W_init = tf.contrib.layers.xavier_initializer()
337 |     with tf.variable_scope("model", reuse=reuse):
338 |         tl.layers.set_name_reuse(reuse)
339 |         net = InputLayer(inputs, name='input')
340 |         net = Conv2d(net, 64, (3, 3), (2, 2), padding='SAME',
341 |                      W_init=W_init, name='stem1')
342 |         net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='stem1_bn')
343 |         net = Conv2d(net, 64, (3, 3), (1, 1), padding='SAME',
344 |                      W_init=W_init, name='stem2')
345 |         net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='stem2_bn')
346 |         net = Conv2d(net, 128, (3, 3), (1, 1), padding='SAME',
347 |                      W_init=W_init, name='stem3')
348 |         net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='stem3_bn')
349 |         net = MaxPool2d(net, filter_size=(2, 2), strides=(2, 2), name='stem3_pool')
350 |         net = denseblock(net, blocknum=6, step=48, firstchannel=192, is_train=is_train, name='denseblock0', reuse=reuse)
351 |         net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='denseblock0_bn')
352 |         net = Conv2d(net, 416, (1, 1), (1, 1), padding='SAME',
353 |                      W_init=W_init, name='denseblock0_cnn')
354 |         net = MaxPool2d(net, filter_size=(2, 2), strides=(2, 2), name='denseblock0_pool')
355 |         net = denseblock(net, blocknum=8, step=48, firstchannel=192, is_train=is_train, name='denseblock1', reuse=reuse)
356 |         net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='denseblock1_bn')
357 |         net = Conv2d(net, 800, (1, 1), (1, 1), padding='SAME',
358 |                      W_init=W_init, name='denseblock1_cnn')
359 |         netfirst=BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='feature_first_bn')
360 |         net = MaxPool2d(net, filter_size=(2, 2), strides=(2, 2), name='denseblock2_pool1')
361 |         net = denseblock(net, blocknum=8, step=48, firstchannel=192, is_train=is_train, name='denseblock2', reuse=reuse)
362 |         net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='denseblock2_bn')
363 |         net = Conv2d(net, 1184, (1, 1), (1, 1), padding='SAME',
364 |                      W_init=W_init, name='denseblock2_cnn')
365 |         net = denseblock(net, blocknum=8, step=48, firstchannel=192, is_train=is_train, name='denseblock3', reuse=reuse)
366 |         net = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='denseblock3_bn')
367 |         net = Conv2d(net, 256, (1, 1), (1, 1), padding='SAME',
368 |                      W_init=W_init, name='denseblock2_cnna')
369 |         netpl=MaxPool2d(netfirst, filter_size=(2, 2), strides=(2, 2), name='First_pool')
370 |         netpl=BatchNormLayer(netpl, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='First_bn')
371 |         netpl = Conv2d(netpl, 256, (1, 1), (1, 1), padding='SAME',
372 |                      W_init=W_init, name='denseblock2_cnnb')
373 |         net=ConcatLayer([net,netpl],-1,"Second_Cat")
374 |         netsecond = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu, name='feature_second_bn')
375 |         net = denseblockpl(net, step=256, firstchannel=256, is_train=is_train, name='denseplz1', reuse=reuse)
376 |         netthird = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu,
377 |                                    name='feature_third_bn')
378 |         net = denseblockpl(net, step=128, firstchannel=128, is_train=is_train, name='denseplz2', reuse=reuse)
379 |         netfourth = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu,
380 |                                    name='feature_fourth_bn')
381 |         net = denseblockpl(net, step=128, firstchannel=128, is_train=is_train, name='denseplz3', reuse=reuse)
382 |         netfifth = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu,
383 |                                    name='feature_fifth_bn')
384 |         net = denseblockfin(net, step=128, firstchannel=128, is_train=is_train, name='denseplz4', reuse=reuse)
385 |         netsixth = BatchNormLayer(net, is_train=is_train, decay=conv_bn_decay, act=tf.nn.relu,
386 |                                    name='feature_sixth_bn')
387 |         outfirst=Conv2d(netfirst, default_box_size[0] * (classes_size + 4), (3, 3), (1, 1), padding='SAME',
388 |                      W_init=W_init, name='firstout')
389 |         outsecond=Conv2d(netsecond, default_box_size[1] * (classes_size + 4), (3, 3), (1, 1), padding='SAME',
390 |                      W_init=W_init, name='secondout')
391 |         outthird=Conv2d(netthird, default_box_size[2] * (classes_size + 4), (3, 3), (1, 1), padding='SAME',
392 |                      W_init=W_init, name='thirdout')
393 |         outfourth=Conv2d(netfourth, default_box_size[3] * (classes_size + 4), (3, 3), (1, 1), padding='SAME',
394 |                      W_init=W_init, name='fourthout')
395 |         outfifth=Conv2d(netfifth, default_box_size[4] * (classes_size + 4), (3, 3), (1, 1), padding='SAME',
396 |                      W_init=W_init, name='fifthout')
397 |         outsixth=Conv2d(netsixth, default_box_size[5] * (classes_size + 4), (3, 3), (1, 1), padding='SAME',
398 |                      W_init=W_init, name='sixthout')
399 |         features1=outfirst.outputs
400 |         features2=outsecond.outputs
401 |         features3=outthird.outputs
402 |         features4=outfourth.outputs
403 |         features5=outfifth.outputs
404 |         features6=outsixth.outputs
405 |         feature_maps = [features1, features2, features3, features4, features5,features6]
406 |         global feature_maps_shape
407 |         feature_maps_shape = [m.get_shape().as_list() for m in feature_maps]
408 |         tmp_all_feature = []
409 |         for i, fmap in zip(range(len(feature_maps)), feature_maps):
410 |             width = feature_maps_shape[i][1]
411 |             height = feature_maps_shape[i][2]
412 |             tmp_all_feature.append(
413 |                 tf.reshape(fmap, [-1, (width * height * default_box_size[i]), (classes_size + 4)]))
414 |         tmp_all_feature = tf.concat(tmp_all_feature, axis=1)
415 |         feature_class = tmp_all_feature[:, :, :classes_size]
416 |         feature_location = tmp_all_feature[:, :, classes_size:]
417 |         print('##   feature_class shape : ' + str(feature_class.get_shape().as_list()))
418 |         print('##   feature_location shape : ' + str(feature_location.get_shape().as_list()))
419 |         # 生成所有default boxs
420 |         global all_default_boxs
421 |         all_default_boxs = generate_all_default_boxs()
422 |         # print(all_default_boxs)
423 |         global all_default_boxs_len
424 |         all_default_boxs_len = len(all_default_boxs)
425 |         print('##   all default boxs : ' + str(all_default_boxs_len))
426 |     return feature_class,feature_location,all_default_boxs,all_default_boxs_len
427 | 
428 | def smooth_L1(x):
429 |     return tf.where(tf.less_equal(tf.abs(x), 1.0), tf.multiply(0.5, tf.pow(x, 2.0)), tf.subtract(tf.abs(x), 0.5))
430 | 
431 | def elloss(feature_class,feature_location,groundtruth_class,groundtruth_location,groundtruth_positives,groundtruth_count):
432 |     softmax_cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=feature_class,
433 |                                                                            labels=groundtruth_class)
434 |     loss_location = tf.div(tf.reduce_sum(tf.multiply(
435 |         tf.reduce_sum(smooth_L1(tf.subtract(groundtruth_location, feature_location)),
436 |                       reduction_indices=2), groundtruth_positives), reduction_indices=1),
437 |         tf.reduce_sum(groundtruth_positives, reduction_indices=1))
438 |     loss_class = tf.div(
439 |         tf.reduce_sum(tf.multiply(softmax_cross_entropy, groundtruth_count), reduction_indices=1),
440 |         tf.reduce_sum(groundtruth_count, reduction_indices=1))
441 |     loss_all = tf.reduce_sum(tf.add(loss_class, loss_location*5))
442 |     return loss_all,loss_class,loss_location
443 | 
444 | def generate_all_default_boxs():
445 |     all_default_boxes = []
446 |     for index, map_shape in zip(range(len(feature_maps_shape)), feature_maps_shape):
447 |         width = int(map_shape[1])
448 |         height = int(map_shape[2])
449 |         cell_scale = default_box_scale[index]
450 |         for x in range(width):
451 |             for y in range(height):
452 |                 for ratio in box_aspect_ratio[index]:
453 |                     center_x = (x / float(width)) + (0.5 / float(width))
454 |                     center_y = (y / float(height)) + (0.5 / float(height))
455 |                     box_width = cell_scale*np.sqrt(ratio)/1.2
456 |                     box_height = cell_scale/np.sqrt(ratio)/1.2
457 |                     all_default_boxes.append([center_x, center_y, box_width, box_height])
458 |                 all_default_boxes.append([(x / float(width)) + (0.5 / float(width)), (y / float(height)) + (0.5 / float(height)), cell_scale*1.5,cell_scale*1.4])
459 |     all_default_boxes = np.array(all_default_boxes)
460 |     all_default_boxes = check_numerics(all_default_boxes, 'all_default_boxes')
461 |     return all_default_boxes
462 | 
463 | def check_numerics(input_dataset, message):
464 |     if str(input_dataset).find('Tensor') == 0:
465 |         input_dataset = tf.check_numerics(input_dataset, message)
466 |     else:
467 |         dataset = np.array(input_dataset)
468 |         nan_count = np.count_nonzero(dataset != dataset)
469 |         inf_count = len(dataset[dataset == float("inf")])
470 |         n_inf_count = len(dataset[dataset == float("-inf")])
471 |         if nan_count > 0 or inf_count > 0 or n_inf_count > 0:
472 |             data_error = '【' + message + '】出现数据错误！【nan：' + str(nan_count) + '|inf：' + str(
473 |                 inf_count) + '|-inf：' + str(n_inf_count) + '】'
474 |             raise Exception(data_error)
475 |     return input_dataset
476 | 
477 | if __name__ == '__main__':
478 |     imageinput=tf.placeholder(tf.float32,[None,300,300,3],"inputsimage")
479 |     imageinputtest = tf.placeholder(tf.float32, [None, 300, 300, 3], "inputsimage")
480 |     gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)
481 |     fc, fl, _, _ = inference(imageinput, True, None)
482 | 
483 |     fc2, fl2, _, _ = inference(imageinputtest, False, True)
484 | 
485 |     groundtruth_class = tf.placeholder(shape=[None, all_default_boxs_len], dtype=tf.int32,
486 |                                        name='groundtruth_class')
487 |     groundtruth_location = tf.placeholder(shape=[None, all_default_boxs_len, 4], dtype=tf.float32,
488 |                                           name='groundtruth_location')
489 |     groundtruth_positives = tf.placeholder(shape=[None, all_default_boxs_len], dtype=tf.float32,
490 |                                            name='groundtruth_positives')
491 |     groundtruth_negatives = tf.placeholder(shape=[None, all_default_boxs_len], dtype=tf.float32,
492 |                                            name='groundtruth_negatives')
493 |     groundtruth_count = tf.add(groundtruth_positives, groundtruth_negatives)
494 |     learning_rt=0.000001
495 |     learning_rate = tf.placeholder(tf.float32, None, 'learning_rate')
496 |     loss_allt, loss_classt, loss_locationt = elloss(fc, fl, groundtruth_class, groundtruth_location, groundtruth_positives, groundtruth_count)
497 |     train = tf.train.MomentumOptimizer(learning_rate,momentum=0.9).minimize(loss_allt)
498 |     tf.summary.scalar('loss_all_train', loss_allt)
499 |     tf.summary.scalar('loss_class_train', tf.reduce_sum(loss_classt) )
500 |     tf.summary.scalar('loss_location_train', tf.reduce_sum(loss_locationt))
501 |     merged = tf.summary.merge_all()
502 |     with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
503 |         trainwrite = tf.summary.FileWriter("logs/", sess.graph)
504 |         sess.run(tf.global_variables_initializer())
505 |         saver2 = tf.train.Saver(var_list=tf.trainable_variables())
506 |         zzz = variables._all_saveable_objects().copy()
507 |         print(zzz)
508 |         saver = tf.train.Saver()
509 |         if os.path.exists('./session_paramsdddaleasy/session2.ckpt.index') :
510 |             print('\nStart Restore')
511 |             saver2.restore(sess, './session_paramsdddaleasy/session2.ckpt')
512 |             print('\nEnd Restore')
513 |         print('\nStart Training')
514 |         min_loss_location = 100000.
515 |         min_loss_class = 100000.
516 |         avg_loss=0
517 |         avg_lossloc=0
518 |         avg_losclass=0
519 |         ptlos=0
520 |         ptlosc=0
521 |         ptlosl=0
522 |         while((min_loss_location + min_loss_class) > 0.001 and running_count < 100000):
523 |             running_count += 1
524 |             train_data, actual_data, _ = get_traindata_voc(batch_size)
525 |             starttime = time.time()
526 |             gt_class, gt_location, gt_positives, gt_negatives=generate_groundtruth_data(actual_data)
527 |             if len(train_data) > 0:
528 |                 loss_all,loss_class,loss_location,_,pred_class,pred_location = sess.run([loss_allt, loss_classt, loss_locationt,train,fc, fl],feed_dict={imageinput:train_data,groundtruth_class:gt_class,groundtruth_location:gt_location,groundtruth_positives:gt_positives,groundtruth_negatives:gt_negatives,learning_rate:learning_rt})
529 |                 l = np.sum(loss_location)
530 |                 c = np.sum(loss_class)
531 |                 avg_loss +=loss_all
532 |                 avg_lossloc += loss_class
533 |                 avg_losclass += loss_location
534 |                 if min_loss_location > l:
535 |                     min_loss_location = l
536 |                 if min_loss_class > c:
537 |                     min_loss_class = c
538 |                 print('Running:【' + str(running_count) + '】|Loss All:【' + str(
539 |                     min_loss_location + min_loss_class) + '|' + str(loss_all) + '】|Location:【' + str(
540 |                     np.sum(loss_location)) + '】|Class:【' + str(np.sum(loss_class)) + '】|pred_class:【' + str(
541 |                     np.sum(pred_class)) + '|' + str(np.amax(pred_class)) + '|' + str(
542 |                     np.min(pred_class)) + '】|pred_location:【' + str(np.sum(pred_location)) + '|' + str(
543 |                     np.amax(pred_location)) + '|' + str(np.min(pred_location)) + '】TIME:'+str(time.time()-starttime))
544 |                 if running_count % 100 == 0:
545 |                     print('---------')
546 |                     print('avgloss')
547 |                     print(avg_loss/100.)
548 |                     print(np.sum(avg_lossloc/100.) )
549 |                     print(np.sum(avg_losclass/100.) )
550 |                     print(ptlos-avg_loss/100.)
551 |                     print(ptlosc-np.sum(avg_lossloc/100.) )
552 |                     print(ptlosl-np.sum(avg_losclass/100.) )
553 |                     ptlos = avg_loss/100.
554 |                     ptlosc = np.sum(avg_lossloc/100. )
555 |                     ptlosl = np.sum(avg_losclass/100. )
556 |                     print('---------')
557 |                     avg_loss=0
558 |                     avg_lossloc = 0
559 |                     avg_losclass = 0
560 |                 if running_count % 100 == 0:
561 |                     results = sess.run(merged,feed_dict={imageinput:train_data,groundtruth_class:gt_class,groundtruth_location:gt_location,groundtruth_positives:gt_positives,groundtruth_negatives:gt_negatives,learning_rate:learning_rt})
562 |                     trainwrite.add_summary(results, running_count)
563 |                 if running_count % 500 == 0:
564 |                     saver.save(sess, './session_paramsdddaleasy/session.ckpt')
565 |                     print('session.ckpt has been saved.')
566 |                     gc.collect()
567 |             else:
568 |                 print('No Data Exists!')
569 |                 break


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | DSOD的Tensorflow实现。
2 | -------------------
3 | 
4 | 使用Tensorlayer和Skimage等，过程中使用了自己修改后的Tensorlayer增广，上传前修改应该能在原始TL跑，没测试代码。按论文可以在VOC07+12上训练吧，Loss那块Loc给了高权重。
5 | 
6 | 源码框架参考[jasonli8848d](https://github.com/lslcode/SSD_for_Tensorflow)的工作，做了一些修改。主要是Groundtruth那块还有增广。另外这个代码增广极度耗费CPU资源，训练效率很低，收敛也不快。有问题可以到[某乎](https://zhuanlan.zhihu.com/p/33957333)评论或者给我发邮件，colinyoo#outlook#com。虽然未必能给予帮助。
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------