├── .gitignore
├── README.md
├── eval_test.py
├── keypoint_subnet
    ├── README.md
    ├── __init__.py
    ├── keypoint_test.py
    ├── keypoint_train.py
    ├── src
    │   ├── __init__.py
    │   ├── backbone.py
    │   ├── convert_tfrecord.py
    │   ├── get_heatmap.py
    │   ├── img_pre_processing.py
    │   ├── json_read.py
    │   ├── model.py
    │   └── reader.py
    └── train_log.md
├── multi_pose_net_eval.py
├── person_detect
    ├── README.md
    ├── __init__.py
    ├── anchor
    │   ├── __init__.py
    │   ├── anchor_generator.py
    │   ├── box_coder.py
    │   ├── box_list.py
    │   ├── box_list_ops.py
    │   └── shape_utils.py
    ├── person_detect_test.py
    ├── person_detect_train.py
    └── src
    │   ├── __init__.py
    │   ├── backbone.py
    │   ├── convert_tfrecord.py
    │   ├── draw_box_with_image.py
    │   ├── get_loss.py
    │   ├── loss.py
    │   ├── reader.py
    │   └── retinanet.py
├── pose_residual_network
    ├── README.md
    ├── __init__.py
    ├── prn_train.py
    └── src
    │   ├── PRN.py
    │   ├── __init__.py
    │   ├── convert_tfrecord.py
    │   └── reader.py
└── utils
    ├── __init__.py
    ├── backbone.py
    ├── coco_convert_ai_json.json
    ├── coco_json_convert.py
    └── gaussian.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.jpg
2 | *.png
3 | *.py[cod]
4 | pre_trained/
5 | *.ckpt
6 | yolo_v3/
7 | 
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### NOTE:
 2 | 
 3 | I find somewhere is weird in eval.py in the official [PRN-pytorch impementaion repo](https://github.com/salihkaragoz/pose-residual-network-pytorch). When to get predicated bbox_keypoints, the code used the true keypoints to assign the bbox_keypoints. The code in eval.py is about line 200 and line 205. The peaks is true keypoints coordinate, it seems that used the true coordinate to assign the predicated bbox_keypoints. Actually i think the line 209~220 in eval.py is the right way to get real predicated bbox_keypoints.
 4 | 
 5 | As far as i can see, i think that this ropo has some problems and cann't get the correct result through 'correct way'. But the author did not response to me and maybe there are still some tricks in this repo that i didn't found yet.
 6 | 
 7 | ## This repository contains a TensorFlow implementation about this ECCV 2018 paper:
 8 | 
 9 | [Muhammed Kocabas, Salih Karagoz, Emre Akbas. MultiPoseNet: Fast Multi-Person Pose Estimation using Pose Residual Network. In ECCV, 2018.](https://arxiv.org/abs/1807.04067)
10 | 
11 | # This contains three part of this network:
12 |     
13 | - **keypoint_subnet**, use resnet_v2_50 + fpn as backbone net work, aiming to detect huaman pose points on a single image.
14 | 
15 | - **person_detect**, use as same as keypoint_subnet backbone, just a little different. Actually this part work is the RetinaNet, shown in paper [Focal Loss](https://arxiv.org/abs/1708.02002)
16 | 
17 | - **pose-residual-network**, the main contribution of this paper
18 | 
19 | Detailed information please see original [paper.](https://arxiv.org/abs/1807.04067)
20 | 
21 | **Note:** we trained three part network separately, just as in paper said, we first train keypoint_subnet and then frozen backbone parameters to trian person_detect sub_network. All training data is 
22 | read through tf_record file.
23 | 
24 | 
25 | ### dataset: 
26 | 
27 | - pose-residual: ai_train2017.tfrecord ; coco_train2017.tfrecord
28 | - person-detect: ai-instance-bbox.tfrecord ; coco-instance-bbox.tfrecord
29 | - keypoint: ai_train2017.tfrecord & ai_train2017.json ; coco_train2017.tfrecord & coco_train2017.json
30 | 
31 | coco-keypoints-annotations:
32 | 
33 | [0-16]::::::[nose, left_eye, right_eye, left_ear, right_ear, left_shoulder, right_shoulder, left_elbow,
34 |  right_elbow, left_wrist, right_wrist, left_hip, right_hip, left_knee, right_knee, left_ankle, right_ankle]
35 | 
36 |  # Thanks
37 |  
38 |  [mkocabas](https://github.com/mkocabas/pose-residual-network)
39 |  [salihkaragoz](https://github.com/salihkaragoz/pose-residual-network-pytorch)
40 |  
41 |  
42 | 


--------------------------------------------------------------------------------
/eval_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import json
  4 | import cv2
  5 | import argparse
  6 | import numpy as np
  7 | from tqdm import tqdm
  8 | from random import shuffle
  9 | 
 10 | from pycocotools.coco import COCO
 11 | from pycocotools.cocoeval import COCOeval
 12 | from utils.gaussian import gaussian, crop, gaussian_multi_input_mp
 13 | 
 14 | import tensorflow as tf
 15 | from pose_residual_network.src.PRN import PRN
 16 | from keypoint_subnet.src.get_heatmap import get_single_heatmap
 17 | 
 18 | 
 19 | def eval(checkpoint = '/media/ulsee/D/PRN/20181015-0750/model.ckpt-245572', json_file = '/media/ulsee/E/datasets/coco/annotations2017/person_keypoints_val2017.json'):
 20 | 
 21 |     ckpt = checkpoint
 22 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 23 |     graph = tf.Graph()
 24 |     with graph.as_default():
 25 |         inputs = tf.placeholder(tf.float32, shape=(1, 56, 36 , 17), name='inputs')
 26 |         prn = PRN(inputs=inputs, output_node=1*56*36*17, is_training=False)
 27 |         prn_out = prn.forward()
 28 |         init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
 29 |         saver = tf.train.Saver()
 30 | 
 31 |         with tf.Session(graph=graph) as sess:
 32 |             sess.run(init_op)
 33 |             saver.restore(sess, ckpt)
 34 |             print ('prn model restore successfully.')
 35 |             print('------------Evaulation Started------------')
 36 | 
 37 |             peak_results, bbox_results, coco = prepare(json_file)
 38 | 
 39 |             image_ids = []
 40 |             my_results = []
 41 |             n_kernel = 15
 42 | 
 43 |             w = int(18 * 2)
 44 |             h = int(28 * 2)
 45 |             in_thres = 0.21
 46 |             # tqdm, Python里用来控制显示的进度条，相当于循环
 47 |             for p in tqdm(peak_results):
 48 |                 idx = p['image_id']
 49 |                 image_ids.append(idx)
 50 | 
 51 |                 peaks = p['peaks']
 52 |                 # 找到当前图片所标注的所有的boxes，是一个列表的列表，[ [], [], ... ,[]]，每个列表值是原始coco标注信息里的box值[x, y, w, h]
 53 |                 bboxes = [k['bbox'] for k in bbox_results if k['image_id'] == idx]
 54 | 
 55 |                 if len(bboxes) == 0 or len(peaks) == 0:
 56 |                     continue
 57 | 
 58 |                 # 构建网络的输入
 59 |                 weights_bbox = np.zeros((len(bboxes), h, w, 4, 17))
 60 |                 # 对这个图片上所有的关键点信息进行处理,注意peaks是有17个元素的列表，对应coco数据集标注的17个关键点，每个元素可以有多个关键点，表示多个人的同一个部位
 61 |                 for joint_id, peak in enumerate(peaks):
 62 |                     # peak就是第几个channel上的所有关键点，也即是这个图片上所有的同一个类型的关键点信息，例如所有的鼻子、左肩、右肩等
 63 |                     for instance_id, instance in enumerate(peak):
 64 |                         # instance_id是当前channel上第几个点，instance是点，有四个值[x, y, 1, idx]
 65 |                         p_x = instance[0]
 66 |                         p_y = instance[1]
 67 | 
 68 |                         for bbox_id, b in enumerate(bboxes):
 69 |                             # bbox_id 表示第几个box，b是box，[xmin, ymin, w, h]
 70 |                             # 下面的过程就和在训练pose-residual-net时生成训练数据是一样的。
 71 |                             # 判断关键点是否在当前的box内，如果是，就根据缩放比例把weights_bbox对应的位置处表示为instance的值
 72 |                             # ?没有很看懂为什么weights_box维度是[ len(bboxes), h, w, 4, 17],感觉完全就可以是[ len(bboxes), h, w, 17]?
 73 |                             is_inside = p_x > b[0] - b[2] * in_thres and \
 74 |                                         p_y > b[1] - b[3] * in_thres and \
 75 |                                         p_x < b[0] + b[2] * (1.0 + in_thres) and \
 76 |                                         p_y < b[1] + b[3] * (1.0 + in_thres)
 77 | 
 78 |                             if is_inside:
 79 |                                 x_scale = float(w) / math.ceil(b[2])
 80 |                                 y_scale = float(h) / math.ceil(b[3])
 81 | 
 82 |                                 x0 = int((p_x - b[0]) * x_scale)
 83 |                                 y0 = int((p_y - b[1]) * y_scale)
 84 | 
 85 |                                 if x0 >= w and y0 >= h:
 86 |                                     x0 = w - 1
 87 |                                     y0 = h - 1
 88 |                                 elif x0 >= w:
 89 |                                     x0 = w - 1
 90 |                                 elif y0 >= h:
 91 |                                     y0 = h - 1
 92 |                                 elif x0 < 0 and y0 < 0:
 93 |                                     x0 = 0
 94 |                                     y0 = 0
 95 |                                 elif x0 < 0:
 96 |                                     x0 = 0
 97 |                                 elif y0 < 0:
 98 |                                     y0 = 0
 99 | 
100 |                                 p = 1e-9
101 | 
102 |                                 weights_bbox[bbox_id, y0, x0, :, joint_id] = [1, instance[2], instance[3], p]
103 | 
104 |                 old_weights_bbox = np.copy(weights_bbox)
105 | 
106 |                 for j in range(weights_bbox.shape[0]):
107 |                     for t in range(17):
108 |                         weights_bbox[j, :, :, 0, t] = gaussian(weights_bbox[j, :, :, 0, t])
109 |                     # weights_bbox[j, :, :, 0, :]      = gaussian_multi_input_mp(weights_bbox[j, :, :, 0, :])
110 | 
111 |                 # -------------------get output of prn net--------------------#
112 | 
113 |                 output_bbox = []
114 |                 for j in range(weights_bbox.shape[0]):
115 |                     inp = weights_bbox[j, :, :, 0, :]  # [h, w, 17]
116 |                     output = sess.run(prn_out, feed_dict={inputs:[inp]})
117 | 
118 |                     temp = np.reshape(output, (56, 36, 17))
119 |                     kps = get_box_keypoints(temp)
120 |                     # print ('output_kps == {} '.format(kps))
121 |                     output_bbox.append(temp)
122 | 
123 |                 # output_box: [len(bboxes), 56, 36, 17]
124 |                 output_bbox = np.array(output_bbox)
125 |                 ##############################################################################################################
126 |                 # _img = cv2.imread('/media/ulsee/E/datasets/coco/cocoval2017/000000281929.jpg', cv2.COLOR_BGR2RGB)
127 |                 # kp = [339, 93, 2, 346, 88, 2, 328, 88, 2, 360, 89, 2, 318, 90, 1, 385, 135, 2, 301, 147, 2, 416, 184, 2,
128 |                 #       286, 204, 2, 407, 226, 2, 276, 244, 2, 358, 254, 2, 309, 259, 2, 352, 346, 2, 307, 349, 2, 348,
129 |                 #       448, 2, 312, 449, 2]
130 |                 # print (_img.shape)
131 |                 # heatmap = get_single_heatmap(kp, _img.shape[0], _img.shape[1], channels=17, sigma=4)
132 |                 # _prn_input = []
133 |                 # for i in range(17):
134 |                 #     _prn_input.append(cv2.resize(heatmap[:,:,i], (36, 56)))
135 |                 #     # print (cv2.resize(heatmap[:,:,i], (36, 56)).shape)
136 |                 # _prn_input = np.reshape(np.asarray(_prn_input), (56, 36, 17))
137 |                 # _prn_output = sess.run(prn_out, feed_dict={inputs:[_prn_input]})
138 |                 # _prn_output_ = []
139 |                 # for i in range(17):
140 |                 #     _prn_output_.append(cv2.resize(_prn_output[0, :, :, i], (_img.shape[1], _img.shape[0])))
141 |                 # _prn_output = np.reshape(np.asarray(_prn_output_), (17, _img.shape[0], _img.shape[1]))
142 |                 # _prn_output = np.transpose(_prn_output, (1,2,0))
143 |                 # print (_prn_output.shape)
144 |                 # cv2.imwrite('true_channel0.jpg', np.expand_dims(heatmap[:,:,0]*255, axis=2))
145 |                 # cv2.imwrite('true_heatmap.jpg', np.sum(heatmap, axis=2, keepdims=True) * 255)
146 |                 # cv2.imwrite('prn_channel0.jpg', np.expand_dims(_prn_output[:,:,0]*255, axis=2))
147 |                 # cv2.imwrite('prn_heatmap.jpg', np.sum(_prn_output, axis=2, keepdims=True)*255)
148 |                 # return
149 |                 ##############################################################################################################
150 | 
151 |                 keypoints_score = []
152 | 
153 |                 for t in range(17):
154 |                     indexes = np.argwhere(old_weights_bbox[:, :, :, 0, t] == 1)
155 |                     keypoint = []
156 |                     for i in indexes:
157 | 
158 |                         cr = crop(output_bbox[i[0], :, :, t], (i[1], i[2]), N=n_kernel)
159 |                         score = np.sum(cr)
160 | 
161 |                         kp_id = old_weights_bbox[i[0], i[1], i[2], 2, t]
162 |                         kp_score = old_weights_bbox[i[0], i[1], i[2], 1, t]
163 |                         p_score = old_weights_bbox[i[0], i[1], i[2], 3, t]  ## ??
164 |                         bbox_id = i[0]
165 |                         # print ('score == {}, kp_score == {}'.format(score, kp_score))
166 |                         score = kp_score * score
167 | 
168 |                         s = [kp_id, bbox_id, kp_score, score]
169 | 
170 |                         keypoint.append(s)
171 |                     keypoints_score.append(keypoint)
172 | 
173 |                 bbox_keypoints = np.zeros((weights_bbox.shape[0], 17, 3))
174 |                 bbox_ids = np.arange(len(bboxes)).tolist()
175 | 
176 |                 # kp_id, bbox_id, kp_score, my_score
177 |                 for i in range(17):
178 |                     joint_keypoints = keypoints_score[i]
179 |                     if len(joint_keypoints) > 0:
180 | 
181 |                         kp_ids = list(set([x[0] for x in joint_keypoints]))
182 | 
183 |                         table = np.zeros((len(bbox_ids), len(kp_ids), 4))
184 | 
185 |                         for b_id, bbox in enumerate(bbox_ids):
186 |                             for k_id, kp in enumerate(kp_ids):
187 |                                 own = [x for x in joint_keypoints if x[0] == kp and x[1] == bbox]
188 | 
189 |                                 if len(own) > 0:
190 |                                     table[bbox, k_id] = own[0]
191 |                                 else:
192 |                                     table[bbox, k_id] = [0] * 4
193 | 
194 |                         for b_id, bbox in enumerate(bbox_ids):
195 | 
196 |                             row = np.argsort(-table[bbox, :, 3])
197 | 
198 |                             if table[bbox, row[0], 3] > 0:
199 |                                 for r in row:
200 |                                     if table[bbox, r, 3] > 0:
201 |                                         column = np.argsort(-table[:, r, 3])
202 | 
203 |                                         if bbox == column[0]:
204 |                                             bbox_keypoints[bbox, i, :] = [x[:3] for x in peaks[i] if x[3] == table[bbox, r, 0]][0]
205 |                                             break
206 |                                         else:
207 |                                             row2 = np.argsort(table[column[0], :, 3])
208 |                                             if row2[0] == r:
209 |                                                 bbox_keypoints[bbox, i, :] = [x[:3] for x in peaks[i] if x[3] == table[bbox, r, 0]][0]
210 |                                                 break
211 |                     else:
212 |                         for j in range(weights_bbox.shape[0]):
213 |                             b = bboxes[j]
214 |                             x_scale = float(w) / math.ceil(b[2])
215 |                             y_scale = float(h) / math.ceil(b[3])
216 | 
217 |                             for t in range(17):
218 |                                 indexes = np.argwhere(old_weights_bbox[j, :, :, 0, t] == 1)
219 |                                 if len(indexes) == 0:
220 |                                     max_index = np.argwhere(output_bbox[j, :, :, t] == np.max(output_bbox[j, :, :, t]))
221 |                                     bbox_keypoints[j, t, :] = [max_index[0][1] / x_scale + b[0],
222 |                                                                max_index[0][0] / y_scale + b[1], 0]
223 | 
224 |                 my_keypoints = []
225 |                 # print ('bbox_keypoints === {}'.format(bbox_keypoints))
226 |                 for i in range(bbox_keypoints.shape[0]):
227 |                     k = np.zeros(51)
228 |                     k[0::3] = bbox_keypoints[i, :, 0]
229 |                     k[1::3] = bbox_keypoints[i, :, 1]
230 |                     k[2::3] = [2] * 17
231 | 
232 |                     pose_score = 0
233 |                     count = 0
234 |                     for f in range(17):
235 |                         if bbox_keypoints[i, f, 0] != 0 and bbox_keypoints[i, f, 1] != 0:
236 |                             count += 1
237 |                         pose_score += bbox_keypoints[i, f, 2]
238 |                         # print (pose_score)
239 |                     pose_score /= 17.0
240 | 
241 |                     my_keypoints.append(k)
242 | 
243 |                     image_data = {
244 |                         'image_id': idx,
245 |                         'bbox': bboxes[i],
246 |                         'score': pose_score,
247 |                         'category_id': 1,
248 |                         'keypoints': k.tolist()
249 |                     }
250 |                     my_results.append(image_data)
251 |                 # print ('###############################################################')
252 |                 # if len(my_results) > 10:
253 |                 #     break
254 |         ann_filename = 'val2017_PRN_keypoint_results_prn_.json'
255 |         # write output
256 |         json.dump(my_results, open(ann_filename, 'w'), indent=4)
257 | 
258 |         # load results in COCO evaluation tool
259 |         coco_pred = coco.loadRes(ann_filename)
260 | 
261 |         # run COCO evaluation
262 |         coco_eval = COCOeval(coco, coco_pred, 'keypoints')
263 |         coco_eval.params.imgIds = image_ids
264 |         coco_eval.evaluate()
265 |         coco_eval.accumulate()
266 |         coco_eval.summarize()
267 | 
268 |     # os.remove(ann_filename)
269 | def get_box_keypoints(prn_out):
270 |     '''
271 | 
272 |     :param prn_out: a heatmap, typically the prn net work output
273 |     :return:
274 |         keypoints: a list of list, contains one pair coordinate for each channel, e.g.,[ [x1, y1], [x2, y2],...,[x17, y17]]
275 |     '''
276 |     keypoints = []
277 |     for c in range(17):
278 |         current_channel = prn_out[:, :, c]
279 |         cur_max = np.max(current_channel)
280 |         if cur_max == 0:
281 |             coorx = 0
282 |             coory = 0
283 |         else:
284 |             index_all = np.where(current_channel == cur_max)
285 |             coorx = index_all[0][0]
286 |             coory = index_all[1][0]
287 | 
288 |         keypoints.append([coory, coorx])
289 | 
290 |     return keypoints
291 | 
292 | def prepare(json_file):
293 | 
294 |     cocodir = json_file
295 |     ann = json.load(open(cocodir))
296 |     bbox_results = ann['annotations']
297 | 
298 |     coco = COCO(cocodir)
299 |     img_ids = coco.getImgIds(catIds=[1])
300 | 
301 |     peak_results = []
302 |     # peak_results 是一个列表，里面的每一个元素是一个字典，字典有三个key，分别是image_id, peaks, file_name. image_id 和 file_name就是coco数据集里图片的名字和ID
303 |     # peaks，是一个列表，有17个元素，每个元素又是一个列表，每个元素又包含N个列表，这个列表有两种情况：
304 |     # 1. 根据原有的关键点信息，当其v为大于0的时候（即可以在图片上标注，无论是否可见），就将四个值[x,y,v,idx]组成的列表当做这个列表的元素放进去，如果这个图片上标注了
305 |     #    多个人，那么继续找到关键点，同样组成的四个值[x,y,1,idx]放进去
306 |     #    其中x和y就是coco数据集标注的关键点的位置，v统一为1，idx指明这个关键点是第几个可以标注的关键点序号，从0开始
307 |     # 2. 如果原有的关键点v为0，则该列表为空
308 |     # 所以 peaks最终的内容就有可能为[ [], [], [[x, y, 1, 0]], [], [[x, y, 1, 1], [x,y,1,2], [x,y,1,3]], [[x, y, 1, 4]], [], ..., [[x, y, 1 idx]] ] 这种形式
309 |     for i in img_ids:
310 |         anns = coco.loadAnns(coco.getAnnIds(imgIds=i))
311 |         # kps是图片上所有人的关键点信息，可能有多个人，也就是列表的列表形式，[ [keypoint1], [keypoints2] ]
312 |         kps = [a['keypoints'] for a in anns]
313 | 
314 |         idx = 0
315 | 
316 |         ks = []
317 |         for i in range(17):
318 |             t = []
319 |             for k in kps:
320 |                 x = k[0::3][i]
321 |                 y = k[1::3][i]
322 |                 v = k[2::3][i]
323 | 
324 |                 if v > 0:
325 |                     t.append([x, y, 1, idx])
326 |                     idx += 1
327 |             ks.append(t)
328 |         image_id = anns[0]['image_id']
329 |         peaks = ks
330 | 
331 |         element = {
332 |             'image_id': image_id,
333 |             'peaks': peaks,
334 |             'file_name': coco.loadImgs(image_id)[0]['file_name']
335 |         }
336 | 
337 |         peak_results.append(element)
338 | 
339 |     shuffle(peak_results)
340 | 
341 |     # temporary_peak_res，最终得到的结果就是去除掉前面peaks全为空的情况，即某张图片上一个关键点都没有的，给去掉，只保留起码有 >= test_keypoint_count的图片
342 |     temporary_peak_res = []
343 |     for p in peak_results:
344 |         if (sum(1 for i in p['peaks'] if i != []) >= 0):
345 |             temporary_peak_res.append(p)
346 |     peak_results = temporary_peak_res
347 | 
348 |     return peak_results, bbox_results, coco
349 | 
350 | 
351 | eval()


--------------------------------------------------------------------------------
/keypoint_subnet/README.md:
--------------------------------------------------------------------------------
1 | 人体关键点检测网络，使用的tfrecord是直接从图片文件夹下生成的，保留了图片的名称、宽高属性。之所以只保留这些，是因为如果
2 | 直接把gt_heatmap写入到tfrecord里的话，会造成最终生成的tfrecord数据太大。因此考虑到这个原因，没有把gt_heatmap放进去，
3 | 而是在训练过程中，读入一个事先改好的json文件，文件的元素是一个个字典，key是图片的名字，对应的value就是关键点的值，训练时
4 | 再从json文件内取出关键点生成heatmap处理。


--------------------------------------------------------------------------------
/keypoint_subnet/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | '''
3 | @author: shiwei hou
4 | @contact: murdockhou@gmail.com
5 | @software: PyCharm
6 | @file: __init__.py
7 | @time: 18-9-29 下午6:56
8 | ''' 


--------------------------------------------------------------------------------
/keypoint_subnet/keypoint_test.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: keypoint_test.py
  7 | @time: 18-10-8 上午9:50
  8 | '''
  9 | import tensorflow as tf
 10 | from datetime import datetime
 11 | import os, cv2, json
 12 | import logging
 13 | import numpy as np
 14 | import math
 15 | 
 16 | import sys
 17 | 
 18 | from src.backbone import BackBone
 19 | from src.model import Keypoint_Subnet
 20 | from src.get_heatmap import get_heatmap
 21 | from src.reader import Keypoint_Reader
 22 | from src.json_read import  load_json, load_coco_json
 23 | from src.img_pre_processing import image_vertical_flipping
 24 | 
 25 | FLAGS = tf.flags.FLAGS
 26 | 
 27 | tf.flags.DEFINE_string('model', '/media/ulsee/D/keypoint_subnet/20181023-2043/model_alter.ckpt-239999',
 28 |                        'model path you want to test, e.g., (/media/ulsee/D/multi-pose-net/20180829-1927/model.ckpt-xxxxx)')
 29 | tf.flags.DEFINE_string('img_path', '/media/ulsee/E/datasets/coco/cocotrain2017',
 30 |                        'image path to test model.')
 31 | tf.flags.DEFINE_string('save_path', '/media/ulsee/E/keypoint/coco/train2017', 'path to save image test result')
 32 | tf.flags.DEFINE_boolean('is_training', True, '')
 33 | tf.flags.DEFINE_integer(name='batch_size', default=1, help='train batch size number')
 34 | tf.flags.DEFINE_integer(name='img_size', default=480, help='net input size')
 35 | tf.flags.DEFINE_integer(name='num_keypoints', default=17, help='number of keypoints to detect')
 36 | 
 37 | 
 38 | 
 39 | def is_image(img_name):
 40 |     img_name = img_name.lower()
 41 |     if img_name.endswith('.jpg') or img_name.endswith('.png') or img_name.endswith('jpeg'):
 42 |         return True
 43 |     return False
 44 | 
 45 | 
 46 | 
 47 | def deal_with_heatmaps(img, heatmap, factorx, factory, num_keypoints, score_threshold, nms_threshold=5, type=1):
 48 |     '''
 49 | 
 50 |     :param img:
 51 |     :param heatmap:
 52 |     :param num_keypoints:
 53 |     :param type: 1 for single person and other for multi-person
 54 |     :return:
 55 |     '''
 56 |     if type == 1:
 57 |         for c in range(num_keypoints):
 58 |             current_heatmap = heatmap[0, :, :, c]
 59 | 
 60 |             cur_max = np.max(current_heatmap)
 61 |             # print (cur_max)
 62 |             if cur_max < score_threshold:
 63 |                 continue
 64 |             index_all = np.where(current_heatmap == cur_max)
 65 |             coorx = index_all[0][0]
 66 |             coory = index_all[1][0]
 67 | 
 68 |             coorx = int(coorx * factorx)
 69 |             coory = int(coory * factory)
 70 | 
 71 |             cv2.circle(img, (coory, coorx), 5, (0, 0, 255), -1)
 72 |             cv2.putText(img, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1)
 73 |     else:
 74 |         threshold = score_threshold
 75 |         nms_threshold = nms_threshold
 76 |         cur_max = 0
 77 |         count = 0
 78 |         for c in range(num_keypoints):
 79 |             current_heatmap = heatmap[0, :, :, c]
 80 |             x, y = np.where(current_heatmap > threshold)
 81 |             coordinate = list(zip(x, y))
 82 |             # print(coordinate)
 83 |             s = []
 84 |             for coor in coordinate:
 85 |                 # print(coor)
 86 |                 # print(current_heatmap[coor])
 87 |                 s.append(current_heatmap[coor])
 88 |             s = np.asarray(s)
 89 |             # print(s)
 90 |             s_index = s.argsort()[::-1] # 降序，第一个位置的索引值最大
 91 |             # print(s_index)
 92 |             # nms
 93 |             keep = []
 94 | 
 95 |             while s_index.size > 0:
 96 |                 keep.append(s_index[0])
 97 |                 s_index = s_index[1:]
 98 |                 last = []
 99 |                 for index in s_index:
100 |                     # print(keep[-1], index)
101 |                     distance = np.sqrt(np.sum(np.square(
102 |                         np.asarray(coordinate[keep[-1]]) - np.asarray(coordinate[index])
103 |                     )))
104 |                     if distance > nms_threshold:
105 |                         last.append(index)
106 | 
107 |                 s_index = np.asarray(last)
108 | 
109 |             for index in keep:
110 |                 coor = coordinate[index]
111 |                 coorx = coor[0]
112 |                 coory = coor[1]
113 | 
114 |                 coorx = int(coorx * factorx)
115 |                 coory = int(coory * factory)
116 | 
117 |                 cv2.circle(img, (coory, coorx), 5, (0, 0, 255), -1)
118 |                 cv2.putText(img, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1)
119 |                 count += 1
120 |                 cur_max += s[index]
121 | 
122 |         cur_max = cur_max / (count if count > 0 else 1)
123 | 
124 |     return img, cur_max
125 | 
126 | def _test(score_threshold, nms_threshold):
127 |     global  save_json
128 |     if not os.path.exists(FLAGS.save_path):
129 |         os.makedirs(FLAGS.save_path)
130 | 
131 |     graph = tf.Graph()
132 |     with graph.as_default():
133 |         # ------------------------get backbone net--------------------------------#
134 |         backbone = BackBone(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, is_training=FLAGS.is_training)
135 |         fpn, _   = backbone.build_fpn_feature()
136 |         # ---------------------------keypoint net---------------------------------#
137 |         keypoint_net = Keypoint_Subnet(inputs=backbone.input_imgs, img_size=backbone.img_size, fpn=fpn,
138 |                                        batch_size=backbone.batch_size, num_classes=FLAGS.num_keypoints)
139 |         pre_heat, _ = keypoint_net.forward()
140 | 
141 |         g_list = tf.global_variables()
142 | 
143 |         bn_moving_mean = [g for g in g_list if 'moving_mean' in g.name]
144 |         bn_moving_vars = [g for g in g_list if 'moving_variance' in g.name]
145 | 
146 |         var_list = tf.trainable_variables()
147 |         var_list += bn_moving_vars + bn_moving_mean
148 |         # for var in var_list:
149 |         #     print (var)
150 | 
151 |         init_op = tf.group(tf.global_variables_initializer())
152 | 
153 |         saver   = tf.train.Saver()
154 | 
155 |         with tf.Session(graph=graph) as sess:
156 |             sess.run(init_op)
157 |             saver.restore(sess, FLAGS.model)
158 |             print('model restore successfully.')
159 | 
160 |             img_num = 0
161 |             test_img_id = ['000000135361','000000265513','000000496607','000000270836']
162 | 
163 |             avg = 0
164 | 
165 |             for img in os.listdir(FLAGS.img_path):
166 |                 # if not is_image(img):
167 |                 #     continue
168 |                 # if img.split('.')[0] not in test_img_id:
169 |                 #     continue
170 |                 img_num += 1
171 |                 img_ori = cv2.imread(os.path.join(FLAGS.img_path, img), cv2.IMREAD_COLOR)
172 | 
173 |                 # img_ori = cv2.flip(img_ori, 1)
174 | 
175 |                 img_copy = img_ori.copy()
176 | 
177 |                 # img_input = img_copy
178 |                 img_input = cv2.resize(img_copy, (FLAGS.img_size, FLAGS.img_size), interpolation=cv2.INTER_NEAREST)
179 |                 heatmaps = sess.run(pre_heat,
180 |                                               feed_dict={backbone.input_imgs:[img_input]})
181 | 
182 |                 factorx = img_ori.shape[0] / heatmaps.shape[1]
183 |                 facotry = img_ori.shape[1] / heatmaps.shape[2]
184 |                 img_save, cur_max = deal_with_heatmaps(img_ori, heatmaps, factorx, facotry, FLAGS.num_keypoints,
185 |                                                        score_threshold=score_threshold, nms_threshold=nms_threshold, type=2)
186 |                 avg += cur_max
187 |                 cv2.imwrite(os.path.join(FLAGS.save_path, img), img_save)
188 |                 # for mean in bn_moving_vars:
189 |                 #     print(sess.run(mean))
190 |                 #     break
191 | 
192 |                 if img_num == 400:
193 |                     break
194 |                 print('tested {}'.format(img_num))
195 | 
196 |             print('avg max === {}'.format(avg/img_num))
197 | 
198 | if __name__ == '__main__':
199 |     _test(score_threshold=0.05, nms_threshold=5)


--------------------------------------------------------------------------------
/keypoint_subnet/keypoint_train.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: keypoint_train.py
  7 | @time: 18-9-28 下午12:13
  8 | '''
  9 | 
 10 | import tensorflow as tf
 11 | from tensorflow.python.framework import graph_util
 12 | from tensorflow.python.platform import gfile
 13 | from datetime import datetime
 14 | import os, time, cv2
 15 | import numpy as np
 16 | 
 17 | from src.backbone import BackBone
 18 | from src.model import Keypoint_Subnet
 19 | from src.get_heatmap import get_heatmap
 20 | from src.reader import Keypoint_Reader
 21 | from src.json_read import  load_json, load_coco_json
 22 | from src.img_pre_processing import img_pre_processing
 23 | 
 24 | 
 25 | 
 26 | FLAGS = tf.flags.FLAGS
 27 | tf.flags.DEFINE_integer('train_nums', 118280, 'train data nums, default: cocotrain2017--118280')
 28 | tf.flags.DEFINE_integer('epochs', 8, 'train epochs')
 29 | tf.flags.DEFINE_integer('batch_size', 4, 'train batch size number')
 30 | tf.flags.DEFINE_integer('img_size', 480, 'net input size')
 31 | tf.flags.DEFINE_float('learning_rate', 1e-4, 'trian lr')
 32 | tf.flags.DEFINE_float('decay_rate', 0.9, 'lr decay rate')
 33 | tf.flags.DEFINE_integer('decay_steps', 10000, 'lr decay steps')
 34 | tf.flags.DEFINE_integer('max_to_keep', 10, 'num of models to saved')
 35 | tf.flags.DEFINE_integer('num_keypoints', 17, 'number of keypoints to detect')
 36 | tf.flags.DEFINE_string('pretrained_resnet', 'pre_trained/resnet_v2_50.ckpt',
 37 |                        'resnet_v2_50 pretrained model')
 38 | tf.flags.DEFINE_boolean('is_training', True, '')
 39 | tf.flags.DEFINE_string('checkpoint_path', '/media/ulsee/D/keypoint_subnet', 'path to save training model')
 40 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/keypoint_subnet_tfrecord/coco_train2017.tfrecord', '')
 41 | tf.flags.DEFINE_string('json_file', '/media/ulsee/E/keypoint_subnet_tfrecord/coco_train2017.json',
 42 |                        '')
 43 | tf.flags.DEFINE_string('finetuning', '20181023-2043/model_alter.ckpt-239999',
 44 |                        'folder of saved model that you wish to continue training or testing(e.g. 20180828-1803/model.ckpt-xxx), default:None')
 45 | tf.flags.DEFINE_boolean('change_dataset', False,
 46 |                         'if change dataset from ai_challenger to coco, the num_keypoints will be changed. If so, when we finetunnig, need to '
 47 |                         'specify do not restore the last output layer var.')
 48 | 
 49 | def keypoint_train():
 50 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 51 | 
 52 |     # -------------------define where checkpoint path is-------------------------#
 53 |     current_time = datetime.now().strftime('%Y%m%d-%H%M')
 54 |     if FLAGS.finetuning is None:
 55 |         checkpoints_dir = os.path.join(FLAGS.checkpoint_path, current_time)
 56 |         if not os.path.exists(checkpoints_dir):
 57 |             try:
 58 |                 os.makedirs(checkpoints_dir)
 59 |             except:
 60 |                 pass
 61 |     else:
 62 |         checkpoints_dir = os.path.join(FLAGS.checkpoint_path, FLAGS.finetuning)
 63 |     print('checkpoints_dir == {}'.format(checkpoints_dir))
 64 |     #-----------------------------load json--------------------------------------#
 65 |     imgid_keypoints_dict = load_json(FLAGS.json_file)
 66 |     # ------------------------------define Graph --------------------------------#
 67 |     # tf.reset_default_graph()
 68 |     graph = tf.Graph()
 69 |     with graph.as_default():
 70 |         #------------------------get backbone net--------------------------------#
 71 |         backbone = BackBone(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, is_training=FLAGS.is_training)
 72 |         fpn, _   = backbone.build_fpn_feature()
 73 |         #---------------------------keypoint net---------------------------------#
 74 |         keypoint_net = Keypoint_Subnet(inputs=backbone.input_imgs, img_size=backbone.img_size, fpn=fpn,
 75 |                                        batch_size=backbone.batch_size, num_classes=FLAGS.num_keypoints)
 76 |         total_loss, net_loss, pre_heat = keypoint_net.net_loss()
 77 |         #-------------------------------reader-----------------------------------#
 78 |         reader = Keypoint_Reader(tfrecord_file=FLAGS.tfrecord_file, batch_size=FLAGS.batch_size, img_size=FLAGS.img_size, epochs=FLAGS.epochs)
 79 |         img_batch, img_id_batch, img_height_batch, img_width_batch = reader.feed()
 80 |         #-----------------------------learning rate------------------------------#
 81 |         global_step   = tf.Variable(0)
 82 |         learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step=global_step,
 83 |                                                    decay_steps=int(FLAGS.train_nums / FLAGS.batch_size),
 84 |                                                    decay_rate=FLAGS.decay_rate,
 85 |                                                    staircase=True)
 86 |         opt               = tf.train.AdamOptimizer(learning_rate, epsilon=1e-5)
 87 |         # grads             = opt.compute_gradients(total_loss)
 88 |         # apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
 89 | 
 90 |         # MOVING_AVERAGE_DECAY  = 0.99
 91 |         # variable_averages     = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
 92 |         # variable_to_average   = (tf.trainable_variables() + tf.moving_average_variables())
 93 |         # variables_averages_op = variable_averages.apply(variable_to_average)
 94 | 
 95 |         update_ops   = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 96 |         with tf.control_dependencies(update_ops):
 97 |             train_op = opt.minimize(total_loss, global_step=global_step)
 98 | 
 99 |         #--------------------------------saver-----------------------------------#
100 |         res50_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='resnet_v2_50')
101 |         restore_res50  = tf.train.Saver(var_list=res50_var_list)
102 | 
103 |         fpn_var_list             = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='build_fpn_feature')
104 |         keypoint_subnet_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='keypoint_subnet')
105 |         output_name              = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='keypoint_subnet.output')
106 | 
107 |         var_list        = tf.trainable_variables()
108 |         global_list     = tf.global_variables()
109 |         bn_moving_vars  = [g for g in global_list if 'moving_mean' in g.name]
110 |         bn_moving_vars += [g for g in global_list if 'moving_variance' in g.name]
111 |         var_list       += bn_moving_vars
112 | 
113 |         if FLAGS.change_dataset:
114 |             for node in output_name:
115 |                 var_list.remove(node)
116 | 
117 |         if FLAGS.finetuning is not None:
118 |             restore_finetuning = tf.train.Saver(var_list=var_list)
119 | 
120 |         saver       = tf.train.Saver(var_list=var_list, max_to_keep=20)
121 |         saver_alter = tf.train.Saver(max_to_keep=5)
122 | 
123 |         #---------------------------------control sigma for heatmap-------------------------------#
124 |         start_gussian_sigma    = 10.0
125 |         end_gussian_sigma      = 2.5
126 |         start_decay_sigma_step = 10000
127 |         decay_steps            = 50000
128 |         # gussian sigma will decay when global_step > start_decay_sigma_step
129 |         gussian_sigma = tf.where(
130 |             tf.greater(global_step, start_decay_sigma_step),
131 |             tf.train.polynomial_decay(start_gussian_sigma,
132 |                                       tf.cast(global_step, tf.int32) - start_decay_sigma_step,
133 |                                       decay_steps,
134 |                                       end_gussian_sigma,
135 |                                       power=1.0),
136 |             start_gussian_sigma
137 |         )
138 |         # --------------------------------init------------------------------------#
139 |         init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
140 |         config  = tf.ConfigProto()
141 |         config.gpu_options.allow_growth = True
142 | 
143 |         #--------------------------------tf summary--------------------------------#
144 |         img_id_batch_placeholder = tf.placeholder(tf.string, shape=[FLAGS.batch_size,])
145 |         tf.summary.text('img_ids', img_id_batch_placeholder)
146 |         tf.summary.scalar('total_loss', total_loss)
147 |         tf.summary.scalar('net_loss', net_loss)
148 |         tf.summary.image('gt_right_ankle', tf.reshape(tf.transpose(
149 |             keypoint_net.input_heats, [3, 0, 1, 2])[16], shape=[-1, FLAGS.img_size // 4, FLAGS.img_size // 4, 1]), max_outputs=2)
150 |         tf.summary.image('ori_image', backbone.input_imgs, max_outputs=2)
151 |         # tf.summary.image('gt_left_shoulder', tf.reshape(tf.transpose(
152 |         #     keypoint_net.input_heats, [3, 0, 1, 2])[5], shape=[-1, FLAGS.img_size // 4, FLAGS.img_size // 4, 1]),max_outputs=2)
153 |         tf.summary.image('pred_right_ankle', tf.reshape(tf.transpose(
154 |             pre_heat, [3, 0, 1, 2])[16], shape=[-1, FLAGS.img_size // 4, FLAGS.img_size // 4, 1]), max_outputs=2)
155 |         tf.summary.image('gt_heatmap', tf.reduce_sum(keypoint_net.input_heats, axis=3, keepdims=True), max_outputs=2)
156 |         tf.summary.image('pred_heatmap', tf.reduce_sum(pre_heat, axis=3, keepdims=True), max_outputs=2)
157 |         tf.summary.scalar('lr', learning_rate)
158 |         summary_op     = tf.summary.merge_all()
159 |         summary_writer = tf.summary.FileWriter(checkpoints_dir, graph)
160 |         # --------------------------------train------------------------------------#
161 |         with tf.Session(graph=graph, config=config) as sess:
162 |             sess.run(init_op)
163 |             coord   = tf.train.Coordinator()
164 |             threads = tf.train.start_queue_runners(sess=sess, coord=coord)
165 |             step    = 0
166 | 
167 |             if FLAGS.finetuning is not None:
168 |                 restore_finetuning.restore(sess, checkpoints_dir)
169 |                 print ('Successfully load pre_trained keypoint_subnet model.')
170 |                 # step = int(checkpoints_dir.split('/')[-1].split('.')[-1].split('-')[-1])
171 |                 print ('Global_step == {}, Step == {}'.format(sess.run(global_step), step))
172 |                 step = sess.run(global_step)
173 |                 # -- bn layer: resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/ ---#
174 |                 # gamma = graph.get_tensor_by_name(name='resnet_v2_50/block4/unit_3/bottleneck_v2/conv2/BatchNorm/gamma:0')
175 |                 # beta = graph.get_tensor_by_name(name='resnet_v2_50/block4/unit_3/bottleneck_v2/conv2/BatchNorm/beta:0')
176 |                 # print('finetuning gamma = ', sess.run(gamma)[:50])
177 |                 # print('beta = ', sess.run(beta)[:50])
178 | 
179 |             else:
180 |                 restore_res50.restore(sess, FLAGS.pretrained_resnet)
181 |                 print ('Successfully load pre_trained resnet_v2_50 model')
182 |                 # -- bn layer: resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/ ---#
183 |                 # gamma = graph.get_tensor_by_name(
184 |                 #     name='resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/gamma:0')
185 |                 # beta = graph.get_tensor_by_name(name='resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/beta:0')
186 |                 # print('no finetuning gamma = ', sess.run(gamma)[:50])
187 |                 # print('beta = ', sess.run(beta)[:50])
188 | 
189 |             start_time = time.time()
190 |             try:
191 |                 while not coord.should_stop():
192 |                     imgs, imgs_id, imgs_height, imgs_width, g_sigma = sess.run([img_batch, img_id_batch, img_height_batch, img_width_batch, gussian_sigma])
193 | 
194 |                     gt_heatmaps = get_heatmap(label_dict=imgid_keypoints_dict, img_ids=imgs_id, img_heights=imgs_height,
195 |                                               img_widths=imgs_width, img_resize=FLAGS.img_size, num_keypoints=FLAGS.num_keypoints,
196 |                                               sigma=g_sigma)
197 | 
198 |                     # imgs, gt_heatmaps = img_pre_processing(imgs, gt_heatmaps)
199 | 
200 |                     _, loss_all, net_out_loss, pre_heats, lr, merge_op = sess.run(
201 |                         [train_op, total_loss, net_loss, pre_heat, learning_rate, summary_op],
202 |                         feed_dict={backbone.input_imgs:imgs,
203 |                                    keypoint_net.input_heats:gt_heatmaps,
204 |                                    img_id_batch_placeholder:imgs_id}
205 |                     )
206 |                     if step % 100 == 0:
207 |                         summary_writer.add_summary(merge_op, step)
208 |                         summary_writer.flush()
209 | 
210 |                     if (step + 1) % 10 == 0:
211 |                         cur_time = time.time()
212 |                         print ('-------------------Step %d:-------------------' % step)
213 |                         print ('total_loss = {}, out_put_loss = {}, lr = {}, sigma = {}, time spend = {}'
214 |                                      .format(loss_all, net_out_loss, lr, g_sigma, cur_time-start_time))
215 |                         start_time = cur_time
216 | 
217 |                         # # -- bn layer: resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/ ---#
218 |                         # gamma = graph.get_tensor_by_name(
219 |                         #     name='resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/gamma:0')
220 |                         # beta = graph.get_tensor_by_name(
221 |                         #     name='resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/beta:0')
222 |                         # print('no finetuning gamma = ', sess.run(gamma)[:50])
223 |                         # print('beta = ', sess.run(beta)[:50])
224 |                         # print (sess.run(bn_moving_vars[0]))
225 |                         # input_graph_def = tf.get_default_graph().as_graph_def()
226 |                         # output_graph_def = graph_util.convert_variables_to_constants(sess, input_graph_def,
227 |                         #                                                              'keypoint_subnet/output/biases'.split(','))
228 |                         # model_f = tf.gfile.FastGFile('model.pb', 'wb')
229 |                         # model_f.write(output_graph_def.SerializeToString())
230 |                         # break
231 |                     if (step + 1) % 5000 == 0:
232 |                         save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step)
233 |                         print ('Model saved in file: {}'.format(save_path))
234 |                         save_path_alter = saver_alter.save(sess, checkpoints_dir+'/model_alter.ckpt', global_step=step)
235 | 
236 |                     step += 1
237 | 
238 | 
239 |             except KeyboardInterrupt:
240 |                 print ('Interrupted, current step == {}'.format(step))
241 |                 coord.request_stop()
242 | 
243 |             except Exception as e:
244 |                 coord.request_stop(e)
245 | 
246 |             finally:
247 |                 save_path = saver.save(sess, checkpoints_dir + "/model.ckpt", global_step=step)
248 |                 print ("Model saved in file: {}" .format(save_path))
249 |                 save_path_alter = saver_alter.save(sess, checkpoints_dir + '/model_alter.ckpt', global_step=step)
250 |                 print ('Current step = {}'.format(step))
251 |                 # When done, ask the threads to stop.
252 |                 coord.request_stop()
253 |                 coord.join(threads)
254 | 
255 | 
256 | if __name__ == '__main__':
257 |     keypoint_train()
258 | 
259 | 
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | 


--------------------------------------------------------------------------------
/keypoint_subnet/src/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | '''
3 | @author: shiwei hou
4 | @contact: murdockhou@gmail.com
5 | @software: PyCharm
6 | @file: __init__.py
7 | @time: 18-9-28 上午11:23
8 | ''' 


--------------------------------------------------------------------------------
/keypoint_subnet/src/backbone.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | @author: shiwei hou
 4 | @contact: murdockhou@gmail.com
 5 | @software: PyCharm
 6 | @file: backbone.py
 7 | @time: 18-9-28 上午11:03
 8 | '''
 9 | 
10 | from __future__ import absolute_import, division, print_function
11 | 
12 | import tensorflow as tf
13 | 
14 | from tensorflow.contrib.slim import nets
15 | from tensorflow.contrib.layers.python.layers import utils
16 | import tensorflow.contrib.slim as slim
17 | 
18 | class BackBone(object):
19 |     def __init__(self, img_size, batch_size, is_training=True):
20 |         self.img_size    = img_size
21 |         self.batch_size  = batch_size
22 |         self.input_imgs  = tf.placeholder(tf.float32, [self.batch_size, self.img_size, self.img_size, 3])
23 |         self.is_training = is_training
24 |         self.stddev      = 0.01
25 | 
26 |     def get_feature_map(self):
27 |         #-------------------resent---------------------#
28 |         arg_scope = nets.resnet_v2.resnet_arg_scope()
29 |         with slim. arg_scope(arg_scope):
30 |             out, end_points = nets.resnet_v2.resnet_v2_50(inputs=self.input_imgs, num_classes=None, is_training=self.is_training)
31 |         #---------------feature map dict---------------#
32 |         feature_map_dict = {
33 |             'C2': end_points['resnet_v2_50/block1/unit_2/bottleneck_v2'],  # input_size / 4
34 |             'C3': end_points['resnet_v2_50/block2/unit_3/bottleneck_v2'],  # input_size / 8
35 |             'C4': end_points['resnet_v2_50/block3/unit_4/bottleneck_v2'],  # input_size / 16
36 |             'C5': end_points['resnet_v2_50/block4']                        # input_size / 32
37 |         }
38 |         return feature_map_dict
39 | 
40 |     def build_fpn_feature(self):
41 |         feature_pyramid  = {}
42 |         feature_map_dict = self.get_feature_map()
43 |         #------------------------------------------build fpn-------------------------------------------#
44 |         with tf.variable_scope('build_fpn_feature'):
45 |             with slim.arg_scope([slim.conv2d], weights_initializer=tf.random_normal_initializer(stddev=self.stddev)):
46 |                 feature_pyramid['P5'] = slim.conv2d(feature_map_dict['C5'], num_outputs=256, kernel_size=[1, 1], stride=1,
47 |                                         scope='build_fpn_P5')
48 | 
49 |                 #------------------ top-down pathway and lateral connections--------------------------#
50 |                 for layer in range(4, 1, -1):
51 |                     p = feature_pyramid['P' + str(layer + 1)]
52 |                     c = feature_map_dict['C' + str(layer)]
53 | 
54 |                     #---------------------------------- upsample p -----------------------------------#
55 |                     up_shape = c.get_shape()
56 |                     up_sample = tf.image.resize_nearest_neighbor(p, [up_shape[2], up_shape[2]],
57 |                                                                  name='upsampling_fpn_P%d' % layer)
58 | 
59 |                     #----------------------------------- 1x1 conv ------------------------------------#
60 |                     c = slim.conv2d(c, num_outputs=256, kernel_size=[1, 1], stride=1, scope='fpn_1x1conv_C%d' % layer)
61 |                     p = up_sample + c
62 | 
63 |                     #----------------------reduce aliasing effect of upsampling ----------------------#
64 |                     #---------------(in the third last paragraph, Section 3, Paper FPN)---------------#
65 |                     p = slim.conv2d(p, num_outputs=256, kernel_size=[3, 3], stride=1, padding='SAME',
66 |                                     scope='build_fpn_P%d' % layer)
67 | 
68 |                     feature_pyramid['P' + str(layer)] = p
69 | 
70 |         return feature_pyramid, feature_map_dict
71 | 
72 | 


--------------------------------------------------------------------------------
/keypoint_subnet/src/convert_tfrecord.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: convert_tfrecord.py.py
  7 | @time: 18-9-28 下午6:50
  8 | ''' 
  9 | 
 10 | import os, cv2
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | 
 14 | FLAGS = tf.flags.FLAGS
 15 | 
 16 | tf.flags.DEFINE_string(name='image_dir', default='/media/ulsee/E/datasets/test2',
 17 |                        help='image directory for building tfrecord')
 18 | 
 19 | tf.flags.DEFINE_string(name='tfrecord_file', default='/media/ulsee/E/keypoint_subnet_tfrecord/coco_train2017-test.tfrecord',
 20 |                        help='output path you want to save tfrecord data file')
 21 | 
 22 | tf.flags.DEFINE_integer(name='img_num', default=21,
 23 |                         help='define how many images to build tfrecord data, zero menas all')
 24 | 
 25 | 
 26 | def img_reader(image_dir):
 27 |     '''
 28 |     read imgs in image_dir and return some lists
 29 |     :param image_dir: string, path of input image dir, e.g., /path/to/imgdir/
 30 |     :return:
 31 |     img_paths: img path for every single img
 32 |     img_ids: img name without suffix for every single img
 33 |     img_heights: img height for every single img
 34 |     img_widths: img width for every single img
 35 |     '''
 36 | 
 37 |     img_paths = []
 38 |     img_ids = []
 39 |     img_heights = []
 40 |     img_widths = []
 41 | 
 42 |     img_count = 0
 43 |     file_suffix = ['jpg', 'png']
 44 | 
 45 |     for img_file in os.scandir(image_dir):
 46 |         if FLAGS.img_num != 0 and img_count == FLAGS.img_num:
 47 |             break
 48 | 
 49 |         suffix = img_file.name[-3:].lower()
 50 | 
 51 |         if suffix in file_suffix and img_file.is_file() :
 52 | 
 53 |             img = cv2.imread(img_file.path, cv2.IMREAD_COLOR)
 54 |             height, width, channels = img.shape
 55 | 
 56 |             img_ids.append(img_file.name[:-4])
 57 |             img_paths.append(img_file.path)
 58 |             img_heights.append(height)
 59 |             img_widths.append(width)
 60 | 
 61 |             img_count += 1
 62 |             print ('------------------{}-----------------'.format(img_count))
 63 | 
 64 | 
 65 |     return img_paths, img_ids, img_heights, img_widths
 66 | 
 67 | def _int64_feature(value):
 68 |     return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
 69 | 
 70 | def _bytes_feature(value):
 71 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
 72 | 
 73 | def _strs_feature(value):
 74 |     return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
 75 | 
 76 | def tfrecord_writer(img_dir, output_file):
 77 |     '''
 78 |     conver img in img_dir into tfrecord, saved as output_file
 79 |     :param img_dir: img directory
 80 |     :param output_file: tfrecord name with path to save
 81 |     :return:
 82 |     '''
 83 |     # print (1)
 84 |     img_paths, img_ids, img_heights, img_widths = img_reader(image_dir=img_dir)
 85 |     # print (2)
 86 |     output_dir = os.path.dirname(output_file)
 87 |     try:
 88 |         os.makedirs(output_dir)
 89 |     except os.error:
 90 |         pass
 91 | 
 92 |     img_nums = len(img_paths)
 93 | 
 94 |     writer = tf.python_io.TFRecordWriter(output_file)
 95 |     print('start writing tfrecord....')
 96 | 
 97 |     for i in range(img_nums):
 98 |         img_path = img_paths[i]
 99 |         img_id = bytes(img_ids[i], encoding='utf-8')
100 |         img_height = img_heights[i]
101 |         img_width = img_widths[i]
102 | 
103 |         with tf.gfile.FastGFile(img_path, 'rb') as f:
104 |             img = f.read()
105 | 
106 |         example = tf.train.Example(features=tf.train.Features(
107 |             feature={
108 |                 'image': _bytes_feature(img),
109 |                 'id': _strs_feature(img_id),
110 |                 'height': _int64_feature(img_height),
111 |                 'width': _int64_feature(img_width)
112 |             }))
113 |         writer.write(example.SerializeToString())
114 | 
115 |         if (i + 1) % 1000 == 0:
116 |             print('processing....{}/{}'.format(i+1, img_nums))
117 |     print ('tfrecord write done.')
118 |     writer.close()
119 | 
120 | def main(argv):
121 |     tfrecord_writer(FLAGS.image_dir, FLAGS.tfrecord_file)
122 | 
123 | if __name__ == '__main__':
124 |     tf.app.run()


--------------------------------------------------------------------------------
/keypoint_subnet/src/get_heatmap.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | @author: shiwei hou
 4 | @contact: murdockhou@gmail.com
 5 | @software: PyCharm
 6 | @file: get_heatmap.py
 7 | @time: 18-8-28 下午4:50
 8 | '''
 9 | 
10 | import numpy as np
11 | import math, cv2
12 | from skimage.filters import gaussian
13 | 
14 | def get_heatmap(label_dict, img_ids, img_heights, img_widths, img_resize, num_keypoints, sigma = 6.0):
15 |     batch = img_ids.shape[0]
16 |     heatmaps = np.zeros([batch, img_resize//4, img_resize//4, num_keypoints], np.float32)
17 | 
18 |     for b in range(batch):
19 |         height = img_heights[b]
20 |         width = img_widths[b]
21 |         keypoints = label_dict[img_ids[b].decode('utf-8')]
22 | 
23 |         single_heatmap = get_single_heatmap(keypoints, height, width, num_keypoints, sigma)
24 |         single_heatmap = cv2.resize(single_heatmap, (img_resize//4, img_resize//4))
25 | 
26 |         heatmaps[b,:,:,:] = single_heatmap
27 | 
28 |     return heatmaps
29 | 
30 | def get_single_heatmap(keypoints, height, width, channels, sigma = 6.0):
31 |     heatmap = np.zeros([channels, height, width], np.float32)
32 |     keypoints = list(keypoints)
33 |     keypoints = np.asarray(keypoints)
34 |     keypoints = np.reshape(keypoints, (len(keypoints)//channels//3, channels*3))
35 | 
36 |     for people in keypoints:
37 |         for i in range (channels):
38 |             keypoint_x = people[i*3]
39 |             keypoint_y = people[i*3+1]
40 |             keypoint_v = people[i*3+2]
41 | 
42 |             if keypoint_x == 0 and keypoint_y == 0:
43 |                 continue
44 |             if keypoint_v == 3:
45 |                 continue
46 | 
47 |             heatmap = put_keypoint_on_heatmap(keypoint_x, keypoint_y, i, heatmap, sigma)
48 |             # heatmap[i, keypoint_y, keypoint_x] = 1
49 | 
50 |     # heatmap = gaussian(heatmap.transpose((1, 2, 0)), sigma=sigma, mode='constant', multichannel=True)
51 |     return heatmap.transpose((1, 2, 0))
52 | 
53 | def put_keypoint_on_heatmap(center_x, center_y, channel, heatmap, sigma = 6.0):
54 |     th = 1.6052
55 |     delta = math.sqrt(th * 2)
56 | 
57 |     height = heatmap.shape[1]
58 |     width = heatmap.shape[2]
59 | 
60 |     x0 = int(max(0, center_x - delta * sigma))
61 |     y0 = int(max(0, center_y - delta * sigma))
62 | 
63 |     x1 = int(min(width, center_x + delta * sigma))
64 |     y1 = int(min(height, center_y + delta * sigma))
65 | 
66 |     for y in range(y0, y1):
67 |         for x in range(x0, x1):
68 |             d = (x - center_x) ** 2 + (y - center_y) ** 2
69 |             exp = d / 2.0 / sigma / sigma
70 | 
71 |             if exp > th:
72 |                 continue
73 |             heatmap[channel][y][x] = max(heatmap[channel][y][x], math.exp(-exp))
74 |             heatmap[channel][y][x] = min(heatmap[channel][y][x], 1.0)
75 | 
76 |     return heatmap


--------------------------------------------------------------------------------
/keypoint_subnet/src/img_pre_processing.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: image_preprocessing.py
  7 | @time: 18-8-30 上午11:29
  8 | '''
  9 | 
 10 | import numpy as np
 11 | import os, cv2, random
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | import sys
 15 | sys.path.append('../')
 16 | from src.get_heatmap import get_single_heatmap
 17 | 
 18 | 
 19 | def img_pre_processing(imgs, heatmaps):
 20 |     '''
 21 | 
 22 |     :param imgs: image batch, shape = [b, h, w, c]
 23 |     :param heatmaps: heatmap batch, shape = [b, h, w, c]
 24 |     :return: depend on rd, rotate 45 degree or not, vertically flip or not
 25 |     return processing imgs and heatmaps with ori shape
 26 |     '''
 27 | 
 28 |     batch = imgs.shape[0]
 29 |     for i in range(batch):
 30 |         current_img = imgs[i, :, :, :]
 31 |         current_heatmap = heatmaps[i, :, :, :]
 32 | 
 33 |         rd = random.randint(1, 10)
 34 |         if rd < 4:
 35 |             current_img, current_heatmap = image_rotation(current_img, current_heatmap, 40)
 36 | 
 37 |         elif rd > 7:
 38 |             current_img, current_heatmap = image_rotation(current_img, current_heatmap, -40)
 39 | 
 40 |         rd = random.randint(1, 10)
 41 |         if rd < 4:
 42 |             current_img, current_heatmap = image_vertical_flipping(current_img, current_heatmap)
 43 | 
 44 |         imgs[i,:,:,:] = current_img
 45 |         heatmaps[i,:, :, :] = current_heatmap
 46 | 
 47 |     return imgs, heatmaps
 48 | 
 49 | def image_rotation(img, heatmap, degree=40):
 50 |     img_ori_shape = img.shape  # [h, w, c]
 51 |     heat_ori_shape = heatmap.shape  # [ h, w, c]
 52 | 
 53 |     img = rotated_bound(img, degree)
 54 |     img = cv2.resize(img, (img_ori_shape[1], img_ori_shape[0]))
 55 | 
 56 |     for c in range(heat_ori_shape[2]):
 57 |         cur_heatmap = heatmap[:, :, c]
 58 |         cur_heatmap = np.expand_dims(cur_heatmap, axis=2)
 59 |         cur_heatmap = rotated_bound(cur_heatmap, degree)
 60 |         if len(cur_heatmap.shape) == 3:
 61 |             cur_heatmap = np.squeeze(cur_heatmap, axis=2)
 62 |         heatmap[:, :, c] = cv2.resize(cur_heatmap, (heat_ori_shape[1], heat_ori_shape[0]))
 63 |     return img, heatmap
 64 | 
 65 | def rotated_bound(image, angle):
 66 |     # grab the dimensions of the image and then determine the
 67 |     # center
 68 |     (h, w) = image.shape[:2]
 69 |     (cX, cY) = (w // 2, h // 2)
 70 | 
 71 |     # grab the rotation matrix (applying the negative of the
 72 |     # angle to rotate clockwise), then grab the sine and cosine
 73 |     # (i.e., the rotation components of the matrix)
 74 |     M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0)
 75 |     cos = np.abs(M[0, 0])
 76 |     sin = np.abs(M[0, 1])
 77 | 
 78 |     # compute the new bounding dimensions of the image
 79 |     nW = int((h * sin) + (w * cos))
 80 |     nH = int((h * cos) + (w * sin))
 81 | 
 82 |     # adjust the rotation matrix to take into account translation
 83 |     M[0, 2] += (nW / 2) - cX
 84 |     M[1, 2] += (nH / 2) - cY
 85 | 
 86 |     # perform the actual rotation and return the image
 87 |     return cv2.warpAffine(image, M, (nW, nH))
 88 | 
 89 | def image_vertical_flipping(img, heatmap):
 90 |     '''
 91 |     要注意的是，进行flip之后，heatmap各个channel的值要改变，因为flip之后，图片上原本左关节点会变成右关节点，
 92 |     右关节点同样会变成左关节点，因此需要在对heatmap进行flip之后，交换左右两个通道。
 93 |     coco数据集标注的顺序是：
 94 |     [0------16]:
 95 |     0:    nose
 96 |     1-2:  left eye,      right eye
 97 |     3-4:  left ear,      right ear
 98 |     5-6:  left shoulder, right shoulder
 99 |     7-8:  left elbow,  right elbow
100 |     9-10: left wrist , right wrist
101 |     11-12:left hip,    right hip
102 |     13-14:left knee,   right knee
103 |     15-16:left ankle,  right ankle
104 |     :param img:
105 |     :param heatmap:
106 |     :return:
107 |     '''
108 | 
109 |     img = cv2.flip(img, 1)
110 |     for i in range(heatmap.shape[2]):
111 |         cur_heat = heatmap[:, :, i]
112 | 
113 |         cur_heat = np.expand_dims(cur_heat, axis=2)
114 |         cur_heat = cv2.flip(cur_heat, 1)
115 |         if len(cur_heat.shape) == 3:
116 |             cur_heat = np.squeeze(cur_heat, axis=2)
117 | 
118 |         heatmap[:, :, i] = cur_heat
119 | 
120 |     # exchane left & right joints
121 |     new_heatmap = np.zeros(heatmap.shape, dtype=heatmap.dtype)
122 |     for i in range(1, 16, 2):
123 |         new_heatmap[:, :, i+1]   = heatmap[:, :, i]
124 |         new_heatmap[:, :, i] = heatmap[:, :, i+1]
125 |     new_heatmap[:, :, 0] = heatmap[:, :, 0]
126 |     return img, new_heatmap
127 | 
128 | def _test():
129 |     img = cv2.imread('/media/ulsee/E/datasets/coco/cocoval2017/000000281929.jpg', cv2.COLOR_BGR2RGB)
130 |     img_copy = img.copy()
131 |     cv2.imwrite('gt_img.jpg', img)
132 |     # img = cv2.flip(img, 0)
133 |     kp = [339,93,2,346,88,2,328,88,2,360,89,2,318,90,1,385,135,2,301,147,2,416,184,2,
134 |           286,204,2,407,226,2,276,244,2,358,254,2,309,259,2,352,346,2,307,349,2,348,448,2,312,449,2]
135 |     heatmap = get_single_heatmap(kp, img.shape[0], img.shape[1], channels=17, sigma=4)
136 | 
137 |     # img, heatmap = image_rotation(img, heatmap, 40)
138 |     img, heatmap = image_vertical_flipping(img, heatmap)
139 |     cv2.imwrite('img_flip.jpg', img)
140 |     #---------#
141 |     for c in range(17):
142 |         ch = heatmap[:, :, c]
143 |         # print (ch)
144 |         curmax = np.max(ch)
145 |         index = np.where(ch == curmax)
146 |         coorx = index[0][0]
147 |         coory = index[1][0]
148 |         cv2.circle(img, (coory, coorx), 5, (0, 0, 255), -1)
149 |         cv2.putText(img, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1)
150 |     cv2.imwrite('img_flip_with_heat.jpg', img)
151 |     heatmap = np.sum(heatmap, axis=2, keepdims=True) * 255
152 |     cv2.imwrite('heat__flip.jpg', heatmap)
153 | 
154 |     # heatmap_ori = heatmap
155 |     # heatmap_ori = np.sum(heatmap_ori, axis=2, keepdims=True)*255
156 |     # cv2.imwrite('gt_heat.jpg', heatmap_ori)
157 |     # # ---------#
158 |     # for c in range(17):
159 |     #     ch = heatmap[:, :, c]
160 |     #     # print (ch)
161 |     #     curmax = np.max(ch)
162 |     #     index = np.where(ch == curmax)
163 |     #     coorx = index[0][0]
164 |     #     coory = index[1][0]
165 |     #     cv2.circle(img_copy, (coory, coorx), 5, (0, 0, 255), -1)
166 |     #     cv2.putText(img_copy, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1)
167 |     # cv2.imwrite('img_with_heat.jpg', img_copy)
168 |     # #-----------#
169 |     # img, heatmap = image_vertical_flipping(img, heatmap)
170 |     # # img, heatmap = image_rotation(img, heatmap)
171 |     #
172 |     # cv2.imwrite('img_flip.jpg', img)
173 |     #
174 |     # #---------#
175 |     # for c in range(17):
176 |     #     ch = heatmap[:, :, c]
177 |     #     # print (ch)
178 |     #     curmax = np.max(ch)
179 |     #     index = np.where(ch == curmax)
180 |     #     coorx = index[0][0]
181 |     #     coory = index[1][0]
182 |     #     cv2.circle(img, (coory, coorx), 5, (0, 0, 255), -1)
183 |     #     cv2.putText(img, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1)
184 |     # cv2.imwrite('img_flip_with_heat.jpg', img)
185 |     # #---------#
186 |     # heatmap = np.sum(heatmap, axis=2, keepdims=True) * 255
187 |     #
188 |     # heatmap = cv2.cvtColor(heatmap, cv2.COLOR_GRAY2RGB)
189 |     # cv2.imwrite('heat_flip.jpg', heatmap)
190 | 
191 | 
192 | 
193 | 
194 | if __name__ == '__main__':
195 |     _test()
196 | 


--------------------------------------------------------------------------------
/keypoint_subnet/src/json_read.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: json_read.py
  7 | @time: 18-8-28 下午3:11
  8 | '''
  9 | import json
 10 | 
 11 | def load_json(json_file):
 12 |     '''
 13 |     load json file and return a dict, like ['id'] = [keypoints], typically used for ai_challenger format dataset
 14 |     :param json_file:
 15 |     :return:
 16 |     '''
 17 |     f = open(json_file, encoding='utf-8')
 18 |     labels = json.load(f)
 19 |     label_dict = {}
 20 |     for label in labels:
 21 |         current_keypoints = []
 22 |         for human, keypoints in label['keypoint_annotations'].items():
 23 |             current_keypoints = current_keypoints + keypoints
 24 |         label_dict[label['image_id']] = current_keypoints
 25 |     return label_dict
 26 | 
 27 | def load_coco_json(json_file):
 28 |     '''
 29 | 
 30 |     :param json_file:
 31 |     :return:
 32 |     '''
 33 |     f = open(json_file, encoding='utf-8')
 34 |     labels = json.load(f)
 35 |     return labels
 36 | 
 37 | def dump_coco_data(json_file):
 38 |     '''
 39 |     convert coco annotatinos json file, as like:[{'image_id":keypoints}]
 40 |     :param json_file:
 41 |     :return:
 42 |     '''
 43 | 
 44 |     f = open(json_file, encoding='utf-8')
 45 |     labels = json.load(f)
 46 |     image_info = labels['images']
 47 |     anno_info = labels['annotations']
 48 |     label_dict = {}
 49 | 
 50 |     for image in image_info:
 51 |         image_name = image['file_name'].split('.')[0]
 52 |         image_id = image['id']
 53 |         current_keypoints = []
 54 |         for anno in anno_info:
 55 |             keypoints = anno['keypoints']
 56 |             anno_image_id = anno['image_id']
 57 |             anno_id = anno['id']
 58 |             if anno_image_id == image_id:
 59 |                 current_keypoints = current_keypoints + keypoints
 60 | 
 61 |         label_dict[image_name] = current_keypoints
 62 |     with open('coco_image_name_to_keypoints.json', 'w') as fw:
 63 |         json.dump(label_dict, fw)
 64 | 
 65 | def convert_coco_instance_json(json_file):
 66 |     '''
 67 |     convert coco annotatinos json file, as like:[{'image_id":[x1, y1, w, h, category_id] * n}]
 68 |     :param json_file:
 69 |     :return:
 70 |     '''
 71 | 
 72 |     f = open(json_file, encoding='utf-8')
 73 |     labels = json.load(f)
 74 |     units = {}
 75 | 
 76 |     image_info = labels['images']
 77 |     anno_info = labels['annotations']
 78 |     print ('start reading json......')
 79 |     ll = len(image_info)
 80 |     count  = 1
 81 |     for image in image_info:
 82 |         image_name = image['file_name'].split('.')[0]
 83 |         image_id = image['id']
 84 |         height = image['height']
 85 |         width = image['width']
 86 |         current_bbox = [height, width]
 87 | 
 88 |         for anno in anno_info:
 89 |             bbox = anno['bbox']
 90 |             anno_image_id = anno['image_id']
 91 | 
 92 |             if anno_image_id == image_id:
 93 |                 bbox.append(anno['category_id'])
 94 |                 current_bbox = current_bbox + bbox
 95 |         units[image_name] = current_bbox
 96 | 
 97 |         if count % 1000 == 0:
 98 |             print ('Processing {}'.format(count/ll))
 99 |         count += 1
100 |         if count == 10:
101 |             break
102 | 
103 |     is_save = True
104 |     if is_save:
105 |         save_json_file = 'coco-instance-imgid-bbox.json'
106 | 
107 |         with open(save_json_file, 'w') as fw:
108 |             json.dump(units, fw)
109 | if __name__ == '__main__':
110 |     convert_coco_instance_json('/media/ulsee/E/datasets/coco-annotations/instances_train2017.json')


--------------------------------------------------------------------------------
/keypoint_subnet/src/model.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: keypoint_subnet.py
  7 | @time: 18-9-28 上午11:23
  8 | '''
  9 | 
 10 | from __future__ import absolute_import, division, print_function
 11 | 
 12 | import tensorflow as tf
 13 | 
 14 | import  numpy as np
 15 | 
 16 | import os, json
 17 | from tensorflow.contrib.slim import nets
 18 | from tensorflow.contrib.layers.python.layers import utils
 19 | import tensorflow.contrib.slim as slim
 20 | 
 21 | # from src.backbone import BackBone
 22 | 
 23 | 
 24 | class Keypoint_Subnet(object):
 25 |     def __init__(self, inputs, img_size, fpn, num_classes, batch_size):
 26 |         self.inputs          = inputs
 27 |         self.img_size        = img_size
 28 |         self.feature_pyramid = fpn
 29 |         self.num_classes     = num_classes
 30 |         self.batch_size      = batch_size
 31 |         self.stddev          = 0.01
 32 | 
 33 |         self.input_heats     = tf.placeholder(tf.float32, [self.batch_size, self.img_size // 4, self.img_size // 4, self.num_classes])
 34 | 
 35 |         # self.output, self.end_points = self.network()
 36 | 
 37 |     def forward(self):
 38 |         with tf.variable_scope('keypoint_subnet') as sc:
 39 |             end_points_collection = sc.original_name_scope + '_end_points'
 40 |             #---------------------------------build layer D--------------------------------#
 41 |             feature_d = {}
 42 |             for layer in range(2, 6, 1):
 43 |                 cur_p = self.feature_pyramid['P' + str(layer)]
 44 |                 d = slim.conv2d(cur_p,
 45 |                                 num_outputs=128,
 46 |                                 kernel_size=[3, 3],
 47 |                                 stride=1,
 48 |                                 weights_initializer=tf.random_normal_initializer(stddev=self.stddev),
 49 |                                 scope='build_feature_D%d_1' % layer)
 50 |                 d = slim.conv2d(d,
 51 |                                 num_outputs=128,
 52 |                                 kernel_size=[3, 3],
 53 |                                 stride=1,
 54 |                                 weights_initializer=tf.random_normal_initializer(stddev=self.stddev),
 55 |                                 scope='build_feature_D%d_2' % layer)
 56 |                 feature_d['D' + str(layer)] = d
 57 |             #--------------------------------concat part layer D---------------------------#
 58 |             concat_d = feature_d['D2']
 59 |             up_shape = concat_d.get_shape()
 60 |             up_sample = tf.image.resize_nearest_neighbor(feature_d['D3'], [up_shape[2], up_shape[2]],
 61 |                                                          name='upsamping_D3')
 62 |             concat_d = tf.concat([concat_d, up_sample], 3)
 63 | 
 64 |             up_sample = tf.image.resize_nearest_neighbor(feature_d['D4'], [up_shape[2], up_shape[2]],
 65 |                                                          name='upsamping_D4')
 66 |             concat_d = tf.concat([concat_d, up_sample], 3)
 67 | 
 68 |             up_sample = tf.image.resize_nearest_neighbor(feature_d['D5'], [up_shape[2], up_shape[2]],
 69 |                                                          name='upsamping_D5')
 70 |             concat_d = tf.concat([concat_d, up_sample], 3)
 71 |             #------------------------------via 3x3 conv and relu---------------------------#
 72 |             concat_d = slim.conv2d(concat_d,
 73 |                                    num_outputs=concat_d.get_shape()[3],
 74 |                                    kernel_size=[3, 3],
 75 |                                    activation_fn=tf.nn.relu,
 76 |                                    weights_initializer=tf.random_normal_initializer(stddev=self.stddev),
 77 |                                    scope='smoothed_concat_d_layer')
 78 | 
 79 |             #----------------------------------final output--------------------------------#
 80 |             output = slim.conv2d(concat_d,
 81 |                                  num_outputs=self.num_classes,
 82 |                                  kernel_size=[1, 1],
 83 |                                  activation_fn=None,
 84 |                                  weights_initializer=tf.random_normal_initializer(stddev=self.stddev),
 85 |                                  scope='output')
 86 | 
 87 |             end_points = utils.convert_collection_to_dict(end_points_collection)
 88 | 
 89 |             return output, end_points
 90 | 
 91 |     def net_loss(self):
 92 |         output, end_points = self.forward()
 93 |         out_all            = []
 94 |         #-------------------------------add intermediate output loss------------------------------#
 95 |         for index, layer in self.feature_pyramid.items():
 96 |             layer = tf.image.resize_bicubic(layer, [self.feature_pyramid['P2'].get_shape()[1], self.feature_pyramid['P2'].get_shape()[1]],
 97 |                                             name='upsamling_layer_%s' % index)
 98 | 
 99 |             output_mid = slim.conv2d(layer, num_outputs=self.num_classes,
100 |                                      kernel_size=[1, 1],
101 |                                      activation_fn=None,
102 |                                      weights_initializer=tf.random_normal_initializer(stddev=self.stddev),
103 |                                      scope='mid_out_%s' % index
104 |                                      )
105 | 
106 |             out_all.append(output_mid)
107 | 
108 |         out_all.append(output)
109 |         #---------------------------------------calculate losses----------------------------------#
110 |         losses = []
111 |         for idx, pre_heat in enumerate(out_all):
112 |             loss_l2 = tf.nn.l2_loss(tf.concat(pre_heat, axis=0) - self.input_heats, name='loss_%d' % idx)
113 |             losses.append(loss_l2)
114 | 
115 |         total_loss = tf.reduce_sum(losses) / self.batch_size
116 |         net_out_loss = tf.reduce_sum(loss_l2) / self.batch_size
117 |         #-----------------------------------------add tf summary----------------------------------#
118 |         # tf.summary.scalar('total_loss', total_loss)
119 |         # tf.summary.scalar('net_loss', net_out_loss)
120 |         # tf.summary.image('ori_image', self.inputs, max_outputs=2)
121 | 
122 | 
123 |         return total_loss, net_out_loss, pre_heat
124 | 
125 | # if __name__ == '__main__':
126 | #     graph = tf.Graph()
127 | #     with graph.as_default():
128 | #         batch_size = 1
129 | #         height, width = 224, 224
130 | #         inputs = tf.random_uniform((batch_size, height, width, 3), seed=1)
131 | #
132 | #         backbone = BackBone(img_size = 224, batch_size=1)
133 | #         fpn, _ = backbone.build_fpn_feature()
134 | #         kp = Keypoint_Subnet(backbone.input_imgs, img_size=backbone.img_size, fpn=fpn, batch_size=backbone.batch_size, num_classes=14)
135 | #         total_loss, net_loss, pre_heat = kp.net_loss()
136 | #         init = tf.group(
137 | #             tf.global_variables_initializer(),
138 | #             tf.local_variables_initializer())
139 | #
140 | #         saver = tf.train.Saver()
141 | #
142 | #         with tf.Session() as sess:
143 | #             sess.run(init)
144 | #
145 | #             writer = tf.summary.FileWriter('graph', tf.get_default_graph())
146 | #             writer.close()
147 | 


--------------------------------------------------------------------------------
/keypoint_subnet/src/reader.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | @author: shiwei hou
 4 | @contact: murdockhou@gmail.com
 5 | @software: PyCharm
 6 | @file: reader.py
 7 | @time: 18-9-28 上午11:53
 8 | '''
 9 | 
10 | import tensorflow as tf
11 | import matplotlib.pyplot as plt
12 | import os, json
13 | import sys
14 | sys.path.append('../')
15 | 
16 | 
17 | 
18 | class Keypoint_Reader:
19 |     def __init__(self, tfrecord_file, img_size=56, batch_size=4, epochs = 1, capacity = 1000, num_threads=12, name=''):
20 |         self.tfrecord_file = tfrecord_file
21 |         self.img_size = img_size
22 |         self.batch_size = batch_size
23 |         self.capacity = capacity
24 |         self.num_threads = num_threads
25 |         self.name = name
26 |         self.reader = tf.TFRecordReader()
27 |         self.epochs = epochs
28 | 
29 |     def feed(self):
30 |         with tf.name_scope(self.name):
31 |             filename_queue = tf.train.string_input_producer([self.tfrecord_file], num_epochs=self.epochs)
32 |             _, serialized_example = self.reader.read(filename_queue)
33 |             features = tf.parse_single_example(
34 |                 serialized_example,
35 |                 features={
36 |                     'image':tf.FixedLenFeature([], tf.string),
37 |                     'id': tf.FixedLenFeature([], tf.string),
38 |                     'height': tf.FixedLenFeature([], tf.int64),
39 |                     'width': tf.FixedLenFeature([], tf.int64)
40 |                 })
41 | 
42 |             img = tf.image.decode_image(features['image'], channels=3) # tensor, [height, width, channels]
43 |             img_id = features['id']
44 |             img_height = tf.cast(features['height'], tf.int32)
45 |             img_width = tf.cast(features['width'], tf.int32)
46 | 
47 |             img = tf.reshape(img, shape=[img_height, img_width, 3])
48 |             img = self.image_preprocessing(img)
49 | 
50 |             img_batch, img_id_batch, img_height_batch, img_width_batch = tf.train.shuffle_batch(
51 |                 [img, img_id, img_height, img_width],
52 |                 batch_size=self.batch_size,
53 |                 num_threads=self.num_threads,
54 |                 capacity=self.capacity,
55 |                 min_after_dequeue=self.capacity // 10
56 |             )
57 | 
58 |             return img_batch, img_id_batch, img_height_batch, img_width_batch
59 | 
60 |     def image_preprocessing(self, image):
61 | 
62 |         img = tf.expand_dims(image, axis=0)
63 |         img = tf.image.resize_nearest_neighbor(img, (self.img_size, self.img_size))
64 |         img = tf.squeeze(img, axis=0)
65 |         return img
66 | 
67 | def reader_test():
68 |     batch = 1
69 |     epoch = 1
70 |     reader = Keypoint_Reader(tfrecord_file='/media/ulsee/E/keypoint_subnet_tfrecord/coco_train2017-test.tfrecord', batch_size=batch, epochs=1)
71 |     _1, _2, _3, _4 = reader.feed()
72 |     # print (_2)
73 |     # return
74 |     # net_x = tf.reduce_sum(net_x, axis=3, keepdims=True)
75 |     # label = tf.reduce_sum(label, axis=3, keepdims=True)
76 | 
77 |     with tf.Session() as sess:
78 |         sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
79 | 
80 |         coord   = tf.train.Coordinator()
81 |         threads = tf.train.start_queue_runners(coord=coord)
82 | 
83 |         try:
84 |             step = 0
85 |             while not coord.should_stop():
86 |                 a,b,c,d = sess.run([_1,_2,_3,_4])
87 |                 print (b.shape)
88 |                 step += 1
89 |         except tf.errors.OutOfRangeError:
90 |             print ('batch = {}, epoch = {}, total steps = {} '.format(batch, epoch, step))
91 |         finally:
92 |             coord.request_stop()
93 |             coord.join(threads)
94 | 
95 | 
96 | if __name__ == '__main__':
97 |     reader_test()


--------------------------------------------------------------------------------
/keypoint_subnet/train_log.md:
--------------------------------------------------------------------------------
1 | ## 2018-09-30:
2 | 
3 |     开始从头开始训练人体关键点检测网络，输入大小为480x480，lr=0.0001，batch为4，使用Adam优化，数据集为coco2017


--------------------------------------------------------------------------------
/person_detect/README.md:
--------------------------------------------------------------------------------
1 | 行人检测网络数据格式：
2 |     
3 | 行人检测网络使用的tfrecord包含了网络训练的所有信息，如图片信息、box信息和label信息。由于tfrecord在读取的时候
4 | ，如果是batch读的话，会把batch里的数据都resize到同一个尺度下，而不同的图片的box数量是不一样的，这就造成了tfrecord会对
5 | 一个batch里box数量少的图片自动补零。并且由于tensorflow的特性，有tf自动补零的batch在sess run之前是没有办法得到具体维度大小，
6 | 也就没有办法对其进行操作。
7 | 
8 | 考虑到这个问题，我们在生成tfrecord的时候，指定了一个box数量大小（30），并在训练的时候将全0的box去掉，不会参与到训练过程中去。


--------------------------------------------------------------------------------
/person_detect/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | '''
3 | @author: shiwei hou
4 | @contact: murdockhou@gmail.com
5 | @software: PyCharm
6 | @file: __init__.py
7 | @time: 18-9-29 下午6:56
8 | ''' 


--------------------------------------------------------------------------------
/person_detect/anchor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/murdockhou/MultiPoseNet-tensorflow/9ab52e5867d7f40233a63db8f344ca380c640164/person_detect/anchor/__init__.py


--------------------------------------------------------------------------------
/person_detect/anchor/anchor_generator.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import division
  3 | from __future__ import print_function
  4 | 
  5 | 
  6 | from anchor import box_list_ops
  7 | from anchor import box_list
  8 | 
  9 | import tensorflow as tf
 10 | import numpy as np
 11 | 
 12 | class MultipleGridAnchorGenerator():
 13 |   """Generate a grid of anchors for multiple CNN layers."""
 14 | 
 15 |   def __init__(self,
 16 |                box_specs_list,
 17 |                base_anchor_sizes,
 18 |                clip_window=None):
 19 |     """Constructs a MultipleGridAnchorGenerator.
 20 | 
 21 |     To construct anchors, at multiple grid resolutions, one must provide a
 22 |     list of feature_map_shape_list (e.g., [(8, 8), (4, 4)]), and for each grid
 23 |     size, a corresponding list of (scale, aspect ratio) box specifications.
 24 | 
 25 |     For example:
 26 |     box_specs_list = [[(.1, 1.0), (.1, 2.0)],  # for 8x8 grid
 27 |                       [(.2, 1.0), (.3, 1.0), (.2, 2.0)]]  # for 4x4 grid
 28 | 
 29 |     To support the fully convolutional setting, we pass grid sizes in at
 30 |     generation time, while scale and aspect ratios are fixed at construction
 31 |     time.
 32 | 
 33 |     Args:
 34 |       box_specs_list: list of list of (scale, aspect ratio) pairs with the
 35 |         outside list having the same number of entries as feature_map_shape_list
 36 |         (which is passed in at generation time).
 37 |       base_anchor_sizes: list of base anchor size in each layer
 38 |       clip_window: a tensor of shape [4] specifying a window to which all
 39 |         anchors should be clipped. If clip_window is None, then no clipping
 40 |         is performed.
 41 | 
 42 |     Raises:
 43 |       ValueError: if box_specs_list is not a list of list of pairs
 44 |       ValueError: if clip_window is not either None or a tensor of shape [4]
 45 |     """
 46 |     if isinstance(box_specs_list, list) and all(
 47 |         [isinstance(list_item, list) for list_item in box_specs_list]):
 48 |       self._box_specs = box_specs_list
 49 |     else:
 50 |       raise ValueError('box_specs_list is expected to be a '
 51 |                        'list of lists of pairs')
 52 |     if isinstance(base_anchor_sizes, list):
 53 |         self._base_anchor_sizes = base_anchor_sizes
 54 |     else:
 55 |         raise ValueError('base_anchor_list is expected to be a list of float')
 56 |     if clip_window is not None and clip_window.get_shape().as_list() != [4]:
 57 |       raise ValueError('clip_window must either be None or a shape [4] tensor')
 58 |     self._clip_window = clip_window
 59 |     self._scales = []
 60 |     self._aspect_ratios = []
 61 |     for box_spec in self._box_specs:
 62 |       if not all([isinstance(entry, tuple) and len(entry) == 2
 63 |                   for entry in box_spec]):
 64 |         raise ValueError('box_specs_list is expected to be a '
 65 |                          'list of lists of pairs')
 66 |       scales, aspect_ratios = zip(*box_spec)
 67 |       self._scales.append(scales)
 68 |       self._aspect_ratios.append(aspect_ratios)
 69 | 
 70 |   def name_scope(self):
 71 |     return 'MultipleGridAnchorGenerator'
 72 | 
 73 |   def num_anchors_per_location(self):
 74 |     """Returns the number of anchors per spatial location.
 75 | 
 76 |     Returns:
 77 |       a list of integers, one for each expected feature map to be passed to
 78 |       the Generate function.
 79 |     """
 80 |     return [len(box_specs) for box_specs in self._box_specs]
 81 | 
 82 |   def generate(self,
 83 |                 input_size,
 84 |                 feature_map_shape_list,
 85 |                 anchor_strides=None,
 86 |                 anchor_offsets=None):
 87 |     """Generates a collection of bounding boxes to be used as anchors.
 88 | 
 89 |     The number of anchors generated for a single grid with shape MxM where we
 90 |     place k boxes over each grid center is k*M^2 and thus the total number of
 91 |     anchors is the sum over all grids. In our box_specs_list example
 92 |     (see the constructor docstring), we would place two boxes over each grid
 93 |     point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and
 94 |     thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the
 95 |     output anchors follows the order of how the grid sizes and box_specs are
 96 |     specified (with box_spec index varying the fastest, followed by width
 97 |     index, then height index, then grid index).
 98 | 
 99 |     Args:
100 |       input_size: input image size list with (width, height)
101 |       feature_map_shape_list: list of pairs of conv net layer resolutions in the
102 |         format [(height_0, width_0), (height_1, width_1), ...]. For example,
103 |         setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that
104 |         correspond to an 8x8 layer followed by a 7x7 layer.
105 |       anchor_strides: list of pairs of strides (in y and x directions
106 |         respectively). For example, setting
107 |         anchor_strides=[(.25, .25), (.5, .5)] means that we want the anchors
108 |         corresponding to the first layer to be strided by .25 and those in the
109 |         second layer to be strided by .5 in both y and x directions. By
110 |         default, if anchor_strides=None, then they are set to be the reciprocal
111 |         of the corresponding grid sizes. The pairs can also be specified as
112 |         dynamic tf.int or tf.float numbers, e.g. for variable shape input
113 |         images.
114 |       anchor_offsets: list of pairs of offsets (in y and x directions
115 |         respectively). The offset specifies where we want the center of the
116 |         (0, 0)-th anchor to lie for each layer. For example, setting
117 |         anchor_offsets=[(.125, .125), (.25, .25)]) means that we want the
118 |         (0, 0)-th anchor of the first layer to lie at (.125, .125) in image
119 |         space and likewise that we want the (0, 0)-th anchor of the second
120 |         layer to lie at (.25, .25) in image space. By default, if
121 |         anchor_offsets=None, then they are set to be half of the corresponding
122 |         anchor stride. The pairs can also be specified as dynamic tf.int or
123 |         tf.float numbers, e.g. for variable shape input images.
124 | 
125 |     Returns:
126 |       boxes: a BoxList holding a collection of N anchor boxes
127 |     Raises:
128 |       ValueError: if feature_map_shape_list, box_specs_list do not have the same
129 |         length.
130 |       ValueError: if feature_map_shape_list does not consist of pairs of
131 |         integers
132 |     """
133 |     if not (isinstance(feature_map_shape_list, list)
134 |             and len(feature_map_shape_list) == len(self._box_specs)):
135 |       raise ValueError('feature_map_shape_list must be a list with the same '
136 |                        'length as self._box_specs')
137 |     if not all([isinstance(list_item, tuple) and len(list_item) == 2
138 |                 for list_item in feature_map_shape_list]):
139 |       raise ValueError('feature_map_shape_list must be a list of pairs.')
140 |     im_height, im_width = input_size[0], input_size[1]
141 |     # anchor_strides = [(8.0, 8.0), (16.0, 16.0), (32.0, 32.0), (56.0, 56.0), (112.0, 112.0)]
142 |     if not anchor_strides:
143 |       anchor_strides = [(tf.to_float(im_height) / tf.to_float(pair[0]),
144 |                          tf.to_float(im_width) / tf.to_float(pair[1]))
145 |                         for pair in feature_map_shape_list]
146 |     # anchor_offsets = [(4.0, 4.0), (8.0, 8.0), (16.0, 16.0), (28.0, 28.0), (56.0, 56.0)]
147 |     if not anchor_offsets:
148 |       anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1])
149 |                         for stride in anchor_strides]
150 | 
151 |     for arg, arg_name in zip([anchor_strides, anchor_offsets],
152 |                              ['anchor_strides', 'anchor_offsets']):
153 |       if not (isinstance(arg, list) and len(arg) == len(self._box_specs)):
154 |         raise ValueError('%s must be a list with the same length '
155 |                          'as self._box_specs' % arg_name)
156 |       if not all([isinstance(list_item, tuple) and len(list_item) == 2
157 |                   for list_item in arg]):
158 |         raise ValueError('%s must be a list of pairs.' % arg_name)
159 | 
160 |     anchor_grid_list = []
161 |     for grid_size, scales, aspect_ratios, stride, offset, base_anchor_size in zip(
162 |         feature_map_shape_list, self._scales, self._aspect_ratios,
163 |         anchor_strides, anchor_offsets, self._base_anchor_sizes):
164 | 
165 |       # print(grid_size, scales, aspect_ratios, stride, offset, base_anchor_size)
166 | 
167 |       anchor_grid_list.append(
168 |           tile_anchors(
169 |               grid_height=grid_size[0],
170 |               grid_width=grid_size[1],
171 |               scales=scales,
172 |               aspect_ratios=aspect_ratios,
173 |               base_anchor_size=base_anchor_size,
174 |               anchor_stride=stride,
175 |               anchor_offset=offset))
176 |       # break
177 |     concatenated_anchors = box_list_ops.concatenate(anchor_grid_list)
178 |     num_anchors = concatenated_anchors.num_boxes_static()
179 |     # print (num_anchors)
180 |     if num_anchors is None:
181 |       num_anchors = concatenated_anchors.num_boxes()
182 |     if self._clip_window is not None:
183 |       clip_window = tf.multiply(
184 |           tf.to_float([im_height, im_width, im_height, im_width]),
185 |           self._clip_window)
186 |       concatenated_anchors = box_list_ops.clip_to_window(
187 |           concatenated_anchors, clip_window, filter_nonoverlapping=False)
188 |       # TODO: make reshape an option for the clip_to_window op
189 |       concatenated_anchors.set(
190 |           tf.reshape(concatenated_anchors.get(), [num_anchors, 4]))
191 | 
192 |     stddevs_tensor = 0.01 * tf.ones(
193 |         [num_anchors, 4], dtype=tf.float32, name='stddevs')
194 |     concatenated_anchors.add_field('stddev', stddevs_tensor)
195 |     return concatenated_anchors
196 | 
197 | 
198 | def tile_anchors(grid_height,
199 |                  grid_width,
200 |                  scales,
201 |                  aspect_ratios,
202 |                  base_anchor_size,
203 |                  anchor_stride,
204 |                  anchor_offset):
205 |   """Create a tiled set of anchors strided along a grid in image space.
206 | 
207 |   This op creates a set of anchor boxes by placing a "basis" collection of
208 |   boxes with user-specified scales and aspect ratios centered at evenly
209 |   distributed points along a grid.  The basis collection is specified via the
210 |   scale and aspect_ratios arguments.  For example, setting scales=[.1, .2, .2]
211 |   and aspect ratios = [2,2,1/2] means that we create three boxes: one with scale
212 |   .1, aspect ratio 2, one with scale .2, aspect ratio 2, and one with scale .2
213 |   and aspect ratio 1/2.  Each box is multiplied by "base_anchor_size" before
214 |   placing it over its respective center.
215 | 
216 |   Grid points are specified via grid_height, grid_width parameters as well as
217 |   the anchor_stride and anchor_offset parameters.
218 | 
219 |   Args:
220 |     grid_height: size of the grid in the y direction (int or int scalar tensor)
221 |     grid_width: size of the grid in the x direction (int or int scalar tensor)
222 |     scales: a 1-d  (float) tensor representing the scale of each box in the
223 |       basis set.
224 |     aspect_ratios: a 1-d (float) tensor representing the aspect ratio of each
225 |       box in the basis set.  The length of the scales and aspect_ratios tensors
226 |       must be equal.
227 |     base_anchor_size: base anchor size in this layer as [height, width]
228 |         (float tensor of shape [2])
229 |     anchor_stride: difference in centers between base anchors for adjacent grid
230 |                    positions (float tensor of shape [2])
231 |     anchor_offset: center of the anchor with scale and aspect ratio 1 for the
232 |                    upper left element of the grid, this should be zero for
233 |                    feature networks with only VALID padding and even receptive
234 |                    field size, but may need some additional calculation if other
235 |                    padding is used (float tensor of shape [2])
236 |   Returns:
237 |     a BoxList holding a collection of N anchor boxes
238 |   """
239 |   ratio_sqrts = tf.sqrt(aspect_ratios)
240 |   # 根据base_anchor_size计算anchor在原图上本来的宽高
241 |   heights = scales / ratio_sqrts * base_anchor_size
242 |   # print ('heights == ', heights.get_shape())
243 |   widths = scales * ratio_sqrts * base_anchor_size
244 |   # print ('widths == ', widths.get_shape())
245 |   # Get a grid of box centers
246 |   y_centers = tf.to_float(tf.range(grid_height))
247 |   y_centers = y_centers * anchor_stride[0] + anchor_offset[0]
248 |   # print ('y_centers before meshgrid === ', y_centers.get_shape())
249 |   x_centers = tf.to_float(tf.range(grid_width))
250 |   x_centers = x_centers * anchor_stride[1] + anchor_offset[1]
251 |   # print('x_centers before meshgrid === ', x_centers.get_shape())
252 |   x_centers, y_centers = tf.meshgrid(x_centers, y_centers)
253 | 
254 |   # xcenters在和widths进行meshgrid之前，xcenters的shape是(grid_height * grid_width)，只不过每一行都是0-（grid_width-1),widths长度为9，是总共要生成的
255 |   # 9个anchors宽度列表，由前面计算得到。widths在和xcenters进行meshgrid之后，由于meshgrid是对维度为1的tensor进行操作，首先会把xcenters展开，
256 |   # 变成一行，有(grid_height * grid_width)列，然后再进行meshgrid操作。meshgrid之后，widths_grid为 （grid_height * gird_width) × 9维矩阵，每一行都是9个anchor的宽度
257 |   # xcenters_grid为(grid_height * grid_width) * 9矩阵，每一列都是grid_height个（0-grid_widht-1)数值。
258 |   # 下面的heights和y_centers进行meshgrid最终得到的结果略有不同，heights_grid和widths_grid结果很一致，都是（grid_height * gird_width) × 9维矩阵，每一行都是9个anchor的高度，
259 |   # 但y_centers_grid就略有变化，因为y_centers是每一列值都是 (0~grid_heigt-1),但每一行的值都是相同的，即每一行的值都是同一个值，meshgrid会将不是1维的矩阵变成一维，是按照行展开的，
260 |   # 所以y_centers展开后就变成[1,1,1,1,1,..., 1,2,2,2,2,...,2,....,h,h,h,...h]这种形式，因此在和heights进行meshgrid之后，y_centers_grid每一列都变成了前面说的那个列表内的值
261 | 
262 |   widths_grid, x_centers_grid = tf.meshgrid(widths, x_centers)
263 |   heights_grid, y_centers_grid = tf.meshgrid(heights, y_centers)
264 | 
265 |   # 在对y_centers_grid 和 x_centers_grid 进行axis=2的stack，x_centers_gird 和 y_centers_grid 维度均为  (grid_height*grid_width) * 9 维度，只不过数值不一样，按照
266 |   # axis=2 进行stack，其实就是把x_centers_grid 和 y_centers_grid 里的值一一对应起来，最后变成 (grid_height * grid_width) * 9 * 2的三维矩阵，其实就是所有anchor对应的
267 |   # 中心点在图像上的坐标，类似于[[[1,1]*9, [1,2]*9, ..., [7,7]*9]]这种形式，其实就是把图片上每个点的坐标拿出来，并重复9次，当做这个点生成的9个anchor的centers
268 |   bbox_centers = tf.stack([y_centers_grid, x_centers_grid], axis=2)
269 | 
270 |   # 同理，对heights_grid 和 widths_grid 进行 axis=2 的stack， 也是得到一个（grid_height * grid_width) * 9 * 2的三维矩阵，只不过这个矩阵保存的是anchor的size，和前面的bbox_centers
271 |   # 的值是一一对应的，即一个存了center的（x,y）坐标，一个存了bbox的宽高
272 |   bbox_sizes = tf.stack([heights_grid, widths_grid], axis=2)
273 | 
274 |   # 接着对这两个矩阵进行reshape成 n*2 的二维矩阵，n是所有anchor的数量，为 (grid_heigt * grid_width * 9)，bbox_centers每一行保存的是anchor的中心点坐标
275 |   # bbox_sizes 保存的是anchor的对应的宽高
276 |   bbox_centers = tf.reshape(bbox_centers, [-1, 2])
277 |   bbox_sizes = tf.reshape(bbox_sizes, [-1, 2])
278 |   # convert [ycenter, xcenter, height, width] to [ymin, xmin, ymax, xmax]
279 |   bbox_corners = _center_size_bbox_to_corners_bbox(bbox_centers, bbox_sizes)
280 | 
281 |   # 需要注意的是，这个生成的anchor就是相对于原图上的位置在哪，并且通过上一行的函数，把box的表示方式变成了[ymin, xmin, ymax, xmax]，最终的shape为(n, 4)
282 |   # base_anchor_size 变得越来越大的原因是，随着featuremap维度不断增高，其上面的每一个点所能表示的原图的范围，也即是感受野也在不断增大
283 |   return box_list.BoxList(bbox_corners)
284 | 
285 | 
286 | def _center_size_bbox_to_corners_bbox(centers, sizes):
287 |   """Converts bbox center-size representation to corners representation.
288 | 
289 |   Args:
290 |     centers: a tensor with shape [N, 2] representing bounding box centers
291 |     sizes: a tensor with shape [N, 2] representing bounding boxes
292 | 
293 |   Returns:
294 |     corners: tensor with shape [N, 4] representing bounding boxes in corners
295 |       representation
296 |   """
297 |   return tf.concat([centers - .5 * sizes, centers + .5 * sizes], 1)
298 | 
299 | 
300 | def create_retinanet_anchors(
301 |                        num_layers=5,
302 |                        scales=(1.0, pow(2, 1./3), pow(2, 2./3)),
303 |                        aspect_ratios=(0.5, 1.0, 2.0),
304 |                        base_anchor_sizes=(32.0, 64.0, 128.0, 256.0, 512.0)
305 |                        ):
306 |     """Create a set of anchors walking along a grid in a collection of feature maps in RetinaNet.
307 | 
308 |     This op creates a set of anchor boxes by placing a basis collection of
309 |     boxes with user-specified scales and aspect ratios centered at evenly
310 |     distributed points along a grid. The basis  Each box is multiplied by
311 |     base_anchor_size before placing it over its respective center.
312 | 
313 |     Args:
314 |         num_layers: The number of grid layers to create anchors
315 |         scales: A list of scales
316 |         aspect_ratios: A list of aspect ratios
317 |         base_anchor_sizes: List of base anchor sizes in each layer
318 |     Returns:
319 |         A MultipleGridAnchorGenerator
320 |     """
321 |     base_anchor_sizes = list(base_anchor_sizes)
322 |     box_spec_list = []
323 |     for idx in range(num_layers):
324 |         layer_spec_list = []
325 |         for scale in scales:
326 |             for aspect_ratio in aspect_ratios:
327 |                 layer_spec_list.append((scale, aspect_ratio))
328 |         box_spec_list.append(layer_spec_list)
329 | 
330 |     # for val in box_spec_list:
331 |     #     print (val)
332 |     # print (base_anchor_sizes)
333 | 
334 |     # box_spec_list = [[(1.0, 0.5), (1.0, 1.0), (1.0, 2.0),
335 |     #                   (1.2599210498948732, 0.5), (1.2599210498948732, 1.0), (1.2599210498948732, 2.0),
336 |     #                   (1.5874010519681994, 0.5), (1.5874010519681994, 1.0), (1.5874010519681994, 2.0)]]
337 |     # base_anchor_sizes = [256.0]
338 |     return MultipleGridAnchorGenerator(box_spec_list, base_anchor_sizes)
339 | 
340 | 
341 | def anchor_assign(anchors, gt_boxes, gt_labels, is_training=True):
342 |     """
343 |     Assign generated anchors to boxes and labels
344 |     Args:
345 |         anchors: BoxList holding a collection of N anchors
346 |         gt_boxes: BoxList holding a collection of groundtruth 2D box coordinates tensor/list [#object, 4]
347 |             ([ymin, xmin, ymax, xmax], float type) of objects in given input image.
348 |         gt_labels: Groundtruth 1D tensor/list [#object] (scalar int) of objects in given image.
349 |         is_training: is training or not
350 | 
351 |     returns:
352 |         BoxList with anchor location and class fields
353 |     """
354 |     pos_iou_thred = 0.5
355 |     neg_iou_thred = 0.5
356 |     if is_training:
357 |         neg_iou_thred = 0.4
358 |     if gt_boxes.get().get_shape()[0] != gt_labels.get_shape()[0]:
359 |         raise ValueError('Boxs and labels number must be equal.')
360 |     # box_iou: 总共有#anchors行，#gt_boxes列 (#anchors, #gtboxes)，每一行表示当前anchor对于gt_boxes的iou值
361 |     box_iou = box_list_ops.iou(anchors, gt_boxes)
362 | 
363 |     # anchor_max_iou:  返回每一个anchor相对于gt_boxes中最大的iou值，
364 |     # 是一个tensor，维度为[#anchors,], 每一个值为这个anchor和所有gtbox最大的iou值，
365 |     # 和下面的anchor_max_iou_indices相对应
366 |     anchor_max_iou = tf.reduce_max(box_iou, axis=1)
367 | 
368 |     # box_iou是一个二维矩阵，每一行代表一个anchor相对于gtbox的iou值，
369 |     # 对其进行axis=1的tf.argmax,就是找到这个anchor和哪个gtbox iou值最大,并返回其下标
370 |     anchor_max_iou_indices = tf.argmax(box_iou, axis=1)
371 | 
372 |     # 根据前面的anchor_max_iou_indices，将gt_boxes里对于每一个anchor是最大iou的那个gt_box取出来，
373 |     # 组成一个新的矩阵，维度为[#anchors, 4]
374 |     anchor_gt_box = tf.gather(gt_boxes.get(), anchor_max_iou_indices)
375 | 
376 |     # 类似于anchor_gt_box, 将前面anchor对应的最大iou的gt_box的label取出来，组成新的矩阵，维度为[#anchors,]
377 |     anchor_gt_cls = tf.gather(gt_labels, anchor_max_iou_indices) #[#saved_anchor_num], 1D
378 |     # print ('anchor_gt_cls === ', anchor_gt_cls)
379 | 
380 |     # get remaining index with iou between 0.4 to 0.5
381 |     # 对于每一个anchor，因为其都有一个相对于gtbox的最大iou值，判断其是否是正样本，如果当前anchor的max_iou值大于pos_iou_thred,
382 |     # 将其class设为其原本对应的label，否则设置为-1，为下一步操作做准备
383 |     anchor_gt_cls = tf.where(tf.greater(anchor_max_iou, pos_iou_thred), anchor_gt_cls, 0-tf.ones_like(anchor_gt_cls))
384 | 
385 |     # 和上面的函数同理，如果anchor的max_iou小于neg_iou_thred,就将其class设置为0，否则就是原本的class
386 |     # 因为已经考虑过其是否大于pos_iou_thred,所以执行完这个函数之后，最后得到的结果就是：
387 |     # iou > 0.5的anchor认为是正样本，iou<0.4认为是负样本， iou在0.4和0.5之间设为-1，忽略掉
388 |     anchor_gt_cls = tf.where(tf.less(anchor_max_iou, neg_iou_thred), tf.zeros_like(anchor_gt_cls), anchor_gt_cls)
389 | 
390 |     anchors.add_field('gt_boxes', anchor_gt_box)
391 |     anchors.add_field('gt_labels', anchor_gt_cls) #[#saved_anchor_num], 1D
392 |     return anchors
393 | 
394 | def anchor_test():
395 |     input_size = [224,224]
396 |     # feature_map = [(28, 28), (14, 14), (7, 7), (4, 4), (2, 2)]
397 |     feature_maps = [(tf.ceil(input_size[0]/pow(2., i+3)), tf.ceil(input_size[1]/pow(2., i+3))) for i in range(5)]
398 | 
399 |     feature_map_list = [(tf.ceil(tf.multiply(tf.to_float(input_size[0]), 1 / pow(2., i + 3))),
400 |                          tf.ceil(tf.multiply(tf.to_float(input_size[1]), 1 / pow(2., i + 3))))
401 |                         for i in range(5)]
402 | 
403 |     # feature_map_list = [(3,3)]
404 |     anchor_generator = create_retinanet_anchors()
405 |     # print ('scales = ', anchor_generator._scales)
406 |     # print ('aspect ratio = ', anchor_generator._aspect_ratios)
407 | 
408 |     anchors = anchor_generator.generate(input_size, feature_map_list)
409 |     anchors_before_assign = anchors.get()
410 |     # return
411 |     gt_boxes = box_list.BoxList(tf.convert_to_tensor([[0, 0, 210, 210], [200,203,205,206], [1,1,220,220]], dtype=tf.float32))
412 |     gt_labels = tf.convert_to_tensor([1, 1, 1])
413 |     anchors, box_iou = anchor_assign(anchors, gt_boxes, gt_labels)
414 |     # x = tf.convert_to_tensor([[[1,2,3],[3,4,5],[5,6,7]],[[1,2,3],[3,4,5],[5,6,7]]])
415 |     result = anchors.get_field("gt_boxes")
416 |     labels = anchors.get_field('gt_labels')
417 |     print (labels.get_shape())
418 |     with tf.Session() as sess:
419 |         print (sess.run(result).shape)
420 |         print (sess.run(labels).shape)
421 |         print (sess.run(box_iou))
422 |         # print(sess.run(tf.squeeze(tf.where(tf.greater(gt_labels, 1)))))
423 |         # print(sess.run(tf.gather(x, tf.convert_to_tensor([0,1]), axis=1)))
424 |     sess.close()
425 | 
426 | if __name__ == "__main__":
427 |     anchor_test()


--------------------------------------------------------------------------------
/person_detect/anchor/box_coder.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """box coder following Faster RCNN procedure.
 17 | 
 18 | Faster RCNN box coder follows the coding schema described below:
 19 |   ty = (y - ya) / ha
 20 |   tx = (x - xa) / wa
 21 |   th = log(h / ha)
 22 |   tw = log(w / wa)
 23 |   where x, y, w, h denote the box's center coordinates, width and height
 24 |   respectively. Similarly, xa, ya, wa, ha denote the anchor's center
 25 |   coordinates, width and height. tx, ty, tw and th denote the anchor-encoded
 26 |   center, width and height respectively.
 27 | 
 28 |   See http://arxiv.org/abs/1506.01497 for details.
 29 | """
 30 | 
 31 | import tensorflow as tf
 32 | 
 33 | from anchor import box_list
 34 | 
 35 | EPSILON = 1e-8
 36 | 
 37 | def get_center_coordinates_and_sizes(box_corners, scope=None):
 38 |     """Computes the center coordinates, height and width of the boxes.
 39 | 
 40 |     Args:
 41 |       box_corners: Tensor of N boxes
 42 |       scope: name scope of the function.
 43 | 
 44 |     Returns:
 45 |       a list of 4 1-D tensors [ycenter, xcenter, height, width].
 46 |     """
 47 |     with tf.name_scope(scope, 'get_center_coordinates_and_sizes'):
 48 |       ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(box_corners))
 49 |       width = xmax - xmin
 50 |       height = ymax - ymin
 51 |       ycenter = ymin + height / 2.
 52 |       xcenter = xmin + width / 2.
 53 |       return [ycenter, xcenter, height, width]
 54 | 
 55 | 
 56 | class FasterRCNNBoxCoder():
 57 |   """Faster RCNN box coder."""
 58 | 
 59 |   def __init__(self, scale_factors=None):
 60 |     """Constructor for FasterRcnnBoxCoder.
 61 | 
 62 |     Args:
 63 |       scale_factors: List of 4 positive scalars to scale ty, tx, th and tw.
 64 |         If set to None, does not perform scaling. For Faster RCNN,
 65 |         the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0].
 66 |     """
 67 |     if scale_factors:
 68 |       assert len(scale_factors) == 4
 69 |       for scalar in scale_factors:
 70 |         assert scalar > 0
 71 |     self._scale_factors = scale_factors
 72 | 
 73 |   @property
 74 |   def code_size(self):
 75 |     return 4
 76 | 
 77 |   def encode(self, boxes, anchors):
 78 |     """Encode a box collection with respect to anchor collection.
 79 | 
 80 |     Args:
 81 |       boxes: Tensor holding N boxes to be encoded.
 82 |       anchors: Tensor of corresponding N anchors.
 83 | 
 84 |     Returns:
 85 |       a tensor representing N anchor-encoded boxes of the format
 86 |       [ty, tx, th, tw].
 87 |     """
 88 |     # Convert anchors to the center coordinate representation.
 89 |     ycenter_a, xcenter_a, ha, wa = get_center_coordinates_and_sizes(anchors)
 90 |     ycenter, xcenter, h, w = get_center_coordinates_and_sizes(boxes)
 91 |     # Avoid NaN in division and log below.
 92 |     ha += EPSILON
 93 |     wa += EPSILON
 94 |     h += EPSILON
 95 |     w += EPSILON
 96 | 
 97 |     tx = (xcenter - xcenter_a) / wa
 98 |     ty = (ycenter - ycenter_a) / ha
 99 |     tw = tf.log(w / wa)
100 |     th = tf.log(h / ha)
101 |     # Scales location targets as used in paper for joint training.
102 |     if self._scale_factors:
103 |       ty *= self._scale_factors[0]
104 |       tx *= self._scale_factors[1]
105 |       th *= self._scale_factors[2]
106 |       tw *= self._scale_factors[3]
107 |     return tf.transpose(tf.stack([ty, tx, th, tw]))
108 | 
109 |   def decode(self, rel_codes, anchors):
110 |     """Decode relative codes to boxes.
111 | 
112 |     Args:
113 |       rel_codes: a tensor representing N anchor-encoded boxes.
114 |       anchors: BoxList of anchors.
115 | 
116 |     Returns:
117 |       boxes: BoxList holding N bounding boxes.
118 |     """
119 |     ycenter_a, xcenter_a, ha, wa = get_center_coordinates_and_sizes(anchors)
120 | 
121 |     ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes))
122 |     if self._scale_factors:
123 |       ty /= self._scale_factors[0]
124 |       tx /= self._scale_factors[1]
125 |       th /= self._scale_factors[2]
126 |       tw /= self._scale_factors[3]
127 |     w = tf.exp(tw) * wa
128 |     h = tf.exp(th) * ha
129 |     ycenter = ty * ha + ycenter_a
130 |     xcenter = tx * wa + xcenter_a
131 |     ymin = ycenter - h / 2.
132 |     xmin = xcenter - w / 2.
133 |     ymax = ycenter + h / 2.
134 |     xmax = xcenter + w / 2.
135 |     return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax])))
136 | 


--------------------------------------------------------------------------------
/person_detect/anchor/box_list.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Bounding Box List definition.
 17 | 
 18 | BoxList represents a list of bounding boxes as tensorflow
 19 | tensors, where each bounding box is represented as a row of 4 numbers,
 20 | [y_min, x_min, y_max, x_max].  It is assumed that all bounding boxes
 21 | within a given list correspond to a single image.  See also
 22 | box_list_ops.py for common box related operations (such as area, iou, etc).
 23 | 
 24 | Optionally, users can add additional related fields (such as weights).
 25 | We assume the following things to be true about fields:
 26 | * they correspond to boxes in the box_list along the 0th dimension
 27 | * they have inferrable rank at graph construction time
 28 | * all dimensions except for possibly the 0th can be inferred
 29 |   (i.e., not None) at graph construction time.
 30 | 
 31 | Some other notes:
 32 |   * Following tensorflow conventions, we use height, width ordering,
 33 |   and correspondingly, y,x (or ymin, xmin, ymax, xmax) ordering
 34 |   * Tensors are always provided as (flat) [N, 4] tensors.
 35 | """
 36 | 
 37 | import tensorflow as tf
 38 | 
 39 | 
 40 | class BoxList(object):
 41 |   """Box collection."""
 42 | 
 43 |   def __init__(self, boxes):
 44 |     """Constructs box collection.
 45 | 
 46 |     Args:
 47 |       boxes: a tensor of shape [N, 4] representing box corners
 48 | 
 49 |     Raises:
 50 |       ValueError: if invalid dimensions for bbox data or if bbox data is not in
 51 |           float32 format.
 52 |     """
 53 |     if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
 54 |       raise ValueError('Invalid dimensions for box data.')
 55 |     if boxes.dtype != tf.float32:
 56 |       raise ValueError('Invalid tensor type: should be tf.float32')
 57 |     self.data = {'boxes': boxes}
 58 | 
 59 |   def num_boxes(self):
 60 |     """Returns number of boxes held in collection.
 61 | 
 62 |     Returns:
 63 |       a tensor representing the number of boxes held in the collection.
 64 |     """
 65 |     return tf.shape(self.data['boxes'])[0]
 66 | 
 67 |   def num_boxes_static(self):
 68 |     """Returns number of boxes held in collection.
 69 | 
 70 |     This number is inferred at graph construction time rather than run-time.
 71 | 
 72 |     Returns:
 73 |       Number of boxes held in collection (integer) or None if this is not
 74 |         inferrable at graph construction time.
 75 |     """
 76 |     return self.data['boxes'].get_shape()[0].value
 77 | 
 78 |   def get_all_fields(self):
 79 |     """Returns all fields."""
 80 |     return self.data.keys()
 81 | 
 82 |   def get_extra_fields(self):
 83 |     """Returns all non-box fields (i.e., everything not named 'boxes')."""
 84 |     return [k for k in self.data.keys() if k != 'boxes']
 85 | 
 86 |   def add_field(self, field, field_data):
 87 |     """Add field to box list.
 88 | 
 89 |     This method can be used to add related box data such as
 90 |     weights/labels, etc.
 91 | 
 92 |     Args:
 93 |       field: a string key to access the data via `get`
 94 |       field_data: a tensor containing the data to store in the BoxList
 95 |     """
 96 |     self.data[field] = field_data
 97 | 
 98 |   def has_field(self, field):
 99 |     return field in self.data
100 | 
101 |   def get(self):
102 |     """Convenience function for accessing box coordinates.
103 | 
104 |     Returns:
105 |       a tensor with shape [N, 4] representing box coordinates following order [ymin, xmin, ymax, xmax]
106 |     """
107 |     return self.get_field('boxes')
108 | 
109 |   def set(self, boxes):
110 |     """Convenience function for setting box coordinates.
111 | 
112 |     Args:
113 |       boxes: a tensor of shape [N, 4] representing box corners
114 | 
115 |     Raises:
116 |       ValueError: if invalid dimensions for bbox data
117 |     """
118 |     if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4:
119 |       raise ValueError('Invalid dimensions for box data.')
120 |     self.data['boxes'] = boxes
121 | 
122 |   def get_field(self, field):
123 |     """Accesses a box collection and associated fields.
124 | 
125 |     This function returns specified field with object; if no field is specified,
126 |     it returns the box coordinates.
127 | 
128 |     Args:
129 |       field: this optional string parameter can be used to specify
130 |         a related field to be accessed.
131 | 
132 |     Returns:
133 |       a tensor representing the box collection or an associated field.
134 | 
135 |     Raises:
136 |       ValueError: if invalid field
137 |     """
138 |     if not self.has_field(field):
139 |       raise ValueError('field ' + str(field) + ' does not exist')
140 |     return self.data[field]
141 | 
142 |   def set_field(self, field, value):
143 |     """Sets the value of a field.
144 | 
145 |     Updates the field of a box_list with a given value.
146 | 
147 |     Args:
148 |       field: (string) name of the field to set value.
149 |       value: the value to assign to the field.
150 | 
151 |     Raises:
152 |       ValueError: if the box_list does not have specified field.
153 |     """
154 |     if not self.has_field(field):
155 |       raise ValueError('field %s does not exist' % field)
156 |     self.data[field] = value
157 | 
158 |   def get_center_coordinates_and_sizes(self, scope=None):
159 |     """Computes the center coordinates, height and width of the boxes.
160 | 
161 |     Args:
162 |       scope: name scope of the function.
163 | 
164 |     Returns:
165 |       a list of 4 1-D tensors [ycenter, xcenter, height, width].
166 |     """
167 |     with tf.name_scope(scope, 'get_center_coordinates_and_sizes'):
168 |       box_corners = self.get()
169 |       ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(box_corners))
170 |       width = xmax - xmin
171 |       height = ymax - ymin
172 |       ycenter = ymin + height / 2.
173 |       xcenter = xmin + width / 2.
174 |       return [ycenter, xcenter, height, width]
175 | 
176 |   def transpose_coordinates(self, scope=None):
177 |     """Transpose the coordinate representation in a boxlist.
178 | 
179 |     Args:
180 |       scope: name scope of the function.
181 |     """
182 |     with tf.name_scope(scope, 'transpose_coordinates'):
183 |       y_min, x_min, y_max, x_max = tf.split(
184 |           value=self.get(), num_or_size_splits=4, axis=1)
185 |       self.set(tf.concat([x_min, y_min, x_max, y_max], 1))
186 | 
187 |   def as_tensor_dict(self, fields=None):
188 |     """Retrieves specified fields as a dictionary of tensors.
189 | 
190 |     Args:
191 |       fields: (optional) list of fields to return in the dictionary.
192 |         If None (default), all fields are returned.
193 | 
194 |     Returns:
195 |       tensor_dict: A dictionary of tensors specified by fields.
196 | 
197 |     Raises:
198 |       ValueError: if specified field is not contained in boxlist.
199 |     """
200 |     tensor_dict = {}
201 |     if fields is None:
202 |       fields = self.get_all_fields()
203 |     for field in fields:
204 |       if not self.has_field(field):
205 |         raise ValueError('boxlist must contain all specified fields')
206 |       tensor_dict[field] = self.get_field(field)
207 |     return tensor_dict
208 | 


--------------------------------------------------------------------------------
/person_detect/anchor/shape_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | 
 16 | """Utils used to manipulate tensor shapes."""
 17 | 
 18 | import tensorflow as tf
 19 | 
 20 | 
 21 | def _is_tensor(t):
 22 |   """Returns a boolean indicating whether the input is a tensor.
 23 | 
 24 |   Args:
 25 |     t: the input to be tested.
 26 | 
 27 |   Returns:
 28 |     a boolean that indicates whether t is a tensor.
 29 |   """
 30 |   return isinstance(t, (tf.Tensor, tf.SparseTensor, tf.Variable))
 31 | 
 32 | 
 33 | def _set_dim_0(t, d0):
 34 |   """Sets the 0-th dimension of the input tensor.
 35 | 
 36 |   Args:
 37 |     t: the input tensor, assuming the rank is at least 1.
 38 |     d0: an integer indicating the 0-th dimension of the input tensor.
 39 | 
 40 |   Returns:
 41 |     the tensor t with the 0-th dimension set.
 42 |   """
 43 |   t_shape = t.get_shape().as_list()
 44 |   t_shape[0] = d0
 45 |   t.set_shape(t_shape)
 46 |   return t
 47 | 
 48 | 
 49 | def pad_tensor(t, length):
 50 |   """Pads the input tensor with 0s along the first dimension up to the length.
 51 | 
 52 |   Args:
 53 |     t: the input tensor, assuming the rank is at least 1.
 54 |     length: a tensor of shape [1]  or an integer, indicating the first dimension
 55 |       of the input tensor t after padding, assuming length <= t.shape[0].
 56 | 
 57 |   Returns:
 58 |     padded_t: the padded tensor, whose first dimension is length. If the length
 59 |       is an integer, the first dimension of padded_t is set to length
 60 |       statically.
 61 |   """
 62 |   t_rank = tf.rank(t)
 63 |   t_shape = tf.shape(t)
 64 |   t_d0 = t_shape[0]
 65 |   pad_d0 = tf.expand_dims(length - t_d0, 0)
 66 |   pad_shape = tf.cond(
 67 |       tf.greater(t_rank, 1), lambda: tf.concat([pad_d0, t_shape[1:]], 0),
 68 |       lambda: tf.expand_dims(length - t_d0, 0))
 69 |   padded_t = tf.concat([t, tf.zeros(pad_shape, dtype=t.dtype)], 0)
 70 |   if not _is_tensor(length):
 71 |     padded_t = _set_dim_0(padded_t, length)
 72 |   return padded_t
 73 | 
 74 | 
 75 | def clip_tensor(t, length):
 76 |   """Clips the input tensor along the first dimension up to the length.
 77 | 
 78 |   Args:
 79 |     t: the input tensor, assuming the rank is at least 1.
 80 |     length: a tensor of shape [1]  or an integer, indicating the first dimension
 81 |       of the input tensor t after clipping, assuming length <= t.shape[0].
 82 | 
 83 |   Returns:
 84 |     clipped_t: the clipped tensor, whose first dimension is length. If the
 85 |       length is an integer, the first dimension of clipped_t is set to length
 86 |       statically.
 87 |   """
 88 |   clipped_t = tf.gather(t, tf.range(length))
 89 |   if not _is_tensor(length):
 90 |     clipped_t = _set_dim_0(clipped_t, length)
 91 |   return clipped_t
 92 | 
 93 | 
 94 | def pad_or_clip_tensor(t, length):
 95 |   """Pad or clip the input tensor along the first dimension.
 96 | 
 97 |   Args:
 98 |     t: the input tensor, assuming the rank is at least 1.
 99 |     length: a tensor of shape [1]  or an integer, indicating the first dimension
100 |       of the input tensor t after processing.
101 | 
102 |   Returns:
103 |     processed_t: the processed tensor, whose first dimension is length. If the
104 |       length is an integer, the first dimension of the processed tensor is set
105 |       to length statically.
106 |   """
107 |   processed_t = tf.cond(
108 |       tf.greater(tf.shape(t)[0], length),
109 |       lambda: clip_tensor(t, length),
110 |       lambda: pad_tensor(t, length))
111 |   if not _is_tensor(length):
112 |     processed_t = _set_dim_0(processed_t, length)
113 |   return processed_t
114 | 
115 | 
116 | def combined_static_and_dynamic_shape(tensor):
117 |   """Returns a list containing static and dynamic values for the dimensions.
118 | 
119 |   Returns a list of static and dynamic values for shape dimensions. This is
120 |   useful to preserve static shapes when available in reshape operation.
121 | 
122 |   Args:
123 |     tensor: A tensor of any type.
124 | 
125 |   Returns:
126 |     A list of size tensor.shape.ndims containing integers or a scalar tensor.
127 |   """
128 |   static_shape = tensor.shape.as_list()
129 |   dynamic_shape = tf.shape(tensor)
130 |   combined_shape = []
131 |   for index, dim in enumerate(static_shape):
132 |     if dim is not None:
133 |       combined_shape.append(dim)
134 |     else:
135 |       combined_shape.append(dynamic_shape[index])
136 |   return combined_shape
137 | 


--------------------------------------------------------------------------------
/person_detect/person_detect_test.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: person_detect_test.py
  7 | @time: 18-10-9 下午5:14
  8 | '''
  9 | 
 10 | # encoding: utf-8
 11 | '''
 12 | @author: shiwei hou
 13 | @contact: murdockhou@gmail.com
 14 | @software: PyCharm
 15 | @file: test_model.py
 16 | @time: 18-9-21 下午1:31
 17 | '''
 18 | 
 19 | import tensorflow as tf
 20 | import numpy as np
 21 | import json, cv2, os
 22 | import logging
 23 | import  sys
 24 | sys.path.append('../')
 25 | 
 26 | from src.retinanet import RetinaNet
 27 | from anchor.anchor_generator import create_retinanet_anchors
 28 | from anchor.box_coder import FasterRCNNBoxCoder
 29 | from keypoint_subnet.src.backbone import  BackBone
 30 | from src.draw_box_with_image import get_pred_boxs_with_img
 31 | 
 32 | 
 33 | FLAGS = tf.flags.FLAGS
 34 | tf.flags.DEFINE_string('model', '/media/ulsee/D/retinanet/20181019-2122/model.ckpt-209999',
 35 |                        'model path you want to test, e.g,. (/media/ulsee/D/retinanet/20180920-1337/model.ckpt-xxxx')
 36 | tf.flags.DEFINE_string('img_path', '/media/ulsee/E/datasets/test',
 37 |                        'img path to test model')
 38 | tf.flags.DEFINE_string('save_path', '/media/ulsee/E/retinanet/test',
 39 |                        'model test result to save')
 40 | tf.flags.DEFINE_integer(name='batch_size', default=1, help='train batch size number')
 41 | tf.flags.DEFINE_integer(name='img_size', default=480, help='net input size')
 42 | tf.flags.DEFINE_boolean('is_single_channel', False, 'define the net cls_pred is single channel or not.')
 43 | 
 44 | def draw_boxs(img, boxs, scores):
 45 | 
 46 |     for i in range(boxs.shape[0]):
 47 |         box = boxs[i]
 48 |         cv2.rectangle(img, (box[1], box[0]), (box[3], box[2]), (255 - 10*i,0,0), 1)
 49 |         cv2.putText(img, 'person: ' + str(scores[i]), (box[1], box[0]), cv2.FONT_HERSHEY_COMPLEX, 1, (0,0,255), 1)
 50 |     return img
 51 | 
 52 | def net_():
 53 |     if not os.path.exists(FLAGS.save_path):
 54 |         os.makedirs(FLAGS.save_path)
 55 | 
 56 |     graph = tf.Graph()
 57 |     with graph.as_default():
 58 | 
 59 |         backbone = BackBone(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, is_training=False)
 60 |         fpn, _   = backbone.build_fpn_feature()
 61 | 
 62 |         net = RetinaNet(fpn=fpn, feature_map_dict=_, batch_size=backbone.batch_size,
 63 |                         num_classes=2, is_training=False)
 64 |         loc_preds, cls_preds = net.forward()
 65 | 
 66 |         #-------------------------------generate anchor----------------------------------------#
 67 |         input_size = [tf.to_float(FLAGS.img_size), tf.to_float(FLAGS.img_size)]
 68 |         feature_map_list = [(tf.ceil(tf.multiply(input_size[0], 1 / pow(2., i + 3))),
 69 |                              tf.ceil(tf.multiply(input_size[1], 1 / pow(2., i + 3))))
 70 |                             for i in range(5)]
 71 |         anchor_generator = create_retinanet_anchors()
 72 |         anchor = anchor_generator.generate(input_size, feature_map_list)
 73 | 
 74 |         # -------------------------------decode loc_pred---------------------------------------#
 75 |         current_loc_pred = loc_preds[0]
 76 |         # 根据anchor将网络的loc输出解码，表示为[ymin, xmin, ymax, xmax]
 77 |         current_box_list = FasterRCNNBoxCoder().decode(current_loc_pred, anchor.get())
 78 |         current_decoded_loc_pred = current_box_list.get()
 79 |         # -------------------------------------NMS--------------------------------------------#
 80 |         box_score = tf.nn.softmax(cls_preds[0])
 81 |         box_score = box_score[:, 1]
 82 |         top_k_score, top_k_indices = tf.nn.top_k(box_score, k=60)
 83 |         decode_boxes = tf.gather(current_decoded_loc_pred, top_k_indices)
 84 |         valid_indices = tf.image.non_max_suppression(boxes=decode_boxes, scores=top_k_score, max_output_size=6,
 85 |                                                      iou_threshold=0.5)
 86 |         final_boxs = tf.gather(decode_boxes, valid_indices)
 87 |         final_scores = tf.gather(top_k_score, valid_indices)
 88 |         #----------------------------------------------------------------------------------------#
 89 |         _box = final_boxs / tf.to_float(FLAGS.img_size)
 90 |         _box = tf.expand_dims(_box, axis=0)
 91 |         _img_with_box = tf.image.draw_bounding_boxes(backbone.input_imgs, _box)
 92 |         #----------------------------------------------------------------------------------------#
 93 |         init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
 94 |         saver = tf.train.Saver()
 95 | 
 96 |         with tf.Session() as sess:
 97 |             sess.run(init_op)
 98 |             saver.restore(sess, FLAGS.model)
 99 |             logging.info('model restore successfully.')
100 | 
101 |             #----------------load img-----------------#
102 |             img_num = 0
103 |             for img in os.listdir(FLAGS.img_path):
104 |                 img_ori = cv2.imread(os.path.join(FLAGS.img_path, img), cv2.IMREAD_COLOR)
105 |                 img_copy = img_ori.copy()
106 | 
107 |                 img_input = cv2.resize(img_copy, (FLAGS.img_size, FLAGS.img_size),
108 |                                        interpolation=cv2.INTER_NEAREST)
109 |                 # boxs, [n, 4], n = [ymin, xmin, ymax, xmax]
110 |                 classes, locations, boxs, scores, img_boxs = sess.run([cls_preds, loc_preds, final_boxs, final_scores, _img_with_box],
111 |                                                             feed_dict={backbone.input_imgs:[img_input]})
112 | 
113 |                 #--------------------scale------------------#
114 |                 factorx = img_ori.shape[1] / img_input.shape[1]
115 |                 factory = img_ori.shape[0] / img_input.shape[0]
116 |                 boxs[:,0] = boxs[:,0] * factory
117 |                 boxs[:,2] = boxs[:,2] * factory
118 |                 boxs[:,1] = boxs[:,1] * factorx
119 |                 boxs[:,3] = boxs[:,3] * factorx
120 |                 #-------------------------------------------#
121 |                 img_save = draw_boxs(img_ori, boxs, scores)
122 |                 cv2.imwrite(os.path.join(FLAGS.save_path, img), img_save)
123 |                 cv2.imwrite(os.path.join(FLAGS.save_path, 'tf' + img), img_boxs[0])
124 |                 img_num += 1
125 |                 logging.info('Testing imgs ... {}'.format(img_num))
126 | 
127 |                 if img_num > 100:
128 |                     break
129 | 
130 | 
131 | if __name__ == '__main__':
132 |     logging.basicConfig(level=logging.INFO)
133 |     net_()
134 | 
135 | 


--------------------------------------------------------------------------------
/person_detect/person_detect_train.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: person_detect_train.py
  7 | @time: 18-9-28 下午2:47
  8 | '''
  9 | 
 10 | import tensorflow as tf
 11 | import tensorflow.contrib.slim as slim
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | import os, json, cv2, time
 15 | from datetime import datetime
 16 | 
 17 | import  sys
 18 | sys.path.append('../')
 19 | 
 20 | from src.reader import Box_Reader
 21 | from src.get_loss import get_loss
 22 | from src.draw_box_with_image import get_gt_boxs_with_img, get_pred_boxs_with_img
 23 | 
 24 | from src.retinanet import RetinaNet
 25 | from keypoint_subnet.src.backbone import  BackBone
 26 | 
 27 | FLAGS = tf.flags.FLAGS
 28 | tf.flags.DEFINE_integer('train_nums', 118280, 'train data nums, default: cocotrain2017--118280')
 29 | tf.flags.DEFINE_integer('epochs', 10, 'train epochs')
 30 | tf.flags.DEFINE_integer('num_classes', 1, '')
 31 | tf.flags.DEFINE_integer('batch_size', 3, 'train batch size number')
 32 | tf.flags.DEFINE_integer('img_size', 480, 'net input size')
 33 | tf.flags.DEFINE_float('learning_rate', 5e-5, 'trian lr')
 34 | tf.flags.DEFINE_float('decay_rate', 0.9, 'lr decay rate')
 35 | tf.flags.DEFINE_integer('decay_steps', 10000, 'lr decay steps')
 36 | tf.flags.DEFINE_string('pretrained_resnet', '/media/ulsee/D/keypoint_subnet/20181015-1711/model.ckpt-64999/model.ckpt-339999',
 37 |                        'keypoint subnet pretrained model')
 38 | tf.flags.DEFINE_boolean('is_training', True, '')
 39 | tf.flags.DEFINE_string('checkpoint_path', '/media/ulsee/D/retinanet', 'path to save training model')
 40 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/person_subnet_tfrecord/coco-instance-5.tfrecord', '')
 41 | tf.flags.DEFINE_string('finetuning',None,
 42 |                     'folder of saved model that you wish to continue training or testing(e.g. 20180828-1803/model.ckpt-xxx), default: None')
 43 | 
 44 | def person_detect_train():
 45 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 46 | 
 47 |     # -------------------define where checkpoint path is-------------------------#
 48 |     current_time = datetime.now().strftime('%Y%m%d-%H%M')
 49 |     if FLAGS.finetuning is None:
 50 |         checkpoints_dir = os.path.join(FLAGS.checkpoint_path, current_time)
 51 |         if not os.path.exists(checkpoints_dir):
 52 |             try:
 53 |                 os.makedirs(checkpoints_dir)
 54 |             except:
 55 |                 pass
 56 |     else:
 57 |         checkpoints_dir = os.path.join(FLAGS.checkpoint_path, FLAGS.finetuning)
 58 |     print('checkpoints_dir == {}'.format(checkpoints_dir))
 59 | 
 60 |     # ------------------------------define Graph --------------------------------#
 61 |     tf.reset_default_graph()
 62 |     graph = tf.Graph()
 63 |     with graph.as_default():
 64 |         #-----------------------------tf.placeholder-----------------------------#
 65 |         gt_boxs_placeholder = tf.placeholder(tf.float32, shape=[FLAGS.batch_size, 30, 4])
 66 |         gt_labels_placeholder = tf.placeholder(tf.int64, shape=[FLAGS.batch_size, 30,])
 67 |         #-------------------------------reader-----------------------------------#
 68 |         reader = Box_Reader(tfrecord_file=FLAGS.tfrecord_file, img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, epochs=FLAGS.epochs)
 69 |         img_batch, img_ids, img_height_batch, img_width_batch, gt_boxs, gt_labels = reader.feed()
 70 |         #--------------------------------net-------------------------------------#
 71 |         backbone = BackBone(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, is_training=FLAGS.is_training)
 72 |         fpn, _   = backbone.build_fpn_feature()
 73 |         net      = RetinaNet(fpn=fpn, feature_map_dict=_, batch_size=backbone.batch_size,
 74 |                              num_classes=FLAGS.num_classes+1, is_training=FLAGS.is_training)
 75 |         loc_pred, cls_pred = net.forward()
 76 |         #---------------------------------loss-----------------------------------#
 77 |         loss, decoded_loc_pred = get_loss(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size,
 78 |                                           gt_boxes=gt_boxs_placeholder, loc_pred=loc_pred,
 79 |                                           gt_labels=gt_labels_placeholder, cls_pred=cls_pred,
 80 |                                           num_classes=FLAGS.num_classes, is_training=FLAGS.is_training)
 81 |         # -----------------------------learning rate-------------------------------#
 82 |         global_step = tf.Variable(0, trainable=False)
 83 |         learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step=global_step,
 84 |                                                    decay_steps=FLAGS.decay_steps, decay_rate=FLAGS.decay_rate,
 85 |                                                    staircase=True)
 86 |         opt          = tf.train.AdamOptimizer(learning_rate, epsilon=1e-5)
 87 |         update_ops   = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 88 |         with tf.control_dependencies(update_ops):
 89 |             train_op = opt.minimize(loss, global_step=global_step)
 90 |         #--------------------------------saver-----------------------------------#
 91 |         res50_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='resnet_v2_50')
 92 |         restore_res50  = tf.train.Saver(var_list=res50_var_list)
 93 |         fpn_var_list   = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='build_fpn_feature')
 94 | 
 95 |         global_list    = tf.global_variables()
 96 |         bn_moving_vars = [g for g in global_list if 'moving_mean' in g.name]
 97 |         bn_moving_vars += [g for g in global_list if 'moving_variance' in g.name]
 98 |         restore_share  = tf.train.Saver(var_list=(res50_var_list+fpn_var_list+bn_moving_vars))
 99 | 
100 |         var_list        = tf.trainable_variables()
101 |         retina_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='retina_net')
102 |         saver           = tf.train.Saver(var_list=(res50_var_list+fpn_var_list+bn_moving_vars+retina_var_list), max_to_keep=10)
103 |         saver_alter     = tf.train.Saver(max_to_keep=5)
104 | 
105 |         #-------------------------------tf summary--------------------------------#
106 |         gt_img_batch_with_box    = get_gt_boxs_with_img(imgs=backbone.input_imgs, gt_boxs=gt_boxs_placeholder, gt_labels=gt_labels_placeholder,
107 |                                                         batch_size=FLAGS.batch_size, img_size=FLAGS.img_size)
108 |         pred_img_batch_with_box  = get_pred_boxs_with_img(imgs=backbone.input_imgs, decoded_boxs=decoded_loc_pred, cls_pred=cls_pred,
109 |                                                           batch_size=FLAGS.batch_size, img_size=FLAGS.img_size)
110 |         gt_img_box_placeholder   = tf.placeholder(tf.float32,
111 |                                                   shape=(FLAGS.batch_size, FLAGS.img_size, FLAGS.img_size, 3))
112 |         pred_img_box_placeholder = tf.placeholder(tf.float32,
113 |                                                   shape=(FLAGS.batch_size, FLAGS.img_size, FLAGS.img_size, 3))
114 |         img_ids_batch_placeholder = tf.placeholder(tf.string, shape=[FLAGS.batch_size,])
115 |         tf.summary.text('img_ids', img_ids_batch_placeholder)
116 |         tf.summary.image('gt_bbox', gt_img_box_placeholder, max_outputs=2)
117 |         tf.summary.image('Pre_bbox', pred_img_box_placeholder, max_outputs=2)
118 |         tf.summary.scalar('lr', learning_rate)
119 |         tf.summary.scalar('loss', loss)
120 | 
121 |         summary_op     = tf.summary.merge_all()
122 |         summary_writer = tf.summary.FileWriter(checkpoints_dir, graph)
123 | 
124 |         #----------------------------------init-----------------------------------#
125 |         init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
126 |         config  = tf.ConfigProto()
127 |         # config.gpu_options.per_process_gpu_memory_fraction = 0.7
128 |         # sudo rm -f ~/.nv
129 |         config.gpu_options.allow_growth = True
130 |         step = 0
131 |         #---------------------------------train------------------------------------#
132 |         with tf.Session(graph=graph, config=config) as sess:
133 |             sess.run(init_op)
134 | 
135 |             if FLAGS.finetuning is not None:
136 |                 saver.save(sess, checkpoints_dir)
137 |                 print('Successfully load finetuning model.')
138 |                 print('Global_step == {}, Step == {}'.format(sess.run(global_step), step))
139 |                 step = sess.run(global_step)
140 | 
141 |             else:
142 |                 restore_share.save(sess, FLAGS.pretrained_resnet)
143 |                 print ('Successfully load pre_trained model.')
144 | 
145 |             coord = tf.train.Coordinator()
146 |             threads = tf.train.start_queue_runners(sess=sess, coord=coord)
147 | 
148 |             start_time = time.time()
149 |             try:
150 |                 while not coord.should_stop():
151 |                     imgs, ids, heights, widths, boxes, labels = sess.run([img_batch, img_ids, img_height_batch, img_width_batch, gt_boxs, gt_labels])
152 | 
153 |                     gt_img_box, pre_img_box, \
154 |                     total_loss, box_pred_list, classes_pred, \
155 |                     _, lr= sess.run(
156 |                         [gt_img_batch_with_box, pred_img_batch_with_box,
157 |                          loss, decoded_loc_pred, cls_pred,
158 |                          train_op, learning_rate
159 |                          ], feed_dict={
160 |                             backbone.input_imgs: imgs,
161 |                             gt_boxs_placeholder: boxes,
162 |                             gt_labels_placeholder:labels,
163 |                             img_ids_batch_placeholder:ids
164 |                         }
165 |                     )
166 |                     # cur_time = time.time()
167 |                     # print ('sess run spend {}'.format(cur_time-pre_time))
168 |                     # pre_time = cur_time
169 | 
170 |                     #-------------------summary------------------------#
171 |                     # gt_img_box_placeholder: gt_img_box,
172 |                     merge_op = sess.run(summary_op,feed_dict={backbone.input_imgs: imgs,
173 |                                                               gt_boxs_placeholder: boxes,
174 |                                                               gt_labels_placeholder:labels,
175 |                                                               pred_img_box_placeholder:pre_img_box,
176 |                                                               gt_img_box_placeholder: gt_img_box,
177 |                                                               img_ids_batch_placeholder: ids})
178 |                     summary_writer.add_summary(merge_op, step)
179 |                     summary_writer.flush()
180 | 
181 |                     # cur_time = time.time()
182 |                     # print('merge op spend {}'.format(cur_time - pre_time))
183 |                     # pre_time = cur_time
184 | 
185 |                     if (step+1) % 10 == 0:
186 |                         cur_time = time.time()
187 |                         print('Step = {}, Total loss = {}, time spend = {}'.format(step, total_loss, cur_time-start_time))
188 |                         start_time = cur_time
189 | 
190 |                     if (step+1) % 2000 == 0:
191 |                         save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step)
192 |                         print('Model saved in file: %s' % save_path)
193 |                         save_path_alter = saver_alter.save(sess, checkpoints_dir + '/model_alter.ckpt',
194 |                                                            global_step=step)
195 | 
196 |                     step += 1
197 |                     # print (step)
198 |                     # if step == 10:
199 |                     #     break
200 | 
201 |             except KeyboardInterrupt:
202 |                 print ('Interrupted')
203 |                 coord.request_stop()
204 | 
205 |             except Exception as e:
206 |                 coord.request_stop(e)
207 | 
208 |             finally:
209 |                 save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step)
210 |                 print ('Model saved in file: %s' % save_path)
211 |                 save_path_alter = saver_alter.save(sess, checkpoints_dir + '/model_alter.ckpt', global_step=step)
212 |                 # When done, ask the threads to stop.
213 |                 coord.request_stop()
214 |                 coord.join(threads)
215 | 
216 | 
217 | 
218 | 
219 | if __name__ == '__main__':
220 |     person_detect_train()


--------------------------------------------------------------------------------
/person_detect/src/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | '''
3 | @author: shiwei hou
4 | @contact: murdockhou@gmail.com
5 | @software: PyCharm
6 | @file: __init__.py
7 | @time: 18-9-28 下午2:45
8 | ''' 


--------------------------------------------------------------------------------
/person_detect/src/backbone.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | @author: shiwei hou
 4 | @contact: murdockhou@gmail.com
 5 | @software: PyCharm
 6 | @file: backbone.py
 7 | @time: 18-9-28 上午11:03
 8 | '''
 9 | 
10 | from __future__ import absolute_import, division, print_function
11 | 
12 | import tensorflow as tf
13 | 
14 | from tensorflow.contrib.slim import nets
15 | from tensorflow.contrib.layers.python.layers import utils
16 | import tensorflow.contrib.slim as slim
17 | 
18 | class BackBone(object):
19 |     def __init__(self, inputs, img_size, batch_size, is_training=True):
20 |         self.img_size    = img_size
21 |         self.batch_size  = batch_size
22 |         self.input_imgs  = inputs
23 |         self.is_training = is_training
24 |         self.stddev      = 0.01
25 | 
26 |     def get_feature_map(self):
27 |         #-------------------resent---------------------#
28 |         arg_scope = nets.resnet_v2.resnet_arg_scope()
29 |         with slim. arg_scope(arg_scope):
30 |             out, end_points = nets.resnet_v2.resnet_v2_50(inputs=self.input_imgs, num_classes=None, is_training=self.is_training)
31 |         #---------------feature map dict---------------#
32 |         feature_map_dict = {
33 |             'C2': end_points['resnet_v2_50/block1/unit_2/bottleneck_v2'],  # input_size / 4
34 |             'C3': end_points['resnet_v2_50/block2/unit_3/bottleneck_v2'],  # input_size / 8
35 |             'C4': end_points['resnet_v2_50/block3/unit_4/bottleneck_v2'],  # input_size / 16
36 |             'C5': end_points['resnet_v2_50/block4']                        # input_size / 32
37 |         }
38 |         return feature_map_dict
39 | 
40 |     def build_fpn_feature(self):
41 |         feature_pyramid  = {}
42 |         feature_map_dict = self.get_feature_map()
43 |         #------------------------------------------build fpn-------------------------------------------#
44 |         with tf.variable_scope('build_fpn_feature'):
45 |             with slim.arg_scope([slim.conv2d], weights_initializer=tf.random_normal_initializer(stddev=self.stddev)):
46 |                 feature_pyramid['P5'] = slim.conv2d(feature_map_dict['C5'], num_outputs=256, kernel_size=[1, 1], stride=1,
47 |                                         scope='build_fpn_P5')
48 | 
49 |                 #------------------ top-down pathway and lateral connections--------------------------#
50 |                 for layer in range(4, 1, -1):
51 |                     p = feature_pyramid['P' + str(layer + 1)]
52 |                     c = feature_map_dict['C' + str(layer)]
53 | 
54 |                     #---------------------------------- upsample p -----------------------------------#
55 |                     up_shape = c.get_shape()
56 |                     up_sample = tf.image.resize_nearest_neighbor(p, [up_shape[2], up_shape[2]],
57 |                                                                  name='upsampling_fpn_P%d' % layer)
58 | 
59 |                     #----------------------------------- 1x1 conv ------------------------------------#
60 |                     c = slim.conv2d(c, num_outputs=256, kernel_size=[1, 1], stride=1, scope='fpn_1x1conv_C%d' % layer)
61 |                     p = up_sample + c
62 | 
63 |                     #----------------------reduce aliasing effect of upsampling ----------------------#
64 |                     #---------------(in the third last paragraph, Section 3, Paper FPN)---------------#
65 |                     p = slim.conv2d(p, num_outputs=256, kernel_size=[3, 3], stride=1, padding='SAME',
66 |                                     scope='build_fpn_P%d' % layer)
67 | 
68 |                     feature_pyramid['P' + str(layer)] = p
69 | 
70 |         return feature_pyramid, feature_map_dict
71 | 
72 | 


--------------------------------------------------------------------------------
/person_detect/src/convert_tfrecord.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: convert_tfrecord.py.py
  7 | @time: 18-9-28 下午6:55
  8 | ''' 
  9 | import tensorflow as tf
 10 | import cv2, os, json
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | 
 14 | FLAGS = tf.flags.FLAGS
 15 | tf.flags.DEFINE_string('json_file', '/media/ulsee/E/datasets/coco/annotations2017/coco-instance-imgid-bbox.json', '')
 16 | tf.flags.DEFINE_string('img_path', '/media/ulsee/E/datasets/coco/cocotrain2017', 'image dataset path need to convert to tfrecord')
 17 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/person_subnet_tfrecord/coco-instance-with-ids.tfrecord', 'tfrecord file')
 18 | 
 19 | def _int64_feature(value):
 20 |     ''' Wrapper for inserting int64 feature into Example proto'''
 21 |     if not isinstance(value, list):
 22 |         value = [value]
 23 |     return tf.train.Feature(int64_list = tf.train.Int64List(value=value))
 24 | 
 25 | def _float_feature(value):
 26 |     ''' Wrapper for inserting float feature into Example proto'''
 27 |     if not isinstance(value, list):
 28 |         value = [value]
 29 |     return tf.train.Feature(float_list=tf.train.FloatList(value=value))
 30 | 
 31 | def _bytes_feature(value):
 32 |     ''' Wrapper for inserting bytes feature into Example proto'''
 33 |     if not isinstance(value, list):
 34 |         value = [value]
 35 |     return tf.train.Feature(bytes_list = tf.train.BytesList(value=value))
 36 | 
 37 | def _string_feature(value):
 38 |     ''' Wrapper for inserting string (actually bytes) feature into Example proto'''
 39 |     if not isinstance(value, list):
 40 |         value = [value]
 41 |     return tf.train.Feature(bytes_list = tf.train.BytesList(value=value))
 42 | 
 43 | def _process_one_image(img_file, bboxes, person_id=1):
 44 |     '''
 45 | 
 46 |     :param img_file: the img file that will be read and processing
 47 |     :param bboxes: a list, contains box and crossponding label, format is [xmin, ymin, w, h, category_id]*n, n is the number of annotationed person
 48 |     :param person_id: the category_id that person is
 49 |     :return:
 50 |     img_data: binary image file that reading from tf.gfile.FastGFile
 51 |     img_shape: [height, widht, channels] of img
 52 |     bboxs: a list, [ymin, xmin, ymax, xmax] * n
 53 |     labels: a list, [person_id,], size = n
 54 |     '''
 55 | 
 56 |     # read img data
 57 |     img_data = tf.gfile.FastGFile(img_file, 'rb').read()
 58 |     img_shape = cv2.imread(img_file).shape
 59 | 
 60 |     # deal with bboxes
 61 |     bboxs = []
 62 |     labels = []
 63 |     box_num = len(bboxes) // 5
 64 | 
 65 |     for i in range(box_num):
 66 |         if bboxes[i*5+4] != person_id:
 67 |             continue
 68 |         box = bboxes[i*5:i*5+4]
 69 |         label = bboxes[i*5+4]
 70 |         box[2] += box[0]
 71 |         box[3] += box[1]
 72 |         #----convert box format [xmin, ymin, xmax, ymax] to [ymin, xmin, ymax, xmax]-------#
 73 |         tmp = box[0]
 74 |         box[0] = box[1]
 75 |         box[1] = tmp
 76 |         tmp = box[2]
 77 |         box[2] = box[3]
 78 |         box[3] = tmp
 79 |         #-----------------------------------------------------------------------------------#
 80 |         bboxs += box
 81 |         labels.append(label)
 82 | 
 83 |     return img_data, img_shape, bboxs, labels
 84 | 
 85 | def convert_to_tfrecord(json_file, tfrecord_file):
 86 |     '''
 87 |     especially reading coco-json file
 88 | 
 89 |     :param json_file: prepared json_file that contains coco dataset person annotations, the format is a map, which key is img_name without suffix, and value is
 90 |     a list contains person_num * 5 elements, the each five elements is like [xmin, ymin, w, h, category_id].
 91 |     :param tfrecord_file: the tfrecord file that we save
 92 |     :return:
 93 |     '''
 94 | 
 95 |     tfrecord_dir = os.path.dirname(tfrecord_file)
 96 |     if not os.path.exists(tfrecord_dir):
 97 |         os.makedirs(tfrecord_dir)
 98 | 
 99 |     writer = tf.python_io.TFRecordWriter(tfrecord_file)
100 |     f = open(json_file, encoding='utf-8')
101 |     labels = json.load(f)
102 | 
103 |     total_img_nums = len(labels)
104 |     count = 0
105 |     count_zero = 0
106 |     for key, value in labels.items():
107 |         img_name = key + '.jpg'
108 |         img_data, shape, bboxs, labels = _process_one_image(os.path.join(FLAGS.img_path, img_name), value)
109 |         if not bboxs:
110 |             count_zero += 1
111 |             continue
112 | 
113 |         # ----if len(bboxs)//4 < n (set n = 20), add zeros to make len(bboxs)//4 == n------------#
114 |         n = 30
115 |         if len(bboxs) < n * 4:
116 |             last = n * 4 - len(bboxs)
117 |             bboxs += list(np.zeros(last, dtype=np.float32))
118 |             labels += list(np.zeros(last // 4, dtype=np.int32))
119 |         else:
120 |             bboxs = bboxs[:n * 4]
121 |             labels = labels[:n]
122 |         # ----------------------------------------------------------------------------------------#
123 | 
124 |         img_format = b'JPEG'
125 |         example = tf.train.Example(features=tf.train.Features(
126 |             feature = {
127 |                 'image':_bytes_feature(img_data),
128 |                 'id':_string_feature(bytes(key, encoding='utf-8')),
129 |                 'height':_int64_feature(shape[0]),
130 |                 'width':_int64_feature(shape[1]),
131 |                 'format':_bytes_feature(img_format),
132 |                 'channel':_int64_feature(shape[2]),
133 |                 'boxes':_float_feature(bboxs), # [xmin, ymin, xmax, ymax] * 30
134 |                 'labels':_int64_feature(labels)
135 |             }
136 |         ))
137 |         writer.write(example.SerializeToString())
138 |         count += 1
139 | 
140 |         # if count == 5:
141 |         #     break
142 | 
143 |         if count % 1000 == 0:
144 |             print ('Processing {}/{}'.format(count, total_img_nums))
145 |     print ('No human box imgs nums {}/{}'.format(count_zero, total_img_nums))
146 |     print('Converting tfrecord done.')
147 |     writer.close()
148 | 
149 | def convert_ai_challenger_tfrecord(tfrecord_file, json_file = '/media/ulsee/E/datasets/ai_challenger_keypoint_train_20170909/keypoint_train_annotations_20170909.json'):
150 |     f = open(json_file, encoding='utf-8')
151 |     labels = json.load(f)
152 |     img_path = '/media/ulsee/E/datasets/ai_challenger_keypoint_train_20170909/keypoint_train_images_20170902'
153 | 
154 |     tfrecord_dir = os.path.dirname(tfrecord_file)
155 |     if not os.path.exists(tfrecord_dir):
156 |         os.makedirs(tfrecord_dir)
157 | 
158 |     writer = tf.python_io.TFRecordWriter(tfrecord_file)
159 |     total_img_nums = len(labels)
160 |     count = 0
161 |     count_zero = 0
162 | 
163 |     for label in labels:
164 |         img_file = os.path.join(img_path, label['image_id'] + '.jpg')
165 |         bbox = []
166 |         category_id = []
167 |         annotations = label['human_annotations']
168 |         for key, value in annotations.items():
169 |             #------convert box format [xmin, ymin, xmax, ymax] into [ymin, xmin, ymax, xmax]-------#
170 |             if len(value) != 4:
171 |                 raise  ValueError('the box size must be equal to 4!!!!')
172 |             tmp = value[0]
173 |             value[0] = value[1]
174 |             value[1] = tmp
175 |             tmp = value[2]
176 |             value[2] = value[3]
177 |             value[3] = tmp
178 |             #--------------------------------------------------------------------------------------#
179 |             bbox += value
180 |             category_id.append(1)
181 |         if not bbox:
182 |             count_zero += 1
183 |             continue
184 | 
185 |         # ----if len(bboxs)//4 < n (set n = 20), add zeros to make len(bboxs)//4 == n------------#
186 |         n = 30
187 |         if len(bbox) < n * 4:
188 |             last = n * 4 - len(bbox)
189 |             bbox += list(np.zeros(last, dtype=np.float32))
190 |             category_id += list(np.zeros(last // 4, dtype=np.int32))
191 |         else:
192 |             bbox = bbox[:n * 4]
193 |             category_id = category_id[:n]
194 |         # ----------------------------------------------------------------------------------------#
195 | 
196 |         img_data = tf.gfile.FastGFile(img_file, 'rb').read()
197 |         img_dat = cv2.imread(img_file, cv2.IMREAD_COLOR)
198 |         shape = img_dat.shape
199 |         img_format = b'JPEG'
200 | 
201 |         # add to tfrecord
202 |         example = tf.train.Example(features=tf.train.Features(
203 |             feature={
204 |                 'image': _bytes_feature(img_data),
205 |                 'height': _int64_feature(shape[0]),
206 |                 'width': _int64_feature(shape[1]),
207 |                 'format': _bytes_feature(img_format),
208 |                 'channel': _int64_feature(shape[2]),
209 |                 'boxes': _float_feature(bbox),  # [xmin, ymin, xmax, ymax] * n
210 |                 'labels': _int64_feature(category_id)
211 |             }
212 |         ))
213 |         writer.write(example.SerializeToString())
214 |         count += 1
215 | 
216 |         # if count == 10:
217 |         #     break
218 |         if count % 1000 == 0:
219 |             print ('Processing {}/{}'.format(count, total_img_nums))
220 |     writer.close()
221 |     print('Zeros box img nums {}'.format(count_zero))
222 |     print('Convert tfrecord done.')
223 | 
224 | if __name__ == '__main__':
225 | 
226 |     convert_to_tfrecord(FLAGS.json_file, FLAGS.tfrecord_file)
227 |     # convert_ai_challenger_tfrecord(FLAGS.tfrecord_file)


--------------------------------------------------------------------------------
/person_detect/src/draw_box_with_image.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | @author: shiwei hou
 4 | @contact: murdockhou@gmail.com
 5 | @software: PyCharm
 6 | @file: draw_box_with_image.py
 7 | @time: 18-9-28 下午3:07
 8 | '''
 9 | 
10 | import tensorflow as tf
11 | import numpy as np
12 | from src.get_loss import deal_zeros_box
13 | 
14 | def get_gt_boxs_with_img(imgs, gt_boxs, gt_labels, batch_size, img_size):
15 |     gt_img_batch_with_box = []
16 |     for i in range(batch_size):
17 | 
18 |         # remove zeros box [0,0,0,0]
19 |         current_loc = gt_boxs[i]
20 |         current_cls = gt_labels[i]
21 |         current_loc, current_cls = deal_zeros_box(current_loc, current_cls)
22 |         current_gt_box           = current_loc / tf.to_float(img_size)
23 | 
24 |         # draw box on single image
25 |         img_batch_i        = imgs[i]
26 |         output_box_batch_i = tf.expand_dims(current_gt_box, axis=0)
27 |         img_batch_i        = tf.expand_dims(img_batch_i, axis=0)
28 | 
29 |         img_batch_i_with_box = tf.image.draw_bounding_boxes(images=img_batch_i, boxes=output_box_batch_i)
30 |         gt_img_batch_with_box.append(img_batch_i_with_box)
31 | 
32 |     gt_img_batch_with_box = tf.reshape(tf.concat(gt_img_batch_with_box, axis=0),
33 |                                        shape=(batch_size, img_size, img_size, 3))
34 |     return gt_img_batch_with_box
35 | 
36 | def get_pred_boxs_with_img(imgs, decoded_boxs, cls_pred,  batch_size, img_size):
37 | 
38 |     batch_output_box = []
39 |     batch_output_box_score = []
40 |     for i in range(batch_size):
41 |         box_score = tf.nn.softmax(cls_pred[i])
42 |         box_score = box_score[:, 1]
43 |         top_k_score, top_k_indices = tf.nn.top_k(box_score, k=60)
44 |         decode_boxes  = tf.gather(decoded_boxs[i], top_k_indices)
45 |         valid_indices = tf.image.non_max_suppression(boxes=decode_boxes, scores=top_k_score, max_output_size=6,
46 |                                                      iou_threshold=0.5)
47 |         output_loc   = tf.gather(decode_boxes, valid_indices)
48 |         output_score = tf.gather(top_k_score, valid_indices)
49 |         batch_output_box.append(output_loc)
50 |         batch_output_box_score.append(output_score)
51 | 
52 |     pred_img_batch_with_box = []
53 |     for i in range(batch_size):
54 |         output_box_batch_i   = batch_output_box[i] / tf.to_float(img_size)
55 |         img_batch_i          = imgs[i]
56 |         output_box_batch_i   = tf.expand_dims(output_box_batch_i, axis=0)
57 |         img_batch_i          = tf.expand_dims(img_batch_i, axis=0)
58 |         img_batch_i_with_box = tf.image.draw_bounding_boxes(images=img_batch_i, boxes=output_box_batch_i)
59 |         pred_img_batch_with_box.append(img_batch_i_with_box)
60 | 
61 |     pred_img_batch_with_box  = tf.reshape(tf.concat(pred_img_batch_with_box, axis=0),
62 |                                          shape=(batch_size, img_size, img_size, 3))
63 |     return pred_img_batch_with_box
64 | 


--------------------------------------------------------------------------------
/person_detect/src/get_loss.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: get_loss.py
  7 | @time: 18-9-28 下午2:53
  8 | '''
  9 | 
 10 | import tensorflow as tf
 11 | import numpy as np
 12 | 
 13 | from anchor.anchor_generator import create_retinanet_anchors, anchor_assign
 14 | from anchor.box_coder import FasterRCNNBoxCoder
 15 | from anchor.box_list import BoxList
 16 | 
 17 | from src.loss import focal_loss, regression_loss
 18 | 
 19 | def get_loss(img_size, batch_size, gt_boxes, loc_pred, gt_labels, cls_pred, num_classes=1, is_training=True):
 20 | 
 21 |     #--------------------based gt get anchors_list------------------------#
 22 |     anchors_list = get_inputs(img_size=img_size, batch_size=batch_size, gt_boxes=gt_boxes,
 23 |                               gt_labels=gt_labels, is_training=is_training)
 24 |     #-----------------------------net-------------------------------------#
 25 |     # backbone = BackBone(img_size, batch_size, is_training=is_training)
 26 |     # fpn      = backbone.build_fpn_feature()
 27 |     # net      = RetinaNet(fpn=fpn, batch_size=batch_size, num_classes=num_classes, is_training=is_training)
 28 |     # loc_pred, cls_pred = net.forward()
 29 | 
 30 |     # ----------------------decode pred_boxs-----------------------#
 31 |     # ----convert [ty, tx, th, tw] to [ymin, xmin, ymax, xmax]-----#
 32 |     decoded_loc_pred = []
 33 |     for i in range(batch_size):
 34 |         anchor = anchors_list[i]
 35 |         current_loc_pred = loc_pred[i]
 36 |         # 根据anchor将网络的loc输出解码，表示为[ymin, xmin, ymax, xmax]
 37 |         current_box_list = FasterRCNNBoxCoder().decode(current_loc_pred, anchor.get())
 38 |         current_decoded_loc_pred = current_box_list.get()
 39 |         decoded_loc_pred.append(current_decoded_loc_pred)
 40 | 
 41 |     #---------get num of anchor overlapped with ground truth box------------#
 42 |     cls_gt = [anchor.get_field('gt_labels') for anchor in
 43 |               anchors_list]  # a list, contains batchs number tensor, each tensor is 1D contains #anchors label
 44 |     loc_gt = [anchor.get_field('gt_encoded_boxes') for anchor in
 45 |               anchors_list]  # a list, contains batchs number tensor, each tensor (gt_encoded_boxes) shape is [-1, 4],
 46 |                              # the format of gt_encoded_boxes is [ymin, xmin, ymax, xmax]
 47 |     #--------------------------calculate loss-------------------------------#
 48 |     total_loss = 0
 49 |     for i in range(batch_size):
 50 |         single_cls_gt   = cls_gt[i]  # [#anchors,]
 51 |         single_loc_gt   = loc_gt[i]  # [#anchors,4]
 52 |         single_cls_pred = cls_pred[i]  # [#anchors,2]
 53 |         single_loc_pred = loc_pred[i]  # [#anchors,4]
 54 | 
 55 |         # print(single_cls_pred.get_shape(), single_cls_gt.get_shape())
 56 | 
 57 |         # focal loss, remove anchor which label equal to -1
 58 |         # 因为前面生成的gt_labels，会有的anchor在iou [0.4,0.5)之间，标签为-1，要忽略掉，所以要先把这些去掉
 59 |         valid_anchor_indices = tf.where(tf.greater_equal(single_cls_gt, 0))
 60 |         valid_cls_gt         = tf.gather_nd(single_cls_gt, valid_anchor_indices)
 61 |         valid_cls_pred       = tf.gather_nd(single_cls_pred, valid_anchor_indices)
 62 |         cls_gt_onehot        = tf.one_hot(valid_cls_gt, depth=num_classes + 1)  # [#anchors, depth]
 63 |         floss                = focal_loss(cls_gt_onehot, valid_cls_pred)
 64 | 
 65 |         # location regression loss, remove background which label == 0
 66 |         valid_cls_indices = tf.where(tf.greater(single_cls_gt, 0))
 67 |         valid_loc_gt      = tf.reshape(tf.gather_nd(single_loc_gt, valid_cls_indices), shape=(-1, 4))
 68 |         valid_loc_preds   = tf.reshape(tf.gather_nd(single_loc_pred, valid_cls_indices), shape=(-1, 4))
 69 |         loc_loss          = regression_loss(valid_loc_preds, valid_loc_gt)
 70 | 
 71 | 
 72 |         total_loss = total_loss + tf.reduce_sum(floss) + tf.reduce_sum(loc_loss)
 73 | 
 74 |     loss = tf.to_float(total_loss) / tf.to_float(batch_size)
 75 |     return loss, decoded_loc_pred
 76 | 
 77 | 
 78 | def get_inputs(img_size, batch_size, gt_boxes, gt_labels, is_training=True):
 79 |     loc_gt = gt_boxes
 80 |     cls_gt = gt_labels #[batch_size, #gt_anchors_number]
 81 | 
 82 |     # print (loc_gt.get_shape(), cls_gt.get_shape())
 83 |     # get anchors
 84 |     anchors_list = []
 85 |     for i in range(batch_size):
 86 |         input_size = [tf.to_float(img_size), tf.to_float(img_size)]
 87 |         feature_map_list = [(tf.ceil(tf.multiply(input_size[0], 1/pow(2., i+3))),
 88 |                              tf.ceil(tf.multiply(input_size[1], 1/pow(2., i+3))))
 89 |                             for i in range(5)]
 90 |         anchor_generator = create_retinanet_anchors()
 91 |         anchor = anchor_generator.generate(input_size, feature_map_list)
 92 | 
 93 |         current_loc_gt = loc_gt[i] #[#gt_anchors_number, 4]
 94 |         current_cls_gt = cls_gt[i] #[#gt_anchors_number]
 95 |         # print('Before remove zeros boxs, loc_gt shape = {}, cls_gt shape = {}'.format(current_loc_gt.get_shape(), current_cls_gt.get_shape()))
 96 |         current_loc_gt, current_cls_gt = deal_zeros_box(current_loc_gt, current_cls_gt)
 97 |         # print('After remove zeros boxs, loc_gt shape = {}, cls_gt shape = {}'.format(current_loc_gt.get_shape(), current_cls_gt.get_shape()))
 98 | 
 99 | 
100 |         anchor = anchor_assign(anchor, gt_boxes=BoxList(current_loc_gt), gt_labels=current_cls_gt, is_training=is_training)
101 | 
102 |         # encode anchor boxes
103 |         gt_boxes = anchor.get_field('gt_boxes')
104 | 
105 |         encoded_gt_boxes = FasterRCNNBoxCoder().encode(gt_boxes, anchor.get())
106 |         anchor.add_field('gt_encoded_boxes', encoded_gt_boxes)
107 |         anchors_list.append(anchor)
108 | 
109 |     return anchors_list
110 | 
111 | def deal_zeros_box(gt_boxes, gt_labels):
112 |     '''
113 |     can not do anything, because one dim in gt_boxes and gt_labels is ?
114 |     update: now, we set ? = 30 in tfrecord file, so we can deal with zeros boxs
115 |     :param gt_boxes: [#boxs, 4]
116 |     :param gt_labels: [#boxs]
117 |     :return:
118 |     '''
119 |     #------------------------deal boxs--------------------------------------------------------#
120 |     gt_boxs = tf.unstack(gt_boxes, axis=0) # gt_boxs, a list contains nums boxs which has shape(4,)
121 |     gt_box = tf.expand_dims(gt_boxes[0], axis=0)
122 |     is_first = True # the first box is always non-zero box
123 | 
124 |     for box in gt_boxs:
125 |         if is_first:
126 |             is_first = False
127 |             continue
128 |         gt_box = tf.cond(tf.equal(tf.reduce_sum(box), tf.reduce_sum(tf.zeros_like(box))), lambda: gt_box,
129 |                          lambda: tf.concat([gt_box, tf.expand_dims(box, axis=0)], axis=0))
130 | 
131 | 
132 |     #---------------------------deal labels--------------------------------------------------#
133 |     gt_labels = tf.unstack(gt_labels, axis=0)
134 |     gt_label = tf.expand_dims(gt_labels[0], axis=0)
135 |     is_first = True # the first label is always non-background
136 |     for label in gt_labels:
137 |         if is_first:
138 |             is_first = False
139 |             continue
140 |         gt_label = tf.cond(tf.equal(tf.reduce_sum(label), tf.reduce_sum(tf.zeros_like(label))), lambda: gt_label,
141 |                            lambda : tf.concat([gt_label, tf.expand_dims(label, axis=0)], axis=0))
142 | 
143 |     gt_label = tf.reshape(gt_label, shape=(-1,))
144 | 
145 |     return gt_box, gt_label


--------------------------------------------------------------------------------
/person_detect/src/loss.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: loss.py
  7 | @time: 18-9-5 上午10:47
  8 | '''
  9 | import tensorflow as tf
 10 | 
 11 | slim = tf.contrib.slim
 12 | 
 13 | def focal_loss(onehot_labels, cls_preds,
 14 |                             alpha=0.25, gamma=2.0, name=None, scope=None):
 15 |     """Compute sigmoid focal loss between logits and onehot labels
 16 | 
 17 |     logits and onehot_labels must have same shape [batchsize, num_classes] and
 18 |     the same data type (float16, 32, 64)
 19 | 
 20 |     Args:
 21 |       onehot_labels: Each row labels[i] must be a valid probability distribution
 22 |       cls_preds: Unscaled log probabilities
 23 |       alpha: The hyperparameter for adjusting biased samples, default is 0.25
 24 |       gamma: The hyperparameter for penalizing the easy labeled samples
 25 |       name: A name for the operation (optional)
 26 | 
 27 |     Returns:
 28 |       A 1-D tensor of length batch_size of same type as logits with softmax focal loss
 29 |     """
 30 | 
 31 |     with tf.name_scope(scope, 'focal_loss', [cls_preds, onehot_labels]) as sc:
 32 |         # logits = tf.convert_to_tensor(cls_preds)
 33 |         # onehot_labels = tf.convert_to_tensor(onehot_labels)
 34 | 
 35 |         # precise_logits = tf.cast(logits, tf.float32) if (logits.dtype == tf.float16) else logits
 36 | 
 37 |         onehot_labels = tf.cast(onehot_labels, cls_preds.dtype)
 38 | 
 39 | 
 40 |         predictions = tf.nn.softmax(cls_preds)
 41 | 
 42 |         predictions_pt = tf.where(tf.equal(onehot_labels, 1), predictions, 1.-predictions)
 43 |         # add small value to avoid 0
 44 |         epsilon = 1e-8
 45 |         alpha_t = tf.scalar_mul(alpha, tf.ones_like(onehot_labels, dtype=tf.float32))
 46 |         alpha_t = tf.where(tf.equal(onehot_labels, 1.0), alpha_t, 1-alpha_t)
 47 |         losses = tf.reduce_sum(-alpha_t * tf.pow(1. - predictions_pt, gamma) * tf.log(predictions_pt+epsilon))
 48 |         return losses
 49 | 
 50 | 
 51 | def focal_loss_alt(x, y):
 52 |     """Focal loss alternative.
 53 | 
 54 |     Args:
 55 |         x: (tensor) sized [N, D]
 56 |         y: (tensor) sized [N, D]
 57 |         num_classes: numbers of classes
 58 | 
 59 |     Return:
 60 |       (tensor) focal loss.
 61 |     """
 62 |     alpha = 0.25
 63 |     t = y
 64 |     # t = t[:, 1:]
 65 | 
 66 |     xt = x * (2 * t - 1)  # xt = x if t > 0 else -x
 67 |     pt = tf.log_sigmoid(2 * xt + 1)
 68 | 
 69 |     w = alpha * t + (1 - alpha) * (1 - t)
 70 |     loss = -w * pt / 2
 71 |     return tf.reduce_sum(loss)
 72 | 
 73 | def regression_loss(pred_boxes, gt_boxes, weights=1.0):
 74 |     """
 75 |     Regression loss (Smooth L1 loss: also known as huber loss)
 76 | 
 77 |     Args:
 78 |         pred_boxes: [# anchors, 4]
 79 |         gt_boxes: [# anchors, 4]
 80 |         weights: Tensor of weights multiplied by loss with shape [# anchors]
 81 |     """
 82 |     loss = tf.losses.huber_loss(predictions=pred_boxes, labels=gt_boxes,
 83 |                                 weights=weights, scope='box_loss')
 84 | 
 85 |     return loss
 86 | 
 87 | 
 88 | def loss_test():
 89 |     logits = tf.convert_to_tensor([[0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2]])
 90 |     labels = slim.one_hot_encoding([1, 2], 4)
 91 |     bbox = tf.ones_like(logits)
 92 |     with tf.Session() as sess:
 93 |         print (sess.run(labels))
 94 |         print (sess.run(logits))
 95 |         print (sess.run(focal_loss(onehot_labels=labels, cls_preds=logits)))
 96 |         print (sess.run(regression_loss(logits, bbox, tf.expand_dims(1./tf.convert_to_tensor([2, 3], dtype=tf.float32), 1))))
 97 |     sess.close()
 98 | 
 99 | # test()
100 | if __name__ == '__main__':
101 |     loss_test()


--------------------------------------------------------------------------------
/person_detect/src/reader.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: reader.py
  7 | @time: 18-9-28 下午2:39
  8 | '''
  9 | 
 10 | import tensorflow as tf
 11 | # from src.retinanet import RetinaNet
 12 | 
 13 | class Box_Reader(object):
 14 |     def __init__(self, tfrecord_file, img_size=224, batch_size=1, epochs=1):
 15 |         self.img_size      = img_size
 16 |         self.batch_size    = batch_size
 17 |         self.epochs        = epochs
 18 |         self.tfrecord_file = tfrecord_file
 19 |         self.reader        = tf.TFRecordReader()
 20 | 
 21 |     def feed(self):
 22 |         filename_queue = tf.train.string_input_producer([self.tfrecord_file], num_epochs=self.epochs)
 23 |         reader = tf.TFRecordReader()
 24 |         _, serialized_example = reader.read(filename_queue)
 25 | 
 26 |         features = tf.parse_single_example(
 27 |             serialized_example,
 28 |             features={
 29 |                 'image': tf.FixedLenFeature((), tf.string),
 30 |                 'id': tf.FixedLenFeature([], tf.string),
 31 |                 'format': tf.FixedLenFeature((), tf.string, 'jpeg'),
 32 |                 'height': tf.FixedLenFeature([], tf.int64),
 33 |                 'width': tf.FixedLenFeature([], tf.int64),
 34 |                 'channel': tf.FixedLenFeature([], tf.int64),
 35 |                 'boxes': tf.VarLenFeature(dtype=tf.float32),
 36 |                 'labels': tf.VarLenFeature(dtype=tf.int64)
 37 |             }
 38 |         )
 39 |         channel = tf.cast(features['channel'], tf.int64)
 40 |         img = tf.image.decode_jpeg(features['image'], channels=3)  # tensor, [height, width, channels]
 41 |         img_id = features['id']
 42 |         # img = tf.decode_raw(features['image'], tf.uint8)
 43 |         img = tf.image.convert_image_dtype(img, dtype=tf.float32)
 44 |         img_height = tf.cast(features['height'], tf.int32)
 45 |         img_width = tf.cast(features['width'], tf.int32)
 46 | 
 47 |         # img = tf.reshape(img, shape=[img_height, img_width, 3])
 48 |         # img = (img - 0) / 255  # network image input need to be float type
 49 |         # img = tf.to_float(img)
 50 | 
 51 |         # features['boxes'] && features['lables'] both SparseTensor type, to get real value stored, need get attribution 'values'
 52 |         boxs = features['boxes'].values
 53 |         label = features['labels'].values
 54 |         # must identify boxs shape and labels shape, otherwise program can not get the shape correctlly
 55 |         boxs = tf.reshape(boxs, shape=(30, 4))
 56 |         label = tf.reshape(label, shape=(30,))
 57 | 
 58 |         if True:
 59 |             img, boxs = self._pre_processing(img, img_height, img_width, boxs)
 60 | 
 61 |         imgs, img_ids, heights, widths, boxes, labels = tf.train.shuffle_batch(
 62 |             [img, img_id, img_height, img_width, boxs, label],
 63 |             batch_size=self.batch_size,
 64 |             num_threads=12,
 65 |             capacity=1000,
 66 |             min_after_dequeue=400
 67 |         )
 68 | 
 69 |         return imgs, img_ids, heights, widths, boxes, labels
 70 | 
 71 |     def _pre_processing(self, img, height, width, bbox):
 72 |         img = tf.expand_dims(img, axis=0)
 73 |         img = tf.image.resize_nearest_neighbor(img, (self.img_size, self.img_size))
 74 |         img = tf.squeeze(img, axis=0)
 75 | 
 76 |         factorx = tf.to_float(self.img_size) / tf.to_float(width)
 77 |         factory = tf.to_float(self.img_size) / tf.to_float(height)
 78 | 
 79 |         bbox = tf.concat([tf.reshape(bbox[:, 0] * factory, (-1, 1)),
 80 |                           tf.reshape(bbox[:, 1] * factorx, (-1, 1)),
 81 |                           tf.reshape(bbox[:, 2] * factory, (-1, 1)),
 82 |                           tf.reshape(bbox[:, 3] * factorx, (-1, 1))],
 83 |                          axis=1)
 84 |         return img, bbox
 85 | 
 86 | def reader_test():
 87 |     batch = 1
 88 |     epochs = 1
 89 |     reader = Box_Reader(tfrecord_file='/media/ulsee/E/person_subnet_tfrecord/coco-instance-5.tfrecord', batch_size=batch, epochs=epochs)
 90 |     imgs, ids, hs, ws, boxs, labels = reader.feed()
 91 |     init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
 92 |     with tf.Session() as sess:
 93 |         sess.run(init_op)
 94 |         step = 0
 95 |         coord = tf.train.Coordinator()
 96 |         threads = tf.train.start_queue_runners(coord=coord)
 97 | 
 98 |         try :
 99 |             while not coord.should_stop():
100 |                 a, b, c, d, e, f = sess.run([imgs, ids, hs, ws, labels, boxs])
101 |                 print (b)
102 |                 step +=1
103 |         except tf.errors.OutOfRangeError:
104 |             print ('batch = {}, epochs = {}, steps = {}'.format(batch, epochs, step))
105 |         finally:
106 |             coord.request_stop()
107 |             coord.join(threads)
108 | 
109 | if __name__ == '__main__':
110 |     reader_test()


--------------------------------------------------------------------------------
/person_detect/src/retinanet.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | @author: shiwei hou
 4 | @contact: murdockhou@gmail.com
 5 | @software: PyCharm
 6 | @file: retinanet.py
 7 | @time: 18-9-28 下午2:17
 8 | '''
 9 | 
10 | from __future__ import absolute_import, division, print_function
11 | 
12 | import tensorflow as tf
13 | from tensorflow.contrib.slim import nets
14 | import tensorflow.contrib.slim as slim
15 | import math
16 | 
17 | class RetinaNet(object):
18 |     def __init__(self, fpn, feature_map_dict, batch_size, num_classes, num_anchors=9, is_training=True):
19 |         self.feature_pyramid  = fpn
20 |         self.feature_map_dict = feature_map_dict
21 |         self.batch_size       = batch_size
22 |         self.num_classes      = num_classes
23 |         self.num_anchors      = num_anchors
24 |         self.is_training      = is_training
25 |         self.stddev           = 0.01
26 |         self.pai              = 0.01
27 | 
28 |     def add_fcn_head(self, inputs, outputs, head_offset):
29 |         with slim.arg_scope([slim.conv2d], scope=str(head_offset), activation_fn=tf.nn.relu,
30 |                             weights_initializer=tf.random_normal_initializer(stddev=self.stddev)):
31 |             net = slim.repeat(inputs, 4, slim.conv2d, 256, kernel_size=[3,3])
32 |             if str(head_offset)[-1] == 's':
33 |                 net = slim.conv2d(net, outputs, kernel_size=[3,3], scope=str(head_offset) +'_final', activation_fn=None,
34 |                                   weights_initializer=tf.constant_initializer(0),
35 |                                   biases_initializer=tf.constant_initializer(-(math.log((1-self.pai)/self.pai))))
36 |             else:
37 |                 net = slim.conv2d(net, outputs, kernel_size=[3,3], activation_fn=None, scope=str(head_offset) + '_final')
38 | 
39 |             return net
40 | 
41 |     def forward(self):
42 |         loc_predictions   = []
43 |         class_predictions = []
44 |         with tf.variable_scope('retina_net'):
45 |             # add P6 and P7 as noticed in papar focal loss, page 4, annotation 2
46 |             self.feature_pyramid['P6'] = slim.conv2d(self.feature_map_dict['C5'], num_outputs=256, kernel_size=[3, 3],
47 |                                                      stride=2,
48 |                                                      weights_initializer=tf.random_normal_initializer(
49 |                                                          stddev=self.stddev),
50 |                                                      activation_fn=None,
51 |                                                      scope='build_fpn_P6')
52 |             self.feature_pyramid['P7'] = slim.conv2d(inputs=(tf.nn.relu(self.feature_pyramid['P6'])),
53 |                                                      num_outputs=256, kernel_size=[3, 3], stride=2,
54 |                                                      weights_initializer=tf.random_normal_initializer(
55 |                                                          stddev=self.stddev),
56 |                                                      activation_fn=None,
57 |                                                      scope='build_fpn_P7')
58 |             # remove P2
59 |             del self.feature_pyramid['P2']
60 | 
61 |             for idx, feature_map in self.feature_pyramid.items():
62 |                 # print ('idx {} crossponding feature map {}'.format(idx, feature_map.get_shape()))
63 |                 loc_prediction   = self.add_fcn_head(feature_map, self.num_anchors * 4, str(idx) + '_bbox')
64 |                 class_prediction = self.add_fcn_head(feature_map, self.num_classes * self.num_anchors,
65 |                                                      str(idx) + '_class')
66 | 
67 |                 loc_prediction   = tf.reshape(loc_prediction, [self.batch_size, -1, 4])
68 |                 class_prediction = tf.reshape(class_prediction, [self.batch_size, -1, self.num_classes])
69 | 
70 |                 loc_predictions.append(loc_prediction)
71 |                 class_predictions.append(class_prediction)
72 | 
73 |             return tf.concat(loc_predictions, axis=1), tf.concat(class_predictions, axis=1)


--------------------------------------------------------------------------------
/pose_residual_network/README.md:
--------------------------------------------------------------------------------
 1 | #PRN 网络理解：
 2 | 
 3 | PRN网络的思想，就是对一个bounding box里，如果有多个相同部位的关键点出现在一个single box里，也就是有多个人，那么就很难判定这么多个关键点到底属于哪一个人。PRN的思想就是，一个single box就是一个人，一个人应该只有一种类型的关键点，前面keypoint Subnet网络得到的关键点位置，如果在这个single box范围内，那么就将这个single box范围内的关键点作为输入放进去PRN网络，经过计算之后，PRN网络对每个channel只输出一个关键点，并且认为这个关键点就是这个single box框起来的人的关键点。
 4 | 
 5 | #PRN网络训练的数据构造
 6 | 
 7 | PRN网络是对单独的一个一个box进行训练的，而不是一张图片。
 8 | 
 9 | 
10 | - label： PRN网络的label就是一个对gt_box处理过后的ROI。论文里将box缩放为高宽为56*36，height/width = 1.56的ROI。将此作为网络的输入、输出大小。如果有17个关键点要训练，那么label大小就是[56, 36, 17],每个channel的意义和使用keypoint subnet得到的输出意义一致。首先就是对gt_box里所有关键点进行处理，和keypoint subnet进行训练时一样，对每个关键点出现的位置，在label对应的位置上打上标签1，否则就是0，其实就是一个heatmap，只不过是以box生成的heatmap。
11 | - input： PRN网络的input就是预先设定好的box大小，首先将input全部设为0，然后对于这个box所在的图片上，所有出现过的关键点进行处理。和生成label过程一样，只不过处理的关键点不仅仅是原本属于gt_box的关键点了，而是这个图片上所有的出现在gt_box范围内的关键点，同样使用生成label的方法，生成网络的input。
12 | 
13 | 
14 | label和input生成之后，均进行高斯处理(sigma小于1显示比较明显)，最后得到的结果才是PRN网络的输入和label。
15 | 
16 | ### 训练结果：
17 | 
18 | - 和官方提供的pytorch版本一致，训练参数一致，训练次数一致，在coco val2017的结果如下：
19 | 
20 | ```
21 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.886
22 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.977
23 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.920
24 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.874
25 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.912
26 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.911
27 |  Average Recall     (AR) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.980
28 |  Average Recall     (AR) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.933
29 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.893
30 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.943
31 | 
32 | ```
33 |  使用官方提供的pytorch版本训练，使用提供的参数和数据，在coco val2017的结果如下：
34 |  
35 |  ```
36 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.888
37 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.977
38 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.920
39 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.876
40 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.910
41 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.913
42 |  Average Recall     (AR) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.981
43 |  Average Recall     (AR) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.933
44 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.894
45 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.943
46 | 
47 | ```
48 |    官方宣称能达到的精度如下：
49 |  ```
50 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.892
51 |  Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.978
52 |  Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.921
53 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.883
54 |  Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.912
55 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 20 ] = 0.917
56 |  Average Recall     (AR) @[ IoU=0.50      | area=   all | maxDets= 20 ] = 0.982
57 |  Average Recall     (AR) @[ IoU=0.75      | area=   all | maxDets= 20 ] = 0.937
58 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.902
59 |  Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.944
60 | 
61 | ```


--------------------------------------------------------------------------------
/pose_residual_network/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | '''
3 | @author: shiwei hou
4 | @contact: murdockhou@gmail.com
5 | @software: PyCharm
6 | @file: __init__.py
7 | @time: 18-9-29 下午6:56
8 | ''' 


--------------------------------------------------------------------------------
/pose_residual_network/prn_train.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: prn_train.py.py
  7 | @time: 18-9-28 上午9:30
  8 | '''
  9 | 
 10 | import tensorflow as tf
 11 | import tensorflow.contrib.slim as slim
 12 | import numpy as np
 13 | import os, json, cv2, time
 14 | import math
 15 | 
 16 | from datetime import datetime
 17 | 
 18 | from src.PRN import  PRN
 19 | from src.reader import PRN_READER
 20 | 
 21 | import sys
 22 | 
 23 | sys.path.append('../')
 24 | 
 25 | from eval_test import eval
 26 | 
 27 | 
 28 | FLAGS = tf.flags.FLAGS
 29 | 
 30 | tf.flags.DEFINE_integer('train_nums', 262464, 'total train_data numbers in tfrecord file.')
 31 | tf.flags.DEFINE_integer('batch_size', 4, '')
 32 | tf.flags.DEFINE_float('learning_rate', 1e-3, '')
 33 | tf.flags.DEFINE_integer('height', 56, '')
 34 | tf.flags.DEFINE_integer('width', 36, '')
 35 | tf.flags.DEFINE_integer('channels', 17, '')
 36 | tf.flags.DEFINE_boolean('is_training', True, '')
 37 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/pose_residual_net_tfrecord/coco_train2017.tfrecord', '')
 38 | tf.flags.DEFINE_string('checkpoint_path', '/media/ulsee/D/PRN', 'path to save training model')
 39 | tf.flags.DEFINE_string('finetuning', None,
 40 |                        'folder of saved model that you wish to continue training or testing(e.g. 20180828-1803/model.ckpt-xxx), default:None')
 41 | 
 42 | def BCEloss(labels, inputs):
 43 |     return tf.reduce_mean(
 44 |         -(tf.multiply(labels, tf.log(inputs)) +
 45 |           tf.multiply((1-labels), tf.log(1-inputs)))
 46 |     )
 47 | 
 48 | def prn_train():
 49 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 50 | 
 51 |     # -------------------define where checkpoint path is-------------------------#
 52 |     current_time = datetime.now().strftime('%Y%m%d-%H%M')
 53 |     if FLAGS.finetuning is None:
 54 |         checkpoints_dir = os.path.join(FLAGS.checkpoint_path, current_time)
 55 |         if not os.path.exists(checkpoints_dir):
 56 |             try:
 57 |                 os.makedirs(checkpoints_dir)
 58 |             except:
 59 |                 pass
 60 |     else:
 61 |         checkpoints_dir = os.path.join(FLAGS.checkpoint_path, FLAGS.finetuning)
 62 |     print('checkpoints_dir == {}'.format(checkpoints_dir))
 63 | 
 64 |     # ------------------------------define Graph --------------------------------#
 65 |     tf.reset_default_graph()
 66 |     graph = tf.Graph()
 67 |     with graph.as_default():
 68 |         # -----------------------------reader------------------------------------#
 69 |         reader = PRN_READER(batch_size=FLAGS.batch_size, height=FLAGS.height, width=FLAGS.width,
 70 |                             channels=FLAGS.channels,
 71 |                             tfrecord_file=FLAGS.tfrecord_file)
 72 |         inputs, label = reader.feed()
 73 |         # print (inputs.get_shape())
 74 |         # print (label.get_shape())
 75 |         # ----------------------------PRN Model----------------------------------#
 76 |         prn_inputs = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, FLAGS.height, FLAGS.width, FLAGS.channels))
 77 |         prn_label = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, FLAGS.height, FLAGS.width, FLAGS.channels))
 78 |         model = PRN(inputs=prn_inputs, output_node=FLAGS.height * FLAGS.width * FLAGS.channels,
 79 |                     is_training=FLAGS.is_training)
 80 |         out = model.forward()
 81 |         # ------------------------------Saver------------------------------------#
 82 |         saver = tf.train.Saver(max_to_keep=10)
 83 |         # ------------------------------Loss-------------------------------------#
 84 |         # loss  = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=out)) / tf.to_float(FLAGS.batch_size)
 85 |         loss = BCEloss(labels=prn_label, inputs=out)
 86 |         # print (loss.get_shape())
 87 |         # ---------------------------lr and gradient-----------------------------#
 88 |         global_step = tf.Variable(0)
 89 |         # learning_rate = tf.to_float(FLAGS.learning_rate)
 90 |         values = [FLAGS.learning_rate * math.pow(0.9, (epoch - 1) // 2) for epoch in range(1, 33, 2)]
 91 |         boundaries = [FLAGS.train_nums // FLAGS.batch_size * epoch for epoch in range(3, 33, 2)]
 92 | 
 93 |         # values = [0.01, 0.02, 0.03]
 94 |         # boundaries = [200, 500]
 95 |         learning_rate = tf.train.piecewise_constant(global_step, boundaries, values)
 96 |         opt = tf.train.AdamOptimizer(learning_rate)
 97 | 
 98 |         update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
 99 |         with tf.control_dependencies(update_ops):
100 |             train_op = opt.minimize(loss, global_step=global_step)
101 |         # -----------------------------tf summary--------------------------------#
102 |         # gt_label   = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, FLAGS.height, FLAGS.width, FLAGS.channels))
103 |         # pred_label = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, FLAGS.height, FLAGS.width, FLAGS.channels))
104 |         tf.summary.scalar('lr', learning_rate)
105 |         tf.summary.scalar('loss', loss)
106 |         # tf.summary.image('label', tf.reshape(tf.transpose(
107 |         #   prn_label, [3, 0, 1, 2])[6], shape=(-1, FLAGS.height, FLAGS.width, 1)), max_outputs=4)
108 |         # tf.summary.image('pred', tf.reshape(tf.transpose(
109 |         #    out, [3, 0, 1, 2])[6], shape=(-1, FLAGS.height, FLAGS.width, 1)), max_outputs=4)
110 |         tf.summary.image('label', tf.reduce_sum(prn_label, axis=3, keep_dims=True), max_outputs=4)
111 |         tf.summary.image('preds', tf.reduce_sum(out, axis=3, keep_dims=True), max_outputs=4)
112 |         summary_op = tf.summary.merge_all()
113 |         summary_writer = tf.summary.FileWriter(checkpoints_dir, graph)
114 |         # --------------------------------init------------------------------------#
115 |         init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())
116 |         config = tf.ConfigProto()
117 |         config.gpu_options.allow_growth = True
118 |         # --------------------------------train------------------------------------#
119 |         with tf.Session(graph=graph, config=config) as sess:
120 |             sess.run(init_op)
121 |             coord = tf.train.Coordinator()
122 |             threads = tf.train.start_queue_runners(sess=sess, coord=coord)
123 |             step = 0
124 |             s_time = time.time()
125 |             try:
126 |                 while not coord.should_stop():
127 |                     net_x, y = sess.run([inputs, label])
128 | 
129 |                     _, net_loss, lr, merge_op = sess.run(
130 |                         [train_op, loss, learning_rate, summary_op],
131 |                         feed_dict={prn_label: y, prn_inputs: net_x}
132 |                     )
133 | 
134 |                     summary_writer.add_summary(merge_op, step)
135 |                     summary_writer.flush()
136 | 
137 |                     if (step + 1) % (FLAGS.train_nums // FLAGS.batch_size) == 0:
138 |                         save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step)
139 |                         print('Model saved in {}'.format(save_path))
140 |                         # eval(checkpoint=save_path)
141 |                     if (step + 1) % 200 == 0:
142 |                         cur_time = time.time()
143 |                         print('step {}: loss = {:.6f}, lr = {:.6f},time spend = {:.6f}'.format(step, net_loss, lr,
144 |                                                                                                cur_time - s_time))
145 |                         s_time = cur_time
146 | 
147 |                     step += 1
148 |                     # break
149 | 
150 |             except KeyboardInterrupt:
151 |                 print('Interrupted')
152 |                 coord.request_stop()
153 |             except Exception as e:
154 |                 coord.request_stop(e)
155 |             except tf.errors.OutOfRangeError:
156 |                 coord.request_stop()
157 |             finally:
158 |                 save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step)
159 |                 print('Model saved in {}'.format(save_path))
160 |                 coord.request_stop()
161 |                 coord.join(threads)
162 | 
163 | if __name__ == '__main__':
164 |     prn_train()
165 | 
166 | 
167 | 
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/pose_residual_network/src/PRN.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | @author: shiwei hou
 4 | @contact: murdockhou@gmail.com
 5 | @software: PyCharm
 6 | @file: PRN.py
 7 | @time: 18-9-27 下午2:36
 8 | '''
 9 | from __future__ import absolute_import, division, print_function
10 | 
11 | import tensorflow as tf
12 | from tensorflow.contrib.slim import nets
13 | from tensorflow.contrib.layers.python.layers import utils
14 | import tensorflow.contrib.slim as slim
15 | 
16 | import numpy as np
17 | import os
18 | 
19 | class PRN(object):
20 |     def __init__(self, inputs, output_node, is_training=True, hidden_node = 1024):
21 |         self.x           = inputs
22 |         self.output_node = output_node
23 |         self.hidden_node = hidden_node
24 |         self.is_training = is_training
25 | 
26 | 
27 | 
28 |     def forward(self):
29 |         with tf.variable_scope('pose-residual-network'):
30 |             flatten  = slim.flatten(inputs=self.x)
31 |             fc1      = slim.fully_connected(inputs=flatten, num_outputs=self.hidden_node, activation_fn=tf.nn.relu)
32 |             dropout1 = slim.dropout(inputs=fc1, is_training=self.is_training)
33 |             fc2      = slim.fully_connected(inputs=dropout1, num_outputs=self.hidden_node, activation_fn=tf.nn.relu)
34 |             dropout2 = slim.dropout(inputs=fc2, is_training=self.is_training)
35 |             fc3      = slim.fully_connected(inputs=dropout2, num_outputs=self.output_node, activation_fn=tf.nn.relu)
36 |             # out      = tf.nn.relu(dropout2)
37 |             out      = tf.add(flatten, fc3)
38 |             out      = tf.nn.softmax(out)
39 |             out      = tf.reshape(out, shape=self.x.get_shape())
40 | 
41 |             return out
42 | 


--------------------------------------------------------------------------------
/pose_residual_network/src/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | '''
3 | @author: shiwei hou
4 | @contact: murdockhou@gmail.com
5 | @software: PyCharm
6 | @file: __init__.py
7 | @time: 18-9-28 上午9:31
8 | ''' 


--------------------------------------------------------------------------------
/pose_residual_network/src/convert_tfrecord.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: convert_tfrecord.py
  7 | @time: 18-9-27 下午3:15
  8 | '''
  9 | import tensorflow as tf
 10 | import cv2, os, json
 11 | import numpy as np
 12 | import matplotlib.pyplot as plt
 13 | from skimage.filters import gaussian
 14 | 
 15 | 
 16 | FLAGS = tf.flags.FLAGS
 17 | tf.flags.DEFINE_string('json_file', '/media/ulsee/E/pose_residual_net_tfrecord/cocotrain2017_convert_ai.json',
 18 |                        '')
 19 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/pose_residual_net_tfrecord/coco_train2017.tfrecord',
 20 |                        'tfrecord file')
 21 | tf.flags.DEFINE_integer('height', 56, 'prn net input height')
 22 | tf.flags.DEFINE_integer('width', 36, 'prn net input width')
 23 | tf.flags.DEFINE_integer('channels', 17, 'number of keypoints')
 24 | 
 25 | def _int64_feature(value):
 26 |     ''' Wrapper for inserting int64 feature into Example proto'''
 27 |     if not isinstance(value, list):
 28 |         value = [value]
 29 |     return tf.train.Feature(int64_list = tf.train.Int64List(value=value))
 30 | 
 31 | def _float_feature(value):
 32 |     ''' Wrapper for inserting float feature into Example proto'''
 33 |     if not isinstance(value, list):
 34 |         value = [value]
 35 |     return tf.train.Feature(float_list=tf.train.FloatList(value=value))
 36 | 
 37 | def _bytes_feature(value):
 38 |     ''' Wrapper for inserting bytes feature into Example proto'''
 39 |     if not isinstance(value, list):
 40 |         value = [value]
 41 |     return tf.train.Feature(bytes_list = tf.train.BytesList(value=value))
 42 | 
 43 | def _string_feature(value):
 44 |     ''' Wrapper for inserting string (actually bytes) feature into Example proto'''
 45 |     if not isinstance(value, list):
 46 |         value = [value]
 47 |     return tf.train.Feature(bytes_list = tf.train.BytesList(value=value))
 48 | 
 49 | def convert_to_tfrecord(json_file, tfrecord_file):
 50 | 
 51 |     f      = open(json_file, encoding='utf-8')
 52 |     labels = json.load(f)
 53 | 
 54 |     if isinstance(labels, dict):
 55 |         pass
 56 |     elif isinstance(labels, list):
 57 |         convert_ai_challenger(labels, tfrecord_file)
 58 |     else:
 59 |         raise ValueError('Json file format is wrong!!!')
 60 | 
 61 | 
 62 | def convert_ai_challenger(labels, tfrecord_file):
 63 | 
 64 |     tfrecord_dir = os.path.dirname(tfrecord_file)
 65 |     if not os.path.exists(tfrecord_dir):
 66 |         os.makedirs(tfrecord_dir)
 67 | 
 68 |     writer     = tf.python_io.TFRecordWriter(tfrecord_file)
 69 |     total_imgs = len(labels)
 70 |     deal_imgs  = 0
 71 |     useless    = 0
 72 |     for label in labels:
 73 |         # print (label['image_id'])
 74 | 
 75 |         kp_anno       = label['keypoint_annotations']
 76 |         human_anno    = label['human_annotations']
 77 |         humans        = kp_anno.keys()
 78 |         all_keypoints = [kp for kp in kp_anno.values()]
 79 | 
 80 |         for human in humans:
 81 |             kp  = kp_anno[human]
 82 |             kpv = kp[2::3]
 83 |             if np.sum(kpv>0) < 4:
 84 |                 useless += 1
 85 |                 continue
 86 |             box = human_anno[human]
 87 |             box[2] = box[2] - box[0]
 88 |             box[3] = box[3] - box[1]
 89 | 
 90 |             if box[2] == 0 or box[3] == 0:
 91 |                 continue
 92 | 
 93 |             tf_label  = get_label_for_single_box(kp, box)
 94 |             tf_inputs = get_input_for_single_box(all_keypoints, box)
 95 | 
 96 |             #
 97 |             # img1 = np.sum(tf_label, axis=2, keepdims=True)
 98 |             # cv2.imwrite('label.jpg', img1*255)
 99 |             # img2 = np.sum(tf_inputs, axis=2, keepdims=True)
100 |             # cv2.imwrite('input.jpg', img2*255)
101 |             #
102 | 
103 |             example = tf.train.Example(features=tf.train.Features(
104 |                 feature = {
105 |                     'input':_float_feature(list(np.reshape(np.asarray(tf_inputs, dtype=np.float32), (-1, )))),
106 |                     'label':_float_feature(list(np.reshape(np.asarray(tf_label, dtype=np.float32), (-1, ))))
107 |                 }
108 |             ))
109 | 
110 |             writer.write(example.SerializeToString())
111 |         deal_imgs += 1
112 | 
113 |         # if deal_imgs == 2:
114 |         #     break
115 | 
116 |         if deal_imgs % 1000 == 0:
117 |             print ('Processing {}/{}'.format(deal_imgs, total_imgs))
118 |             print ('Useless boxs {}'.format(useless))
119 | 
120 |     writer.close()
121 |     print ('Converting tf record done.')
122 | 
123 | 
124 | def get_label_for_single_box(keypoints, bbox):
125 |     label = np.zeros((FLAGS.height, FLAGS.width, FLAGS.channels))
126 | 
127 |     x = int(bbox[0])
128 |     y = int(bbox[1])
129 |     w = float(bbox[2])
130 |     h = float(bbox[3])
131 | 
132 |     x_scale = float(FLAGS.width) / w
133 |     y_scale = float(FLAGS.height) / h
134 | 
135 |     kpx = keypoints[0::3]
136 |     kpy = keypoints[1::3]
137 |     kpv = keypoints[2::3]
138 | 
139 |     for j in range(FLAGS.channels):
140 |         if kpv[j] != 3 and kpv[j] != 0:
141 |             x0 = int((kpx[j] - x) * x_scale)
142 |             y0 = int((kpy[j] - y) * y_scale)
143 | 
144 |             if x0 >= FLAGS.width and y0 >= FLAGS.height:
145 |                 label[FLAGS.height-1, FLAGS.width-1, j] = 1
146 |             elif x0 >= FLAGS.width:
147 |                 try:
148 |                     label[y0, FLAGS.width-1, j] = 1
149 |                 except:
150 |                     label[0,  FLAGS.width-1, j] = 1
151 |             elif y0 >= FLAGS.height:
152 |                 try:
153 |                     label[FLAGS.height-1, x0, j] = 1
154 |                 except:
155 |                     label[FLAGS.height-1, 0,  j] = 1
156 |             elif x0 < 0 and y0 < 0:
157 |                 label[0, 0, j]   = 1
158 |             elif x0 < 0:
159 |                 label[y0, 0, j]  = 1
160 |             elif y0 < 0:
161 |                 label[0, x0, j]  = 1
162 |             else:
163 |                 label[y0, x0, j] = 1
164 | 
165 |     # for c in range(FLAGS.channels):
166 |     #     label[:, :, c] = gaussian(label[:, :, c],sigma=0.5)
167 |     label = gaussian(label, sigma=2, mode='constant', multichannel=True)
168 |     return label
169 | 
170 | def get_input_for_single_box(keypoints, bbox):
171 |     inputs    = np.zeros((FLAGS.height, FLAGS.width, FLAGS.channels))
172 |     threshold = 0.21
173 | 
174 |     x = int(bbox[0])
175 |     y = int(bbox[1])
176 |     w = float(bbox[2])
177 |     h = float(bbox[3])
178 | 
179 | 
180 |     x_scale = float(FLAGS.width) / w
181 |     y_scale = float(FLAGS.height) / h
182 | 
183 |     for ann in keypoints:
184 |         kpx = ann[0::3]
185 |         kpy = ann[1::3]
186 |         kpv = ann[2::3]
187 | 
188 | 
189 |         for j in range(FLAGS.channels):
190 |             if kpv[j] != 3 and kpv[j] != 0:
191 |                 if kpx[j] > bbox[0] - bbox[2] * threshold and kpx[j] < bbox[0] + bbox[2] * (1 + threshold):
192 |                     if kpy[j] > bbox[1] - bbox[3] * threshold and kpy[j] < bbox[1] + bbox[3] * (1 + threshold):
193 | 
194 |                         x0 = int((kpx[j] - x) * x_scale)
195 |                         y0 = int((kpy[j] - y) * y_scale)
196 | 
197 |                         if x0 >= FLAGS.width and y0 >= FLAGS.height:
198 |                             inputs[FLAGS.height - 1, FLAGS.width - 1, j] = 1
199 |                         elif x0 >= FLAGS.width:
200 |                             try:
201 |                                 inputs[y0, FLAGS.width - 1, j] = 1
202 |                             except:
203 |                                 inputs[0, FLAGS.width - 1, j] = 1
204 |                         elif y0 >= FLAGS.height:
205 |                             try:
206 |                                 inputs[FLAGS.height - 1, x0, j] = 1
207 |                             except:
208 |                                 inputs[FLAGS.height - 1, 0, j] = 1
209 |                         elif x0 < 0 and y0 < 0:
210 |                             inputs[0, 0, j] = 1
211 |                         elif x0 < 0:
212 |                             inputs[y0, 0, j] = 1
213 |                         elif y0 < 0:
214 |                             inputs[0, x0, j] = 1
215 |                         else:
216 |                             inputs[y0, x0, j] = 1
217 | 
218 | 
219 |     for c in range(FLAGS.channels):
220 |         inputs[:, :, c] = gaussian(inputs[:, :, c])
221 |     return inputs
222 | 
223 | 
224 | if __name__ == '__main__':
225 |     convert_to_tfrecord(FLAGS.json_file, FLAGS.tfrecord_file)
226 | 
227 | 
228 | 
229 | 


--------------------------------------------------------------------------------
/pose_residual_network/src/reader.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | @author: shiwei hou
 4 | @contact: murdockhou@gmail.com
 5 | @software: PyCharm
 6 | @file: reader.py
 7 | @time: 18-9-27 下午5:05
 8 | '''
 9 | 
10 | import tensorflow as tf
11 | import numpy as np
12 | import cv2
13 | import os
14 | 
15 | class PRN_READER(object):
16 |     def __init__(self, batch_size, height, width, channels, tfrecord_file):
17 |         self.batch_size = batch_size
18 |         self.height        = height
19 |         self.width         = width
20 |         self.channles      = channels
21 |         self.reader        = tf.TFRecordReader()
22 |         self.tfrecord_file = tfrecord_file
23 | 
24 |     def feed(self):
25 | 
26 |         filename_queue = tf.train.string_input_producer([self.tfrecord_file], num_epochs=16)
27 |         reader         = self.reader
28 |         _, serialized_example = reader.read(filename_queue)
29 | 
30 |         features = tf.parse_single_example(
31 |             serialized_example,
32 |             features={
33 |                 'input': tf.VarLenFeature(dtype=tf.float32),
34 |                 'label': tf.VarLenFeature(dtype=tf.float32)
35 |             }
36 |         )
37 | 
38 |         inputs = features['input'].values
39 |         label  = features['label'].values
40 | 
41 |         inputs = tf.reshape(inputs, shape=(self.height, self.width, self.channles))
42 |         label  = tf.reshape(label,  shape=(self.height, self.width, self.channles))
43 | 
44 |         batch_input, batch_label = tf.train.shuffle_batch(
45 |             [inputs, label],
46 |             batch_size=self.batch_size,
47 |             num_threads=4,
48 |             capacity=1000,
49 |             min_after_dequeue=100
50 |         )
51 | 
52 |         return batch_input, batch_label
53 | 
54 | 
55 | def reader_test():
56 |     os.environ['CUDA_VISIBLE_DEVICES'] = '0'
57 |     reader = PRN_READER(batch_size=1, height=56, width=36, channels=17,
58 |                         tfrecord_file='/raid5/hswData/pose_residual_net_tfrecord/coco_train2017_6.tfrecord')
59 |     net_x, label = reader.feed()
60 |     # net_x = tf.reduce_sum(net_x, axis=3, keepdims=True)
61 |     # label = tf.reduce_sum(label, axis=3, keepdims=True)
62 | 
63 |     with tf.Session() as sess:
64 |         sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()])
65 |         coord   = tf.train.Coordinator()
66 |         threads = tf.train.start_queue_runners(coord=coord)
67 | 
68 |         step = 0
69 |         try:
70 |             while not coord.should_stop():
71 |                 _1, _2 = sess.run([net_x, label])
72 |                 step += 1
73 |         except tf.errors.OutOfRangeError:
74 |             print('done. total step == ', step)
75 |         finally:
76 | 
77 |             print ('batch = 1, epochs = 1,  total step == ', step)
78 |             coord.request_stop()
79 |             coord.join(threads)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     reader_test()


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | '''
3 | @author: shiwei hou
4 | @contact: murdockhou@gmail.com
5 | @software: PyCharm
6 | @file: __init__.py
7 | @time: 18-9-28 下午2:19
8 | ''' 


--------------------------------------------------------------------------------
/utils/backbone.py:
--------------------------------------------------------------------------------
 1 | # encoding: utf-8
 2 | '''
 3 | @author: shiwei hou
 4 | @contact: murdockhou@gmail.com
 5 | @software: PyCharm
 6 | @file: backbone.py
 7 | @time: 18-9-28 上午11:03
 8 | '''
 9 | 
10 | from __future__ import absolute_import, division, print_function
11 | 
12 | import tensorflow as tf
13 | 
14 | from tensorflow.contrib.slim import nets
15 | from tensorflow.contrib.layers.python.layers import utils
16 | import tensorflow.contrib.slim as slim
17 | 
18 | class BackBone(object):
19 |     def __init__(self, img_size, batch_size, is_training=True):
20 |         self.img_size    = img_size
21 |         self.batch_size  = batch_size
22 |         self.input_imgs  = tf.placeholder(tf.float32, [self.batch_size, self.img_size, self.img_size, 3])
23 |         self.is_training = is_training
24 |         self.stddev      = 0.01
25 | 
26 |     def get_feature_map(self):
27 |         #-------------------resent---------------------#
28 |         arg_scope = nets.resnet_v2.resnet_arg_scope()
29 |         with slim. arg_scope(arg_scope):
30 |             out, end_points = nets.resnet_v2.resnet_v2_50(inputs=self.input_imgs, num_classes=None, is_training=self.is_training)
31 |         #---------------feature map dict---------------#
32 |         feature_map_dict = {
33 |             'C2': end_points['resnet_v2_50/block1/unit_2/bottleneck_v2'],  # input_size / 4
34 |             'C3': end_points['resnet_v2_50/block2/unit_3/bottleneck_v2'],  # input_size / 8
35 |             'C4': end_points['resnet_v2_50/block3/unit_4/bottleneck_v2'],  # input_size / 16
36 |             'C5': end_points['resnet_v2_50/block4']                        # input_size / 32
37 |         }
38 |         return feature_map_dict
39 | 
40 |     def build_fpn_feature(self):
41 |         feature_pyramid  = {}
42 |         feature_map_dict = self.get_feature_map()
43 |         #------------------------------------------build fpn-------------------------------------------#
44 |         with tf.variable_scope('build_fpn_feature'):
45 |             with slim.arg_scope([slim.conv2d], weights_initializer=tf.random_normal_initializer(stddev=self.stddev)):
46 |                 feature_pyramid['P5'] = slim.conv2d(feature_map_dict['C5'], num_outputs=256, kernel_size=[1, 1], stride=1,
47 |                                         scope='build_fpn_P5')
48 | 
49 |                 #------------------ top-down pathway and lateral connections--------------------------#
50 |                 for layer in range(4, 1, -1):
51 |                     p = feature_pyramid['P' + str(layer + 1)]
52 |                     c = feature_map_dict['C' + str(layer)]
53 | 
54 |                     #---------------------------------- upsample p -----------------------------------#
55 |                     up_shape = c.get_shape()
56 |                     up_sample = tf.image.resize_nearest_neighbor(p, [up_shape[2], up_shape[2]],
57 |                                                                  name='upsampling_fpn_P%d' % layer)
58 | 
59 |                     #----------------------------------- 1x1 conv ------------------------------------#
60 |                     c = slim.conv2d(c, num_outputs=256, kernel_size=[1, 1], stride=1, scope='fpn_1x1conv_C%d' % layer)
61 |                     p = up_sample + c
62 | 
63 |                     #----------------------reduce aliasing effect of upsampling ----------------------#
64 |                     #---------------(in the third last paragraph, Section 3, Paper FPN)---------------#
65 |                     p = slim.conv2d(p, num_outputs=256, kernel_size=[3, 3], stride=1, padding='SAME',
66 |                                     scope='build_fpn_P%d' % layer)
67 | 
68 |                     feature_pyramid['P' + str(layer)] = p
69 | 
70 |         return feature_pyramid
71 | 
72 | 


--------------------------------------------------------------------------------
/utils/coco_convert_ai_json.json:
--------------------------------------------------------------------------------
1 | [{"image_id": "000000397133", "keypoint_annotations": {"human0": [433, 94, 2, 434, 90, 2, 0, 0, 0, 443, 98, 2, 0, 0, 0, 420, 128, 2, 474, 133, 2, 396, 162, 2, 489, 173, 2, 0, 0, 0, 0, 0, 0, 419, 214, 2, 458, 215, 2, 411, 274, 2, 458, 273, 2, 402, 333, 2, 465, 334, 2], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 277, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [388.66, 69.92, 498.07000000000005, 347.54], "human1": [0, 262.81, 62.16, 299.58]}}, {"image_id": "000000252219", "keypoint_annotations": {"human0": [356, 198, 2, 358, 193, 2, 351, 194, 2, 364, 192, 2, 346, 194, 2, 375, 207, 2, 341, 211, 2, 388, 236, 2, 336, 238, 2, 392, 263, 2, 343, 242, 2, 373, 271, 2, 347, 272, 2, 372, 316, 2, 348, 318, 2, 372, 353, 2, 355, 354, 2], "human1": [100, 190, 2, 0, 0, 0, 96, 185, 2, 0, 0, 0, 86, 188, 2, 84, 208, 2, 71, 208, 2, 84, 245, 2, 59, 240, 2, 115, 263, 2, 66, 271, 2, 64, 268, 2, 71, 264, 2, 59, 324, 2, 99, 322, 2, 18, 363, 2, 101, 377, 2], "human2": [536, 192, 1, 538, 188, 2, 0, 0, 0, 552, 190, 2, 0, 0, 0, 568, 207, 2, 555, 208, 2, 559, 243, 2, 554, 246, 2, 542, 270, 2, 550, 277, 2, 573, 274, 2, 559, 274, 2, 589, 323, 2, 541, 322, 2, 617, 365, 2, 530, 361, 2]}, "human_annotations": {"human0": [326.28, 174.56, 397.52, 371.81], "human1": [9.79, 167.06, 131.73, 393.51], "human2": [510.44, 171.27, 634.1, 387.03]}}, {"image_id": "000000087038", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 271, 233, 2, 0, 0, 0, 271, 239, 2, 287, 239, 2, 266, 257, 2, 0, 0, 0, 261, 268, 2, 0, 0, 0, 285, 261, 2, 298, 260, 2, 282, 285, 2, 284, 278, 2, 286, 311, 2, 291, 298, 2], "human2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human3": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human4": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human5": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human6": [363, 226, 1, 363, 222, 2, 0, 0, 0, 369, 218, 2, 0, 0, 0, 380, 227, 2, 383, 213, 2, 400, 235, 2, 397, 199, 2, 390, 252, 2, 409, 183, 2, 399, 266, 2, 390, 259, 2, 384, 305, 2, 364, 270, 2, 376, 338, 2, 364, 290, 2], "human7": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human8": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human9": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human10": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human11": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human12": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human13": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [226.04, 229.31, 237.63, 259.72], "human1": [257.85, 224.48, 301.98, 321.48], "human2": [68.18, 238.19, 84.36000000000001, 281.07], "human3": [79.16, 232.26, 107.38, 283.38], "human4": [98.4, 234.28, 117.92, 280.74], "human5": [326.86, 223.46, 339.97, 262.13], "human6": [345.41, 173.41, 418.35, 358.82], "human7": [239.72, 225.38, 250.36, 258.44], "human8": [167.02, 234, 182.8, 271.46], "human9": [209.68, 231.08, 218.83, 265.61], "human10": [408.29, 231.25, 425.41, 266.22], "human11": [204.14, 229.02, 211.47, 263.98], "human12": [195.32, 228.06, 205.97, 265.24], "human13": [1, 190, 639, 291]}}, {"image_id": "000000480985", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 326, 281, 2, 0, 0, 0, 338, 292, 2, 330, 292, 2, 340, 306, 2, 0, 0, 0, 334, 317, 2, 0, 0, 0, 337, 326, 2, 332, 325, 2, 338, 350, 2, 0, 0, 0, 340, 368, 2, 0, 0, 0], "human3": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human4": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 281, 299, 2, 0, 0, 0, 282, 320, 2, 0, 0, 0, 274, 329, 2, 0, 0, 0, 278, 327, 2, 270, 327, 1, 280, 355, 2, 273, 355, 2, 282, 373, 2, 273, 374, 2], "human5": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human6": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human7": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [47.19, 296.12, 75.49, 329.29], "human1": [32.75, 298.94, 49.269999999999996, 328.15999999999997], "human2": [320.16, 275.05, 347.22, 379.58000000000004], "human3": [10.05, 302.96, 23.75, 328.65], "human4": [266.37, 293.13, 290.34000000000003, 382.09], "human5": [369.5, 278.52, 375.0, 324.16999999999996], "human6": [290.03, 299.79, 305.27, 319.66], "human7": [302.2, 298.22, 314.93, 316.95000000000005]}}, {"image_id": "000000296649", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 344, 324, 2, 331, 323, 2, 364, 337, 2, 346, 347, 2, 386, 350, 2, 370, 362, 1, 339, 367, 2, 329, 370, 2, 0, 0, 0, 361, 382, 2, 0, 0, 0, 347, 410, 1], "human1": [307, 308, 2, 0, 0, 0, 305, 305, 2, 0, 0, 0, 296, 306, 2, 299, 320, 2, 285, 326, 2, 0, 0, 0, 294, 355, 2, 0, 0, 0, 311, 340, 2, 0, 0, 0, 284, 368, 2, 0, 0, 0, 317, 372, 2, 0, 0, 0, 312, 408, 2], "human2": [46, 291, 2, 48, 287, 2, 43, 288, 2, 0, 0, 0, 0, 0, 0, 44, 307, 2, 15, 306, 2, 62, 322, 2, 28, 336, 2, 89, 339, 2, 63, 347, 2, 44, 354, 2, 15, 355, 2, 73, 362, 1, 60, 376, 2, 0, 0, 0, 52, 421, 2], "human3": [492, 299, 2, 0, 0, 0, 489, 295, 2, 0, 0, 0, 478, 297, 2, 458, 315, 2, 477, 325, 2, 0, 0, 0, 491, 362, 2, 486, 328, 2, 0, 0, 0, 448, 391, 2, 464, 391, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human4": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human5": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human6": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human7": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human8": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human9": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human10": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human11": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 541, 301, 2, 507, 317, 2, 534, 325, 2, 0, 0, 0, 562, 348, 2, 0, 0, 0, 593, 362, 1, 494, 383, 1, 521, 388, 1, 0, 0, 0, 562, 400, 1, 0, 0, 0, 0, 0, 0], "human12": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [322.57, 290.81, 387.65999999999997, 418.43], "human1": [273.64, 292.11, 324.59, 421.76], "human2": [1.92, 266.88, 116.37, 422.66999999999996], "human3": [424.12, 270.59, 531.59, 400.13], "human4": [259.27, 281, 285.28999999999996, 316.1], "human5": [281.06, 276.47, 296.57, 317.79], "human6": [104.73, 267.55, 123.54, 296.61], "human7": [120.86, 271.12, 137.14, 296.52], "human8": [257.14, 281.32, 269.94, 323.34], "human9": [269, 274.45, 277.89, 292.23], "human10": [556.28, 309.36, 588.37, 355.48], "human11": [494.93, 276.54, 587.01, 402.61], "human12": [300, 280, 325, 334]}}, {"image_id": "000000386912", "keypoint_annotations": {"human0": [305, 195, 2, 317, 181, 2, 296, 184, 2, 335, 187, 2, 0, 0, 0, 367, 252, 2, 271, 248, 2, 377, 354, 2, 250, 324, 2, 307, 381, 2, 232, 354, 2, 343, 402, 1, 271, 395, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [210.27, 143.29, 430.09000000000003, 419.43999999999994]}}, {"image_id": "000000348881", "keypoint_annotations": {"human0": [592, 279, 2, 0, 0, 0, 592, 278, 2, 0, 0, 0, 589, 278, 2, 588, 285, 2, 580, 285, 2, 0, 0, 0, 580, 301, 2, 0, 0, 0, 588, 304, 2, 582, 311, 2, 577, 310, 2, 579, 325, 2, 578, 324, 2, 576, 338, 2, 572, 341, 2], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [567.82, 273.1, 599.2, 347.21000000000004], "human1": [251.19, 106.42, 274.51, 168.14]}}, {"image_id": "000000522713", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [306.78, 277.85, 317.52, 288.24], "human1": [490.68, 269.32, 494.16, 272.73]}}, {"image_id": "000000181666", "keypoint_annotations": {"human0": [306, 181, 2, 0, 0, 0, 303, 178, 2, 0, 0, 0, 297, 180, 2, 312, 193, 2, 288, 192, 2, 320, 215, 2, 277, 210, 2, 312, 219, 2, 278, 223, 2, 308, 231, 2, 291, 231, 2, 309, 254, 2, 290, 254, 2, 309, 280, 1, 290, 279, 1], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [272.5, 165.89, 324.88, 269.82], "human1": [51.39, 189.47, 75.47, 215.74], "human2": [0, 182.71, 15.86, 261.06]}}, {"image_id": "000000017627", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human3": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [150.04, 224, 169.04999999999998, 295.72], "human1": [259.18, 228.19, 273.3, 249.25], "human2": [172.88, 235.27, 182.88, 265.27], "human3": [187.03, 226.28, 203.04, 248.77]}}]


--------------------------------------------------------------------------------
/utils/coco_json_convert.py:
--------------------------------------------------------------------------------
  1 | # encoding: utf-8
  2 | '''
  3 | @author: shiwei hou
  4 | @contact: murdockhou@gmail.com
  5 | @software: PyCharm
  6 | @file: coco_json_convert.py
  7 | @time: 18-9-27 下午6:02
  8 | 
  9 | try to convert coco annotation json file into as like ai_challenger format
 10 | [
 11 |     {
 12 |         "image_id": "a0f6bdc065a602b7b84a67fb8d14ce403d902e0d",
 13 |         "human_annotations":
 14 |         {
 15 |             "human1": [178,250,290,522],
 16 |             "human2": [293,274,352,473],
 17 |             "human3": [315,236,389,495],
 18 |         ...},
 19 |         "keypoint_annotations":
 20 |         {
 21 |         "human1": [261, 294, 1, 281, 328, 1, 259, 314, 2,
 22 |                     213, 295, 1, 208, 346, 1, 192, 335, 1,
 23 |                     245, 375, 1, 255, 432, 1, 244, 494, 1,
 24 |                     221, 379, 1, 219, 442, 1, 226, 491, 1,
 25 |                     226, 256, 1, 231, 284, 1],
 26 |         "human2": [313, 301, 1, 305, 337, 1, 321, 345, 1,
 27 |                     331, 316, 2, 331, 335, 2, 344, 343, 2,
 28 |                     313, 359, 1, 320, 409, 1, 311, 454, 1,
 29 |                     327, 356, 2, 330, 409, 1, 324, 446, 1,
 30 |                     337, 284, 1, 327, 302, 1],
 31 |         "human3": [373, 304, 1, 346, 286, 1, 332, 263, 1,
 32 |                     363, 308, 2, 342, 327, 2, 345, 313, 1,
 33 |                     370, 385, 2, 368, 423, 2, 370, 466, 2,
 34 |                     363, 386, 1, 361, 424, 1, 361, 475, 1,
 35 |                     365, 273, 1, 369, 297, 1],
 36 |         ...}
 37 |     },
 38 |     ...
 39 | ]
 40 | '''
 41 | 
 42 | import json
 43 | import numpy as np
 44 | 
 45 | coco_json_file = '/media/ulsee/E/datasets/coco/annotations2017/person_keypoints_val2017.json'
 46 | 
 47 | f      = open(coco_json_file, encoding='utf-8')
 48 | labels = json.load(f)
 49 | units  = []
 50 | 
 51 | img_info  = labels['images']
 52 | anno_info = labels['annotations']
 53 | 
 54 | print ('Start converting json file.....')
 55 | ll    = len(img_info)
 56 | count = 0
 57 | 
 58 | for img in img_info:
 59 |     unit     = {}
 60 |     img_name = img['file_name'].split('.')[0]
 61 |     img_id   = img['id']
 62 |     height   = img['height']
 63 |     width    = img['width']
 64 | 
 65 |     keypoint_anno = {}
 66 |     human_anno    = {}
 67 |     human_count   = 0
 68 | 
 69 |     for anno in anno_info:
 70 |         bbox        = anno['bbox']
 71 |         anno_img_id = anno['image_id']
 72 |         keypoints   = anno['keypoints']
 73 |         category_id = anno['category_id']
 74 | 
 75 |         if anno_img_id == img_id:
 76 |             bbox[2] = bbox[0] + bbox[2]
 77 |             bbox[3] = bbox[1] + bbox[3]
 78 |             keypoint_anno['human'+str(human_count)] = keypoints
 79 |             human_anno['human'+str(human_count)]    = bbox
 80 |             human_count += 1
 81 |     if human_count == 0:
 82 |         keypoint_anno['human0'] = [0 for i in range(17*3)]
 83 |         human_anno['human0']    = [0 for i in range(4)]
 84 |     unit['image_id']             = img_name
 85 |     unit['keypoint_annotations'] = keypoint_anno
 86 |     unit['human_annotations']    = human_anno
 87 |     unit['id']                   = img_id
 88 | 
 89 |     units.append(unit)
 90 | 
 91 |     count += 1
 92 | 
 93 |     # if count == 10:
 94 |     #     break
 95 | 
 96 |     if count % 100 == 0:
 97 |         print ('Processing {}/{}'.format(count, ll))
 98 | 
 99 | with open('/media/ulsee/E//coco_val2017_aiformat.json', 'w') as fw:
100 |     json.dump(units, fw)
101 |     print ('Convert done.')
102 | 
103 | 
104 | 
105 | 
106 | 
107 | 


--------------------------------------------------------------------------------
/utils/gaussian.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from skimage.filters import gaussian
  3 | 
  4 | sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89] * 100)
  5 | 
  6 | 
  7 | def multivariate_gaussian(N, sigma=2):
  8 |     t = 4
  9 |     X = np.linspace(-t, t, N)
 10 |     Y = np.linspace(-t, t, N)
 11 |     X, Y = np.meshgrid(X, Y)
 12 |     pos = np.empty(X.shape + (2,))
 13 |     pos[:, :, 0] = X
 14 |     pos[:, :, 1] = Y
 15 |     mu = np.array([0., 0.])
 16 |     sigma = np.array([[sigma, 0], [0, sigma]])
 17 |     n = mu.shape[0]
 18 |     Sigma_det = np.linalg.det(sigma)
 19 |     Sigma_inv = np.linalg.inv(sigma)
 20 |     N = np.sqrt((2 * np.pi) ** n * Sigma_det)
 21 |     fac = np.einsum('...k,kl,...l->...', pos - mu, Sigma_inv, pos - mu)
 22 |     return np.exp(-fac / 2) / N
 23 | 
 24 | 
 25 | def crop_paste(img, c, N=13, sigma=2):
 26 |     Z = multivariate_gaussian(N, sigma)
 27 | 
 28 |     H = img.shape[1]
 29 |     W = img.shape[0]
 30 | 
 31 |     h = (Z.shape[0] - 1) / 2
 32 | 
 33 |     N = Z.shape[0]
 34 |     x1 = (c[0] - h)
 35 |     y1 = (c[1] - h)
 36 | 
 37 |     x2 = (c[0] + h) + 1
 38 |     y2 = (c[1] + h) + 1
 39 | 
 40 |     zx1 = 0
 41 |     zy1 = 0
 42 |     zx2 = N + 1
 43 |     zy2 = N + 1
 44 | 
 45 |     if x1 < 0:
 46 |         x1 = 0
 47 |         zx1 = 0 - (c[0] - h)
 48 | 
 49 |     if y1 < 0:
 50 |         y1 = 0
 51 |         zy1 = 0 - (c[1] - h)
 52 | 
 53 |     if x2 > W - 1:
 54 |         x2 = W - 1
 55 |         zx2 = x2 - x1 + 1
 56 |         x2 = W
 57 | 
 58 |     if y2 > H - 1:
 59 |         y2 = H - 1
 60 |         zy2 = y2 - y1 + 1
 61 |         y2 = H
 62 | 
 63 |     img[x1:x2, y1:y2] = np.maximum(Z[zx1:zx2, zy1:zy2], img[x1:x2, y1:y2])
 64 | 
 65 | 
 66 | '''
 67 | def gaussian(img, N = 13, sigma=2):
 68 |     cs = np.where(img==1)
 69 |     img = np.zeros_like(img)
 70 |     for c in zip(cs[0], cs[1]):
 71 |         crop_paste(img, c, N, sigma)
 72 |     return img
 73 | '''
 74 | 
 75 | 
 76 | def gaussian_multi_input_mp(inp):
 77 |     '''
 78 |     :param inp: Multi person ground truth heatmap input (17 ch) Each channel contains multiple joints.
 79 |     :return: out: Gaussian augmented output. Values are between 0. and 1.
 80 |     '''
 81 | 
 82 |     h, w, ch = inp.shape
 83 |     out = np.zeros_like(inp)
 84 |     for i in range(ch):
 85 |         layer = inp[:, :, i]
 86 |         ind = np.argwhere(layer == 1)
 87 |         b = []
 88 |         if len(ind) > 0:
 89 |             for j in ind:
 90 |                 t = np.zeros((h, w))
 91 |                 t[j[0], j[1]] = 1
 92 |                 t = gaussian(t, sigma=2, mode='constant')
 93 |                 t = t * (1 / t.max())
 94 |                 b.append(t)
 95 | 
 96 |             out[:, :, i] = np.maximum.reduce(b)
 97 |         else:
 98 |             out[:, :, i] = np.zeros((h, w))
 99 |     return out
100 | 
101 | 
102 | def gaussian_multi_output(inp):
103 |     '''
104 |     :param inp: Single person ground truth heatmap input (17 ch) Each channel contains one joint.
105 |     :return: out: Gaussian augmented output. Values are between 0. and 1.
106 |     '''
107 |     h, w, ch = inp.shape
108 |     out = np.zeros_like(inp)
109 |     for i in range(ch):
110 |         j = np.argwhere(inp[:, :, i] == 1)
111 |         if len(j) == 0:
112 |             out[:, :, i] = np.zeros((h, w))
113 |             continue
114 |         j = j[0]
115 |         t = np.zeros((h, w))
116 |         t[j[0], j[1]] = 1
117 |         t = gaussian(t, sigma=5, mode='constant')
118 |         out[:, :, i] = t * (1 / t.max())
119 |     return out
120 | 
121 | 
122 | def crop(img, c, N=13):
123 |     H = img.shape[1]
124 |     W = img.shape[0]
125 | 
126 |     h = (N - 1) / 2
127 | 
128 |     x1 = int(c[0] - h)
129 |     y1 = int(c[1] - h)
130 | 
131 |     x2 = int(c[0] + h) + 1
132 |     y2 = int(c[1] + h) + 1
133 | 
134 |     if x1 < 0:
135 |         x1 = 0
136 | 
137 |     if y1 < 0:
138 |         y1 = 0
139 | 
140 |     if x2 > W - 1:
141 |         x2 = W
142 | 
143 |     if y2 > H - 1:
144 |         y2 = H
145 | 
146 |     return img[x1:x2, y1:y2]
147 | 
148 | 


--------------------------------------------------------------------------------