├── .gitignore ├── README.md ├── eval_test.py ├── keypoint_subnet ├── README.md ├── __init__.py ├── keypoint_test.py ├── keypoint_train.py ├── src │ ├── __init__.py │ ├── backbone.py │ ├── convert_tfrecord.py │ ├── get_heatmap.py │ ├── img_pre_processing.py │ ├── json_read.py │ ├── model.py │ └── reader.py └── train_log.md ├── multi_pose_net_eval.py ├── person_detect ├── README.md ├── __init__.py ├── anchor │ ├── __init__.py │ ├── anchor_generator.py │ ├── box_coder.py │ ├── box_list.py │ ├── box_list_ops.py │ └── shape_utils.py ├── person_detect_test.py ├── person_detect_train.py └── src │ ├── __init__.py │ ├── backbone.py │ ├── convert_tfrecord.py │ ├── draw_box_with_image.py │ ├── get_loss.py │ ├── loss.py │ ├── reader.py │ └── retinanet.py ├── pose_residual_network ├── README.md ├── __init__.py ├── prn_train.py └── src │ ├── PRN.py │ ├── __init__.py │ ├── convert_tfrecord.py │ └── reader.py └── utils ├── __init__.py ├── backbone.py ├── coco_convert_ai_json.json ├── coco_json_convert.py └── gaussian.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.jpg 2 | *.png 3 | *.py[cod] 4 | pre_trained/ 5 | *.ckpt 6 | yolo_v3/ 7 | 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### NOTE: 2 | 3 | I find somewhere is weird in eval.py in the official [PRN-pytorch impementaion repo](https://github.com/salihkaragoz/pose-residual-network-pytorch). When to get predicated bbox_keypoints, the code used the true keypoints to assign the bbox_keypoints. The code in eval.py is about line 200 and line 205. The peaks is true keypoints coordinate, it seems that used the true coordinate to assign the predicated bbox_keypoints. Actually i think the line 209~220 in eval.py is the right way to get real predicated bbox_keypoints. 4 | 5 | As far as i can see, i think that this ropo has some problems and cann't get the correct result through 'correct way'. But the author did not response to me and maybe there are still some tricks in this repo that i didn't found yet. 6 | 7 | ## This repository contains a TensorFlow implementation about this ECCV 2018 paper: 8 | 9 | [Muhammed Kocabas, Salih Karagoz, Emre Akbas. MultiPoseNet: Fast Multi-Person Pose Estimation using Pose Residual Network. In ECCV, 2018.](https://arxiv.org/abs/1807.04067) 10 | 11 | # This contains three part of this network: 12 | 13 | - **keypoint_subnet**, use resnet_v2_50 + fpn as backbone net work, aiming to detect huaman pose points on a single image. 14 | 15 | - **person_detect**, use as same as keypoint_subnet backbone, just a little different. Actually this part work is the RetinaNet, shown in paper [Focal Loss](https://arxiv.org/abs/1708.02002) 16 | 17 | - **pose-residual-network**, the main contribution of this paper 18 | 19 | Detailed information please see original [paper.](https://arxiv.org/abs/1807.04067) 20 | 21 | **Note:** we trained three part network separately, just as in paper said, we first train keypoint_subnet and then frozen backbone parameters to trian person_detect sub_network. All training data is 22 | read through tf_record file. 23 | 24 | 25 | ### dataset: 26 | 27 | - pose-residual: ai_train2017.tfrecord ; coco_train2017.tfrecord 28 | - person-detect: ai-instance-bbox.tfrecord ; coco-instance-bbox.tfrecord 29 | - keypoint: ai_train2017.tfrecord & ai_train2017.json ; coco_train2017.tfrecord & coco_train2017.json 30 | 31 | coco-keypoints-annotations: 32 | 33 | [0-16]::::::[nose, left_eye, right_eye, left_ear, right_ear, left_shoulder, right_shoulder, left_elbow, 34 | right_elbow, left_wrist, right_wrist, left_hip, right_hip, left_knee, right_knee, left_ankle, right_ankle] 35 | 36 | # Thanks 37 | 38 | [mkocabas](https://github.com/mkocabas/pose-residual-network) 39 | [salihkaragoz](https://github.com/salihkaragoz/pose-residual-network-pytorch) 40 | 41 | 42 | -------------------------------------------------------------------------------- /eval_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import json 4 | import cv2 5 | import argparse 6 | import numpy as np 7 | from tqdm import tqdm 8 | from random import shuffle 9 | 10 | from pycocotools.coco import COCO 11 | from pycocotools.cocoeval import COCOeval 12 | from utils.gaussian import gaussian, crop, gaussian_multi_input_mp 13 | 14 | import tensorflow as tf 15 | from pose_residual_network.src.PRN import PRN 16 | from keypoint_subnet.src.get_heatmap import get_single_heatmap 17 | 18 | 19 | def eval(checkpoint = '/media/ulsee/D/PRN/20181015-0750/model.ckpt-245572', json_file = '/media/ulsee/E/datasets/coco/annotations2017/person_keypoints_val2017.json'): 20 | 21 | ckpt = checkpoint 22 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 23 | graph = tf.Graph() 24 | with graph.as_default(): 25 | inputs = tf.placeholder(tf.float32, shape=(1, 56, 36 , 17), name='inputs') 26 | prn = PRN(inputs=inputs, output_node=1*56*36*17, is_training=False) 27 | prn_out = prn.forward() 28 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 29 | saver = tf.train.Saver() 30 | 31 | with tf.Session(graph=graph) as sess: 32 | sess.run(init_op) 33 | saver.restore(sess, ckpt) 34 | print ('prn model restore successfully.') 35 | print('------------Evaulation Started------------') 36 | 37 | peak_results, bbox_results, coco = prepare(json_file) 38 | 39 | image_ids = [] 40 | my_results = [] 41 | n_kernel = 15 42 | 43 | w = int(18 * 2) 44 | h = int(28 * 2) 45 | in_thres = 0.21 46 | # tqdm, Python里用来控制显示的进度条,相当于循环 47 | for p in tqdm(peak_results): 48 | idx = p['image_id'] 49 | image_ids.append(idx) 50 | 51 | peaks = p['peaks'] 52 | # 找到当前图片所标注的所有的boxes,是一个列表的列表,[ [], [], ... ,[]],每个列表值是原始coco标注信息里的box值[x, y, w, h] 53 | bboxes = [k['bbox'] for k in bbox_results if k['image_id'] == idx] 54 | 55 | if len(bboxes) == 0 or len(peaks) == 0: 56 | continue 57 | 58 | # 构建网络的输入 59 | weights_bbox = np.zeros((len(bboxes), h, w, 4, 17)) 60 | # 对这个图片上所有的关键点信息进行处理,注意peaks是有17个元素的列表,对应coco数据集标注的17个关键点,每个元素可以有多个关键点,表示多个人的同一个部位 61 | for joint_id, peak in enumerate(peaks): 62 | # peak就是第几个channel上的所有关键点,也即是这个图片上所有的同一个类型的关键点信息,例如所有的鼻子、左肩、右肩等 63 | for instance_id, instance in enumerate(peak): 64 | # instance_id是当前channel上第几个点,instance是点,有四个值[x, y, 1, idx] 65 | p_x = instance[0] 66 | p_y = instance[1] 67 | 68 | for bbox_id, b in enumerate(bboxes): 69 | # bbox_id 表示第几个box,b是box,[xmin, ymin, w, h] 70 | # 下面的过程就和在训练pose-residual-net时生成训练数据是一样的。 71 | # 判断关键点是否在当前的box内,如果是,就根据缩放比例把weights_bbox对应的位置处表示为instance的值 72 | # ?没有很看懂为什么weights_box维度是[ len(bboxes), h, w, 4, 17],感觉完全就可以是[ len(bboxes), h, w, 17]? 73 | is_inside = p_x > b[0] - b[2] * in_thres and \ 74 | p_y > b[1] - b[3] * in_thres and \ 75 | p_x < b[0] + b[2] * (1.0 + in_thres) and \ 76 | p_y < b[1] + b[3] * (1.0 + in_thres) 77 | 78 | if is_inside: 79 | x_scale = float(w) / math.ceil(b[2]) 80 | y_scale = float(h) / math.ceil(b[3]) 81 | 82 | x0 = int((p_x - b[0]) * x_scale) 83 | y0 = int((p_y - b[1]) * y_scale) 84 | 85 | if x0 >= w and y0 >= h: 86 | x0 = w - 1 87 | y0 = h - 1 88 | elif x0 >= w: 89 | x0 = w - 1 90 | elif y0 >= h: 91 | y0 = h - 1 92 | elif x0 < 0 and y0 < 0: 93 | x0 = 0 94 | y0 = 0 95 | elif x0 < 0: 96 | x0 = 0 97 | elif y0 < 0: 98 | y0 = 0 99 | 100 | p = 1e-9 101 | 102 | weights_bbox[bbox_id, y0, x0, :, joint_id] = [1, instance[2], instance[3], p] 103 | 104 | old_weights_bbox = np.copy(weights_bbox) 105 | 106 | for j in range(weights_bbox.shape[0]): 107 | for t in range(17): 108 | weights_bbox[j, :, :, 0, t] = gaussian(weights_bbox[j, :, :, 0, t]) 109 | # weights_bbox[j, :, :, 0, :] = gaussian_multi_input_mp(weights_bbox[j, :, :, 0, :]) 110 | 111 | # -------------------get output of prn net--------------------# 112 | 113 | output_bbox = [] 114 | for j in range(weights_bbox.shape[0]): 115 | inp = weights_bbox[j, :, :, 0, :] # [h, w, 17] 116 | output = sess.run(prn_out, feed_dict={inputs:[inp]}) 117 | 118 | temp = np.reshape(output, (56, 36, 17)) 119 | kps = get_box_keypoints(temp) 120 | # print ('output_kps == {} '.format(kps)) 121 | output_bbox.append(temp) 122 | 123 | # output_box: [len(bboxes), 56, 36, 17] 124 | output_bbox = np.array(output_bbox) 125 | ############################################################################################################## 126 | # _img = cv2.imread('/media/ulsee/E/datasets/coco/cocoval2017/000000281929.jpg', cv2.COLOR_BGR2RGB) 127 | # kp = [339, 93, 2, 346, 88, 2, 328, 88, 2, 360, 89, 2, 318, 90, 1, 385, 135, 2, 301, 147, 2, 416, 184, 2, 128 | # 286, 204, 2, 407, 226, 2, 276, 244, 2, 358, 254, 2, 309, 259, 2, 352, 346, 2, 307, 349, 2, 348, 129 | # 448, 2, 312, 449, 2] 130 | # print (_img.shape) 131 | # heatmap = get_single_heatmap(kp, _img.shape[0], _img.shape[1], channels=17, sigma=4) 132 | # _prn_input = [] 133 | # for i in range(17): 134 | # _prn_input.append(cv2.resize(heatmap[:,:,i], (36, 56))) 135 | # # print (cv2.resize(heatmap[:,:,i], (36, 56)).shape) 136 | # _prn_input = np.reshape(np.asarray(_prn_input), (56, 36, 17)) 137 | # _prn_output = sess.run(prn_out, feed_dict={inputs:[_prn_input]}) 138 | # _prn_output_ = [] 139 | # for i in range(17): 140 | # _prn_output_.append(cv2.resize(_prn_output[0, :, :, i], (_img.shape[1], _img.shape[0]))) 141 | # _prn_output = np.reshape(np.asarray(_prn_output_), (17, _img.shape[0], _img.shape[1])) 142 | # _prn_output = np.transpose(_prn_output, (1,2,0)) 143 | # print (_prn_output.shape) 144 | # cv2.imwrite('true_channel0.jpg', np.expand_dims(heatmap[:,:,0]*255, axis=2)) 145 | # cv2.imwrite('true_heatmap.jpg', np.sum(heatmap, axis=2, keepdims=True) * 255) 146 | # cv2.imwrite('prn_channel0.jpg', np.expand_dims(_prn_output[:,:,0]*255, axis=2)) 147 | # cv2.imwrite('prn_heatmap.jpg', np.sum(_prn_output, axis=2, keepdims=True)*255) 148 | # return 149 | ############################################################################################################## 150 | 151 | keypoints_score = [] 152 | 153 | for t in range(17): 154 | indexes = np.argwhere(old_weights_bbox[:, :, :, 0, t] == 1) 155 | keypoint = [] 156 | for i in indexes: 157 | 158 | cr = crop(output_bbox[i[0], :, :, t], (i[1], i[2]), N=n_kernel) 159 | score = np.sum(cr) 160 | 161 | kp_id = old_weights_bbox[i[0], i[1], i[2], 2, t] 162 | kp_score = old_weights_bbox[i[0], i[1], i[2], 1, t] 163 | p_score = old_weights_bbox[i[0], i[1], i[2], 3, t] ## ?? 164 | bbox_id = i[0] 165 | # print ('score == {}, kp_score == {}'.format(score, kp_score)) 166 | score = kp_score * score 167 | 168 | s = [kp_id, bbox_id, kp_score, score] 169 | 170 | keypoint.append(s) 171 | keypoints_score.append(keypoint) 172 | 173 | bbox_keypoints = np.zeros((weights_bbox.shape[0], 17, 3)) 174 | bbox_ids = np.arange(len(bboxes)).tolist() 175 | 176 | # kp_id, bbox_id, kp_score, my_score 177 | for i in range(17): 178 | joint_keypoints = keypoints_score[i] 179 | if len(joint_keypoints) > 0: 180 | 181 | kp_ids = list(set([x[0] for x in joint_keypoints])) 182 | 183 | table = np.zeros((len(bbox_ids), len(kp_ids), 4)) 184 | 185 | for b_id, bbox in enumerate(bbox_ids): 186 | for k_id, kp in enumerate(kp_ids): 187 | own = [x for x in joint_keypoints if x[0] == kp and x[1] == bbox] 188 | 189 | if len(own) > 0: 190 | table[bbox, k_id] = own[0] 191 | else: 192 | table[bbox, k_id] = [0] * 4 193 | 194 | for b_id, bbox in enumerate(bbox_ids): 195 | 196 | row = np.argsort(-table[bbox, :, 3]) 197 | 198 | if table[bbox, row[0], 3] > 0: 199 | for r in row: 200 | if table[bbox, r, 3] > 0: 201 | column = np.argsort(-table[:, r, 3]) 202 | 203 | if bbox == column[0]: 204 | bbox_keypoints[bbox, i, :] = [x[:3] for x in peaks[i] if x[3] == table[bbox, r, 0]][0] 205 | break 206 | else: 207 | row2 = np.argsort(table[column[0], :, 3]) 208 | if row2[0] == r: 209 | bbox_keypoints[bbox, i, :] = [x[:3] for x in peaks[i] if x[3] == table[bbox, r, 0]][0] 210 | break 211 | else: 212 | for j in range(weights_bbox.shape[0]): 213 | b = bboxes[j] 214 | x_scale = float(w) / math.ceil(b[2]) 215 | y_scale = float(h) / math.ceil(b[3]) 216 | 217 | for t in range(17): 218 | indexes = np.argwhere(old_weights_bbox[j, :, :, 0, t] == 1) 219 | if len(indexes) == 0: 220 | max_index = np.argwhere(output_bbox[j, :, :, t] == np.max(output_bbox[j, :, :, t])) 221 | bbox_keypoints[j, t, :] = [max_index[0][1] / x_scale + b[0], 222 | max_index[0][0] / y_scale + b[1], 0] 223 | 224 | my_keypoints = [] 225 | # print ('bbox_keypoints === {}'.format(bbox_keypoints)) 226 | for i in range(bbox_keypoints.shape[0]): 227 | k = np.zeros(51) 228 | k[0::3] = bbox_keypoints[i, :, 0] 229 | k[1::3] = bbox_keypoints[i, :, 1] 230 | k[2::3] = [2] * 17 231 | 232 | pose_score = 0 233 | count = 0 234 | for f in range(17): 235 | if bbox_keypoints[i, f, 0] != 0 and bbox_keypoints[i, f, 1] != 0: 236 | count += 1 237 | pose_score += bbox_keypoints[i, f, 2] 238 | # print (pose_score) 239 | pose_score /= 17.0 240 | 241 | my_keypoints.append(k) 242 | 243 | image_data = { 244 | 'image_id': idx, 245 | 'bbox': bboxes[i], 246 | 'score': pose_score, 247 | 'category_id': 1, 248 | 'keypoints': k.tolist() 249 | } 250 | my_results.append(image_data) 251 | # print ('###############################################################') 252 | # if len(my_results) > 10: 253 | # break 254 | ann_filename = 'val2017_PRN_keypoint_results_prn_.json' 255 | # write output 256 | json.dump(my_results, open(ann_filename, 'w'), indent=4) 257 | 258 | # load results in COCO evaluation tool 259 | coco_pred = coco.loadRes(ann_filename) 260 | 261 | # run COCO evaluation 262 | coco_eval = COCOeval(coco, coco_pred, 'keypoints') 263 | coco_eval.params.imgIds = image_ids 264 | coco_eval.evaluate() 265 | coco_eval.accumulate() 266 | coco_eval.summarize() 267 | 268 | # os.remove(ann_filename) 269 | def get_box_keypoints(prn_out): 270 | ''' 271 | 272 | :param prn_out: a heatmap, typically the prn net work output 273 | :return: 274 | keypoints: a list of list, contains one pair coordinate for each channel, e.g.,[ [x1, y1], [x2, y2],...,[x17, y17]] 275 | ''' 276 | keypoints = [] 277 | for c in range(17): 278 | current_channel = prn_out[:, :, c] 279 | cur_max = np.max(current_channel) 280 | if cur_max == 0: 281 | coorx = 0 282 | coory = 0 283 | else: 284 | index_all = np.where(current_channel == cur_max) 285 | coorx = index_all[0][0] 286 | coory = index_all[1][0] 287 | 288 | keypoints.append([coory, coorx]) 289 | 290 | return keypoints 291 | 292 | def prepare(json_file): 293 | 294 | cocodir = json_file 295 | ann = json.load(open(cocodir)) 296 | bbox_results = ann['annotations'] 297 | 298 | coco = COCO(cocodir) 299 | img_ids = coco.getImgIds(catIds=[1]) 300 | 301 | peak_results = [] 302 | # peak_results 是一个列表,里面的每一个元素是一个字典,字典有三个key,分别是image_id, peaks, file_name. image_id 和 file_name就是coco数据集里图片的名字和ID 303 | # peaks,是一个列表,有17个元素,每个元素又是一个列表,每个元素又包含N个列表,这个列表有两种情况: 304 | # 1. 根据原有的关键点信息,当其v为大于0的时候(即可以在图片上标注,无论是否可见),就将四个值[x,y,v,idx]组成的列表当做这个列表的元素放进去,如果这个图片上标注了 305 | # 多个人,那么继续找到关键点,同样组成的四个值[x,y,1,idx]放进去 306 | # 其中x和y就是coco数据集标注的关键点的位置,v统一为1,idx指明这个关键点是第几个可以标注的关键点序号,从0开始 307 | # 2. 如果原有的关键点v为0,则该列表为空 308 | # 所以 peaks最终的内容就有可能为[ [], [], [[x, y, 1, 0]], [], [[x, y, 1, 1], [x,y,1,2], [x,y,1,3]], [[x, y, 1, 4]], [], ..., [[x, y, 1 idx]] ] 这种形式 309 | for i in img_ids: 310 | anns = coco.loadAnns(coco.getAnnIds(imgIds=i)) 311 | # kps是图片上所有人的关键点信息,可能有多个人,也就是列表的列表形式,[ [keypoint1], [keypoints2] ] 312 | kps = [a['keypoints'] for a in anns] 313 | 314 | idx = 0 315 | 316 | ks = [] 317 | for i in range(17): 318 | t = [] 319 | for k in kps: 320 | x = k[0::3][i] 321 | y = k[1::3][i] 322 | v = k[2::3][i] 323 | 324 | if v > 0: 325 | t.append([x, y, 1, idx]) 326 | idx += 1 327 | ks.append(t) 328 | image_id = anns[0]['image_id'] 329 | peaks = ks 330 | 331 | element = { 332 | 'image_id': image_id, 333 | 'peaks': peaks, 334 | 'file_name': coco.loadImgs(image_id)[0]['file_name'] 335 | } 336 | 337 | peak_results.append(element) 338 | 339 | shuffle(peak_results) 340 | 341 | # temporary_peak_res,最终得到的结果就是去除掉前面peaks全为空的情况,即某张图片上一个关键点都没有的,给去掉,只保留起码有 >= test_keypoint_count的图片 342 | temporary_peak_res = [] 343 | for p in peak_results: 344 | if (sum(1 for i in p['peaks'] if i != []) >= 0): 345 | temporary_peak_res.append(p) 346 | peak_results = temporary_peak_res 347 | 348 | return peak_results, bbox_results, coco 349 | 350 | 351 | eval() -------------------------------------------------------------------------------- /keypoint_subnet/README.md: -------------------------------------------------------------------------------- 1 | 人体关键点检测网络,使用的tfrecord是直接从图片文件夹下生成的,保留了图片的名称、宽高属性。之所以只保留这些,是因为如果 2 | 直接把gt_heatmap写入到tfrecord里的话,会造成最终生成的tfrecord数据太大。因此考虑到这个原因,没有把gt_heatmap放进去, 3 | 而是在训练过程中,读入一个事先改好的json文件,文件的元素是一个个字典,key是图片的名字,对应的value就是关键点的值,训练时 4 | 再从json文件内取出关键点生成heatmap处理。 -------------------------------------------------------------------------------- /keypoint_subnet/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: __init__.py 7 | @time: 18-9-29 下午6:56 8 | ''' -------------------------------------------------------------------------------- /keypoint_subnet/keypoint_test.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: keypoint_test.py 7 | @time: 18-10-8 上午9:50 8 | ''' 9 | import tensorflow as tf 10 | from datetime import datetime 11 | import os, cv2, json 12 | import logging 13 | import numpy as np 14 | import math 15 | 16 | import sys 17 | 18 | from src.backbone import BackBone 19 | from src.model import Keypoint_Subnet 20 | from src.get_heatmap import get_heatmap 21 | from src.reader import Keypoint_Reader 22 | from src.json_read import load_json, load_coco_json 23 | from src.img_pre_processing import image_vertical_flipping 24 | 25 | FLAGS = tf.flags.FLAGS 26 | 27 | tf.flags.DEFINE_string('model', '/media/ulsee/D/keypoint_subnet/20181023-2043/model_alter.ckpt-239999', 28 | 'model path you want to test, e.g., (/media/ulsee/D/multi-pose-net/20180829-1927/model.ckpt-xxxxx)') 29 | tf.flags.DEFINE_string('img_path', '/media/ulsee/E/datasets/coco/cocotrain2017', 30 | 'image path to test model.') 31 | tf.flags.DEFINE_string('save_path', '/media/ulsee/E/keypoint/coco/train2017', 'path to save image test result') 32 | tf.flags.DEFINE_boolean('is_training', True, '') 33 | tf.flags.DEFINE_integer(name='batch_size', default=1, help='train batch size number') 34 | tf.flags.DEFINE_integer(name='img_size', default=480, help='net input size') 35 | tf.flags.DEFINE_integer(name='num_keypoints', default=17, help='number of keypoints to detect') 36 | 37 | 38 | 39 | def is_image(img_name): 40 | img_name = img_name.lower() 41 | if img_name.endswith('.jpg') or img_name.endswith('.png') or img_name.endswith('jpeg'): 42 | return True 43 | return False 44 | 45 | 46 | 47 | def deal_with_heatmaps(img, heatmap, factorx, factory, num_keypoints, score_threshold, nms_threshold=5, type=1): 48 | ''' 49 | 50 | :param img: 51 | :param heatmap: 52 | :param num_keypoints: 53 | :param type: 1 for single person and other for multi-person 54 | :return: 55 | ''' 56 | if type == 1: 57 | for c in range(num_keypoints): 58 | current_heatmap = heatmap[0, :, :, c] 59 | 60 | cur_max = np.max(current_heatmap) 61 | # print (cur_max) 62 | if cur_max < score_threshold: 63 | continue 64 | index_all = np.where(current_heatmap == cur_max) 65 | coorx = index_all[0][0] 66 | coory = index_all[1][0] 67 | 68 | coorx = int(coorx * factorx) 69 | coory = int(coory * factory) 70 | 71 | cv2.circle(img, (coory, coorx), 5, (0, 0, 255), -1) 72 | cv2.putText(img, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1) 73 | else: 74 | threshold = score_threshold 75 | nms_threshold = nms_threshold 76 | cur_max = 0 77 | count = 0 78 | for c in range(num_keypoints): 79 | current_heatmap = heatmap[0, :, :, c] 80 | x, y = np.where(current_heatmap > threshold) 81 | coordinate = list(zip(x, y)) 82 | # print(coordinate) 83 | s = [] 84 | for coor in coordinate: 85 | # print(coor) 86 | # print(current_heatmap[coor]) 87 | s.append(current_heatmap[coor]) 88 | s = np.asarray(s) 89 | # print(s) 90 | s_index = s.argsort()[::-1] # 降序,第一个位置的索引值最大 91 | # print(s_index) 92 | # nms 93 | keep = [] 94 | 95 | while s_index.size > 0: 96 | keep.append(s_index[0]) 97 | s_index = s_index[1:] 98 | last = [] 99 | for index in s_index: 100 | # print(keep[-1], index) 101 | distance = np.sqrt(np.sum(np.square( 102 | np.asarray(coordinate[keep[-1]]) - np.asarray(coordinate[index]) 103 | ))) 104 | if distance > nms_threshold: 105 | last.append(index) 106 | 107 | s_index = np.asarray(last) 108 | 109 | for index in keep: 110 | coor = coordinate[index] 111 | coorx = coor[0] 112 | coory = coor[1] 113 | 114 | coorx = int(coorx * factorx) 115 | coory = int(coory * factory) 116 | 117 | cv2.circle(img, (coory, coorx), 5, (0, 0, 255), -1) 118 | cv2.putText(img, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1) 119 | count += 1 120 | cur_max += s[index] 121 | 122 | cur_max = cur_max / (count if count > 0 else 1) 123 | 124 | return img, cur_max 125 | 126 | def _test(score_threshold, nms_threshold): 127 | global save_json 128 | if not os.path.exists(FLAGS.save_path): 129 | os.makedirs(FLAGS.save_path) 130 | 131 | graph = tf.Graph() 132 | with graph.as_default(): 133 | # ------------------------get backbone net--------------------------------# 134 | backbone = BackBone(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, is_training=FLAGS.is_training) 135 | fpn, _ = backbone.build_fpn_feature() 136 | # ---------------------------keypoint net---------------------------------# 137 | keypoint_net = Keypoint_Subnet(inputs=backbone.input_imgs, img_size=backbone.img_size, fpn=fpn, 138 | batch_size=backbone.batch_size, num_classes=FLAGS.num_keypoints) 139 | pre_heat, _ = keypoint_net.forward() 140 | 141 | g_list = tf.global_variables() 142 | 143 | bn_moving_mean = [g for g in g_list if 'moving_mean' in g.name] 144 | bn_moving_vars = [g for g in g_list if 'moving_variance' in g.name] 145 | 146 | var_list = tf.trainable_variables() 147 | var_list += bn_moving_vars + bn_moving_mean 148 | # for var in var_list: 149 | # print (var) 150 | 151 | init_op = tf.group(tf.global_variables_initializer()) 152 | 153 | saver = tf.train.Saver() 154 | 155 | with tf.Session(graph=graph) as sess: 156 | sess.run(init_op) 157 | saver.restore(sess, FLAGS.model) 158 | print('model restore successfully.') 159 | 160 | img_num = 0 161 | test_img_id = ['000000135361','000000265513','000000496607','000000270836'] 162 | 163 | avg = 0 164 | 165 | for img in os.listdir(FLAGS.img_path): 166 | # if not is_image(img): 167 | # continue 168 | # if img.split('.')[0] not in test_img_id: 169 | # continue 170 | img_num += 1 171 | img_ori = cv2.imread(os.path.join(FLAGS.img_path, img), cv2.IMREAD_COLOR) 172 | 173 | # img_ori = cv2.flip(img_ori, 1) 174 | 175 | img_copy = img_ori.copy() 176 | 177 | # img_input = img_copy 178 | img_input = cv2.resize(img_copy, (FLAGS.img_size, FLAGS.img_size), interpolation=cv2.INTER_NEAREST) 179 | heatmaps = sess.run(pre_heat, 180 | feed_dict={backbone.input_imgs:[img_input]}) 181 | 182 | factorx = img_ori.shape[0] / heatmaps.shape[1] 183 | facotry = img_ori.shape[1] / heatmaps.shape[2] 184 | img_save, cur_max = deal_with_heatmaps(img_ori, heatmaps, factorx, facotry, FLAGS.num_keypoints, 185 | score_threshold=score_threshold, nms_threshold=nms_threshold, type=2) 186 | avg += cur_max 187 | cv2.imwrite(os.path.join(FLAGS.save_path, img), img_save) 188 | # for mean in bn_moving_vars: 189 | # print(sess.run(mean)) 190 | # break 191 | 192 | if img_num == 400: 193 | break 194 | print('tested {}'.format(img_num)) 195 | 196 | print('avg max === {}'.format(avg/img_num)) 197 | 198 | if __name__ == '__main__': 199 | _test(score_threshold=0.05, nms_threshold=5) -------------------------------------------------------------------------------- /keypoint_subnet/keypoint_train.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: keypoint_train.py 7 | @time: 18-9-28 下午12:13 8 | ''' 9 | 10 | import tensorflow as tf 11 | from tensorflow.python.framework import graph_util 12 | from tensorflow.python.platform import gfile 13 | from datetime import datetime 14 | import os, time, cv2 15 | import numpy as np 16 | 17 | from src.backbone import BackBone 18 | from src.model import Keypoint_Subnet 19 | from src.get_heatmap import get_heatmap 20 | from src.reader import Keypoint_Reader 21 | from src.json_read import load_json, load_coco_json 22 | from src.img_pre_processing import img_pre_processing 23 | 24 | 25 | 26 | FLAGS = tf.flags.FLAGS 27 | tf.flags.DEFINE_integer('train_nums', 118280, 'train data nums, default: cocotrain2017--118280') 28 | tf.flags.DEFINE_integer('epochs', 8, 'train epochs') 29 | tf.flags.DEFINE_integer('batch_size', 4, 'train batch size number') 30 | tf.flags.DEFINE_integer('img_size', 480, 'net input size') 31 | tf.flags.DEFINE_float('learning_rate', 1e-4, 'trian lr') 32 | tf.flags.DEFINE_float('decay_rate', 0.9, 'lr decay rate') 33 | tf.flags.DEFINE_integer('decay_steps', 10000, 'lr decay steps') 34 | tf.flags.DEFINE_integer('max_to_keep', 10, 'num of models to saved') 35 | tf.flags.DEFINE_integer('num_keypoints', 17, 'number of keypoints to detect') 36 | tf.flags.DEFINE_string('pretrained_resnet', 'pre_trained/resnet_v2_50.ckpt', 37 | 'resnet_v2_50 pretrained model') 38 | tf.flags.DEFINE_boolean('is_training', True, '') 39 | tf.flags.DEFINE_string('checkpoint_path', '/media/ulsee/D/keypoint_subnet', 'path to save training model') 40 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/keypoint_subnet_tfrecord/coco_train2017.tfrecord', '') 41 | tf.flags.DEFINE_string('json_file', '/media/ulsee/E/keypoint_subnet_tfrecord/coco_train2017.json', 42 | '') 43 | tf.flags.DEFINE_string('finetuning', '20181023-2043/model_alter.ckpt-239999', 44 | 'folder of saved model that you wish to continue training or testing(e.g. 20180828-1803/model.ckpt-xxx), default:None') 45 | tf.flags.DEFINE_boolean('change_dataset', False, 46 | 'if change dataset from ai_challenger to coco, the num_keypoints will be changed. If so, when we finetunnig, need to ' 47 | 'specify do not restore the last output layer var.') 48 | 49 | def keypoint_train(): 50 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 51 | 52 | # -------------------define where checkpoint path is-------------------------# 53 | current_time = datetime.now().strftime('%Y%m%d-%H%M') 54 | if FLAGS.finetuning is None: 55 | checkpoints_dir = os.path.join(FLAGS.checkpoint_path, current_time) 56 | if not os.path.exists(checkpoints_dir): 57 | try: 58 | os.makedirs(checkpoints_dir) 59 | except: 60 | pass 61 | else: 62 | checkpoints_dir = os.path.join(FLAGS.checkpoint_path, FLAGS.finetuning) 63 | print('checkpoints_dir == {}'.format(checkpoints_dir)) 64 | #-----------------------------load json--------------------------------------# 65 | imgid_keypoints_dict = load_json(FLAGS.json_file) 66 | # ------------------------------define Graph --------------------------------# 67 | # tf.reset_default_graph() 68 | graph = tf.Graph() 69 | with graph.as_default(): 70 | #------------------------get backbone net--------------------------------# 71 | backbone = BackBone(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, is_training=FLAGS.is_training) 72 | fpn, _ = backbone.build_fpn_feature() 73 | #---------------------------keypoint net---------------------------------# 74 | keypoint_net = Keypoint_Subnet(inputs=backbone.input_imgs, img_size=backbone.img_size, fpn=fpn, 75 | batch_size=backbone.batch_size, num_classes=FLAGS.num_keypoints) 76 | total_loss, net_loss, pre_heat = keypoint_net.net_loss() 77 | #-------------------------------reader-----------------------------------# 78 | reader = Keypoint_Reader(tfrecord_file=FLAGS.tfrecord_file, batch_size=FLAGS.batch_size, img_size=FLAGS.img_size, epochs=FLAGS.epochs) 79 | img_batch, img_id_batch, img_height_batch, img_width_batch = reader.feed() 80 | #-----------------------------learning rate------------------------------# 81 | global_step = tf.Variable(0) 82 | learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step=global_step, 83 | decay_steps=int(FLAGS.train_nums / FLAGS.batch_size), 84 | decay_rate=FLAGS.decay_rate, 85 | staircase=True) 86 | opt = tf.train.AdamOptimizer(learning_rate, epsilon=1e-5) 87 | # grads = opt.compute_gradients(total_loss) 88 | # apply_gradient_op = opt.apply_gradients(grads, global_step=global_step) 89 | 90 | # MOVING_AVERAGE_DECAY = 0.99 91 | # variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step) 92 | # variable_to_average = (tf.trainable_variables() + tf.moving_average_variables()) 93 | # variables_averages_op = variable_averages.apply(variable_to_average) 94 | 95 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 96 | with tf.control_dependencies(update_ops): 97 | train_op = opt.minimize(total_loss, global_step=global_step) 98 | 99 | #--------------------------------saver-----------------------------------# 100 | res50_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='resnet_v2_50') 101 | restore_res50 = tf.train.Saver(var_list=res50_var_list) 102 | 103 | fpn_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='build_fpn_feature') 104 | keypoint_subnet_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='keypoint_subnet') 105 | output_name = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='keypoint_subnet.output') 106 | 107 | var_list = tf.trainable_variables() 108 | global_list = tf.global_variables() 109 | bn_moving_vars = [g for g in global_list if 'moving_mean' in g.name] 110 | bn_moving_vars += [g for g in global_list if 'moving_variance' in g.name] 111 | var_list += bn_moving_vars 112 | 113 | if FLAGS.change_dataset: 114 | for node in output_name: 115 | var_list.remove(node) 116 | 117 | if FLAGS.finetuning is not None: 118 | restore_finetuning = tf.train.Saver(var_list=var_list) 119 | 120 | saver = tf.train.Saver(var_list=var_list, max_to_keep=20) 121 | saver_alter = tf.train.Saver(max_to_keep=5) 122 | 123 | #---------------------------------control sigma for heatmap-------------------------------# 124 | start_gussian_sigma = 10.0 125 | end_gussian_sigma = 2.5 126 | start_decay_sigma_step = 10000 127 | decay_steps = 50000 128 | # gussian sigma will decay when global_step > start_decay_sigma_step 129 | gussian_sigma = tf.where( 130 | tf.greater(global_step, start_decay_sigma_step), 131 | tf.train.polynomial_decay(start_gussian_sigma, 132 | tf.cast(global_step, tf.int32) - start_decay_sigma_step, 133 | decay_steps, 134 | end_gussian_sigma, 135 | power=1.0), 136 | start_gussian_sigma 137 | ) 138 | # --------------------------------init------------------------------------# 139 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 140 | config = tf.ConfigProto() 141 | config.gpu_options.allow_growth = True 142 | 143 | #--------------------------------tf summary--------------------------------# 144 | img_id_batch_placeholder = tf.placeholder(tf.string, shape=[FLAGS.batch_size,]) 145 | tf.summary.text('img_ids', img_id_batch_placeholder) 146 | tf.summary.scalar('total_loss', total_loss) 147 | tf.summary.scalar('net_loss', net_loss) 148 | tf.summary.image('gt_right_ankle', tf.reshape(tf.transpose( 149 | keypoint_net.input_heats, [3, 0, 1, 2])[16], shape=[-1, FLAGS.img_size // 4, FLAGS.img_size // 4, 1]), max_outputs=2) 150 | tf.summary.image('ori_image', backbone.input_imgs, max_outputs=2) 151 | # tf.summary.image('gt_left_shoulder', tf.reshape(tf.transpose( 152 | # keypoint_net.input_heats, [3, 0, 1, 2])[5], shape=[-1, FLAGS.img_size // 4, FLAGS.img_size // 4, 1]),max_outputs=2) 153 | tf.summary.image('pred_right_ankle', tf.reshape(tf.transpose( 154 | pre_heat, [3, 0, 1, 2])[16], shape=[-1, FLAGS.img_size // 4, FLAGS.img_size // 4, 1]), max_outputs=2) 155 | tf.summary.image('gt_heatmap', tf.reduce_sum(keypoint_net.input_heats, axis=3, keepdims=True), max_outputs=2) 156 | tf.summary.image('pred_heatmap', tf.reduce_sum(pre_heat, axis=3, keepdims=True), max_outputs=2) 157 | tf.summary.scalar('lr', learning_rate) 158 | summary_op = tf.summary.merge_all() 159 | summary_writer = tf.summary.FileWriter(checkpoints_dir, graph) 160 | # --------------------------------train------------------------------------# 161 | with tf.Session(graph=graph, config=config) as sess: 162 | sess.run(init_op) 163 | coord = tf.train.Coordinator() 164 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 165 | step = 0 166 | 167 | if FLAGS.finetuning is not None: 168 | restore_finetuning.restore(sess, checkpoints_dir) 169 | print ('Successfully load pre_trained keypoint_subnet model.') 170 | # step = int(checkpoints_dir.split('/')[-1].split('.')[-1].split('-')[-1]) 171 | print ('Global_step == {}, Step == {}'.format(sess.run(global_step), step)) 172 | step = sess.run(global_step) 173 | # -- bn layer: resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/ ---# 174 | # gamma = graph.get_tensor_by_name(name='resnet_v2_50/block4/unit_3/bottleneck_v2/conv2/BatchNorm/gamma:0') 175 | # beta = graph.get_tensor_by_name(name='resnet_v2_50/block4/unit_3/bottleneck_v2/conv2/BatchNorm/beta:0') 176 | # print('finetuning gamma = ', sess.run(gamma)[:50]) 177 | # print('beta = ', sess.run(beta)[:50]) 178 | 179 | else: 180 | restore_res50.restore(sess, FLAGS.pretrained_resnet) 181 | print ('Successfully load pre_trained resnet_v2_50 model') 182 | # -- bn layer: resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/ ---# 183 | # gamma = graph.get_tensor_by_name( 184 | # name='resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/gamma:0') 185 | # beta = graph.get_tensor_by_name(name='resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/beta:0') 186 | # print('no finetuning gamma = ', sess.run(gamma)[:50]) 187 | # print('beta = ', sess.run(beta)[:50]) 188 | 189 | start_time = time.time() 190 | try: 191 | while not coord.should_stop(): 192 | imgs, imgs_id, imgs_height, imgs_width, g_sigma = sess.run([img_batch, img_id_batch, img_height_batch, img_width_batch, gussian_sigma]) 193 | 194 | gt_heatmaps = get_heatmap(label_dict=imgid_keypoints_dict, img_ids=imgs_id, img_heights=imgs_height, 195 | img_widths=imgs_width, img_resize=FLAGS.img_size, num_keypoints=FLAGS.num_keypoints, 196 | sigma=g_sigma) 197 | 198 | # imgs, gt_heatmaps = img_pre_processing(imgs, gt_heatmaps) 199 | 200 | _, loss_all, net_out_loss, pre_heats, lr, merge_op = sess.run( 201 | [train_op, total_loss, net_loss, pre_heat, learning_rate, summary_op], 202 | feed_dict={backbone.input_imgs:imgs, 203 | keypoint_net.input_heats:gt_heatmaps, 204 | img_id_batch_placeholder:imgs_id} 205 | ) 206 | if step % 100 == 0: 207 | summary_writer.add_summary(merge_op, step) 208 | summary_writer.flush() 209 | 210 | if (step + 1) % 10 == 0: 211 | cur_time = time.time() 212 | print ('-------------------Step %d:-------------------' % step) 213 | print ('total_loss = {}, out_put_loss = {}, lr = {}, sigma = {}, time spend = {}' 214 | .format(loss_all, net_out_loss, lr, g_sigma, cur_time-start_time)) 215 | start_time = cur_time 216 | 217 | # # -- bn layer: resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/ ---# 218 | # gamma = graph.get_tensor_by_name( 219 | # name='resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/gamma:0') 220 | # beta = graph.get_tensor_by_name( 221 | # name='resnet_v2_50/block1/unit_1/bottleneck_v2/conv1/BatchNorm/beta:0') 222 | # print('no finetuning gamma = ', sess.run(gamma)[:50]) 223 | # print('beta = ', sess.run(beta)[:50]) 224 | # print (sess.run(bn_moving_vars[0])) 225 | # input_graph_def = tf.get_default_graph().as_graph_def() 226 | # output_graph_def = graph_util.convert_variables_to_constants(sess, input_graph_def, 227 | # 'keypoint_subnet/output/biases'.split(',')) 228 | # model_f = tf.gfile.FastGFile('model.pb', 'wb') 229 | # model_f.write(output_graph_def.SerializeToString()) 230 | # break 231 | if (step + 1) % 5000 == 0: 232 | save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step) 233 | print ('Model saved in file: {}'.format(save_path)) 234 | save_path_alter = saver_alter.save(sess, checkpoints_dir+'/model_alter.ckpt', global_step=step) 235 | 236 | step += 1 237 | 238 | 239 | except KeyboardInterrupt: 240 | print ('Interrupted, current step == {}'.format(step)) 241 | coord.request_stop() 242 | 243 | except Exception as e: 244 | coord.request_stop(e) 245 | 246 | finally: 247 | save_path = saver.save(sess, checkpoints_dir + "/model.ckpt", global_step=step) 248 | print ("Model saved in file: {}" .format(save_path)) 249 | save_path_alter = saver_alter.save(sess, checkpoints_dir + '/model_alter.ckpt', global_step=step) 250 | print ('Current step = {}'.format(step)) 251 | # When done, ask the threads to stop. 252 | coord.request_stop() 253 | coord.join(threads) 254 | 255 | 256 | if __name__ == '__main__': 257 | keypoint_train() 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | -------------------------------------------------------------------------------- /keypoint_subnet/src/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: __init__.py 7 | @time: 18-9-28 上午11:23 8 | ''' -------------------------------------------------------------------------------- /keypoint_subnet/src/backbone.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: backbone.py 7 | @time: 18-9-28 上午11:03 8 | ''' 9 | 10 | from __future__ import absolute_import, division, print_function 11 | 12 | import tensorflow as tf 13 | 14 | from tensorflow.contrib.slim import nets 15 | from tensorflow.contrib.layers.python.layers import utils 16 | import tensorflow.contrib.slim as slim 17 | 18 | class BackBone(object): 19 | def __init__(self, img_size, batch_size, is_training=True): 20 | self.img_size = img_size 21 | self.batch_size = batch_size 22 | self.input_imgs = tf.placeholder(tf.float32, [self.batch_size, self.img_size, self.img_size, 3]) 23 | self.is_training = is_training 24 | self.stddev = 0.01 25 | 26 | def get_feature_map(self): 27 | #-------------------resent---------------------# 28 | arg_scope = nets.resnet_v2.resnet_arg_scope() 29 | with slim. arg_scope(arg_scope): 30 | out, end_points = nets.resnet_v2.resnet_v2_50(inputs=self.input_imgs, num_classes=None, is_training=self.is_training) 31 | #---------------feature map dict---------------# 32 | feature_map_dict = { 33 | 'C2': end_points['resnet_v2_50/block1/unit_2/bottleneck_v2'], # input_size / 4 34 | 'C3': end_points['resnet_v2_50/block2/unit_3/bottleneck_v2'], # input_size / 8 35 | 'C4': end_points['resnet_v2_50/block3/unit_4/bottleneck_v2'], # input_size / 16 36 | 'C5': end_points['resnet_v2_50/block4'] # input_size / 32 37 | } 38 | return feature_map_dict 39 | 40 | def build_fpn_feature(self): 41 | feature_pyramid = {} 42 | feature_map_dict = self.get_feature_map() 43 | #------------------------------------------build fpn-------------------------------------------# 44 | with tf.variable_scope('build_fpn_feature'): 45 | with slim.arg_scope([slim.conv2d], weights_initializer=tf.random_normal_initializer(stddev=self.stddev)): 46 | feature_pyramid['P5'] = slim.conv2d(feature_map_dict['C5'], num_outputs=256, kernel_size=[1, 1], stride=1, 47 | scope='build_fpn_P5') 48 | 49 | #------------------ top-down pathway and lateral connections--------------------------# 50 | for layer in range(4, 1, -1): 51 | p = feature_pyramid['P' + str(layer + 1)] 52 | c = feature_map_dict['C' + str(layer)] 53 | 54 | #---------------------------------- upsample p -----------------------------------# 55 | up_shape = c.get_shape() 56 | up_sample = tf.image.resize_nearest_neighbor(p, [up_shape[2], up_shape[2]], 57 | name='upsampling_fpn_P%d' % layer) 58 | 59 | #----------------------------------- 1x1 conv ------------------------------------# 60 | c = slim.conv2d(c, num_outputs=256, kernel_size=[1, 1], stride=1, scope='fpn_1x1conv_C%d' % layer) 61 | p = up_sample + c 62 | 63 | #----------------------reduce aliasing effect of upsampling ----------------------# 64 | #---------------(in the third last paragraph, Section 3, Paper FPN)---------------# 65 | p = slim.conv2d(p, num_outputs=256, kernel_size=[3, 3], stride=1, padding='SAME', 66 | scope='build_fpn_P%d' % layer) 67 | 68 | feature_pyramid['P' + str(layer)] = p 69 | 70 | return feature_pyramid, feature_map_dict 71 | 72 | -------------------------------------------------------------------------------- /keypoint_subnet/src/convert_tfrecord.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: convert_tfrecord.py.py 7 | @time: 18-9-28 下午6:50 8 | ''' 9 | 10 | import os, cv2 11 | import numpy as np 12 | import tensorflow as tf 13 | 14 | FLAGS = tf.flags.FLAGS 15 | 16 | tf.flags.DEFINE_string(name='image_dir', default='/media/ulsee/E/datasets/test2', 17 | help='image directory for building tfrecord') 18 | 19 | tf.flags.DEFINE_string(name='tfrecord_file', default='/media/ulsee/E/keypoint_subnet_tfrecord/coco_train2017-test.tfrecord', 20 | help='output path you want to save tfrecord data file') 21 | 22 | tf.flags.DEFINE_integer(name='img_num', default=21, 23 | help='define how many images to build tfrecord data, zero menas all') 24 | 25 | 26 | def img_reader(image_dir): 27 | ''' 28 | read imgs in image_dir and return some lists 29 | :param image_dir: string, path of input image dir, e.g., /path/to/imgdir/ 30 | :return: 31 | img_paths: img path for every single img 32 | img_ids: img name without suffix for every single img 33 | img_heights: img height for every single img 34 | img_widths: img width for every single img 35 | ''' 36 | 37 | img_paths = [] 38 | img_ids = [] 39 | img_heights = [] 40 | img_widths = [] 41 | 42 | img_count = 0 43 | file_suffix = ['jpg', 'png'] 44 | 45 | for img_file in os.scandir(image_dir): 46 | if FLAGS.img_num != 0 and img_count == FLAGS.img_num: 47 | break 48 | 49 | suffix = img_file.name[-3:].lower() 50 | 51 | if suffix in file_suffix and img_file.is_file() : 52 | 53 | img = cv2.imread(img_file.path, cv2.IMREAD_COLOR) 54 | height, width, channels = img.shape 55 | 56 | img_ids.append(img_file.name[:-4]) 57 | img_paths.append(img_file.path) 58 | img_heights.append(height) 59 | img_widths.append(width) 60 | 61 | img_count += 1 62 | print ('------------------{}-----------------'.format(img_count)) 63 | 64 | 65 | return img_paths, img_ids, img_heights, img_widths 66 | 67 | def _int64_feature(value): 68 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 69 | 70 | def _bytes_feature(value): 71 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 72 | 73 | def _strs_feature(value): 74 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 75 | 76 | def tfrecord_writer(img_dir, output_file): 77 | ''' 78 | conver img in img_dir into tfrecord, saved as output_file 79 | :param img_dir: img directory 80 | :param output_file: tfrecord name with path to save 81 | :return: 82 | ''' 83 | # print (1) 84 | img_paths, img_ids, img_heights, img_widths = img_reader(image_dir=img_dir) 85 | # print (2) 86 | output_dir = os.path.dirname(output_file) 87 | try: 88 | os.makedirs(output_dir) 89 | except os.error: 90 | pass 91 | 92 | img_nums = len(img_paths) 93 | 94 | writer = tf.python_io.TFRecordWriter(output_file) 95 | print('start writing tfrecord....') 96 | 97 | for i in range(img_nums): 98 | img_path = img_paths[i] 99 | img_id = bytes(img_ids[i], encoding='utf-8') 100 | img_height = img_heights[i] 101 | img_width = img_widths[i] 102 | 103 | with tf.gfile.FastGFile(img_path, 'rb') as f: 104 | img = f.read() 105 | 106 | example = tf.train.Example(features=tf.train.Features( 107 | feature={ 108 | 'image': _bytes_feature(img), 109 | 'id': _strs_feature(img_id), 110 | 'height': _int64_feature(img_height), 111 | 'width': _int64_feature(img_width) 112 | })) 113 | writer.write(example.SerializeToString()) 114 | 115 | if (i + 1) % 1000 == 0: 116 | print('processing....{}/{}'.format(i+1, img_nums)) 117 | print ('tfrecord write done.') 118 | writer.close() 119 | 120 | def main(argv): 121 | tfrecord_writer(FLAGS.image_dir, FLAGS.tfrecord_file) 122 | 123 | if __name__ == '__main__': 124 | tf.app.run() -------------------------------------------------------------------------------- /keypoint_subnet/src/get_heatmap.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: get_heatmap.py 7 | @time: 18-8-28 下午4:50 8 | ''' 9 | 10 | import numpy as np 11 | import math, cv2 12 | from skimage.filters import gaussian 13 | 14 | def get_heatmap(label_dict, img_ids, img_heights, img_widths, img_resize, num_keypoints, sigma = 6.0): 15 | batch = img_ids.shape[0] 16 | heatmaps = np.zeros([batch, img_resize//4, img_resize//4, num_keypoints], np.float32) 17 | 18 | for b in range(batch): 19 | height = img_heights[b] 20 | width = img_widths[b] 21 | keypoints = label_dict[img_ids[b].decode('utf-8')] 22 | 23 | single_heatmap = get_single_heatmap(keypoints, height, width, num_keypoints, sigma) 24 | single_heatmap = cv2.resize(single_heatmap, (img_resize//4, img_resize//4)) 25 | 26 | heatmaps[b,:,:,:] = single_heatmap 27 | 28 | return heatmaps 29 | 30 | def get_single_heatmap(keypoints, height, width, channels, sigma = 6.0): 31 | heatmap = np.zeros([channels, height, width], np.float32) 32 | keypoints = list(keypoints) 33 | keypoints = np.asarray(keypoints) 34 | keypoints = np.reshape(keypoints, (len(keypoints)//channels//3, channels*3)) 35 | 36 | for people in keypoints: 37 | for i in range (channels): 38 | keypoint_x = people[i*3] 39 | keypoint_y = people[i*3+1] 40 | keypoint_v = people[i*3+2] 41 | 42 | if keypoint_x == 0 and keypoint_y == 0: 43 | continue 44 | if keypoint_v == 3: 45 | continue 46 | 47 | heatmap = put_keypoint_on_heatmap(keypoint_x, keypoint_y, i, heatmap, sigma) 48 | # heatmap[i, keypoint_y, keypoint_x] = 1 49 | 50 | # heatmap = gaussian(heatmap.transpose((1, 2, 0)), sigma=sigma, mode='constant', multichannel=True) 51 | return heatmap.transpose((1, 2, 0)) 52 | 53 | def put_keypoint_on_heatmap(center_x, center_y, channel, heatmap, sigma = 6.0): 54 | th = 1.6052 55 | delta = math.sqrt(th * 2) 56 | 57 | height = heatmap.shape[1] 58 | width = heatmap.shape[2] 59 | 60 | x0 = int(max(0, center_x - delta * sigma)) 61 | y0 = int(max(0, center_y - delta * sigma)) 62 | 63 | x1 = int(min(width, center_x + delta * sigma)) 64 | y1 = int(min(height, center_y + delta * sigma)) 65 | 66 | for y in range(y0, y1): 67 | for x in range(x0, x1): 68 | d = (x - center_x) ** 2 + (y - center_y) ** 2 69 | exp = d / 2.0 / sigma / sigma 70 | 71 | if exp > th: 72 | continue 73 | heatmap[channel][y][x] = max(heatmap[channel][y][x], math.exp(-exp)) 74 | heatmap[channel][y][x] = min(heatmap[channel][y][x], 1.0) 75 | 76 | return heatmap -------------------------------------------------------------------------------- /keypoint_subnet/src/img_pre_processing.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: image_preprocessing.py 7 | @time: 18-8-30 上午11:29 8 | ''' 9 | 10 | import numpy as np 11 | import os, cv2, random 12 | import matplotlib.pyplot as plt 13 | 14 | import sys 15 | sys.path.append('../') 16 | from src.get_heatmap import get_single_heatmap 17 | 18 | 19 | def img_pre_processing(imgs, heatmaps): 20 | ''' 21 | 22 | :param imgs: image batch, shape = [b, h, w, c] 23 | :param heatmaps: heatmap batch, shape = [b, h, w, c] 24 | :return: depend on rd, rotate 45 degree or not, vertically flip or not 25 | return processing imgs and heatmaps with ori shape 26 | ''' 27 | 28 | batch = imgs.shape[0] 29 | for i in range(batch): 30 | current_img = imgs[i, :, :, :] 31 | current_heatmap = heatmaps[i, :, :, :] 32 | 33 | rd = random.randint(1, 10) 34 | if rd < 4: 35 | current_img, current_heatmap = image_rotation(current_img, current_heatmap, 40) 36 | 37 | elif rd > 7: 38 | current_img, current_heatmap = image_rotation(current_img, current_heatmap, -40) 39 | 40 | rd = random.randint(1, 10) 41 | if rd < 4: 42 | current_img, current_heatmap = image_vertical_flipping(current_img, current_heatmap) 43 | 44 | imgs[i,:,:,:] = current_img 45 | heatmaps[i,:, :, :] = current_heatmap 46 | 47 | return imgs, heatmaps 48 | 49 | def image_rotation(img, heatmap, degree=40): 50 | img_ori_shape = img.shape # [h, w, c] 51 | heat_ori_shape = heatmap.shape # [ h, w, c] 52 | 53 | img = rotated_bound(img, degree) 54 | img = cv2.resize(img, (img_ori_shape[1], img_ori_shape[0])) 55 | 56 | for c in range(heat_ori_shape[2]): 57 | cur_heatmap = heatmap[:, :, c] 58 | cur_heatmap = np.expand_dims(cur_heatmap, axis=2) 59 | cur_heatmap = rotated_bound(cur_heatmap, degree) 60 | if len(cur_heatmap.shape) == 3: 61 | cur_heatmap = np.squeeze(cur_heatmap, axis=2) 62 | heatmap[:, :, c] = cv2.resize(cur_heatmap, (heat_ori_shape[1], heat_ori_shape[0])) 63 | return img, heatmap 64 | 65 | def rotated_bound(image, angle): 66 | # grab the dimensions of the image and then determine the 67 | # center 68 | (h, w) = image.shape[:2] 69 | (cX, cY) = (w // 2, h // 2) 70 | 71 | # grab the rotation matrix (applying the negative of the 72 | # angle to rotate clockwise), then grab the sine and cosine 73 | # (i.e., the rotation components of the matrix) 74 | M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0) 75 | cos = np.abs(M[0, 0]) 76 | sin = np.abs(M[0, 1]) 77 | 78 | # compute the new bounding dimensions of the image 79 | nW = int((h * sin) + (w * cos)) 80 | nH = int((h * cos) + (w * sin)) 81 | 82 | # adjust the rotation matrix to take into account translation 83 | M[0, 2] += (nW / 2) - cX 84 | M[1, 2] += (nH / 2) - cY 85 | 86 | # perform the actual rotation and return the image 87 | return cv2.warpAffine(image, M, (nW, nH)) 88 | 89 | def image_vertical_flipping(img, heatmap): 90 | ''' 91 | 要注意的是,进行flip之后,heatmap各个channel的值要改变,因为flip之后,图片上原本左关节点会变成右关节点, 92 | 右关节点同样会变成左关节点,因此需要在对heatmap进行flip之后,交换左右两个通道。 93 | coco数据集标注的顺序是: 94 | [0------16]: 95 | 0: nose 96 | 1-2: left eye, right eye 97 | 3-4: left ear, right ear 98 | 5-6: left shoulder, right shoulder 99 | 7-8: left elbow, right elbow 100 | 9-10: left wrist , right wrist 101 | 11-12:left hip, right hip 102 | 13-14:left knee, right knee 103 | 15-16:left ankle, right ankle 104 | :param img: 105 | :param heatmap: 106 | :return: 107 | ''' 108 | 109 | img = cv2.flip(img, 1) 110 | for i in range(heatmap.shape[2]): 111 | cur_heat = heatmap[:, :, i] 112 | 113 | cur_heat = np.expand_dims(cur_heat, axis=2) 114 | cur_heat = cv2.flip(cur_heat, 1) 115 | if len(cur_heat.shape) == 3: 116 | cur_heat = np.squeeze(cur_heat, axis=2) 117 | 118 | heatmap[:, :, i] = cur_heat 119 | 120 | # exchane left & right joints 121 | new_heatmap = np.zeros(heatmap.shape, dtype=heatmap.dtype) 122 | for i in range(1, 16, 2): 123 | new_heatmap[:, :, i+1] = heatmap[:, :, i] 124 | new_heatmap[:, :, i] = heatmap[:, :, i+1] 125 | new_heatmap[:, :, 0] = heatmap[:, :, 0] 126 | return img, new_heatmap 127 | 128 | def _test(): 129 | img = cv2.imread('/media/ulsee/E/datasets/coco/cocoval2017/000000281929.jpg', cv2.COLOR_BGR2RGB) 130 | img_copy = img.copy() 131 | cv2.imwrite('gt_img.jpg', img) 132 | # img = cv2.flip(img, 0) 133 | kp = [339,93,2,346,88,2,328,88,2,360,89,2,318,90,1,385,135,2,301,147,2,416,184,2, 134 | 286,204,2,407,226,2,276,244,2,358,254,2,309,259,2,352,346,2,307,349,2,348,448,2,312,449,2] 135 | heatmap = get_single_heatmap(kp, img.shape[0], img.shape[1], channels=17, sigma=4) 136 | 137 | # img, heatmap = image_rotation(img, heatmap, 40) 138 | img, heatmap = image_vertical_flipping(img, heatmap) 139 | cv2.imwrite('img_flip.jpg', img) 140 | #---------# 141 | for c in range(17): 142 | ch = heatmap[:, :, c] 143 | # print (ch) 144 | curmax = np.max(ch) 145 | index = np.where(ch == curmax) 146 | coorx = index[0][0] 147 | coory = index[1][0] 148 | cv2.circle(img, (coory, coorx), 5, (0, 0, 255), -1) 149 | cv2.putText(img, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1) 150 | cv2.imwrite('img_flip_with_heat.jpg', img) 151 | heatmap = np.sum(heatmap, axis=2, keepdims=True) * 255 152 | cv2.imwrite('heat__flip.jpg', heatmap) 153 | 154 | # heatmap_ori = heatmap 155 | # heatmap_ori = np.sum(heatmap_ori, axis=2, keepdims=True)*255 156 | # cv2.imwrite('gt_heat.jpg', heatmap_ori) 157 | # # ---------# 158 | # for c in range(17): 159 | # ch = heatmap[:, :, c] 160 | # # print (ch) 161 | # curmax = np.max(ch) 162 | # index = np.where(ch == curmax) 163 | # coorx = index[0][0] 164 | # coory = index[1][0] 165 | # cv2.circle(img_copy, (coory, coorx), 5, (0, 0, 255), -1) 166 | # cv2.putText(img_copy, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1) 167 | # cv2.imwrite('img_with_heat.jpg', img_copy) 168 | # #-----------# 169 | # img, heatmap = image_vertical_flipping(img, heatmap) 170 | # # img, heatmap = image_rotation(img, heatmap) 171 | # 172 | # cv2.imwrite('img_flip.jpg', img) 173 | # 174 | # #---------# 175 | # for c in range(17): 176 | # ch = heatmap[:, :, c] 177 | # # print (ch) 178 | # curmax = np.max(ch) 179 | # index = np.where(ch == curmax) 180 | # coorx = index[0][0] 181 | # coory = index[1][0] 182 | # cv2.circle(img, (coory, coorx), 5, (0, 0, 255), -1) 183 | # cv2.putText(img, str(c), (coory, coorx), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 0, 0), 1) 184 | # cv2.imwrite('img_flip_with_heat.jpg', img) 185 | # #---------# 186 | # heatmap = np.sum(heatmap, axis=2, keepdims=True) * 255 187 | # 188 | # heatmap = cv2.cvtColor(heatmap, cv2.COLOR_GRAY2RGB) 189 | # cv2.imwrite('heat_flip.jpg', heatmap) 190 | 191 | 192 | 193 | 194 | if __name__ == '__main__': 195 | _test() 196 | -------------------------------------------------------------------------------- /keypoint_subnet/src/json_read.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: json_read.py 7 | @time: 18-8-28 下午3:11 8 | ''' 9 | import json 10 | 11 | def load_json(json_file): 12 | ''' 13 | load json file and return a dict, like ['id'] = [keypoints], typically used for ai_challenger format dataset 14 | :param json_file: 15 | :return: 16 | ''' 17 | f = open(json_file, encoding='utf-8') 18 | labels = json.load(f) 19 | label_dict = {} 20 | for label in labels: 21 | current_keypoints = [] 22 | for human, keypoints in label['keypoint_annotations'].items(): 23 | current_keypoints = current_keypoints + keypoints 24 | label_dict[label['image_id']] = current_keypoints 25 | return label_dict 26 | 27 | def load_coco_json(json_file): 28 | ''' 29 | 30 | :param json_file: 31 | :return: 32 | ''' 33 | f = open(json_file, encoding='utf-8') 34 | labels = json.load(f) 35 | return labels 36 | 37 | def dump_coco_data(json_file): 38 | ''' 39 | convert coco annotatinos json file, as like:[{'image_id":keypoints}] 40 | :param json_file: 41 | :return: 42 | ''' 43 | 44 | f = open(json_file, encoding='utf-8') 45 | labels = json.load(f) 46 | image_info = labels['images'] 47 | anno_info = labels['annotations'] 48 | label_dict = {} 49 | 50 | for image in image_info: 51 | image_name = image['file_name'].split('.')[0] 52 | image_id = image['id'] 53 | current_keypoints = [] 54 | for anno in anno_info: 55 | keypoints = anno['keypoints'] 56 | anno_image_id = anno['image_id'] 57 | anno_id = anno['id'] 58 | if anno_image_id == image_id: 59 | current_keypoints = current_keypoints + keypoints 60 | 61 | label_dict[image_name] = current_keypoints 62 | with open('coco_image_name_to_keypoints.json', 'w') as fw: 63 | json.dump(label_dict, fw) 64 | 65 | def convert_coco_instance_json(json_file): 66 | ''' 67 | convert coco annotatinos json file, as like:[{'image_id":[x1, y1, w, h, category_id] * n}] 68 | :param json_file: 69 | :return: 70 | ''' 71 | 72 | f = open(json_file, encoding='utf-8') 73 | labels = json.load(f) 74 | units = {} 75 | 76 | image_info = labels['images'] 77 | anno_info = labels['annotations'] 78 | print ('start reading json......') 79 | ll = len(image_info) 80 | count = 1 81 | for image in image_info: 82 | image_name = image['file_name'].split('.')[0] 83 | image_id = image['id'] 84 | height = image['height'] 85 | width = image['width'] 86 | current_bbox = [height, width] 87 | 88 | for anno in anno_info: 89 | bbox = anno['bbox'] 90 | anno_image_id = anno['image_id'] 91 | 92 | if anno_image_id == image_id: 93 | bbox.append(anno['category_id']) 94 | current_bbox = current_bbox + bbox 95 | units[image_name] = current_bbox 96 | 97 | if count % 1000 == 0: 98 | print ('Processing {}'.format(count/ll)) 99 | count += 1 100 | if count == 10: 101 | break 102 | 103 | is_save = True 104 | if is_save: 105 | save_json_file = 'coco-instance-imgid-bbox.json' 106 | 107 | with open(save_json_file, 'w') as fw: 108 | json.dump(units, fw) 109 | if __name__ == '__main__': 110 | convert_coco_instance_json('/media/ulsee/E/datasets/coco-annotations/instances_train2017.json') -------------------------------------------------------------------------------- /keypoint_subnet/src/model.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: keypoint_subnet.py 7 | @time: 18-9-28 上午11:23 8 | ''' 9 | 10 | from __future__ import absolute_import, division, print_function 11 | 12 | import tensorflow as tf 13 | 14 | import numpy as np 15 | 16 | import os, json 17 | from tensorflow.contrib.slim import nets 18 | from tensorflow.contrib.layers.python.layers import utils 19 | import tensorflow.contrib.slim as slim 20 | 21 | # from src.backbone import BackBone 22 | 23 | 24 | class Keypoint_Subnet(object): 25 | def __init__(self, inputs, img_size, fpn, num_classes, batch_size): 26 | self.inputs = inputs 27 | self.img_size = img_size 28 | self.feature_pyramid = fpn 29 | self.num_classes = num_classes 30 | self.batch_size = batch_size 31 | self.stddev = 0.01 32 | 33 | self.input_heats = tf.placeholder(tf.float32, [self.batch_size, self.img_size // 4, self.img_size // 4, self.num_classes]) 34 | 35 | # self.output, self.end_points = self.network() 36 | 37 | def forward(self): 38 | with tf.variable_scope('keypoint_subnet') as sc: 39 | end_points_collection = sc.original_name_scope + '_end_points' 40 | #---------------------------------build layer D--------------------------------# 41 | feature_d = {} 42 | for layer in range(2, 6, 1): 43 | cur_p = self.feature_pyramid['P' + str(layer)] 44 | d = slim.conv2d(cur_p, 45 | num_outputs=128, 46 | kernel_size=[3, 3], 47 | stride=1, 48 | weights_initializer=tf.random_normal_initializer(stddev=self.stddev), 49 | scope='build_feature_D%d_1' % layer) 50 | d = slim.conv2d(d, 51 | num_outputs=128, 52 | kernel_size=[3, 3], 53 | stride=1, 54 | weights_initializer=tf.random_normal_initializer(stddev=self.stddev), 55 | scope='build_feature_D%d_2' % layer) 56 | feature_d['D' + str(layer)] = d 57 | #--------------------------------concat part layer D---------------------------# 58 | concat_d = feature_d['D2'] 59 | up_shape = concat_d.get_shape() 60 | up_sample = tf.image.resize_nearest_neighbor(feature_d['D3'], [up_shape[2], up_shape[2]], 61 | name='upsamping_D3') 62 | concat_d = tf.concat([concat_d, up_sample], 3) 63 | 64 | up_sample = tf.image.resize_nearest_neighbor(feature_d['D4'], [up_shape[2], up_shape[2]], 65 | name='upsamping_D4') 66 | concat_d = tf.concat([concat_d, up_sample], 3) 67 | 68 | up_sample = tf.image.resize_nearest_neighbor(feature_d['D5'], [up_shape[2], up_shape[2]], 69 | name='upsamping_D5') 70 | concat_d = tf.concat([concat_d, up_sample], 3) 71 | #------------------------------via 3x3 conv and relu---------------------------# 72 | concat_d = slim.conv2d(concat_d, 73 | num_outputs=concat_d.get_shape()[3], 74 | kernel_size=[3, 3], 75 | activation_fn=tf.nn.relu, 76 | weights_initializer=tf.random_normal_initializer(stddev=self.stddev), 77 | scope='smoothed_concat_d_layer') 78 | 79 | #----------------------------------final output--------------------------------# 80 | output = slim.conv2d(concat_d, 81 | num_outputs=self.num_classes, 82 | kernel_size=[1, 1], 83 | activation_fn=None, 84 | weights_initializer=tf.random_normal_initializer(stddev=self.stddev), 85 | scope='output') 86 | 87 | end_points = utils.convert_collection_to_dict(end_points_collection) 88 | 89 | return output, end_points 90 | 91 | def net_loss(self): 92 | output, end_points = self.forward() 93 | out_all = [] 94 | #-------------------------------add intermediate output loss------------------------------# 95 | for index, layer in self.feature_pyramid.items(): 96 | layer = tf.image.resize_bicubic(layer, [self.feature_pyramid['P2'].get_shape()[1], self.feature_pyramid['P2'].get_shape()[1]], 97 | name='upsamling_layer_%s' % index) 98 | 99 | output_mid = slim.conv2d(layer, num_outputs=self.num_classes, 100 | kernel_size=[1, 1], 101 | activation_fn=None, 102 | weights_initializer=tf.random_normal_initializer(stddev=self.stddev), 103 | scope='mid_out_%s' % index 104 | ) 105 | 106 | out_all.append(output_mid) 107 | 108 | out_all.append(output) 109 | #---------------------------------------calculate losses----------------------------------# 110 | losses = [] 111 | for idx, pre_heat in enumerate(out_all): 112 | loss_l2 = tf.nn.l2_loss(tf.concat(pre_heat, axis=0) - self.input_heats, name='loss_%d' % idx) 113 | losses.append(loss_l2) 114 | 115 | total_loss = tf.reduce_sum(losses) / self.batch_size 116 | net_out_loss = tf.reduce_sum(loss_l2) / self.batch_size 117 | #-----------------------------------------add tf summary----------------------------------# 118 | # tf.summary.scalar('total_loss', total_loss) 119 | # tf.summary.scalar('net_loss', net_out_loss) 120 | # tf.summary.image('ori_image', self.inputs, max_outputs=2) 121 | 122 | 123 | return total_loss, net_out_loss, pre_heat 124 | 125 | # if __name__ == '__main__': 126 | # graph = tf.Graph() 127 | # with graph.as_default(): 128 | # batch_size = 1 129 | # height, width = 224, 224 130 | # inputs = tf.random_uniform((batch_size, height, width, 3), seed=1) 131 | # 132 | # backbone = BackBone(img_size = 224, batch_size=1) 133 | # fpn, _ = backbone.build_fpn_feature() 134 | # kp = Keypoint_Subnet(backbone.input_imgs, img_size=backbone.img_size, fpn=fpn, batch_size=backbone.batch_size, num_classes=14) 135 | # total_loss, net_loss, pre_heat = kp.net_loss() 136 | # init = tf.group( 137 | # tf.global_variables_initializer(), 138 | # tf.local_variables_initializer()) 139 | # 140 | # saver = tf.train.Saver() 141 | # 142 | # with tf.Session() as sess: 143 | # sess.run(init) 144 | # 145 | # writer = tf.summary.FileWriter('graph', tf.get_default_graph()) 146 | # writer.close() 147 | -------------------------------------------------------------------------------- /keypoint_subnet/src/reader.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: reader.py 7 | @time: 18-9-28 上午11:53 8 | ''' 9 | 10 | import tensorflow as tf 11 | import matplotlib.pyplot as plt 12 | import os, json 13 | import sys 14 | sys.path.append('../') 15 | 16 | 17 | 18 | class Keypoint_Reader: 19 | def __init__(self, tfrecord_file, img_size=56, batch_size=4, epochs = 1, capacity = 1000, num_threads=12, name=''): 20 | self.tfrecord_file = tfrecord_file 21 | self.img_size = img_size 22 | self.batch_size = batch_size 23 | self.capacity = capacity 24 | self.num_threads = num_threads 25 | self.name = name 26 | self.reader = tf.TFRecordReader() 27 | self.epochs = epochs 28 | 29 | def feed(self): 30 | with tf.name_scope(self.name): 31 | filename_queue = tf.train.string_input_producer([self.tfrecord_file], num_epochs=self.epochs) 32 | _, serialized_example = self.reader.read(filename_queue) 33 | features = tf.parse_single_example( 34 | serialized_example, 35 | features={ 36 | 'image':tf.FixedLenFeature([], tf.string), 37 | 'id': tf.FixedLenFeature([], tf.string), 38 | 'height': tf.FixedLenFeature([], tf.int64), 39 | 'width': tf.FixedLenFeature([], tf.int64) 40 | }) 41 | 42 | img = tf.image.decode_image(features['image'], channels=3) # tensor, [height, width, channels] 43 | img_id = features['id'] 44 | img_height = tf.cast(features['height'], tf.int32) 45 | img_width = tf.cast(features['width'], tf.int32) 46 | 47 | img = tf.reshape(img, shape=[img_height, img_width, 3]) 48 | img = self.image_preprocessing(img) 49 | 50 | img_batch, img_id_batch, img_height_batch, img_width_batch = tf.train.shuffle_batch( 51 | [img, img_id, img_height, img_width], 52 | batch_size=self.batch_size, 53 | num_threads=self.num_threads, 54 | capacity=self.capacity, 55 | min_after_dequeue=self.capacity // 10 56 | ) 57 | 58 | return img_batch, img_id_batch, img_height_batch, img_width_batch 59 | 60 | def image_preprocessing(self, image): 61 | 62 | img = tf.expand_dims(image, axis=0) 63 | img = tf.image.resize_nearest_neighbor(img, (self.img_size, self.img_size)) 64 | img = tf.squeeze(img, axis=0) 65 | return img 66 | 67 | def reader_test(): 68 | batch = 1 69 | epoch = 1 70 | reader = Keypoint_Reader(tfrecord_file='/media/ulsee/E/keypoint_subnet_tfrecord/coco_train2017-test.tfrecord', batch_size=batch, epochs=1) 71 | _1, _2, _3, _4 = reader.feed() 72 | # print (_2) 73 | # return 74 | # net_x = tf.reduce_sum(net_x, axis=3, keepdims=True) 75 | # label = tf.reduce_sum(label, axis=3, keepdims=True) 76 | 77 | with tf.Session() as sess: 78 | sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) 79 | 80 | coord = tf.train.Coordinator() 81 | threads = tf.train.start_queue_runners(coord=coord) 82 | 83 | try: 84 | step = 0 85 | while not coord.should_stop(): 86 | a,b,c,d = sess.run([_1,_2,_3,_4]) 87 | print (b.shape) 88 | step += 1 89 | except tf.errors.OutOfRangeError: 90 | print ('batch = {}, epoch = {}, total steps = {} '.format(batch, epoch, step)) 91 | finally: 92 | coord.request_stop() 93 | coord.join(threads) 94 | 95 | 96 | if __name__ == '__main__': 97 | reader_test() -------------------------------------------------------------------------------- /keypoint_subnet/train_log.md: -------------------------------------------------------------------------------- 1 | ## 2018-09-30: 2 | 3 | 开始从头开始训练人体关键点检测网络,输入大小为480x480,lr=0.0001,batch为4,使用Adam优化,数据集为coco2017 -------------------------------------------------------------------------------- /person_detect/README.md: -------------------------------------------------------------------------------- 1 | 行人检测网络数据格式: 2 | 3 | 行人检测网络使用的tfrecord包含了网络训练的所有信息,如图片信息、box信息和label信息。由于tfrecord在读取的时候 4 | ,如果是batch读的话,会把batch里的数据都resize到同一个尺度下,而不同的图片的box数量是不一样的,这就造成了tfrecord会对 5 | 一个batch里box数量少的图片自动补零。并且由于tensorflow的特性,有tf自动补零的batch在sess run之前是没有办法得到具体维度大小, 6 | 也就没有办法对其进行操作。 7 | 8 | 考虑到这个问题,我们在生成tfrecord的时候,指定了一个box数量大小(30),并在训练的时候将全0的box去掉,不会参与到训练过程中去。 -------------------------------------------------------------------------------- /person_detect/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: __init__.py 7 | @time: 18-9-29 下午6:56 8 | ''' -------------------------------------------------------------------------------- /person_detect/anchor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/murdockhou/MultiPoseNet-tensorflow/9ab52e5867d7f40233a63db8f344ca380c640164/person_detect/anchor/__init__.py -------------------------------------------------------------------------------- /person_detect/anchor/anchor_generator.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | 6 | from anchor import box_list_ops 7 | from anchor import box_list 8 | 9 | import tensorflow as tf 10 | import numpy as np 11 | 12 | class MultipleGridAnchorGenerator(): 13 | """Generate a grid of anchors for multiple CNN layers.""" 14 | 15 | def __init__(self, 16 | box_specs_list, 17 | base_anchor_sizes, 18 | clip_window=None): 19 | """Constructs a MultipleGridAnchorGenerator. 20 | 21 | To construct anchors, at multiple grid resolutions, one must provide a 22 | list of feature_map_shape_list (e.g., [(8, 8), (4, 4)]), and for each grid 23 | size, a corresponding list of (scale, aspect ratio) box specifications. 24 | 25 | For example: 26 | box_specs_list = [[(.1, 1.0), (.1, 2.0)], # for 8x8 grid 27 | [(.2, 1.0), (.3, 1.0), (.2, 2.0)]] # for 4x4 grid 28 | 29 | To support the fully convolutional setting, we pass grid sizes in at 30 | generation time, while scale and aspect ratios are fixed at construction 31 | time. 32 | 33 | Args: 34 | box_specs_list: list of list of (scale, aspect ratio) pairs with the 35 | outside list having the same number of entries as feature_map_shape_list 36 | (which is passed in at generation time). 37 | base_anchor_sizes: list of base anchor size in each layer 38 | clip_window: a tensor of shape [4] specifying a window to which all 39 | anchors should be clipped. If clip_window is None, then no clipping 40 | is performed. 41 | 42 | Raises: 43 | ValueError: if box_specs_list is not a list of list of pairs 44 | ValueError: if clip_window is not either None or a tensor of shape [4] 45 | """ 46 | if isinstance(box_specs_list, list) and all( 47 | [isinstance(list_item, list) for list_item in box_specs_list]): 48 | self._box_specs = box_specs_list 49 | else: 50 | raise ValueError('box_specs_list is expected to be a ' 51 | 'list of lists of pairs') 52 | if isinstance(base_anchor_sizes, list): 53 | self._base_anchor_sizes = base_anchor_sizes 54 | else: 55 | raise ValueError('base_anchor_list is expected to be a list of float') 56 | if clip_window is not None and clip_window.get_shape().as_list() != [4]: 57 | raise ValueError('clip_window must either be None or a shape [4] tensor') 58 | self._clip_window = clip_window 59 | self._scales = [] 60 | self._aspect_ratios = [] 61 | for box_spec in self._box_specs: 62 | if not all([isinstance(entry, tuple) and len(entry) == 2 63 | for entry in box_spec]): 64 | raise ValueError('box_specs_list is expected to be a ' 65 | 'list of lists of pairs') 66 | scales, aspect_ratios = zip(*box_spec) 67 | self._scales.append(scales) 68 | self._aspect_ratios.append(aspect_ratios) 69 | 70 | def name_scope(self): 71 | return 'MultipleGridAnchorGenerator' 72 | 73 | def num_anchors_per_location(self): 74 | """Returns the number of anchors per spatial location. 75 | 76 | Returns: 77 | a list of integers, one for each expected feature map to be passed to 78 | the Generate function. 79 | """ 80 | return [len(box_specs) for box_specs in self._box_specs] 81 | 82 | def generate(self, 83 | input_size, 84 | feature_map_shape_list, 85 | anchor_strides=None, 86 | anchor_offsets=None): 87 | """Generates a collection of bounding boxes to be used as anchors. 88 | 89 | The number of anchors generated for a single grid with shape MxM where we 90 | place k boxes over each grid center is k*M^2 and thus the total number of 91 | anchors is the sum over all grids. In our box_specs_list example 92 | (see the constructor docstring), we would place two boxes over each grid 93 | point on an 8x8 grid and three boxes over each grid point on a 4x4 grid and 94 | thus end up with 2*8^2 + 3*4^2 = 176 anchors in total. The layout of the 95 | output anchors follows the order of how the grid sizes and box_specs are 96 | specified (with box_spec index varying the fastest, followed by width 97 | index, then height index, then grid index). 98 | 99 | Args: 100 | input_size: input image size list with (width, height) 101 | feature_map_shape_list: list of pairs of conv net layer resolutions in the 102 | format [(height_0, width_0), (height_1, width_1), ...]. For example, 103 | setting feature_map_shape_list=[(8, 8), (7, 7)] asks for anchors that 104 | correspond to an 8x8 layer followed by a 7x7 layer. 105 | anchor_strides: list of pairs of strides (in y and x directions 106 | respectively). For example, setting 107 | anchor_strides=[(.25, .25), (.5, .5)] means that we want the anchors 108 | corresponding to the first layer to be strided by .25 and those in the 109 | second layer to be strided by .5 in both y and x directions. By 110 | default, if anchor_strides=None, then they are set to be the reciprocal 111 | of the corresponding grid sizes. The pairs can also be specified as 112 | dynamic tf.int or tf.float numbers, e.g. for variable shape input 113 | images. 114 | anchor_offsets: list of pairs of offsets (in y and x directions 115 | respectively). The offset specifies where we want the center of the 116 | (0, 0)-th anchor to lie for each layer. For example, setting 117 | anchor_offsets=[(.125, .125), (.25, .25)]) means that we want the 118 | (0, 0)-th anchor of the first layer to lie at (.125, .125) in image 119 | space and likewise that we want the (0, 0)-th anchor of the second 120 | layer to lie at (.25, .25) in image space. By default, if 121 | anchor_offsets=None, then they are set to be half of the corresponding 122 | anchor stride. The pairs can also be specified as dynamic tf.int or 123 | tf.float numbers, e.g. for variable shape input images. 124 | 125 | Returns: 126 | boxes: a BoxList holding a collection of N anchor boxes 127 | Raises: 128 | ValueError: if feature_map_shape_list, box_specs_list do not have the same 129 | length. 130 | ValueError: if feature_map_shape_list does not consist of pairs of 131 | integers 132 | """ 133 | if not (isinstance(feature_map_shape_list, list) 134 | and len(feature_map_shape_list) == len(self._box_specs)): 135 | raise ValueError('feature_map_shape_list must be a list with the same ' 136 | 'length as self._box_specs') 137 | if not all([isinstance(list_item, tuple) and len(list_item) == 2 138 | for list_item in feature_map_shape_list]): 139 | raise ValueError('feature_map_shape_list must be a list of pairs.') 140 | im_height, im_width = input_size[0], input_size[1] 141 | # anchor_strides = [(8.0, 8.0), (16.0, 16.0), (32.0, 32.0), (56.0, 56.0), (112.0, 112.0)] 142 | if not anchor_strides: 143 | anchor_strides = [(tf.to_float(im_height) / tf.to_float(pair[0]), 144 | tf.to_float(im_width) / tf.to_float(pair[1])) 145 | for pair in feature_map_shape_list] 146 | # anchor_offsets = [(4.0, 4.0), (8.0, 8.0), (16.0, 16.0), (28.0, 28.0), (56.0, 56.0)] 147 | if not anchor_offsets: 148 | anchor_offsets = [(0.5 * stride[0], 0.5 * stride[1]) 149 | for stride in anchor_strides] 150 | 151 | for arg, arg_name in zip([anchor_strides, anchor_offsets], 152 | ['anchor_strides', 'anchor_offsets']): 153 | if not (isinstance(arg, list) and len(arg) == len(self._box_specs)): 154 | raise ValueError('%s must be a list with the same length ' 155 | 'as self._box_specs' % arg_name) 156 | if not all([isinstance(list_item, tuple) and len(list_item) == 2 157 | for list_item in arg]): 158 | raise ValueError('%s must be a list of pairs.' % arg_name) 159 | 160 | anchor_grid_list = [] 161 | for grid_size, scales, aspect_ratios, stride, offset, base_anchor_size in zip( 162 | feature_map_shape_list, self._scales, self._aspect_ratios, 163 | anchor_strides, anchor_offsets, self._base_anchor_sizes): 164 | 165 | # print(grid_size, scales, aspect_ratios, stride, offset, base_anchor_size) 166 | 167 | anchor_grid_list.append( 168 | tile_anchors( 169 | grid_height=grid_size[0], 170 | grid_width=grid_size[1], 171 | scales=scales, 172 | aspect_ratios=aspect_ratios, 173 | base_anchor_size=base_anchor_size, 174 | anchor_stride=stride, 175 | anchor_offset=offset)) 176 | # break 177 | concatenated_anchors = box_list_ops.concatenate(anchor_grid_list) 178 | num_anchors = concatenated_anchors.num_boxes_static() 179 | # print (num_anchors) 180 | if num_anchors is None: 181 | num_anchors = concatenated_anchors.num_boxes() 182 | if self._clip_window is not None: 183 | clip_window = tf.multiply( 184 | tf.to_float([im_height, im_width, im_height, im_width]), 185 | self._clip_window) 186 | concatenated_anchors = box_list_ops.clip_to_window( 187 | concatenated_anchors, clip_window, filter_nonoverlapping=False) 188 | # TODO: make reshape an option for the clip_to_window op 189 | concatenated_anchors.set( 190 | tf.reshape(concatenated_anchors.get(), [num_anchors, 4])) 191 | 192 | stddevs_tensor = 0.01 * tf.ones( 193 | [num_anchors, 4], dtype=tf.float32, name='stddevs') 194 | concatenated_anchors.add_field('stddev', stddevs_tensor) 195 | return concatenated_anchors 196 | 197 | 198 | def tile_anchors(grid_height, 199 | grid_width, 200 | scales, 201 | aspect_ratios, 202 | base_anchor_size, 203 | anchor_stride, 204 | anchor_offset): 205 | """Create a tiled set of anchors strided along a grid in image space. 206 | 207 | This op creates a set of anchor boxes by placing a "basis" collection of 208 | boxes with user-specified scales and aspect ratios centered at evenly 209 | distributed points along a grid. The basis collection is specified via the 210 | scale and aspect_ratios arguments. For example, setting scales=[.1, .2, .2] 211 | and aspect ratios = [2,2,1/2] means that we create three boxes: one with scale 212 | .1, aspect ratio 2, one with scale .2, aspect ratio 2, and one with scale .2 213 | and aspect ratio 1/2. Each box is multiplied by "base_anchor_size" before 214 | placing it over its respective center. 215 | 216 | Grid points are specified via grid_height, grid_width parameters as well as 217 | the anchor_stride and anchor_offset parameters. 218 | 219 | Args: 220 | grid_height: size of the grid in the y direction (int or int scalar tensor) 221 | grid_width: size of the grid in the x direction (int or int scalar tensor) 222 | scales: a 1-d (float) tensor representing the scale of each box in the 223 | basis set. 224 | aspect_ratios: a 1-d (float) tensor representing the aspect ratio of each 225 | box in the basis set. The length of the scales and aspect_ratios tensors 226 | must be equal. 227 | base_anchor_size: base anchor size in this layer as [height, width] 228 | (float tensor of shape [2]) 229 | anchor_stride: difference in centers between base anchors for adjacent grid 230 | positions (float tensor of shape [2]) 231 | anchor_offset: center of the anchor with scale and aspect ratio 1 for the 232 | upper left element of the grid, this should be zero for 233 | feature networks with only VALID padding and even receptive 234 | field size, but may need some additional calculation if other 235 | padding is used (float tensor of shape [2]) 236 | Returns: 237 | a BoxList holding a collection of N anchor boxes 238 | """ 239 | ratio_sqrts = tf.sqrt(aspect_ratios) 240 | # 根据base_anchor_size计算anchor在原图上本来的宽高 241 | heights = scales / ratio_sqrts * base_anchor_size 242 | # print ('heights == ', heights.get_shape()) 243 | widths = scales * ratio_sqrts * base_anchor_size 244 | # print ('widths == ', widths.get_shape()) 245 | # Get a grid of box centers 246 | y_centers = tf.to_float(tf.range(grid_height)) 247 | y_centers = y_centers * anchor_stride[0] + anchor_offset[0] 248 | # print ('y_centers before meshgrid === ', y_centers.get_shape()) 249 | x_centers = tf.to_float(tf.range(grid_width)) 250 | x_centers = x_centers * anchor_stride[1] + anchor_offset[1] 251 | # print('x_centers before meshgrid === ', x_centers.get_shape()) 252 | x_centers, y_centers = tf.meshgrid(x_centers, y_centers) 253 | 254 | # xcenters在和widths进行meshgrid之前,xcenters的shape是(grid_height * grid_width),只不过每一行都是0-(grid_width-1),widths长度为9,是总共要生成的 255 | # 9个anchors宽度列表,由前面计算得到。widths在和xcenters进行meshgrid之后,由于meshgrid是对维度为1的tensor进行操作,首先会把xcenters展开, 256 | # 变成一行,有(grid_height * grid_width)列,然后再进行meshgrid操作。meshgrid之后,widths_grid为 (grid_height * gird_width) × 9维矩阵,每一行都是9个anchor的宽度 257 | # xcenters_grid为(grid_height * grid_width) * 9矩阵,每一列都是grid_height个(0-grid_widht-1)数值。 258 | # 下面的heights和y_centers进行meshgrid最终得到的结果略有不同,heights_grid和widths_grid结果很一致,都是(grid_height * gird_width) × 9维矩阵,每一行都是9个anchor的高度, 259 | # 但y_centers_grid就略有变化,因为y_centers是每一列值都是 (0~grid_heigt-1),但每一行的值都是相同的,即每一行的值都是同一个值,meshgrid会将不是1维的矩阵变成一维,是按照行展开的, 260 | # 所以y_centers展开后就变成[1,1,1,1,1,..., 1,2,2,2,2,...,2,....,h,h,h,...h]这种形式,因此在和heights进行meshgrid之后,y_centers_grid每一列都变成了前面说的那个列表内的值 261 | 262 | widths_grid, x_centers_grid = tf.meshgrid(widths, x_centers) 263 | heights_grid, y_centers_grid = tf.meshgrid(heights, y_centers) 264 | 265 | # 在对y_centers_grid 和 x_centers_grid 进行axis=2的stack,x_centers_gird 和 y_centers_grid 维度均为 (grid_height*grid_width) * 9 维度,只不过数值不一样,按照 266 | # axis=2 进行stack,其实就是把x_centers_grid 和 y_centers_grid 里的值一一对应起来,最后变成 (grid_height * grid_width) * 9 * 2的三维矩阵,其实就是所有anchor对应的 267 | # 中心点在图像上的坐标,类似于[[[1,1]*9, [1,2]*9, ..., [7,7]*9]]这种形式,其实就是把图片上每个点的坐标拿出来,并重复9次,当做这个点生成的9个anchor的centers 268 | bbox_centers = tf.stack([y_centers_grid, x_centers_grid], axis=2) 269 | 270 | # 同理,对heights_grid 和 widths_grid 进行 axis=2 的stack, 也是得到一个(grid_height * grid_width) * 9 * 2的三维矩阵,只不过这个矩阵保存的是anchor的size,和前面的bbox_centers 271 | # 的值是一一对应的,即一个存了center的(x,y)坐标,一个存了bbox的宽高 272 | bbox_sizes = tf.stack([heights_grid, widths_grid], axis=2) 273 | 274 | # 接着对这两个矩阵进行reshape成 n*2 的二维矩阵,n是所有anchor的数量,为 (grid_heigt * grid_width * 9),bbox_centers每一行保存的是anchor的中心点坐标 275 | # bbox_sizes 保存的是anchor的对应的宽高 276 | bbox_centers = tf.reshape(bbox_centers, [-1, 2]) 277 | bbox_sizes = tf.reshape(bbox_sizes, [-1, 2]) 278 | # convert [ycenter, xcenter, height, width] to [ymin, xmin, ymax, xmax] 279 | bbox_corners = _center_size_bbox_to_corners_bbox(bbox_centers, bbox_sizes) 280 | 281 | # 需要注意的是,这个生成的anchor就是相对于原图上的位置在哪,并且通过上一行的函数,把box的表示方式变成了[ymin, xmin, ymax, xmax],最终的shape为(n, 4) 282 | # base_anchor_size 变得越来越大的原因是,随着featuremap维度不断增高,其上面的每一个点所能表示的原图的范围,也即是感受野也在不断增大 283 | return box_list.BoxList(bbox_corners) 284 | 285 | 286 | def _center_size_bbox_to_corners_bbox(centers, sizes): 287 | """Converts bbox center-size representation to corners representation. 288 | 289 | Args: 290 | centers: a tensor with shape [N, 2] representing bounding box centers 291 | sizes: a tensor with shape [N, 2] representing bounding boxes 292 | 293 | Returns: 294 | corners: tensor with shape [N, 4] representing bounding boxes in corners 295 | representation 296 | """ 297 | return tf.concat([centers - .5 * sizes, centers + .5 * sizes], 1) 298 | 299 | 300 | def create_retinanet_anchors( 301 | num_layers=5, 302 | scales=(1.0, pow(2, 1./3), pow(2, 2./3)), 303 | aspect_ratios=(0.5, 1.0, 2.0), 304 | base_anchor_sizes=(32.0, 64.0, 128.0, 256.0, 512.0) 305 | ): 306 | """Create a set of anchors walking along a grid in a collection of feature maps in RetinaNet. 307 | 308 | This op creates a set of anchor boxes by placing a basis collection of 309 | boxes with user-specified scales and aspect ratios centered at evenly 310 | distributed points along a grid. The basis Each box is multiplied by 311 | base_anchor_size before placing it over its respective center. 312 | 313 | Args: 314 | num_layers: The number of grid layers to create anchors 315 | scales: A list of scales 316 | aspect_ratios: A list of aspect ratios 317 | base_anchor_sizes: List of base anchor sizes in each layer 318 | Returns: 319 | A MultipleGridAnchorGenerator 320 | """ 321 | base_anchor_sizes = list(base_anchor_sizes) 322 | box_spec_list = [] 323 | for idx in range(num_layers): 324 | layer_spec_list = [] 325 | for scale in scales: 326 | for aspect_ratio in aspect_ratios: 327 | layer_spec_list.append((scale, aspect_ratio)) 328 | box_spec_list.append(layer_spec_list) 329 | 330 | # for val in box_spec_list: 331 | # print (val) 332 | # print (base_anchor_sizes) 333 | 334 | # box_spec_list = [[(1.0, 0.5), (1.0, 1.0), (1.0, 2.0), 335 | # (1.2599210498948732, 0.5), (1.2599210498948732, 1.0), (1.2599210498948732, 2.0), 336 | # (1.5874010519681994, 0.5), (1.5874010519681994, 1.0), (1.5874010519681994, 2.0)]] 337 | # base_anchor_sizes = [256.0] 338 | return MultipleGridAnchorGenerator(box_spec_list, base_anchor_sizes) 339 | 340 | 341 | def anchor_assign(anchors, gt_boxes, gt_labels, is_training=True): 342 | """ 343 | Assign generated anchors to boxes and labels 344 | Args: 345 | anchors: BoxList holding a collection of N anchors 346 | gt_boxes: BoxList holding a collection of groundtruth 2D box coordinates tensor/list [#object, 4] 347 | ([ymin, xmin, ymax, xmax], float type) of objects in given input image. 348 | gt_labels: Groundtruth 1D tensor/list [#object] (scalar int) of objects in given image. 349 | is_training: is training or not 350 | 351 | returns: 352 | BoxList with anchor location and class fields 353 | """ 354 | pos_iou_thred = 0.5 355 | neg_iou_thred = 0.5 356 | if is_training: 357 | neg_iou_thred = 0.4 358 | if gt_boxes.get().get_shape()[0] != gt_labels.get_shape()[0]: 359 | raise ValueError('Boxs and labels number must be equal.') 360 | # box_iou: 总共有#anchors行,#gt_boxes列 (#anchors, #gtboxes),每一行表示当前anchor对于gt_boxes的iou值 361 | box_iou = box_list_ops.iou(anchors, gt_boxes) 362 | 363 | # anchor_max_iou: 返回每一个anchor相对于gt_boxes中最大的iou值, 364 | # 是一个tensor,维度为[#anchors,], 每一个值为这个anchor和所有gtbox最大的iou值, 365 | # 和下面的anchor_max_iou_indices相对应 366 | anchor_max_iou = tf.reduce_max(box_iou, axis=1) 367 | 368 | # box_iou是一个二维矩阵,每一行代表一个anchor相对于gtbox的iou值, 369 | # 对其进行axis=1的tf.argmax,就是找到这个anchor和哪个gtbox iou值最大,并返回其下标 370 | anchor_max_iou_indices = tf.argmax(box_iou, axis=1) 371 | 372 | # 根据前面的anchor_max_iou_indices,将gt_boxes里对于每一个anchor是最大iou的那个gt_box取出来, 373 | # 组成一个新的矩阵,维度为[#anchors, 4] 374 | anchor_gt_box = tf.gather(gt_boxes.get(), anchor_max_iou_indices) 375 | 376 | # 类似于anchor_gt_box, 将前面anchor对应的最大iou的gt_box的label取出来,组成新的矩阵,维度为[#anchors,] 377 | anchor_gt_cls = tf.gather(gt_labels, anchor_max_iou_indices) #[#saved_anchor_num], 1D 378 | # print ('anchor_gt_cls === ', anchor_gt_cls) 379 | 380 | # get remaining index with iou between 0.4 to 0.5 381 | # 对于每一个anchor,因为其都有一个相对于gtbox的最大iou值,判断其是否是正样本,如果当前anchor的max_iou值大于pos_iou_thred, 382 | # 将其class设为其原本对应的label,否则设置为-1,为下一步操作做准备 383 | anchor_gt_cls = tf.where(tf.greater(anchor_max_iou, pos_iou_thred), anchor_gt_cls, 0-tf.ones_like(anchor_gt_cls)) 384 | 385 | # 和上面的函数同理,如果anchor的max_iou小于neg_iou_thred,就将其class设置为0,否则就是原本的class 386 | # 因为已经考虑过其是否大于pos_iou_thred,所以执行完这个函数之后,最后得到的结果就是: 387 | # iou > 0.5的anchor认为是正样本,iou<0.4认为是负样本, iou在0.4和0.5之间设为-1,忽略掉 388 | anchor_gt_cls = tf.where(tf.less(anchor_max_iou, neg_iou_thred), tf.zeros_like(anchor_gt_cls), anchor_gt_cls) 389 | 390 | anchors.add_field('gt_boxes', anchor_gt_box) 391 | anchors.add_field('gt_labels', anchor_gt_cls) #[#saved_anchor_num], 1D 392 | return anchors 393 | 394 | def anchor_test(): 395 | input_size = [224,224] 396 | # feature_map = [(28, 28), (14, 14), (7, 7), (4, 4), (2, 2)] 397 | feature_maps = [(tf.ceil(input_size[0]/pow(2., i+3)), tf.ceil(input_size[1]/pow(2., i+3))) for i in range(5)] 398 | 399 | feature_map_list = [(tf.ceil(tf.multiply(tf.to_float(input_size[0]), 1 / pow(2., i + 3))), 400 | tf.ceil(tf.multiply(tf.to_float(input_size[1]), 1 / pow(2., i + 3)))) 401 | for i in range(5)] 402 | 403 | # feature_map_list = [(3,3)] 404 | anchor_generator = create_retinanet_anchors() 405 | # print ('scales = ', anchor_generator._scales) 406 | # print ('aspect ratio = ', anchor_generator._aspect_ratios) 407 | 408 | anchors = anchor_generator.generate(input_size, feature_map_list) 409 | anchors_before_assign = anchors.get() 410 | # return 411 | gt_boxes = box_list.BoxList(tf.convert_to_tensor([[0, 0, 210, 210], [200,203,205,206], [1,1,220,220]], dtype=tf.float32)) 412 | gt_labels = tf.convert_to_tensor([1, 1, 1]) 413 | anchors, box_iou = anchor_assign(anchors, gt_boxes, gt_labels) 414 | # x = tf.convert_to_tensor([[[1,2,3],[3,4,5],[5,6,7]],[[1,2,3],[3,4,5],[5,6,7]]]) 415 | result = anchors.get_field("gt_boxes") 416 | labels = anchors.get_field('gt_labels') 417 | print (labels.get_shape()) 418 | with tf.Session() as sess: 419 | print (sess.run(result).shape) 420 | print (sess.run(labels).shape) 421 | print (sess.run(box_iou)) 422 | # print(sess.run(tf.squeeze(tf.where(tf.greater(gt_labels, 1))))) 423 | # print(sess.run(tf.gather(x, tf.convert_to_tensor([0,1]), axis=1))) 424 | sess.close() 425 | 426 | if __name__ == "__main__": 427 | anchor_test() -------------------------------------------------------------------------------- /person_detect/anchor/box_coder.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """box coder following Faster RCNN procedure. 17 | 18 | Faster RCNN box coder follows the coding schema described below: 19 | ty = (y - ya) / ha 20 | tx = (x - xa) / wa 21 | th = log(h / ha) 22 | tw = log(w / wa) 23 | where x, y, w, h denote the box's center coordinates, width and height 24 | respectively. Similarly, xa, ya, wa, ha denote the anchor's center 25 | coordinates, width and height. tx, ty, tw and th denote the anchor-encoded 26 | center, width and height respectively. 27 | 28 | See http://arxiv.org/abs/1506.01497 for details. 29 | """ 30 | 31 | import tensorflow as tf 32 | 33 | from anchor import box_list 34 | 35 | EPSILON = 1e-8 36 | 37 | def get_center_coordinates_and_sizes(box_corners, scope=None): 38 | """Computes the center coordinates, height and width of the boxes. 39 | 40 | Args: 41 | box_corners: Tensor of N boxes 42 | scope: name scope of the function. 43 | 44 | Returns: 45 | a list of 4 1-D tensors [ycenter, xcenter, height, width]. 46 | """ 47 | with tf.name_scope(scope, 'get_center_coordinates_and_sizes'): 48 | ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(box_corners)) 49 | width = xmax - xmin 50 | height = ymax - ymin 51 | ycenter = ymin + height / 2. 52 | xcenter = xmin + width / 2. 53 | return [ycenter, xcenter, height, width] 54 | 55 | 56 | class FasterRCNNBoxCoder(): 57 | """Faster RCNN box coder.""" 58 | 59 | def __init__(self, scale_factors=None): 60 | """Constructor for FasterRcnnBoxCoder. 61 | 62 | Args: 63 | scale_factors: List of 4 positive scalars to scale ty, tx, th and tw. 64 | If set to None, does not perform scaling. For Faster RCNN, 65 | the open-source implementation recommends using [10.0, 10.0, 5.0, 5.0]. 66 | """ 67 | if scale_factors: 68 | assert len(scale_factors) == 4 69 | for scalar in scale_factors: 70 | assert scalar > 0 71 | self._scale_factors = scale_factors 72 | 73 | @property 74 | def code_size(self): 75 | return 4 76 | 77 | def encode(self, boxes, anchors): 78 | """Encode a box collection with respect to anchor collection. 79 | 80 | Args: 81 | boxes: Tensor holding N boxes to be encoded. 82 | anchors: Tensor of corresponding N anchors. 83 | 84 | Returns: 85 | a tensor representing N anchor-encoded boxes of the format 86 | [ty, tx, th, tw]. 87 | """ 88 | # Convert anchors to the center coordinate representation. 89 | ycenter_a, xcenter_a, ha, wa = get_center_coordinates_and_sizes(anchors) 90 | ycenter, xcenter, h, w = get_center_coordinates_and_sizes(boxes) 91 | # Avoid NaN in division and log below. 92 | ha += EPSILON 93 | wa += EPSILON 94 | h += EPSILON 95 | w += EPSILON 96 | 97 | tx = (xcenter - xcenter_a) / wa 98 | ty = (ycenter - ycenter_a) / ha 99 | tw = tf.log(w / wa) 100 | th = tf.log(h / ha) 101 | # Scales location targets as used in paper for joint training. 102 | if self._scale_factors: 103 | ty *= self._scale_factors[0] 104 | tx *= self._scale_factors[1] 105 | th *= self._scale_factors[2] 106 | tw *= self._scale_factors[3] 107 | return tf.transpose(tf.stack([ty, tx, th, tw])) 108 | 109 | def decode(self, rel_codes, anchors): 110 | """Decode relative codes to boxes. 111 | 112 | Args: 113 | rel_codes: a tensor representing N anchor-encoded boxes. 114 | anchors: BoxList of anchors. 115 | 116 | Returns: 117 | boxes: BoxList holding N bounding boxes. 118 | """ 119 | ycenter_a, xcenter_a, ha, wa = get_center_coordinates_and_sizes(anchors) 120 | 121 | ty, tx, th, tw = tf.unstack(tf.transpose(rel_codes)) 122 | if self._scale_factors: 123 | ty /= self._scale_factors[0] 124 | tx /= self._scale_factors[1] 125 | th /= self._scale_factors[2] 126 | tw /= self._scale_factors[3] 127 | w = tf.exp(tw) * wa 128 | h = tf.exp(th) * ha 129 | ycenter = ty * ha + ycenter_a 130 | xcenter = tx * wa + xcenter_a 131 | ymin = ycenter - h / 2. 132 | xmin = xcenter - w / 2. 133 | ymax = ycenter + h / 2. 134 | xmax = xcenter + w / 2. 135 | return box_list.BoxList(tf.transpose(tf.stack([ymin, xmin, ymax, xmax]))) 136 | -------------------------------------------------------------------------------- /person_detect/anchor/box_list.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Bounding Box List definition. 17 | 18 | BoxList represents a list of bounding boxes as tensorflow 19 | tensors, where each bounding box is represented as a row of 4 numbers, 20 | [y_min, x_min, y_max, x_max]. It is assumed that all bounding boxes 21 | within a given list correspond to a single image. See also 22 | box_list_ops.py for common box related operations (such as area, iou, etc). 23 | 24 | Optionally, users can add additional related fields (such as weights). 25 | We assume the following things to be true about fields: 26 | * they correspond to boxes in the box_list along the 0th dimension 27 | * they have inferrable rank at graph construction time 28 | * all dimensions except for possibly the 0th can be inferred 29 | (i.e., not None) at graph construction time. 30 | 31 | Some other notes: 32 | * Following tensorflow conventions, we use height, width ordering, 33 | and correspondingly, y,x (or ymin, xmin, ymax, xmax) ordering 34 | * Tensors are always provided as (flat) [N, 4] tensors. 35 | """ 36 | 37 | import tensorflow as tf 38 | 39 | 40 | class BoxList(object): 41 | """Box collection.""" 42 | 43 | def __init__(self, boxes): 44 | """Constructs box collection. 45 | 46 | Args: 47 | boxes: a tensor of shape [N, 4] representing box corners 48 | 49 | Raises: 50 | ValueError: if invalid dimensions for bbox data or if bbox data is not in 51 | float32 format. 52 | """ 53 | if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4: 54 | raise ValueError('Invalid dimensions for box data.') 55 | if boxes.dtype != tf.float32: 56 | raise ValueError('Invalid tensor type: should be tf.float32') 57 | self.data = {'boxes': boxes} 58 | 59 | def num_boxes(self): 60 | """Returns number of boxes held in collection. 61 | 62 | Returns: 63 | a tensor representing the number of boxes held in the collection. 64 | """ 65 | return tf.shape(self.data['boxes'])[0] 66 | 67 | def num_boxes_static(self): 68 | """Returns number of boxes held in collection. 69 | 70 | This number is inferred at graph construction time rather than run-time. 71 | 72 | Returns: 73 | Number of boxes held in collection (integer) or None if this is not 74 | inferrable at graph construction time. 75 | """ 76 | return self.data['boxes'].get_shape()[0].value 77 | 78 | def get_all_fields(self): 79 | """Returns all fields.""" 80 | return self.data.keys() 81 | 82 | def get_extra_fields(self): 83 | """Returns all non-box fields (i.e., everything not named 'boxes').""" 84 | return [k for k in self.data.keys() if k != 'boxes'] 85 | 86 | def add_field(self, field, field_data): 87 | """Add field to box list. 88 | 89 | This method can be used to add related box data such as 90 | weights/labels, etc. 91 | 92 | Args: 93 | field: a string key to access the data via `get` 94 | field_data: a tensor containing the data to store in the BoxList 95 | """ 96 | self.data[field] = field_data 97 | 98 | def has_field(self, field): 99 | return field in self.data 100 | 101 | def get(self): 102 | """Convenience function for accessing box coordinates. 103 | 104 | Returns: 105 | a tensor with shape [N, 4] representing box coordinates following order [ymin, xmin, ymax, xmax] 106 | """ 107 | return self.get_field('boxes') 108 | 109 | def set(self, boxes): 110 | """Convenience function for setting box coordinates. 111 | 112 | Args: 113 | boxes: a tensor of shape [N, 4] representing box corners 114 | 115 | Raises: 116 | ValueError: if invalid dimensions for bbox data 117 | """ 118 | if len(boxes.get_shape()) != 2 or boxes.get_shape()[-1] != 4: 119 | raise ValueError('Invalid dimensions for box data.') 120 | self.data['boxes'] = boxes 121 | 122 | def get_field(self, field): 123 | """Accesses a box collection and associated fields. 124 | 125 | This function returns specified field with object; if no field is specified, 126 | it returns the box coordinates. 127 | 128 | Args: 129 | field: this optional string parameter can be used to specify 130 | a related field to be accessed. 131 | 132 | Returns: 133 | a tensor representing the box collection or an associated field. 134 | 135 | Raises: 136 | ValueError: if invalid field 137 | """ 138 | if not self.has_field(field): 139 | raise ValueError('field ' + str(field) + ' does not exist') 140 | return self.data[field] 141 | 142 | def set_field(self, field, value): 143 | """Sets the value of a field. 144 | 145 | Updates the field of a box_list with a given value. 146 | 147 | Args: 148 | field: (string) name of the field to set value. 149 | value: the value to assign to the field. 150 | 151 | Raises: 152 | ValueError: if the box_list does not have specified field. 153 | """ 154 | if not self.has_field(field): 155 | raise ValueError('field %s does not exist' % field) 156 | self.data[field] = value 157 | 158 | def get_center_coordinates_and_sizes(self, scope=None): 159 | """Computes the center coordinates, height and width of the boxes. 160 | 161 | Args: 162 | scope: name scope of the function. 163 | 164 | Returns: 165 | a list of 4 1-D tensors [ycenter, xcenter, height, width]. 166 | """ 167 | with tf.name_scope(scope, 'get_center_coordinates_and_sizes'): 168 | box_corners = self.get() 169 | ymin, xmin, ymax, xmax = tf.unstack(tf.transpose(box_corners)) 170 | width = xmax - xmin 171 | height = ymax - ymin 172 | ycenter = ymin + height / 2. 173 | xcenter = xmin + width / 2. 174 | return [ycenter, xcenter, height, width] 175 | 176 | def transpose_coordinates(self, scope=None): 177 | """Transpose the coordinate representation in a boxlist. 178 | 179 | Args: 180 | scope: name scope of the function. 181 | """ 182 | with tf.name_scope(scope, 'transpose_coordinates'): 183 | y_min, x_min, y_max, x_max = tf.split( 184 | value=self.get(), num_or_size_splits=4, axis=1) 185 | self.set(tf.concat([x_min, y_min, x_max, y_max], 1)) 186 | 187 | def as_tensor_dict(self, fields=None): 188 | """Retrieves specified fields as a dictionary of tensors. 189 | 190 | Args: 191 | fields: (optional) list of fields to return in the dictionary. 192 | If None (default), all fields are returned. 193 | 194 | Returns: 195 | tensor_dict: A dictionary of tensors specified by fields. 196 | 197 | Raises: 198 | ValueError: if specified field is not contained in boxlist. 199 | """ 200 | tensor_dict = {} 201 | if fields is None: 202 | fields = self.get_all_fields() 203 | for field in fields: 204 | if not self.has_field(field): 205 | raise ValueError('boxlist must contain all specified fields') 206 | tensor_dict[field] = self.get_field(field) 207 | return tensor_dict 208 | -------------------------------------------------------------------------------- /person_detect/anchor/shape_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | 16 | """Utils used to manipulate tensor shapes.""" 17 | 18 | import tensorflow as tf 19 | 20 | 21 | def _is_tensor(t): 22 | """Returns a boolean indicating whether the input is a tensor. 23 | 24 | Args: 25 | t: the input to be tested. 26 | 27 | Returns: 28 | a boolean that indicates whether t is a tensor. 29 | """ 30 | return isinstance(t, (tf.Tensor, tf.SparseTensor, tf.Variable)) 31 | 32 | 33 | def _set_dim_0(t, d0): 34 | """Sets the 0-th dimension of the input tensor. 35 | 36 | Args: 37 | t: the input tensor, assuming the rank is at least 1. 38 | d0: an integer indicating the 0-th dimension of the input tensor. 39 | 40 | Returns: 41 | the tensor t with the 0-th dimension set. 42 | """ 43 | t_shape = t.get_shape().as_list() 44 | t_shape[0] = d0 45 | t.set_shape(t_shape) 46 | return t 47 | 48 | 49 | def pad_tensor(t, length): 50 | """Pads the input tensor with 0s along the first dimension up to the length. 51 | 52 | Args: 53 | t: the input tensor, assuming the rank is at least 1. 54 | length: a tensor of shape [1] or an integer, indicating the first dimension 55 | of the input tensor t after padding, assuming length <= t.shape[0]. 56 | 57 | Returns: 58 | padded_t: the padded tensor, whose first dimension is length. If the length 59 | is an integer, the first dimension of padded_t is set to length 60 | statically. 61 | """ 62 | t_rank = tf.rank(t) 63 | t_shape = tf.shape(t) 64 | t_d0 = t_shape[0] 65 | pad_d0 = tf.expand_dims(length - t_d0, 0) 66 | pad_shape = tf.cond( 67 | tf.greater(t_rank, 1), lambda: tf.concat([pad_d0, t_shape[1:]], 0), 68 | lambda: tf.expand_dims(length - t_d0, 0)) 69 | padded_t = tf.concat([t, tf.zeros(pad_shape, dtype=t.dtype)], 0) 70 | if not _is_tensor(length): 71 | padded_t = _set_dim_0(padded_t, length) 72 | return padded_t 73 | 74 | 75 | def clip_tensor(t, length): 76 | """Clips the input tensor along the first dimension up to the length. 77 | 78 | Args: 79 | t: the input tensor, assuming the rank is at least 1. 80 | length: a tensor of shape [1] or an integer, indicating the first dimension 81 | of the input tensor t after clipping, assuming length <= t.shape[0]. 82 | 83 | Returns: 84 | clipped_t: the clipped tensor, whose first dimension is length. If the 85 | length is an integer, the first dimension of clipped_t is set to length 86 | statically. 87 | """ 88 | clipped_t = tf.gather(t, tf.range(length)) 89 | if not _is_tensor(length): 90 | clipped_t = _set_dim_0(clipped_t, length) 91 | return clipped_t 92 | 93 | 94 | def pad_or_clip_tensor(t, length): 95 | """Pad or clip the input tensor along the first dimension. 96 | 97 | Args: 98 | t: the input tensor, assuming the rank is at least 1. 99 | length: a tensor of shape [1] or an integer, indicating the first dimension 100 | of the input tensor t after processing. 101 | 102 | Returns: 103 | processed_t: the processed tensor, whose first dimension is length. If the 104 | length is an integer, the first dimension of the processed tensor is set 105 | to length statically. 106 | """ 107 | processed_t = tf.cond( 108 | tf.greater(tf.shape(t)[0], length), 109 | lambda: clip_tensor(t, length), 110 | lambda: pad_tensor(t, length)) 111 | if not _is_tensor(length): 112 | processed_t = _set_dim_0(processed_t, length) 113 | return processed_t 114 | 115 | 116 | def combined_static_and_dynamic_shape(tensor): 117 | """Returns a list containing static and dynamic values for the dimensions. 118 | 119 | Returns a list of static and dynamic values for shape dimensions. This is 120 | useful to preserve static shapes when available in reshape operation. 121 | 122 | Args: 123 | tensor: A tensor of any type. 124 | 125 | Returns: 126 | A list of size tensor.shape.ndims containing integers or a scalar tensor. 127 | """ 128 | static_shape = tensor.shape.as_list() 129 | dynamic_shape = tf.shape(tensor) 130 | combined_shape = [] 131 | for index, dim in enumerate(static_shape): 132 | if dim is not None: 133 | combined_shape.append(dim) 134 | else: 135 | combined_shape.append(dynamic_shape[index]) 136 | return combined_shape 137 | -------------------------------------------------------------------------------- /person_detect/person_detect_test.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: person_detect_test.py 7 | @time: 18-10-9 下午5:14 8 | ''' 9 | 10 | # encoding: utf-8 11 | ''' 12 | @author: shiwei hou 13 | @contact: murdockhou@gmail.com 14 | @software: PyCharm 15 | @file: test_model.py 16 | @time: 18-9-21 下午1:31 17 | ''' 18 | 19 | import tensorflow as tf 20 | import numpy as np 21 | import json, cv2, os 22 | import logging 23 | import sys 24 | sys.path.append('../') 25 | 26 | from src.retinanet import RetinaNet 27 | from anchor.anchor_generator import create_retinanet_anchors 28 | from anchor.box_coder import FasterRCNNBoxCoder 29 | from keypoint_subnet.src.backbone import BackBone 30 | from src.draw_box_with_image import get_pred_boxs_with_img 31 | 32 | 33 | FLAGS = tf.flags.FLAGS 34 | tf.flags.DEFINE_string('model', '/media/ulsee/D/retinanet/20181019-2122/model.ckpt-209999', 35 | 'model path you want to test, e.g,. (/media/ulsee/D/retinanet/20180920-1337/model.ckpt-xxxx') 36 | tf.flags.DEFINE_string('img_path', '/media/ulsee/E/datasets/test', 37 | 'img path to test model') 38 | tf.flags.DEFINE_string('save_path', '/media/ulsee/E/retinanet/test', 39 | 'model test result to save') 40 | tf.flags.DEFINE_integer(name='batch_size', default=1, help='train batch size number') 41 | tf.flags.DEFINE_integer(name='img_size', default=480, help='net input size') 42 | tf.flags.DEFINE_boolean('is_single_channel', False, 'define the net cls_pred is single channel or not.') 43 | 44 | def draw_boxs(img, boxs, scores): 45 | 46 | for i in range(boxs.shape[0]): 47 | box = boxs[i] 48 | cv2.rectangle(img, (box[1], box[0]), (box[3], box[2]), (255 - 10*i,0,0), 1) 49 | cv2.putText(img, 'person: ' + str(scores[i]), (box[1], box[0]), cv2.FONT_HERSHEY_COMPLEX, 1, (0,0,255), 1) 50 | return img 51 | 52 | def net_(): 53 | if not os.path.exists(FLAGS.save_path): 54 | os.makedirs(FLAGS.save_path) 55 | 56 | graph = tf.Graph() 57 | with graph.as_default(): 58 | 59 | backbone = BackBone(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, is_training=False) 60 | fpn, _ = backbone.build_fpn_feature() 61 | 62 | net = RetinaNet(fpn=fpn, feature_map_dict=_, batch_size=backbone.batch_size, 63 | num_classes=2, is_training=False) 64 | loc_preds, cls_preds = net.forward() 65 | 66 | #-------------------------------generate anchor----------------------------------------# 67 | input_size = [tf.to_float(FLAGS.img_size), tf.to_float(FLAGS.img_size)] 68 | feature_map_list = [(tf.ceil(tf.multiply(input_size[0], 1 / pow(2., i + 3))), 69 | tf.ceil(tf.multiply(input_size[1], 1 / pow(2., i + 3)))) 70 | for i in range(5)] 71 | anchor_generator = create_retinanet_anchors() 72 | anchor = anchor_generator.generate(input_size, feature_map_list) 73 | 74 | # -------------------------------decode loc_pred---------------------------------------# 75 | current_loc_pred = loc_preds[0] 76 | # 根据anchor将网络的loc输出解码,表示为[ymin, xmin, ymax, xmax] 77 | current_box_list = FasterRCNNBoxCoder().decode(current_loc_pred, anchor.get()) 78 | current_decoded_loc_pred = current_box_list.get() 79 | # -------------------------------------NMS--------------------------------------------# 80 | box_score = tf.nn.softmax(cls_preds[0]) 81 | box_score = box_score[:, 1] 82 | top_k_score, top_k_indices = tf.nn.top_k(box_score, k=60) 83 | decode_boxes = tf.gather(current_decoded_loc_pred, top_k_indices) 84 | valid_indices = tf.image.non_max_suppression(boxes=decode_boxes, scores=top_k_score, max_output_size=6, 85 | iou_threshold=0.5) 86 | final_boxs = tf.gather(decode_boxes, valid_indices) 87 | final_scores = tf.gather(top_k_score, valid_indices) 88 | #----------------------------------------------------------------------------------------# 89 | _box = final_boxs / tf.to_float(FLAGS.img_size) 90 | _box = tf.expand_dims(_box, axis=0) 91 | _img_with_box = tf.image.draw_bounding_boxes(backbone.input_imgs, _box) 92 | #----------------------------------------------------------------------------------------# 93 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 94 | saver = tf.train.Saver() 95 | 96 | with tf.Session() as sess: 97 | sess.run(init_op) 98 | saver.restore(sess, FLAGS.model) 99 | logging.info('model restore successfully.') 100 | 101 | #----------------load img-----------------# 102 | img_num = 0 103 | for img in os.listdir(FLAGS.img_path): 104 | img_ori = cv2.imread(os.path.join(FLAGS.img_path, img), cv2.IMREAD_COLOR) 105 | img_copy = img_ori.copy() 106 | 107 | img_input = cv2.resize(img_copy, (FLAGS.img_size, FLAGS.img_size), 108 | interpolation=cv2.INTER_NEAREST) 109 | # boxs, [n, 4], n = [ymin, xmin, ymax, xmax] 110 | classes, locations, boxs, scores, img_boxs = sess.run([cls_preds, loc_preds, final_boxs, final_scores, _img_with_box], 111 | feed_dict={backbone.input_imgs:[img_input]}) 112 | 113 | #--------------------scale------------------# 114 | factorx = img_ori.shape[1] / img_input.shape[1] 115 | factory = img_ori.shape[0] / img_input.shape[0] 116 | boxs[:,0] = boxs[:,0] * factory 117 | boxs[:,2] = boxs[:,2] * factory 118 | boxs[:,1] = boxs[:,1] * factorx 119 | boxs[:,3] = boxs[:,3] * factorx 120 | #-------------------------------------------# 121 | img_save = draw_boxs(img_ori, boxs, scores) 122 | cv2.imwrite(os.path.join(FLAGS.save_path, img), img_save) 123 | cv2.imwrite(os.path.join(FLAGS.save_path, 'tf' + img), img_boxs[0]) 124 | img_num += 1 125 | logging.info('Testing imgs ... {}'.format(img_num)) 126 | 127 | if img_num > 100: 128 | break 129 | 130 | 131 | if __name__ == '__main__': 132 | logging.basicConfig(level=logging.INFO) 133 | net_() 134 | 135 | -------------------------------------------------------------------------------- /person_detect/person_detect_train.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: person_detect_train.py 7 | @time: 18-9-28 下午2:47 8 | ''' 9 | 10 | import tensorflow as tf 11 | import tensorflow.contrib.slim as slim 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | import os, json, cv2, time 15 | from datetime import datetime 16 | 17 | import sys 18 | sys.path.append('../') 19 | 20 | from src.reader import Box_Reader 21 | from src.get_loss import get_loss 22 | from src.draw_box_with_image import get_gt_boxs_with_img, get_pred_boxs_with_img 23 | 24 | from src.retinanet import RetinaNet 25 | from keypoint_subnet.src.backbone import BackBone 26 | 27 | FLAGS = tf.flags.FLAGS 28 | tf.flags.DEFINE_integer('train_nums', 118280, 'train data nums, default: cocotrain2017--118280') 29 | tf.flags.DEFINE_integer('epochs', 10, 'train epochs') 30 | tf.flags.DEFINE_integer('num_classes', 1, '') 31 | tf.flags.DEFINE_integer('batch_size', 3, 'train batch size number') 32 | tf.flags.DEFINE_integer('img_size', 480, 'net input size') 33 | tf.flags.DEFINE_float('learning_rate', 5e-5, 'trian lr') 34 | tf.flags.DEFINE_float('decay_rate', 0.9, 'lr decay rate') 35 | tf.flags.DEFINE_integer('decay_steps', 10000, 'lr decay steps') 36 | tf.flags.DEFINE_string('pretrained_resnet', '/media/ulsee/D/keypoint_subnet/20181015-1711/model.ckpt-64999/model.ckpt-339999', 37 | 'keypoint subnet pretrained model') 38 | tf.flags.DEFINE_boolean('is_training', True, '') 39 | tf.flags.DEFINE_string('checkpoint_path', '/media/ulsee/D/retinanet', 'path to save training model') 40 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/person_subnet_tfrecord/coco-instance-5.tfrecord', '') 41 | tf.flags.DEFINE_string('finetuning',None, 42 | 'folder of saved model that you wish to continue training or testing(e.g. 20180828-1803/model.ckpt-xxx), default: None') 43 | 44 | def person_detect_train(): 45 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 46 | 47 | # -------------------define where checkpoint path is-------------------------# 48 | current_time = datetime.now().strftime('%Y%m%d-%H%M') 49 | if FLAGS.finetuning is None: 50 | checkpoints_dir = os.path.join(FLAGS.checkpoint_path, current_time) 51 | if not os.path.exists(checkpoints_dir): 52 | try: 53 | os.makedirs(checkpoints_dir) 54 | except: 55 | pass 56 | else: 57 | checkpoints_dir = os.path.join(FLAGS.checkpoint_path, FLAGS.finetuning) 58 | print('checkpoints_dir == {}'.format(checkpoints_dir)) 59 | 60 | # ------------------------------define Graph --------------------------------# 61 | tf.reset_default_graph() 62 | graph = tf.Graph() 63 | with graph.as_default(): 64 | #-----------------------------tf.placeholder-----------------------------# 65 | gt_boxs_placeholder = tf.placeholder(tf.float32, shape=[FLAGS.batch_size, 30, 4]) 66 | gt_labels_placeholder = tf.placeholder(tf.int64, shape=[FLAGS.batch_size, 30,]) 67 | #-------------------------------reader-----------------------------------# 68 | reader = Box_Reader(tfrecord_file=FLAGS.tfrecord_file, img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, epochs=FLAGS.epochs) 69 | img_batch, img_ids, img_height_batch, img_width_batch, gt_boxs, gt_labels = reader.feed() 70 | #--------------------------------net-------------------------------------# 71 | backbone = BackBone(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, is_training=FLAGS.is_training) 72 | fpn, _ = backbone.build_fpn_feature() 73 | net = RetinaNet(fpn=fpn, feature_map_dict=_, batch_size=backbone.batch_size, 74 | num_classes=FLAGS.num_classes+1, is_training=FLAGS.is_training) 75 | loc_pred, cls_pred = net.forward() 76 | #---------------------------------loss-----------------------------------# 77 | loss, decoded_loc_pred = get_loss(img_size=FLAGS.img_size, batch_size=FLAGS.batch_size, 78 | gt_boxes=gt_boxs_placeholder, loc_pred=loc_pred, 79 | gt_labels=gt_labels_placeholder, cls_pred=cls_pred, 80 | num_classes=FLAGS.num_classes, is_training=FLAGS.is_training) 81 | # -----------------------------learning rate-------------------------------# 82 | global_step = tf.Variable(0, trainable=False) 83 | learning_rate = tf.train.exponential_decay(FLAGS.learning_rate, global_step=global_step, 84 | decay_steps=FLAGS.decay_steps, decay_rate=FLAGS.decay_rate, 85 | staircase=True) 86 | opt = tf.train.AdamOptimizer(learning_rate, epsilon=1e-5) 87 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 88 | with tf.control_dependencies(update_ops): 89 | train_op = opt.minimize(loss, global_step=global_step) 90 | #--------------------------------saver-----------------------------------# 91 | res50_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='resnet_v2_50') 92 | restore_res50 = tf.train.Saver(var_list=res50_var_list) 93 | fpn_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='build_fpn_feature') 94 | 95 | global_list = tf.global_variables() 96 | bn_moving_vars = [g for g in global_list if 'moving_mean' in g.name] 97 | bn_moving_vars += [g for g in global_list if 'moving_variance' in g.name] 98 | restore_share = tf.train.Saver(var_list=(res50_var_list+fpn_var_list+bn_moving_vars)) 99 | 100 | var_list = tf.trainable_variables() 101 | retina_var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='retina_net') 102 | saver = tf.train.Saver(var_list=(res50_var_list+fpn_var_list+bn_moving_vars+retina_var_list), max_to_keep=10) 103 | saver_alter = tf.train.Saver(max_to_keep=5) 104 | 105 | #-------------------------------tf summary--------------------------------# 106 | gt_img_batch_with_box = get_gt_boxs_with_img(imgs=backbone.input_imgs, gt_boxs=gt_boxs_placeholder, gt_labels=gt_labels_placeholder, 107 | batch_size=FLAGS.batch_size, img_size=FLAGS.img_size) 108 | pred_img_batch_with_box = get_pred_boxs_with_img(imgs=backbone.input_imgs, decoded_boxs=decoded_loc_pred, cls_pred=cls_pred, 109 | batch_size=FLAGS.batch_size, img_size=FLAGS.img_size) 110 | gt_img_box_placeholder = tf.placeholder(tf.float32, 111 | shape=(FLAGS.batch_size, FLAGS.img_size, FLAGS.img_size, 3)) 112 | pred_img_box_placeholder = tf.placeholder(tf.float32, 113 | shape=(FLAGS.batch_size, FLAGS.img_size, FLAGS.img_size, 3)) 114 | img_ids_batch_placeholder = tf.placeholder(tf.string, shape=[FLAGS.batch_size,]) 115 | tf.summary.text('img_ids', img_ids_batch_placeholder) 116 | tf.summary.image('gt_bbox', gt_img_box_placeholder, max_outputs=2) 117 | tf.summary.image('Pre_bbox', pred_img_box_placeholder, max_outputs=2) 118 | tf.summary.scalar('lr', learning_rate) 119 | tf.summary.scalar('loss', loss) 120 | 121 | summary_op = tf.summary.merge_all() 122 | summary_writer = tf.summary.FileWriter(checkpoints_dir, graph) 123 | 124 | #----------------------------------init-----------------------------------# 125 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 126 | config = tf.ConfigProto() 127 | # config.gpu_options.per_process_gpu_memory_fraction = 0.7 128 | # sudo rm -f ~/.nv 129 | config.gpu_options.allow_growth = True 130 | step = 0 131 | #---------------------------------train------------------------------------# 132 | with tf.Session(graph=graph, config=config) as sess: 133 | sess.run(init_op) 134 | 135 | if FLAGS.finetuning is not None: 136 | saver.save(sess, checkpoints_dir) 137 | print('Successfully load finetuning model.') 138 | print('Global_step == {}, Step == {}'.format(sess.run(global_step), step)) 139 | step = sess.run(global_step) 140 | 141 | else: 142 | restore_share.save(sess, FLAGS.pretrained_resnet) 143 | print ('Successfully load pre_trained model.') 144 | 145 | coord = tf.train.Coordinator() 146 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 147 | 148 | start_time = time.time() 149 | try: 150 | while not coord.should_stop(): 151 | imgs, ids, heights, widths, boxes, labels = sess.run([img_batch, img_ids, img_height_batch, img_width_batch, gt_boxs, gt_labels]) 152 | 153 | gt_img_box, pre_img_box, \ 154 | total_loss, box_pred_list, classes_pred, \ 155 | _, lr= sess.run( 156 | [gt_img_batch_with_box, pred_img_batch_with_box, 157 | loss, decoded_loc_pred, cls_pred, 158 | train_op, learning_rate 159 | ], feed_dict={ 160 | backbone.input_imgs: imgs, 161 | gt_boxs_placeholder: boxes, 162 | gt_labels_placeholder:labels, 163 | img_ids_batch_placeholder:ids 164 | } 165 | ) 166 | # cur_time = time.time() 167 | # print ('sess run spend {}'.format(cur_time-pre_time)) 168 | # pre_time = cur_time 169 | 170 | #-------------------summary------------------------# 171 | # gt_img_box_placeholder: gt_img_box, 172 | merge_op = sess.run(summary_op,feed_dict={backbone.input_imgs: imgs, 173 | gt_boxs_placeholder: boxes, 174 | gt_labels_placeholder:labels, 175 | pred_img_box_placeholder:pre_img_box, 176 | gt_img_box_placeholder: gt_img_box, 177 | img_ids_batch_placeholder: ids}) 178 | summary_writer.add_summary(merge_op, step) 179 | summary_writer.flush() 180 | 181 | # cur_time = time.time() 182 | # print('merge op spend {}'.format(cur_time - pre_time)) 183 | # pre_time = cur_time 184 | 185 | if (step+1) % 10 == 0: 186 | cur_time = time.time() 187 | print('Step = {}, Total loss = {}, time spend = {}'.format(step, total_loss, cur_time-start_time)) 188 | start_time = cur_time 189 | 190 | if (step+1) % 2000 == 0: 191 | save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step) 192 | print('Model saved in file: %s' % save_path) 193 | save_path_alter = saver_alter.save(sess, checkpoints_dir + '/model_alter.ckpt', 194 | global_step=step) 195 | 196 | step += 1 197 | # print (step) 198 | # if step == 10: 199 | # break 200 | 201 | except KeyboardInterrupt: 202 | print ('Interrupted') 203 | coord.request_stop() 204 | 205 | except Exception as e: 206 | coord.request_stop(e) 207 | 208 | finally: 209 | save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step) 210 | print ('Model saved in file: %s' % save_path) 211 | save_path_alter = saver_alter.save(sess, checkpoints_dir + '/model_alter.ckpt', global_step=step) 212 | # When done, ask the threads to stop. 213 | coord.request_stop() 214 | coord.join(threads) 215 | 216 | 217 | 218 | 219 | if __name__ == '__main__': 220 | person_detect_train() -------------------------------------------------------------------------------- /person_detect/src/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: __init__.py 7 | @time: 18-9-28 下午2:45 8 | ''' -------------------------------------------------------------------------------- /person_detect/src/backbone.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: backbone.py 7 | @time: 18-9-28 上午11:03 8 | ''' 9 | 10 | from __future__ import absolute_import, division, print_function 11 | 12 | import tensorflow as tf 13 | 14 | from tensorflow.contrib.slim import nets 15 | from tensorflow.contrib.layers.python.layers import utils 16 | import tensorflow.contrib.slim as slim 17 | 18 | class BackBone(object): 19 | def __init__(self, inputs, img_size, batch_size, is_training=True): 20 | self.img_size = img_size 21 | self.batch_size = batch_size 22 | self.input_imgs = inputs 23 | self.is_training = is_training 24 | self.stddev = 0.01 25 | 26 | def get_feature_map(self): 27 | #-------------------resent---------------------# 28 | arg_scope = nets.resnet_v2.resnet_arg_scope() 29 | with slim. arg_scope(arg_scope): 30 | out, end_points = nets.resnet_v2.resnet_v2_50(inputs=self.input_imgs, num_classes=None, is_training=self.is_training) 31 | #---------------feature map dict---------------# 32 | feature_map_dict = { 33 | 'C2': end_points['resnet_v2_50/block1/unit_2/bottleneck_v2'], # input_size / 4 34 | 'C3': end_points['resnet_v2_50/block2/unit_3/bottleneck_v2'], # input_size / 8 35 | 'C4': end_points['resnet_v2_50/block3/unit_4/bottleneck_v2'], # input_size / 16 36 | 'C5': end_points['resnet_v2_50/block4'] # input_size / 32 37 | } 38 | return feature_map_dict 39 | 40 | def build_fpn_feature(self): 41 | feature_pyramid = {} 42 | feature_map_dict = self.get_feature_map() 43 | #------------------------------------------build fpn-------------------------------------------# 44 | with tf.variable_scope('build_fpn_feature'): 45 | with slim.arg_scope([slim.conv2d], weights_initializer=tf.random_normal_initializer(stddev=self.stddev)): 46 | feature_pyramid['P5'] = slim.conv2d(feature_map_dict['C5'], num_outputs=256, kernel_size=[1, 1], stride=1, 47 | scope='build_fpn_P5') 48 | 49 | #------------------ top-down pathway and lateral connections--------------------------# 50 | for layer in range(4, 1, -1): 51 | p = feature_pyramid['P' + str(layer + 1)] 52 | c = feature_map_dict['C' + str(layer)] 53 | 54 | #---------------------------------- upsample p -----------------------------------# 55 | up_shape = c.get_shape() 56 | up_sample = tf.image.resize_nearest_neighbor(p, [up_shape[2], up_shape[2]], 57 | name='upsampling_fpn_P%d' % layer) 58 | 59 | #----------------------------------- 1x1 conv ------------------------------------# 60 | c = slim.conv2d(c, num_outputs=256, kernel_size=[1, 1], stride=1, scope='fpn_1x1conv_C%d' % layer) 61 | p = up_sample + c 62 | 63 | #----------------------reduce aliasing effect of upsampling ----------------------# 64 | #---------------(in the third last paragraph, Section 3, Paper FPN)---------------# 65 | p = slim.conv2d(p, num_outputs=256, kernel_size=[3, 3], stride=1, padding='SAME', 66 | scope='build_fpn_P%d' % layer) 67 | 68 | feature_pyramid['P' + str(layer)] = p 69 | 70 | return feature_pyramid, feature_map_dict 71 | 72 | -------------------------------------------------------------------------------- /person_detect/src/convert_tfrecord.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: convert_tfrecord.py.py 7 | @time: 18-9-28 下午6:55 8 | ''' 9 | import tensorflow as tf 10 | import cv2, os, json 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | 14 | FLAGS = tf.flags.FLAGS 15 | tf.flags.DEFINE_string('json_file', '/media/ulsee/E/datasets/coco/annotations2017/coco-instance-imgid-bbox.json', '') 16 | tf.flags.DEFINE_string('img_path', '/media/ulsee/E/datasets/coco/cocotrain2017', 'image dataset path need to convert to tfrecord') 17 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/person_subnet_tfrecord/coco-instance-with-ids.tfrecord', 'tfrecord file') 18 | 19 | def _int64_feature(value): 20 | ''' Wrapper for inserting int64 feature into Example proto''' 21 | if not isinstance(value, list): 22 | value = [value] 23 | return tf.train.Feature(int64_list = tf.train.Int64List(value=value)) 24 | 25 | def _float_feature(value): 26 | ''' Wrapper for inserting float feature into Example proto''' 27 | if not isinstance(value, list): 28 | value = [value] 29 | return tf.train.Feature(float_list=tf.train.FloatList(value=value)) 30 | 31 | def _bytes_feature(value): 32 | ''' Wrapper for inserting bytes feature into Example proto''' 33 | if not isinstance(value, list): 34 | value = [value] 35 | return tf.train.Feature(bytes_list = tf.train.BytesList(value=value)) 36 | 37 | def _string_feature(value): 38 | ''' Wrapper for inserting string (actually bytes) feature into Example proto''' 39 | if not isinstance(value, list): 40 | value = [value] 41 | return tf.train.Feature(bytes_list = tf.train.BytesList(value=value)) 42 | 43 | def _process_one_image(img_file, bboxes, person_id=1): 44 | ''' 45 | 46 | :param img_file: the img file that will be read and processing 47 | :param bboxes: a list, contains box and crossponding label, format is [xmin, ymin, w, h, category_id]*n, n is the number of annotationed person 48 | :param person_id: the category_id that person is 49 | :return: 50 | img_data: binary image file that reading from tf.gfile.FastGFile 51 | img_shape: [height, widht, channels] of img 52 | bboxs: a list, [ymin, xmin, ymax, xmax] * n 53 | labels: a list, [person_id,], size = n 54 | ''' 55 | 56 | # read img data 57 | img_data = tf.gfile.FastGFile(img_file, 'rb').read() 58 | img_shape = cv2.imread(img_file).shape 59 | 60 | # deal with bboxes 61 | bboxs = [] 62 | labels = [] 63 | box_num = len(bboxes) // 5 64 | 65 | for i in range(box_num): 66 | if bboxes[i*5+4] != person_id: 67 | continue 68 | box = bboxes[i*5:i*5+4] 69 | label = bboxes[i*5+4] 70 | box[2] += box[0] 71 | box[3] += box[1] 72 | #----convert box format [xmin, ymin, xmax, ymax] to [ymin, xmin, ymax, xmax]-------# 73 | tmp = box[0] 74 | box[0] = box[1] 75 | box[1] = tmp 76 | tmp = box[2] 77 | box[2] = box[3] 78 | box[3] = tmp 79 | #-----------------------------------------------------------------------------------# 80 | bboxs += box 81 | labels.append(label) 82 | 83 | return img_data, img_shape, bboxs, labels 84 | 85 | def convert_to_tfrecord(json_file, tfrecord_file): 86 | ''' 87 | especially reading coco-json file 88 | 89 | :param json_file: prepared json_file that contains coco dataset person annotations, the format is a map, which key is img_name without suffix, and value is 90 | a list contains person_num * 5 elements, the each five elements is like [xmin, ymin, w, h, category_id]. 91 | :param tfrecord_file: the tfrecord file that we save 92 | :return: 93 | ''' 94 | 95 | tfrecord_dir = os.path.dirname(tfrecord_file) 96 | if not os.path.exists(tfrecord_dir): 97 | os.makedirs(tfrecord_dir) 98 | 99 | writer = tf.python_io.TFRecordWriter(tfrecord_file) 100 | f = open(json_file, encoding='utf-8') 101 | labels = json.load(f) 102 | 103 | total_img_nums = len(labels) 104 | count = 0 105 | count_zero = 0 106 | for key, value in labels.items(): 107 | img_name = key + '.jpg' 108 | img_data, shape, bboxs, labels = _process_one_image(os.path.join(FLAGS.img_path, img_name), value) 109 | if not bboxs: 110 | count_zero += 1 111 | continue 112 | 113 | # ----if len(bboxs)//4 < n (set n = 20), add zeros to make len(bboxs)//4 == n------------# 114 | n = 30 115 | if len(bboxs) < n * 4: 116 | last = n * 4 - len(bboxs) 117 | bboxs += list(np.zeros(last, dtype=np.float32)) 118 | labels += list(np.zeros(last // 4, dtype=np.int32)) 119 | else: 120 | bboxs = bboxs[:n * 4] 121 | labels = labels[:n] 122 | # ----------------------------------------------------------------------------------------# 123 | 124 | img_format = b'JPEG' 125 | example = tf.train.Example(features=tf.train.Features( 126 | feature = { 127 | 'image':_bytes_feature(img_data), 128 | 'id':_string_feature(bytes(key, encoding='utf-8')), 129 | 'height':_int64_feature(shape[0]), 130 | 'width':_int64_feature(shape[1]), 131 | 'format':_bytes_feature(img_format), 132 | 'channel':_int64_feature(shape[2]), 133 | 'boxes':_float_feature(bboxs), # [xmin, ymin, xmax, ymax] * 30 134 | 'labels':_int64_feature(labels) 135 | } 136 | )) 137 | writer.write(example.SerializeToString()) 138 | count += 1 139 | 140 | # if count == 5: 141 | # break 142 | 143 | if count % 1000 == 0: 144 | print ('Processing {}/{}'.format(count, total_img_nums)) 145 | print ('No human box imgs nums {}/{}'.format(count_zero, total_img_nums)) 146 | print('Converting tfrecord done.') 147 | writer.close() 148 | 149 | def convert_ai_challenger_tfrecord(tfrecord_file, json_file = '/media/ulsee/E/datasets/ai_challenger_keypoint_train_20170909/keypoint_train_annotations_20170909.json'): 150 | f = open(json_file, encoding='utf-8') 151 | labels = json.load(f) 152 | img_path = '/media/ulsee/E/datasets/ai_challenger_keypoint_train_20170909/keypoint_train_images_20170902' 153 | 154 | tfrecord_dir = os.path.dirname(tfrecord_file) 155 | if not os.path.exists(tfrecord_dir): 156 | os.makedirs(tfrecord_dir) 157 | 158 | writer = tf.python_io.TFRecordWriter(tfrecord_file) 159 | total_img_nums = len(labels) 160 | count = 0 161 | count_zero = 0 162 | 163 | for label in labels: 164 | img_file = os.path.join(img_path, label['image_id'] + '.jpg') 165 | bbox = [] 166 | category_id = [] 167 | annotations = label['human_annotations'] 168 | for key, value in annotations.items(): 169 | #------convert box format [xmin, ymin, xmax, ymax] into [ymin, xmin, ymax, xmax]-------# 170 | if len(value) != 4: 171 | raise ValueError('the box size must be equal to 4!!!!') 172 | tmp = value[0] 173 | value[0] = value[1] 174 | value[1] = tmp 175 | tmp = value[2] 176 | value[2] = value[3] 177 | value[3] = tmp 178 | #--------------------------------------------------------------------------------------# 179 | bbox += value 180 | category_id.append(1) 181 | if not bbox: 182 | count_zero += 1 183 | continue 184 | 185 | # ----if len(bboxs)//4 < n (set n = 20), add zeros to make len(bboxs)//4 == n------------# 186 | n = 30 187 | if len(bbox) < n * 4: 188 | last = n * 4 - len(bbox) 189 | bbox += list(np.zeros(last, dtype=np.float32)) 190 | category_id += list(np.zeros(last // 4, dtype=np.int32)) 191 | else: 192 | bbox = bbox[:n * 4] 193 | category_id = category_id[:n] 194 | # ----------------------------------------------------------------------------------------# 195 | 196 | img_data = tf.gfile.FastGFile(img_file, 'rb').read() 197 | img_dat = cv2.imread(img_file, cv2.IMREAD_COLOR) 198 | shape = img_dat.shape 199 | img_format = b'JPEG' 200 | 201 | # add to tfrecord 202 | example = tf.train.Example(features=tf.train.Features( 203 | feature={ 204 | 'image': _bytes_feature(img_data), 205 | 'height': _int64_feature(shape[0]), 206 | 'width': _int64_feature(shape[1]), 207 | 'format': _bytes_feature(img_format), 208 | 'channel': _int64_feature(shape[2]), 209 | 'boxes': _float_feature(bbox), # [xmin, ymin, xmax, ymax] * n 210 | 'labels': _int64_feature(category_id) 211 | } 212 | )) 213 | writer.write(example.SerializeToString()) 214 | count += 1 215 | 216 | # if count == 10: 217 | # break 218 | if count % 1000 == 0: 219 | print ('Processing {}/{}'.format(count, total_img_nums)) 220 | writer.close() 221 | print('Zeros box img nums {}'.format(count_zero)) 222 | print('Convert tfrecord done.') 223 | 224 | if __name__ == '__main__': 225 | 226 | convert_to_tfrecord(FLAGS.json_file, FLAGS.tfrecord_file) 227 | # convert_ai_challenger_tfrecord(FLAGS.tfrecord_file) -------------------------------------------------------------------------------- /person_detect/src/draw_box_with_image.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: draw_box_with_image.py 7 | @time: 18-9-28 下午3:07 8 | ''' 9 | 10 | import tensorflow as tf 11 | import numpy as np 12 | from src.get_loss import deal_zeros_box 13 | 14 | def get_gt_boxs_with_img(imgs, gt_boxs, gt_labels, batch_size, img_size): 15 | gt_img_batch_with_box = [] 16 | for i in range(batch_size): 17 | 18 | # remove zeros box [0,0,0,0] 19 | current_loc = gt_boxs[i] 20 | current_cls = gt_labels[i] 21 | current_loc, current_cls = deal_zeros_box(current_loc, current_cls) 22 | current_gt_box = current_loc / tf.to_float(img_size) 23 | 24 | # draw box on single image 25 | img_batch_i = imgs[i] 26 | output_box_batch_i = tf.expand_dims(current_gt_box, axis=0) 27 | img_batch_i = tf.expand_dims(img_batch_i, axis=0) 28 | 29 | img_batch_i_with_box = tf.image.draw_bounding_boxes(images=img_batch_i, boxes=output_box_batch_i) 30 | gt_img_batch_with_box.append(img_batch_i_with_box) 31 | 32 | gt_img_batch_with_box = tf.reshape(tf.concat(gt_img_batch_with_box, axis=0), 33 | shape=(batch_size, img_size, img_size, 3)) 34 | return gt_img_batch_with_box 35 | 36 | def get_pred_boxs_with_img(imgs, decoded_boxs, cls_pred, batch_size, img_size): 37 | 38 | batch_output_box = [] 39 | batch_output_box_score = [] 40 | for i in range(batch_size): 41 | box_score = tf.nn.softmax(cls_pred[i]) 42 | box_score = box_score[:, 1] 43 | top_k_score, top_k_indices = tf.nn.top_k(box_score, k=60) 44 | decode_boxes = tf.gather(decoded_boxs[i], top_k_indices) 45 | valid_indices = tf.image.non_max_suppression(boxes=decode_boxes, scores=top_k_score, max_output_size=6, 46 | iou_threshold=0.5) 47 | output_loc = tf.gather(decode_boxes, valid_indices) 48 | output_score = tf.gather(top_k_score, valid_indices) 49 | batch_output_box.append(output_loc) 50 | batch_output_box_score.append(output_score) 51 | 52 | pred_img_batch_with_box = [] 53 | for i in range(batch_size): 54 | output_box_batch_i = batch_output_box[i] / tf.to_float(img_size) 55 | img_batch_i = imgs[i] 56 | output_box_batch_i = tf.expand_dims(output_box_batch_i, axis=0) 57 | img_batch_i = tf.expand_dims(img_batch_i, axis=0) 58 | img_batch_i_with_box = tf.image.draw_bounding_boxes(images=img_batch_i, boxes=output_box_batch_i) 59 | pred_img_batch_with_box.append(img_batch_i_with_box) 60 | 61 | pred_img_batch_with_box = tf.reshape(tf.concat(pred_img_batch_with_box, axis=0), 62 | shape=(batch_size, img_size, img_size, 3)) 63 | return pred_img_batch_with_box 64 | -------------------------------------------------------------------------------- /person_detect/src/get_loss.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: get_loss.py 7 | @time: 18-9-28 下午2:53 8 | ''' 9 | 10 | import tensorflow as tf 11 | import numpy as np 12 | 13 | from anchor.anchor_generator import create_retinanet_anchors, anchor_assign 14 | from anchor.box_coder import FasterRCNNBoxCoder 15 | from anchor.box_list import BoxList 16 | 17 | from src.loss import focal_loss, regression_loss 18 | 19 | def get_loss(img_size, batch_size, gt_boxes, loc_pred, gt_labels, cls_pred, num_classes=1, is_training=True): 20 | 21 | #--------------------based gt get anchors_list------------------------# 22 | anchors_list = get_inputs(img_size=img_size, batch_size=batch_size, gt_boxes=gt_boxes, 23 | gt_labels=gt_labels, is_training=is_training) 24 | #-----------------------------net-------------------------------------# 25 | # backbone = BackBone(img_size, batch_size, is_training=is_training) 26 | # fpn = backbone.build_fpn_feature() 27 | # net = RetinaNet(fpn=fpn, batch_size=batch_size, num_classes=num_classes, is_training=is_training) 28 | # loc_pred, cls_pred = net.forward() 29 | 30 | # ----------------------decode pred_boxs-----------------------# 31 | # ----convert [ty, tx, th, tw] to [ymin, xmin, ymax, xmax]-----# 32 | decoded_loc_pred = [] 33 | for i in range(batch_size): 34 | anchor = anchors_list[i] 35 | current_loc_pred = loc_pred[i] 36 | # 根据anchor将网络的loc输出解码,表示为[ymin, xmin, ymax, xmax] 37 | current_box_list = FasterRCNNBoxCoder().decode(current_loc_pred, anchor.get()) 38 | current_decoded_loc_pred = current_box_list.get() 39 | decoded_loc_pred.append(current_decoded_loc_pred) 40 | 41 | #---------get num of anchor overlapped with ground truth box------------# 42 | cls_gt = [anchor.get_field('gt_labels') for anchor in 43 | anchors_list] # a list, contains batchs number tensor, each tensor is 1D contains #anchors label 44 | loc_gt = [anchor.get_field('gt_encoded_boxes') for anchor in 45 | anchors_list] # a list, contains batchs number tensor, each tensor (gt_encoded_boxes) shape is [-1, 4], 46 | # the format of gt_encoded_boxes is [ymin, xmin, ymax, xmax] 47 | #--------------------------calculate loss-------------------------------# 48 | total_loss = 0 49 | for i in range(batch_size): 50 | single_cls_gt = cls_gt[i] # [#anchors,] 51 | single_loc_gt = loc_gt[i] # [#anchors,4] 52 | single_cls_pred = cls_pred[i] # [#anchors,2] 53 | single_loc_pred = loc_pred[i] # [#anchors,4] 54 | 55 | # print(single_cls_pred.get_shape(), single_cls_gt.get_shape()) 56 | 57 | # focal loss, remove anchor which label equal to -1 58 | # 因为前面生成的gt_labels,会有的anchor在iou [0.4,0.5)之间,标签为-1,要忽略掉,所以要先把这些去掉 59 | valid_anchor_indices = tf.where(tf.greater_equal(single_cls_gt, 0)) 60 | valid_cls_gt = tf.gather_nd(single_cls_gt, valid_anchor_indices) 61 | valid_cls_pred = tf.gather_nd(single_cls_pred, valid_anchor_indices) 62 | cls_gt_onehot = tf.one_hot(valid_cls_gt, depth=num_classes + 1) # [#anchors, depth] 63 | floss = focal_loss(cls_gt_onehot, valid_cls_pred) 64 | 65 | # location regression loss, remove background which label == 0 66 | valid_cls_indices = tf.where(tf.greater(single_cls_gt, 0)) 67 | valid_loc_gt = tf.reshape(tf.gather_nd(single_loc_gt, valid_cls_indices), shape=(-1, 4)) 68 | valid_loc_preds = tf.reshape(tf.gather_nd(single_loc_pred, valid_cls_indices), shape=(-1, 4)) 69 | loc_loss = regression_loss(valid_loc_preds, valid_loc_gt) 70 | 71 | 72 | total_loss = total_loss + tf.reduce_sum(floss) + tf.reduce_sum(loc_loss) 73 | 74 | loss = tf.to_float(total_loss) / tf.to_float(batch_size) 75 | return loss, decoded_loc_pred 76 | 77 | 78 | def get_inputs(img_size, batch_size, gt_boxes, gt_labels, is_training=True): 79 | loc_gt = gt_boxes 80 | cls_gt = gt_labels #[batch_size, #gt_anchors_number] 81 | 82 | # print (loc_gt.get_shape(), cls_gt.get_shape()) 83 | # get anchors 84 | anchors_list = [] 85 | for i in range(batch_size): 86 | input_size = [tf.to_float(img_size), tf.to_float(img_size)] 87 | feature_map_list = [(tf.ceil(tf.multiply(input_size[0], 1/pow(2., i+3))), 88 | tf.ceil(tf.multiply(input_size[1], 1/pow(2., i+3)))) 89 | for i in range(5)] 90 | anchor_generator = create_retinanet_anchors() 91 | anchor = anchor_generator.generate(input_size, feature_map_list) 92 | 93 | current_loc_gt = loc_gt[i] #[#gt_anchors_number, 4] 94 | current_cls_gt = cls_gt[i] #[#gt_anchors_number] 95 | # print('Before remove zeros boxs, loc_gt shape = {}, cls_gt shape = {}'.format(current_loc_gt.get_shape(), current_cls_gt.get_shape())) 96 | current_loc_gt, current_cls_gt = deal_zeros_box(current_loc_gt, current_cls_gt) 97 | # print('After remove zeros boxs, loc_gt shape = {}, cls_gt shape = {}'.format(current_loc_gt.get_shape(), current_cls_gt.get_shape())) 98 | 99 | 100 | anchor = anchor_assign(anchor, gt_boxes=BoxList(current_loc_gt), gt_labels=current_cls_gt, is_training=is_training) 101 | 102 | # encode anchor boxes 103 | gt_boxes = anchor.get_field('gt_boxes') 104 | 105 | encoded_gt_boxes = FasterRCNNBoxCoder().encode(gt_boxes, anchor.get()) 106 | anchor.add_field('gt_encoded_boxes', encoded_gt_boxes) 107 | anchors_list.append(anchor) 108 | 109 | return anchors_list 110 | 111 | def deal_zeros_box(gt_boxes, gt_labels): 112 | ''' 113 | can not do anything, because one dim in gt_boxes and gt_labels is ? 114 | update: now, we set ? = 30 in tfrecord file, so we can deal with zeros boxs 115 | :param gt_boxes: [#boxs, 4] 116 | :param gt_labels: [#boxs] 117 | :return: 118 | ''' 119 | #------------------------deal boxs--------------------------------------------------------# 120 | gt_boxs = tf.unstack(gt_boxes, axis=0) # gt_boxs, a list contains nums boxs which has shape(4,) 121 | gt_box = tf.expand_dims(gt_boxes[0], axis=0) 122 | is_first = True # the first box is always non-zero box 123 | 124 | for box in gt_boxs: 125 | if is_first: 126 | is_first = False 127 | continue 128 | gt_box = tf.cond(tf.equal(tf.reduce_sum(box), tf.reduce_sum(tf.zeros_like(box))), lambda: gt_box, 129 | lambda: tf.concat([gt_box, tf.expand_dims(box, axis=0)], axis=0)) 130 | 131 | 132 | #---------------------------deal labels--------------------------------------------------# 133 | gt_labels = tf.unstack(gt_labels, axis=0) 134 | gt_label = tf.expand_dims(gt_labels[0], axis=0) 135 | is_first = True # the first label is always non-background 136 | for label in gt_labels: 137 | if is_first: 138 | is_first = False 139 | continue 140 | gt_label = tf.cond(tf.equal(tf.reduce_sum(label), tf.reduce_sum(tf.zeros_like(label))), lambda: gt_label, 141 | lambda : tf.concat([gt_label, tf.expand_dims(label, axis=0)], axis=0)) 142 | 143 | gt_label = tf.reshape(gt_label, shape=(-1,)) 144 | 145 | return gt_box, gt_label -------------------------------------------------------------------------------- /person_detect/src/loss.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: loss.py 7 | @time: 18-9-5 上午10:47 8 | ''' 9 | import tensorflow as tf 10 | 11 | slim = tf.contrib.slim 12 | 13 | def focal_loss(onehot_labels, cls_preds, 14 | alpha=0.25, gamma=2.0, name=None, scope=None): 15 | """Compute sigmoid focal loss between logits and onehot labels 16 | 17 | logits and onehot_labels must have same shape [batchsize, num_classes] and 18 | the same data type (float16, 32, 64) 19 | 20 | Args: 21 | onehot_labels: Each row labels[i] must be a valid probability distribution 22 | cls_preds: Unscaled log probabilities 23 | alpha: The hyperparameter for adjusting biased samples, default is 0.25 24 | gamma: The hyperparameter for penalizing the easy labeled samples 25 | name: A name for the operation (optional) 26 | 27 | Returns: 28 | A 1-D tensor of length batch_size of same type as logits with softmax focal loss 29 | """ 30 | 31 | with tf.name_scope(scope, 'focal_loss', [cls_preds, onehot_labels]) as sc: 32 | # logits = tf.convert_to_tensor(cls_preds) 33 | # onehot_labels = tf.convert_to_tensor(onehot_labels) 34 | 35 | # precise_logits = tf.cast(logits, tf.float32) if (logits.dtype == tf.float16) else logits 36 | 37 | onehot_labels = tf.cast(onehot_labels, cls_preds.dtype) 38 | 39 | 40 | predictions = tf.nn.softmax(cls_preds) 41 | 42 | predictions_pt = tf.where(tf.equal(onehot_labels, 1), predictions, 1.-predictions) 43 | # add small value to avoid 0 44 | epsilon = 1e-8 45 | alpha_t = tf.scalar_mul(alpha, tf.ones_like(onehot_labels, dtype=tf.float32)) 46 | alpha_t = tf.where(tf.equal(onehot_labels, 1.0), alpha_t, 1-alpha_t) 47 | losses = tf.reduce_sum(-alpha_t * tf.pow(1. - predictions_pt, gamma) * tf.log(predictions_pt+epsilon)) 48 | return losses 49 | 50 | 51 | def focal_loss_alt(x, y): 52 | """Focal loss alternative. 53 | 54 | Args: 55 | x: (tensor) sized [N, D] 56 | y: (tensor) sized [N, D] 57 | num_classes: numbers of classes 58 | 59 | Return: 60 | (tensor) focal loss. 61 | """ 62 | alpha = 0.25 63 | t = y 64 | # t = t[:, 1:] 65 | 66 | xt = x * (2 * t - 1) # xt = x if t > 0 else -x 67 | pt = tf.log_sigmoid(2 * xt + 1) 68 | 69 | w = alpha * t + (1 - alpha) * (1 - t) 70 | loss = -w * pt / 2 71 | return tf.reduce_sum(loss) 72 | 73 | def regression_loss(pred_boxes, gt_boxes, weights=1.0): 74 | """ 75 | Regression loss (Smooth L1 loss: also known as huber loss) 76 | 77 | Args: 78 | pred_boxes: [# anchors, 4] 79 | gt_boxes: [# anchors, 4] 80 | weights: Tensor of weights multiplied by loss with shape [# anchors] 81 | """ 82 | loss = tf.losses.huber_loss(predictions=pred_boxes, labels=gt_boxes, 83 | weights=weights, scope='box_loss') 84 | 85 | return loss 86 | 87 | 88 | def loss_test(): 89 | logits = tf.convert_to_tensor([[0.2, 0.3, 0.4, 0.5], [0.5, 0.4, 0.3, 0.2]]) 90 | labels = slim.one_hot_encoding([1, 2], 4) 91 | bbox = tf.ones_like(logits) 92 | with tf.Session() as sess: 93 | print (sess.run(labels)) 94 | print (sess.run(logits)) 95 | print (sess.run(focal_loss(onehot_labels=labels, cls_preds=logits))) 96 | print (sess.run(regression_loss(logits, bbox, tf.expand_dims(1./tf.convert_to_tensor([2, 3], dtype=tf.float32), 1)))) 97 | sess.close() 98 | 99 | # test() 100 | if __name__ == '__main__': 101 | loss_test() -------------------------------------------------------------------------------- /person_detect/src/reader.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: reader.py 7 | @time: 18-9-28 下午2:39 8 | ''' 9 | 10 | import tensorflow as tf 11 | # from src.retinanet import RetinaNet 12 | 13 | class Box_Reader(object): 14 | def __init__(self, tfrecord_file, img_size=224, batch_size=1, epochs=1): 15 | self.img_size = img_size 16 | self.batch_size = batch_size 17 | self.epochs = epochs 18 | self.tfrecord_file = tfrecord_file 19 | self.reader = tf.TFRecordReader() 20 | 21 | def feed(self): 22 | filename_queue = tf.train.string_input_producer([self.tfrecord_file], num_epochs=self.epochs) 23 | reader = tf.TFRecordReader() 24 | _, serialized_example = reader.read(filename_queue) 25 | 26 | features = tf.parse_single_example( 27 | serialized_example, 28 | features={ 29 | 'image': tf.FixedLenFeature((), tf.string), 30 | 'id': tf.FixedLenFeature([], tf.string), 31 | 'format': tf.FixedLenFeature((), tf.string, 'jpeg'), 32 | 'height': tf.FixedLenFeature([], tf.int64), 33 | 'width': tf.FixedLenFeature([], tf.int64), 34 | 'channel': tf.FixedLenFeature([], tf.int64), 35 | 'boxes': tf.VarLenFeature(dtype=tf.float32), 36 | 'labels': tf.VarLenFeature(dtype=tf.int64) 37 | } 38 | ) 39 | channel = tf.cast(features['channel'], tf.int64) 40 | img = tf.image.decode_jpeg(features['image'], channels=3) # tensor, [height, width, channels] 41 | img_id = features['id'] 42 | # img = tf.decode_raw(features['image'], tf.uint8) 43 | img = tf.image.convert_image_dtype(img, dtype=tf.float32) 44 | img_height = tf.cast(features['height'], tf.int32) 45 | img_width = tf.cast(features['width'], tf.int32) 46 | 47 | # img = tf.reshape(img, shape=[img_height, img_width, 3]) 48 | # img = (img - 0) / 255 # network image input need to be float type 49 | # img = tf.to_float(img) 50 | 51 | # features['boxes'] && features['lables'] both SparseTensor type, to get real value stored, need get attribution 'values' 52 | boxs = features['boxes'].values 53 | label = features['labels'].values 54 | # must identify boxs shape and labels shape, otherwise program can not get the shape correctlly 55 | boxs = tf.reshape(boxs, shape=(30, 4)) 56 | label = tf.reshape(label, shape=(30,)) 57 | 58 | if True: 59 | img, boxs = self._pre_processing(img, img_height, img_width, boxs) 60 | 61 | imgs, img_ids, heights, widths, boxes, labels = tf.train.shuffle_batch( 62 | [img, img_id, img_height, img_width, boxs, label], 63 | batch_size=self.batch_size, 64 | num_threads=12, 65 | capacity=1000, 66 | min_after_dequeue=400 67 | ) 68 | 69 | return imgs, img_ids, heights, widths, boxes, labels 70 | 71 | def _pre_processing(self, img, height, width, bbox): 72 | img = tf.expand_dims(img, axis=0) 73 | img = tf.image.resize_nearest_neighbor(img, (self.img_size, self.img_size)) 74 | img = tf.squeeze(img, axis=0) 75 | 76 | factorx = tf.to_float(self.img_size) / tf.to_float(width) 77 | factory = tf.to_float(self.img_size) / tf.to_float(height) 78 | 79 | bbox = tf.concat([tf.reshape(bbox[:, 0] * factory, (-1, 1)), 80 | tf.reshape(bbox[:, 1] * factorx, (-1, 1)), 81 | tf.reshape(bbox[:, 2] * factory, (-1, 1)), 82 | tf.reshape(bbox[:, 3] * factorx, (-1, 1))], 83 | axis=1) 84 | return img, bbox 85 | 86 | def reader_test(): 87 | batch = 1 88 | epochs = 1 89 | reader = Box_Reader(tfrecord_file='/media/ulsee/E/person_subnet_tfrecord/coco-instance-5.tfrecord', batch_size=batch, epochs=epochs) 90 | imgs, ids, hs, ws, boxs, labels = reader.feed() 91 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 92 | with tf.Session() as sess: 93 | sess.run(init_op) 94 | step = 0 95 | coord = tf.train.Coordinator() 96 | threads = tf.train.start_queue_runners(coord=coord) 97 | 98 | try : 99 | while not coord.should_stop(): 100 | a, b, c, d, e, f = sess.run([imgs, ids, hs, ws, labels, boxs]) 101 | print (b) 102 | step +=1 103 | except tf.errors.OutOfRangeError: 104 | print ('batch = {}, epochs = {}, steps = {}'.format(batch, epochs, step)) 105 | finally: 106 | coord.request_stop() 107 | coord.join(threads) 108 | 109 | if __name__ == '__main__': 110 | reader_test() -------------------------------------------------------------------------------- /person_detect/src/retinanet.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: retinanet.py 7 | @time: 18-9-28 下午2:17 8 | ''' 9 | 10 | from __future__ import absolute_import, division, print_function 11 | 12 | import tensorflow as tf 13 | from tensorflow.contrib.slim import nets 14 | import tensorflow.contrib.slim as slim 15 | import math 16 | 17 | class RetinaNet(object): 18 | def __init__(self, fpn, feature_map_dict, batch_size, num_classes, num_anchors=9, is_training=True): 19 | self.feature_pyramid = fpn 20 | self.feature_map_dict = feature_map_dict 21 | self.batch_size = batch_size 22 | self.num_classes = num_classes 23 | self.num_anchors = num_anchors 24 | self.is_training = is_training 25 | self.stddev = 0.01 26 | self.pai = 0.01 27 | 28 | def add_fcn_head(self, inputs, outputs, head_offset): 29 | with slim.arg_scope([slim.conv2d], scope=str(head_offset), activation_fn=tf.nn.relu, 30 | weights_initializer=tf.random_normal_initializer(stddev=self.stddev)): 31 | net = slim.repeat(inputs, 4, slim.conv2d, 256, kernel_size=[3,3]) 32 | if str(head_offset)[-1] == 's': 33 | net = slim.conv2d(net, outputs, kernel_size=[3,3], scope=str(head_offset) +'_final', activation_fn=None, 34 | weights_initializer=tf.constant_initializer(0), 35 | biases_initializer=tf.constant_initializer(-(math.log((1-self.pai)/self.pai)))) 36 | else: 37 | net = slim.conv2d(net, outputs, kernel_size=[3,3], activation_fn=None, scope=str(head_offset) + '_final') 38 | 39 | return net 40 | 41 | def forward(self): 42 | loc_predictions = [] 43 | class_predictions = [] 44 | with tf.variable_scope('retina_net'): 45 | # add P6 and P7 as noticed in papar focal loss, page 4, annotation 2 46 | self.feature_pyramid['P6'] = slim.conv2d(self.feature_map_dict['C5'], num_outputs=256, kernel_size=[3, 3], 47 | stride=2, 48 | weights_initializer=tf.random_normal_initializer( 49 | stddev=self.stddev), 50 | activation_fn=None, 51 | scope='build_fpn_P6') 52 | self.feature_pyramid['P7'] = slim.conv2d(inputs=(tf.nn.relu(self.feature_pyramid['P6'])), 53 | num_outputs=256, kernel_size=[3, 3], stride=2, 54 | weights_initializer=tf.random_normal_initializer( 55 | stddev=self.stddev), 56 | activation_fn=None, 57 | scope='build_fpn_P7') 58 | # remove P2 59 | del self.feature_pyramid['P2'] 60 | 61 | for idx, feature_map in self.feature_pyramid.items(): 62 | # print ('idx {} crossponding feature map {}'.format(idx, feature_map.get_shape())) 63 | loc_prediction = self.add_fcn_head(feature_map, self.num_anchors * 4, str(idx) + '_bbox') 64 | class_prediction = self.add_fcn_head(feature_map, self.num_classes * self.num_anchors, 65 | str(idx) + '_class') 66 | 67 | loc_prediction = tf.reshape(loc_prediction, [self.batch_size, -1, 4]) 68 | class_prediction = tf.reshape(class_prediction, [self.batch_size, -1, self.num_classes]) 69 | 70 | loc_predictions.append(loc_prediction) 71 | class_predictions.append(class_prediction) 72 | 73 | return tf.concat(loc_predictions, axis=1), tf.concat(class_predictions, axis=1) -------------------------------------------------------------------------------- /pose_residual_network/README.md: -------------------------------------------------------------------------------- 1 | #PRN 网络理解: 2 | 3 | PRN网络的思想,就是对一个bounding box里,如果有多个相同部位的关键点出现在一个single box里,也就是有多个人,那么就很难判定这么多个关键点到底属于哪一个人。PRN的思想就是,一个single box就是一个人,一个人应该只有一种类型的关键点,前面keypoint Subnet网络得到的关键点位置,如果在这个single box范围内,那么就将这个single box范围内的关键点作为输入放进去PRN网络,经过计算之后,PRN网络对每个channel只输出一个关键点,并且认为这个关键点就是这个single box框起来的人的关键点。 4 | 5 | #PRN网络训练的数据构造 6 | 7 | PRN网络是对单独的一个一个box进行训练的,而不是一张图片。 8 | 9 | 10 | - label: PRN网络的label就是一个对gt_box处理过后的ROI。论文里将box缩放为高宽为56*36,height/width = 1.56的ROI。将此作为网络的输入、输出大小。如果有17个关键点要训练,那么label大小就是[56, 36, 17],每个channel的意义和使用keypoint subnet得到的输出意义一致。首先就是对gt_box里所有关键点进行处理,和keypoint subnet进行训练时一样,对每个关键点出现的位置,在label对应的位置上打上标签1,否则就是0,其实就是一个heatmap,只不过是以box生成的heatmap。 11 | - input: PRN网络的input就是预先设定好的box大小,首先将input全部设为0,然后对于这个box所在的图片上,所有出现过的关键点进行处理。和生成label过程一样,只不过处理的关键点不仅仅是原本属于gt_box的关键点了,而是这个图片上所有的出现在gt_box范围内的关键点,同样使用生成label的方法,生成网络的input。 12 | 13 | 14 | label和input生成之后,均进行高斯处理(sigma小于1显示比较明显),最后得到的结果才是PRN网络的输入和label。 15 | 16 | ### 训练结果: 17 | 18 | - 和官方提供的pytorch版本一致,训练参数一致,训练次数一致,在coco val2017的结果如下: 19 | 20 | ``` 21 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.886 22 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.977 23 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.920 24 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.874 25 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.912 26 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.911 27 | Average Recall (AR) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.980 28 | Average Recall (AR) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.933 29 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.893 30 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.943 31 | 32 | ``` 33 | 使用官方提供的pytorch版本训练,使用提供的参数和数据,在coco val2017的结果如下: 34 | 35 | ``` 36 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.888 37 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.977 38 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.920 39 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.876 40 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.910 41 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.913 42 | Average Recall (AR) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.981 43 | Average Recall (AR) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.933 44 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.894 45 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.943 46 | 47 | ``` 48 | 官方宣称能达到的精度如下: 49 | ``` 50 | Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.892 51 | Average Precision (AP) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.978 52 | Average Precision (AP) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.921 53 | Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.883 54 | Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.912 55 | Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 20 ] = 0.917 56 | Average Recall (AR) @[ IoU=0.50 | area= all | maxDets= 20 ] = 0.982 57 | Average Recall (AR) @[ IoU=0.75 | area= all | maxDets= 20 ] = 0.937 58 | Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets= 20 ] = 0.902 59 | Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets= 20 ] = 0.944 60 | 61 | ``` -------------------------------------------------------------------------------- /pose_residual_network/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: __init__.py 7 | @time: 18-9-29 下午6:56 8 | ''' -------------------------------------------------------------------------------- /pose_residual_network/prn_train.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: prn_train.py.py 7 | @time: 18-9-28 上午9:30 8 | ''' 9 | 10 | import tensorflow as tf 11 | import tensorflow.contrib.slim as slim 12 | import numpy as np 13 | import os, json, cv2, time 14 | import math 15 | 16 | from datetime import datetime 17 | 18 | from src.PRN import PRN 19 | from src.reader import PRN_READER 20 | 21 | import sys 22 | 23 | sys.path.append('../') 24 | 25 | from eval_test import eval 26 | 27 | 28 | FLAGS = tf.flags.FLAGS 29 | 30 | tf.flags.DEFINE_integer('train_nums', 262464, 'total train_data numbers in tfrecord file.') 31 | tf.flags.DEFINE_integer('batch_size', 4, '') 32 | tf.flags.DEFINE_float('learning_rate', 1e-3, '') 33 | tf.flags.DEFINE_integer('height', 56, '') 34 | tf.flags.DEFINE_integer('width', 36, '') 35 | tf.flags.DEFINE_integer('channels', 17, '') 36 | tf.flags.DEFINE_boolean('is_training', True, '') 37 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/pose_residual_net_tfrecord/coco_train2017.tfrecord', '') 38 | tf.flags.DEFINE_string('checkpoint_path', '/media/ulsee/D/PRN', 'path to save training model') 39 | tf.flags.DEFINE_string('finetuning', None, 40 | 'folder of saved model that you wish to continue training or testing(e.g. 20180828-1803/model.ckpt-xxx), default:None') 41 | 42 | def BCEloss(labels, inputs): 43 | return tf.reduce_mean( 44 | -(tf.multiply(labels, tf.log(inputs)) + 45 | tf.multiply((1-labels), tf.log(1-inputs))) 46 | ) 47 | 48 | def prn_train(): 49 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 50 | 51 | # -------------------define where checkpoint path is-------------------------# 52 | current_time = datetime.now().strftime('%Y%m%d-%H%M') 53 | if FLAGS.finetuning is None: 54 | checkpoints_dir = os.path.join(FLAGS.checkpoint_path, current_time) 55 | if not os.path.exists(checkpoints_dir): 56 | try: 57 | os.makedirs(checkpoints_dir) 58 | except: 59 | pass 60 | else: 61 | checkpoints_dir = os.path.join(FLAGS.checkpoint_path, FLAGS.finetuning) 62 | print('checkpoints_dir == {}'.format(checkpoints_dir)) 63 | 64 | # ------------------------------define Graph --------------------------------# 65 | tf.reset_default_graph() 66 | graph = tf.Graph() 67 | with graph.as_default(): 68 | # -----------------------------reader------------------------------------# 69 | reader = PRN_READER(batch_size=FLAGS.batch_size, height=FLAGS.height, width=FLAGS.width, 70 | channels=FLAGS.channels, 71 | tfrecord_file=FLAGS.tfrecord_file) 72 | inputs, label = reader.feed() 73 | # print (inputs.get_shape()) 74 | # print (label.get_shape()) 75 | # ----------------------------PRN Model----------------------------------# 76 | prn_inputs = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, FLAGS.height, FLAGS.width, FLAGS.channels)) 77 | prn_label = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, FLAGS.height, FLAGS.width, FLAGS.channels)) 78 | model = PRN(inputs=prn_inputs, output_node=FLAGS.height * FLAGS.width * FLAGS.channels, 79 | is_training=FLAGS.is_training) 80 | out = model.forward() 81 | # ------------------------------Saver------------------------------------# 82 | saver = tf.train.Saver(max_to_keep=10) 83 | # ------------------------------Loss-------------------------------------# 84 | # loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=label, logits=out)) / tf.to_float(FLAGS.batch_size) 85 | loss = BCEloss(labels=prn_label, inputs=out) 86 | # print (loss.get_shape()) 87 | # ---------------------------lr and gradient-----------------------------# 88 | global_step = tf.Variable(0) 89 | # learning_rate = tf.to_float(FLAGS.learning_rate) 90 | values = [FLAGS.learning_rate * math.pow(0.9, (epoch - 1) // 2) for epoch in range(1, 33, 2)] 91 | boundaries = [FLAGS.train_nums // FLAGS.batch_size * epoch for epoch in range(3, 33, 2)] 92 | 93 | # values = [0.01, 0.02, 0.03] 94 | # boundaries = [200, 500] 95 | learning_rate = tf.train.piecewise_constant(global_step, boundaries, values) 96 | opt = tf.train.AdamOptimizer(learning_rate) 97 | 98 | update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) 99 | with tf.control_dependencies(update_ops): 100 | train_op = opt.minimize(loss, global_step=global_step) 101 | # -----------------------------tf summary--------------------------------# 102 | # gt_label = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, FLAGS.height, FLAGS.width, FLAGS.channels)) 103 | # pred_label = tf.placeholder(tf.float32, shape=(FLAGS.batch_size, FLAGS.height, FLAGS.width, FLAGS.channels)) 104 | tf.summary.scalar('lr', learning_rate) 105 | tf.summary.scalar('loss', loss) 106 | # tf.summary.image('label', tf.reshape(tf.transpose( 107 | # prn_label, [3, 0, 1, 2])[6], shape=(-1, FLAGS.height, FLAGS.width, 1)), max_outputs=4) 108 | # tf.summary.image('pred', tf.reshape(tf.transpose( 109 | # out, [3, 0, 1, 2])[6], shape=(-1, FLAGS.height, FLAGS.width, 1)), max_outputs=4) 110 | tf.summary.image('label', tf.reduce_sum(prn_label, axis=3, keep_dims=True), max_outputs=4) 111 | tf.summary.image('preds', tf.reduce_sum(out, axis=3, keep_dims=True), max_outputs=4) 112 | summary_op = tf.summary.merge_all() 113 | summary_writer = tf.summary.FileWriter(checkpoints_dir, graph) 114 | # --------------------------------init------------------------------------# 115 | init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) 116 | config = tf.ConfigProto() 117 | config.gpu_options.allow_growth = True 118 | # --------------------------------train------------------------------------# 119 | with tf.Session(graph=graph, config=config) as sess: 120 | sess.run(init_op) 121 | coord = tf.train.Coordinator() 122 | threads = tf.train.start_queue_runners(sess=sess, coord=coord) 123 | step = 0 124 | s_time = time.time() 125 | try: 126 | while not coord.should_stop(): 127 | net_x, y = sess.run([inputs, label]) 128 | 129 | _, net_loss, lr, merge_op = sess.run( 130 | [train_op, loss, learning_rate, summary_op], 131 | feed_dict={prn_label: y, prn_inputs: net_x} 132 | ) 133 | 134 | summary_writer.add_summary(merge_op, step) 135 | summary_writer.flush() 136 | 137 | if (step + 1) % (FLAGS.train_nums // FLAGS.batch_size) == 0: 138 | save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step) 139 | print('Model saved in {}'.format(save_path)) 140 | # eval(checkpoint=save_path) 141 | if (step + 1) % 200 == 0: 142 | cur_time = time.time() 143 | print('step {}: loss = {:.6f}, lr = {:.6f},time spend = {:.6f}'.format(step, net_loss, lr, 144 | cur_time - s_time)) 145 | s_time = cur_time 146 | 147 | step += 1 148 | # break 149 | 150 | except KeyboardInterrupt: 151 | print('Interrupted') 152 | coord.request_stop() 153 | except Exception as e: 154 | coord.request_stop(e) 155 | except tf.errors.OutOfRangeError: 156 | coord.request_stop() 157 | finally: 158 | save_path = saver.save(sess, checkpoints_dir + '/model.ckpt', global_step=step) 159 | print('Model saved in {}'.format(save_path)) 160 | coord.request_stop() 161 | coord.join(threads) 162 | 163 | if __name__ == '__main__': 164 | prn_train() 165 | 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /pose_residual_network/src/PRN.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: PRN.py 7 | @time: 18-9-27 下午2:36 8 | ''' 9 | from __future__ import absolute_import, division, print_function 10 | 11 | import tensorflow as tf 12 | from tensorflow.contrib.slim import nets 13 | from tensorflow.contrib.layers.python.layers import utils 14 | import tensorflow.contrib.slim as slim 15 | 16 | import numpy as np 17 | import os 18 | 19 | class PRN(object): 20 | def __init__(self, inputs, output_node, is_training=True, hidden_node = 1024): 21 | self.x = inputs 22 | self.output_node = output_node 23 | self.hidden_node = hidden_node 24 | self.is_training = is_training 25 | 26 | 27 | 28 | def forward(self): 29 | with tf.variable_scope('pose-residual-network'): 30 | flatten = slim.flatten(inputs=self.x) 31 | fc1 = slim.fully_connected(inputs=flatten, num_outputs=self.hidden_node, activation_fn=tf.nn.relu) 32 | dropout1 = slim.dropout(inputs=fc1, is_training=self.is_training) 33 | fc2 = slim.fully_connected(inputs=dropout1, num_outputs=self.hidden_node, activation_fn=tf.nn.relu) 34 | dropout2 = slim.dropout(inputs=fc2, is_training=self.is_training) 35 | fc3 = slim.fully_connected(inputs=dropout2, num_outputs=self.output_node, activation_fn=tf.nn.relu) 36 | # out = tf.nn.relu(dropout2) 37 | out = tf.add(flatten, fc3) 38 | out = tf.nn.softmax(out) 39 | out = tf.reshape(out, shape=self.x.get_shape()) 40 | 41 | return out 42 | -------------------------------------------------------------------------------- /pose_residual_network/src/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: __init__.py 7 | @time: 18-9-28 上午9:31 8 | ''' -------------------------------------------------------------------------------- /pose_residual_network/src/convert_tfrecord.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: convert_tfrecord.py 7 | @time: 18-9-27 下午3:15 8 | ''' 9 | import tensorflow as tf 10 | import cv2, os, json 11 | import numpy as np 12 | import matplotlib.pyplot as plt 13 | from skimage.filters import gaussian 14 | 15 | 16 | FLAGS = tf.flags.FLAGS 17 | tf.flags.DEFINE_string('json_file', '/media/ulsee/E/pose_residual_net_tfrecord/cocotrain2017_convert_ai.json', 18 | '') 19 | tf.flags.DEFINE_string('tfrecord_file', '/media/ulsee/E/pose_residual_net_tfrecord/coco_train2017.tfrecord', 20 | 'tfrecord file') 21 | tf.flags.DEFINE_integer('height', 56, 'prn net input height') 22 | tf.flags.DEFINE_integer('width', 36, 'prn net input width') 23 | tf.flags.DEFINE_integer('channels', 17, 'number of keypoints') 24 | 25 | def _int64_feature(value): 26 | ''' Wrapper for inserting int64 feature into Example proto''' 27 | if not isinstance(value, list): 28 | value = [value] 29 | return tf.train.Feature(int64_list = tf.train.Int64List(value=value)) 30 | 31 | def _float_feature(value): 32 | ''' Wrapper for inserting float feature into Example proto''' 33 | if not isinstance(value, list): 34 | value = [value] 35 | return tf.train.Feature(float_list=tf.train.FloatList(value=value)) 36 | 37 | def _bytes_feature(value): 38 | ''' Wrapper for inserting bytes feature into Example proto''' 39 | if not isinstance(value, list): 40 | value = [value] 41 | return tf.train.Feature(bytes_list = tf.train.BytesList(value=value)) 42 | 43 | def _string_feature(value): 44 | ''' Wrapper for inserting string (actually bytes) feature into Example proto''' 45 | if not isinstance(value, list): 46 | value = [value] 47 | return tf.train.Feature(bytes_list = tf.train.BytesList(value=value)) 48 | 49 | def convert_to_tfrecord(json_file, tfrecord_file): 50 | 51 | f = open(json_file, encoding='utf-8') 52 | labels = json.load(f) 53 | 54 | if isinstance(labels, dict): 55 | pass 56 | elif isinstance(labels, list): 57 | convert_ai_challenger(labels, tfrecord_file) 58 | else: 59 | raise ValueError('Json file format is wrong!!!') 60 | 61 | 62 | def convert_ai_challenger(labels, tfrecord_file): 63 | 64 | tfrecord_dir = os.path.dirname(tfrecord_file) 65 | if not os.path.exists(tfrecord_dir): 66 | os.makedirs(tfrecord_dir) 67 | 68 | writer = tf.python_io.TFRecordWriter(tfrecord_file) 69 | total_imgs = len(labels) 70 | deal_imgs = 0 71 | useless = 0 72 | for label in labels: 73 | # print (label['image_id']) 74 | 75 | kp_anno = label['keypoint_annotations'] 76 | human_anno = label['human_annotations'] 77 | humans = kp_anno.keys() 78 | all_keypoints = [kp for kp in kp_anno.values()] 79 | 80 | for human in humans: 81 | kp = kp_anno[human] 82 | kpv = kp[2::3] 83 | if np.sum(kpv>0) < 4: 84 | useless += 1 85 | continue 86 | box = human_anno[human] 87 | box[2] = box[2] - box[0] 88 | box[3] = box[3] - box[1] 89 | 90 | if box[2] == 0 or box[3] == 0: 91 | continue 92 | 93 | tf_label = get_label_for_single_box(kp, box) 94 | tf_inputs = get_input_for_single_box(all_keypoints, box) 95 | 96 | # 97 | # img1 = np.sum(tf_label, axis=2, keepdims=True) 98 | # cv2.imwrite('label.jpg', img1*255) 99 | # img2 = np.sum(tf_inputs, axis=2, keepdims=True) 100 | # cv2.imwrite('input.jpg', img2*255) 101 | # 102 | 103 | example = tf.train.Example(features=tf.train.Features( 104 | feature = { 105 | 'input':_float_feature(list(np.reshape(np.asarray(tf_inputs, dtype=np.float32), (-1, )))), 106 | 'label':_float_feature(list(np.reshape(np.asarray(tf_label, dtype=np.float32), (-1, )))) 107 | } 108 | )) 109 | 110 | writer.write(example.SerializeToString()) 111 | deal_imgs += 1 112 | 113 | # if deal_imgs == 2: 114 | # break 115 | 116 | if deal_imgs % 1000 == 0: 117 | print ('Processing {}/{}'.format(deal_imgs, total_imgs)) 118 | print ('Useless boxs {}'.format(useless)) 119 | 120 | writer.close() 121 | print ('Converting tf record done.') 122 | 123 | 124 | def get_label_for_single_box(keypoints, bbox): 125 | label = np.zeros((FLAGS.height, FLAGS.width, FLAGS.channels)) 126 | 127 | x = int(bbox[0]) 128 | y = int(bbox[1]) 129 | w = float(bbox[2]) 130 | h = float(bbox[3]) 131 | 132 | x_scale = float(FLAGS.width) / w 133 | y_scale = float(FLAGS.height) / h 134 | 135 | kpx = keypoints[0::3] 136 | kpy = keypoints[1::3] 137 | kpv = keypoints[2::3] 138 | 139 | for j in range(FLAGS.channels): 140 | if kpv[j] != 3 and kpv[j] != 0: 141 | x0 = int((kpx[j] - x) * x_scale) 142 | y0 = int((kpy[j] - y) * y_scale) 143 | 144 | if x0 >= FLAGS.width and y0 >= FLAGS.height: 145 | label[FLAGS.height-1, FLAGS.width-1, j] = 1 146 | elif x0 >= FLAGS.width: 147 | try: 148 | label[y0, FLAGS.width-1, j] = 1 149 | except: 150 | label[0, FLAGS.width-1, j] = 1 151 | elif y0 >= FLAGS.height: 152 | try: 153 | label[FLAGS.height-1, x0, j] = 1 154 | except: 155 | label[FLAGS.height-1, 0, j] = 1 156 | elif x0 < 0 and y0 < 0: 157 | label[0, 0, j] = 1 158 | elif x0 < 0: 159 | label[y0, 0, j] = 1 160 | elif y0 < 0: 161 | label[0, x0, j] = 1 162 | else: 163 | label[y0, x0, j] = 1 164 | 165 | # for c in range(FLAGS.channels): 166 | # label[:, :, c] = gaussian(label[:, :, c],sigma=0.5) 167 | label = gaussian(label, sigma=2, mode='constant', multichannel=True) 168 | return label 169 | 170 | def get_input_for_single_box(keypoints, bbox): 171 | inputs = np.zeros((FLAGS.height, FLAGS.width, FLAGS.channels)) 172 | threshold = 0.21 173 | 174 | x = int(bbox[0]) 175 | y = int(bbox[1]) 176 | w = float(bbox[2]) 177 | h = float(bbox[3]) 178 | 179 | 180 | x_scale = float(FLAGS.width) / w 181 | y_scale = float(FLAGS.height) / h 182 | 183 | for ann in keypoints: 184 | kpx = ann[0::3] 185 | kpy = ann[1::3] 186 | kpv = ann[2::3] 187 | 188 | 189 | for j in range(FLAGS.channels): 190 | if kpv[j] != 3 and kpv[j] != 0: 191 | if kpx[j] > bbox[0] - bbox[2] * threshold and kpx[j] < bbox[0] + bbox[2] * (1 + threshold): 192 | if kpy[j] > bbox[1] - bbox[3] * threshold and kpy[j] < bbox[1] + bbox[3] * (1 + threshold): 193 | 194 | x0 = int((kpx[j] - x) * x_scale) 195 | y0 = int((kpy[j] - y) * y_scale) 196 | 197 | if x0 >= FLAGS.width and y0 >= FLAGS.height: 198 | inputs[FLAGS.height - 1, FLAGS.width - 1, j] = 1 199 | elif x0 >= FLAGS.width: 200 | try: 201 | inputs[y0, FLAGS.width - 1, j] = 1 202 | except: 203 | inputs[0, FLAGS.width - 1, j] = 1 204 | elif y0 >= FLAGS.height: 205 | try: 206 | inputs[FLAGS.height - 1, x0, j] = 1 207 | except: 208 | inputs[FLAGS.height - 1, 0, j] = 1 209 | elif x0 < 0 and y0 < 0: 210 | inputs[0, 0, j] = 1 211 | elif x0 < 0: 212 | inputs[y0, 0, j] = 1 213 | elif y0 < 0: 214 | inputs[0, x0, j] = 1 215 | else: 216 | inputs[y0, x0, j] = 1 217 | 218 | 219 | for c in range(FLAGS.channels): 220 | inputs[:, :, c] = gaussian(inputs[:, :, c]) 221 | return inputs 222 | 223 | 224 | if __name__ == '__main__': 225 | convert_to_tfrecord(FLAGS.json_file, FLAGS.tfrecord_file) 226 | 227 | 228 | 229 | -------------------------------------------------------------------------------- /pose_residual_network/src/reader.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: reader.py 7 | @time: 18-9-27 下午5:05 8 | ''' 9 | 10 | import tensorflow as tf 11 | import numpy as np 12 | import cv2 13 | import os 14 | 15 | class PRN_READER(object): 16 | def __init__(self, batch_size, height, width, channels, tfrecord_file): 17 | self.batch_size = batch_size 18 | self.height = height 19 | self.width = width 20 | self.channles = channels 21 | self.reader = tf.TFRecordReader() 22 | self.tfrecord_file = tfrecord_file 23 | 24 | def feed(self): 25 | 26 | filename_queue = tf.train.string_input_producer([self.tfrecord_file], num_epochs=16) 27 | reader = self.reader 28 | _, serialized_example = reader.read(filename_queue) 29 | 30 | features = tf.parse_single_example( 31 | serialized_example, 32 | features={ 33 | 'input': tf.VarLenFeature(dtype=tf.float32), 34 | 'label': tf.VarLenFeature(dtype=tf.float32) 35 | } 36 | ) 37 | 38 | inputs = features['input'].values 39 | label = features['label'].values 40 | 41 | inputs = tf.reshape(inputs, shape=(self.height, self.width, self.channles)) 42 | label = tf.reshape(label, shape=(self.height, self.width, self.channles)) 43 | 44 | batch_input, batch_label = tf.train.shuffle_batch( 45 | [inputs, label], 46 | batch_size=self.batch_size, 47 | num_threads=4, 48 | capacity=1000, 49 | min_after_dequeue=100 50 | ) 51 | 52 | return batch_input, batch_label 53 | 54 | 55 | def reader_test(): 56 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 57 | reader = PRN_READER(batch_size=1, height=56, width=36, channels=17, 58 | tfrecord_file='/raid5/hswData/pose_residual_net_tfrecord/coco_train2017_6.tfrecord') 59 | net_x, label = reader.feed() 60 | # net_x = tf.reduce_sum(net_x, axis=3, keepdims=True) 61 | # label = tf.reduce_sum(label, axis=3, keepdims=True) 62 | 63 | with tf.Session() as sess: 64 | sess.run([tf.global_variables_initializer(), tf.local_variables_initializer()]) 65 | coord = tf.train.Coordinator() 66 | threads = tf.train.start_queue_runners(coord=coord) 67 | 68 | step = 0 69 | try: 70 | while not coord.should_stop(): 71 | _1, _2 = sess.run([net_x, label]) 72 | step += 1 73 | except tf.errors.OutOfRangeError: 74 | print('done. total step == ', step) 75 | finally: 76 | 77 | print ('batch = 1, epochs = 1, total step == ', step) 78 | coord.request_stop() 79 | coord.join(threads) 80 | 81 | 82 | if __name__ == '__main__': 83 | reader_test() -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: __init__.py 7 | @time: 18-9-28 下午2:19 8 | ''' -------------------------------------------------------------------------------- /utils/backbone.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: backbone.py 7 | @time: 18-9-28 上午11:03 8 | ''' 9 | 10 | from __future__ import absolute_import, division, print_function 11 | 12 | import tensorflow as tf 13 | 14 | from tensorflow.contrib.slim import nets 15 | from tensorflow.contrib.layers.python.layers import utils 16 | import tensorflow.contrib.slim as slim 17 | 18 | class BackBone(object): 19 | def __init__(self, img_size, batch_size, is_training=True): 20 | self.img_size = img_size 21 | self.batch_size = batch_size 22 | self.input_imgs = tf.placeholder(tf.float32, [self.batch_size, self.img_size, self.img_size, 3]) 23 | self.is_training = is_training 24 | self.stddev = 0.01 25 | 26 | def get_feature_map(self): 27 | #-------------------resent---------------------# 28 | arg_scope = nets.resnet_v2.resnet_arg_scope() 29 | with slim. arg_scope(arg_scope): 30 | out, end_points = nets.resnet_v2.resnet_v2_50(inputs=self.input_imgs, num_classes=None, is_training=self.is_training) 31 | #---------------feature map dict---------------# 32 | feature_map_dict = { 33 | 'C2': end_points['resnet_v2_50/block1/unit_2/bottleneck_v2'], # input_size / 4 34 | 'C3': end_points['resnet_v2_50/block2/unit_3/bottleneck_v2'], # input_size / 8 35 | 'C4': end_points['resnet_v2_50/block3/unit_4/bottleneck_v2'], # input_size / 16 36 | 'C5': end_points['resnet_v2_50/block4'] # input_size / 32 37 | } 38 | return feature_map_dict 39 | 40 | def build_fpn_feature(self): 41 | feature_pyramid = {} 42 | feature_map_dict = self.get_feature_map() 43 | #------------------------------------------build fpn-------------------------------------------# 44 | with tf.variable_scope('build_fpn_feature'): 45 | with slim.arg_scope([slim.conv2d], weights_initializer=tf.random_normal_initializer(stddev=self.stddev)): 46 | feature_pyramid['P5'] = slim.conv2d(feature_map_dict['C5'], num_outputs=256, kernel_size=[1, 1], stride=1, 47 | scope='build_fpn_P5') 48 | 49 | #------------------ top-down pathway and lateral connections--------------------------# 50 | for layer in range(4, 1, -1): 51 | p = feature_pyramid['P' + str(layer + 1)] 52 | c = feature_map_dict['C' + str(layer)] 53 | 54 | #---------------------------------- upsample p -----------------------------------# 55 | up_shape = c.get_shape() 56 | up_sample = tf.image.resize_nearest_neighbor(p, [up_shape[2], up_shape[2]], 57 | name='upsampling_fpn_P%d' % layer) 58 | 59 | #----------------------------------- 1x1 conv ------------------------------------# 60 | c = slim.conv2d(c, num_outputs=256, kernel_size=[1, 1], stride=1, scope='fpn_1x1conv_C%d' % layer) 61 | p = up_sample + c 62 | 63 | #----------------------reduce aliasing effect of upsampling ----------------------# 64 | #---------------(in the third last paragraph, Section 3, Paper FPN)---------------# 65 | p = slim.conv2d(p, num_outputs=256, kernel_size=[3, 3], stride=1, padding='SAME', 66 | scope='build_fpn_P%d' % layer) 67 | 68 | feature_pyramid['P' + str(layer)] = p 69 | 70 | return feature_pyramid 71 | 72 | -------------------------------------------------------------------------------- /utils/coco_convert_ai_json.json: -------------------------------------------------------------------------------- 1 | [{"image_id": "000000397133", "keypoint_annotations": {"human0": [433, 94, 2, 434, 90, 2, 0, 0, 0, 443, 98, 2, 0, 0, 0, 420, 128, 2, 474, 133, 2, 396, 162, 2, 489, 173, 2, 0, 0, 0, 0, 0, 0, 419, 214, 2, 458, 215, 2, 411, 274, 2, 458, 273, 2, 402, 333, 2, 465, 334, 2], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 277, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [388.66, 69.92, 498.07000000000005, 347.54], "human1": [0, 262.81, 62.16, 299.58]}}, {"image_id": "000000252219", "keypoint_annotations": {"human0": [356, 198, 2, 358, 193, 2, 351, 194, 2, 364, 192, 2, 346, 194, 2, 375, 207, 2, 341, 211, 2, 388, 236, 2, 336, 238, 2, 392, 263, 2, 343, 242, 2, 373, 271, 2, 347, 272, 2, 372, 316, 2, 348, 318, 2, 372, 353, 2, 355, 354, 2], "human1": [100, 190, 2, 0, 0, 0, 96, 185, 2, 0, 0, 0, 86, 188, 2, 84, 208, 2, 71, 208, 2, 84, 245, 2, 59, 240, 2, 115, 263, 2, 66, 271, 2, 64, 268, 2, 71, 264, 2, 59, 324, 2, 99, 322, 2, 18, 363, 2, 101, 377, 2], "human2": [536, 192, 1, 538, 188, 2, 0, 0, 0, 552, 190, 2, 0, 0, 0, 568, 207, 2, 555, 208, 2, 559, 243, 2, 554, 246, 2, 542, 270, 2, 550, 277, 2, 573, 274, 2, 559, 274, 2, 589, 323, 2, 541, 322, 2, 617, 365, 2, 530, 361, 2]}, "human_annotations": {"human0": [326.28, 174.56, 397.52, 371.81], "human1": [9.79, 167.06, 131.73, 393.51], "human2": [510.44, 171.27, 634.1, 387.03]}}, {"image_id": "000000087038", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 271, 233, 2, 0, 0, 0, 271, 239, 2, 287, 239, 2, 266, 257, 2, 0, 0, 0, 261, 268, 2, 0, 0, 0, 285, 261, 2, 298, 260, 2, 282, 285, 2, 284, 278, 2, 286, 311, 2, 291, 298, 2], "human2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human3": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human4": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human5": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human6": [363, 226, 1, 363, 222, 2, 0, 0, 0, 369, 218, 2, 0, 0, 0, 380, 227, 2, 383, 213, 2, 400, 235, 2, 397, 199, 2, 390, 252, 2, 409, 183, 2, 399, 266, 2, 390, 259, 2, 384, 305, 2, 364, 270, 2, 376, 338, 2, 364, 290, 2], "human7": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human8": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human9": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human10": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human11": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human12": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human13": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [226.04, 229.31, 237.63, 259.72], "human1": [257.85, 224.48, 301.98, 321.48], "human2": [68.18, 238.19, 84.36000000000001, 281.07], "human3": [79.16, 232.26, 107.38, 283.38], "human4": [98.4, 234.28, 117.92, 280.74], "human5": [326.86, 223.46, 339.97, 262.13], "human6": [345.41, 173.41, 418.35, 358.82], "human7": [239.72, 225.38, 250.36, 258.44], "human8": [167.02, 234, 182.8, 271.46], "human9": [209.68, 231.08, 218.83, 265.61], "human10": [408.29, 231.25, 425.41, 266.22], "human11": [204.14, 229.02, 211.47, 263.98], "human12": [195.32, 228.06, 205.97, 265.24], "human13": [1, 190, 639, 291]}}, {"image_id": "000000480985", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 326, 281, 2, 0, 0, 0, 338, 292, 2, 330, 292, 2, 340, 306, 2, 0, 0, 0, 334, 317, 2, 0, 0, 0, 337, 326, 2, 332, 325, 2, 338, 350, 2, 0, 0, 0, 340, 368, 2, 0, 0, 0], "human3": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human4": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 281, 299, 2, 0, 0, 0, 282, 320, 2, 0, 0, 0, 274, 329, 2, 0, 0, 0, 278, 327, 2, 270, 327, 1, 280, 355, 2, 273, 355, 2, 282, 373, 2, 273, 374, 2], "human5": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human6": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human7": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [47.19, 296.12, 75.49, 329.29], "human1": [32.75, 298.94, 49.269999999999996, 328.15999999999997], "human2": [320.16, 275.05, 347.22, 379.58000000000004], "human3": [10.05, 302.96, 23.75, 328.65], "human4": [266.37, 293.13, 290.34000000000003, 382.09], "human5": [369.5, 278.52, 375.0, 324.16999999999996], "human6": [290.03, 299.79, 305.27, 319.66], "human7": [302.2, 298.22, 314.93, 316.95000000000005]}}, {"image_id": "000000296649", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 344, 324, 2, 331, 323, 2, 364, 337, 2, 346, 347, 2, 386, 350, 2, 370, 362, 1, 339, 367, 2, 329, 370, 2, 0, 0, 0, 361, 382, 2, 0, 0, 0, 347, 410, 1], "human1": [307, 308, 2, 0, 0, 0, 305, 305, 2, 0, 0, 0, 296, 306, 2, 299, 320, 2, 285, 326, 2, 0, 0, 0, 294, 355, 2, 0, 0, 0, 311, 340, 2, 0, 0, 0, 284, 368, 2, 0, 0, 0, 317, 372, 2, 0, 0, 0, 312, 408, 2], "human2": [46, 291, 2, 48, 287, 2, 43, 288, 2, 0, 0, 0, 0, 0, 0, 44, 307, 2, 15, 306, 2, 62, 322, 2, 28, 336, 2, 89, 339, 2, 63, 347, 2, 44, 354, 2, 15, 355, 2, 73, 362, 1, 60, 376, 2, 0, 0, 0, 52, 421, 2], "human3": [492, 299, 2, 0, 0, 0, 489, 295, 2, 0, 0, 0, 478, 297, 2, 458, 315, 2, 477, 325, 2, 0, 0, 0, 491, 362, 2, 486, 328, 2, 0, 0, 0, 448, 391, 2, 464, 391, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human4": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human5": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human6": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human7": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human8": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human9": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human10": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human11": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 541, 301, 2, 507, 317, 2, 534, 325, 2, 0, 0, 0, 562, 348, 2, 0, 0, 0, 593, 362, 1, 494, 383, 1, 521, 388, 1, 0, 0, 0, 562, 400, 1, 0, 0, 0, 0, 0, 0], "human12": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [322.57, 290.81, 387.65999999999997, 418.43], "human1": [273.64, 292.11, 324.59, 421.76], "human2": [1.92, 266.88, 116.37, 422.66999999999996], "human3": [424.12, 270.59, 531.59, 400.13], "human4": [259.27, 281, 285.28999999999996, 316.1], "human5": [281.06, 276.47, 296.57, 317.79], "human6": [104.73, 267.55, 123.54, 296.61], "human7": [120.86, 271.12, 137.14, 296.52], "human8": [257.14, 281.32, 269.94, 323.34], "human9": [269, 274.45, 277.89, 292.23], "human10": [556.28, 309.36, 588.37, 355.48], "human11": [494.93, 276.54, 587.01, 402.61], "human12": [300, 280, 325, 334]}}, {"image_id": "000000386912", "keypoint_annotations": {"human0": [305, 195, 2, 317, 181, 2, 296, 184, 2, 335, 187, 2, 0, 0, 0, 367, 252, 2, 271, 248, 2, 377, 354, 2, 250, 324, 2, 307, 381, 2, 232, 354, 2, 343, 402, 1, 271, 395, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [210.27, 143.29, 430.09000000000003, 419.43999999999994]}}, {"image_id": "000000348881", "keypoint_annotations": {"human0": [592, 279, 2, 0, 0, 0, 592, 278, 2, 0, 0, 0, 589, 278, 2, 588, 285, 2, 580, 285, 2, 0, 0, 0, 580, 301, 2, 0, 0, 0, 588, 304, 2, 582, 311, 2, 577, 310, 2, 579, 325, 2, 578, 324, 2, 576, 338, 2, 572, 341, 2], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [567.82, 273.1, 599.2, 347.21000000000004], "human1": [251.19, 106.42, 274.51, 168.14]}}, {"image_id": "000000522713", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [306.78, 277.85, 317.52, 288.24], "human1": [490.68, 269.32, 494.16, 272.73]}}, {"image_id": "000000181666", "keypoint_annotations": {"human0": [306, 181, 2, 0, 0, 0, 303, 178, 2, 0, 0, 0, 297, 180, 2, 312, 193, 2, 288, 192, 2, 320, 215, 2, 277, 210, 2, 312, 219, 2, 278, 223, 2, 308, 231, 2, 291, 231, 2, 309, 254, 2, 290, 254, 2, 309, 280, 1, 290, 279, 1], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [272.5, 165.89, 324.88, 269.82], "human1": [51.39, 189.47, 75.47, 215.74], "human2": [0, 182.71, 15.86, 261.06]}}, {"image_id": "000000017627", "keypoint_annotations": {"human0": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human1": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human2": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "human3": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}, "human_annotations": {"human0": [150.04, 224, 169.04999999999998, 295.72], "human1": [259.18, 228.19, 273.3, 249.25], "human2": [172.88, 235.27, 182.88, 265.27], "human3": [187.03, 226.28, 203.04, 248.77]}}] -------------------------------------------------------------------------------- /utils/coco_json_convert.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | ''' 3 | @author: shiwei hou 4 | @contact: murdockhou@gmail.com 5 | @software: PyCharm 6 | @file: coco_json_convert.py 7 | @time: 18-9-27 下午6:02 8 | 9 | try to convert coco annotation json file into as like ai_challenger format 10 | [ 11 | { 12 | "image_id": "a0f6bdc065a602b7b84a67fb8d14ce403d902e0d", 13 | "human_annotations": 14 | { 15 | "human1": [178,250,290,522], 16 | "human2": [293,274,352,473], 17 | "human3": [315,236,389,495], 18 | ...}, 19 | "keypoint_annotations": 20 | { 21 | "human1": [261, 294, 1, 281, 328, 1, 259, 314, 2, 22 | 213, 295, 1, 208, 346, 1, 192, 335, 1, 23 | 245, 375, 1, 255, 432, 1, 244, 494, 1, 24 | 221, 379, 1, 219, 442, 1, 226, 491, 1, 25 | 226, 256, 1, 231, 284, 1], 26 | "human2": [313, 301, 1, 305, 337, 1, 321, 345, 1, 27 | 331, 316, 2, 331, 335, 2, 344, 343, 2, 28 | 313, 359, 1, 320, 409, 1, 311, 454, 1, 29 | 327, 356, 2, 330, 409, 1, 324, 446, 1, 30 | 337, 284, 1, 327, 302, 1], 31 | "human3": [373, 304, 1, 346, 286, 1, 332, 263, 1, 32 | 363, 308, 2, 342, 327, 2, 345, 313, 1, 33 | 370, 385, 2, 368, 423, 2, 370, 466, 2, 34 | 363, 386, 1, 361, 424, 1, 361, 475, 1, 35 | 365, 273, 1, 369, 297, 1], 36 | ...} 37 | }, 38 | ... 39 | ] 40 | ''' 41 | 42 | import json 43 | import numpy as np 44 | 45 | coco_json_file = '/media/ulsee/E/datasets/coco/annotations2017/person_keypoints_val2017.json' 46 | 47 | f = open(coco_json_file, encoding='utf-8') 48 | labels = json.load(f) 49 | units = [] 50 | 51 | img_info = labels['images'] 52 | anno_info = labels['annotations'] 53 | 54 | print ('Start converting json file.....') 55 | ll = len(img_info) 56 | count = 0 57 | 58 | for img in img_info: 59 | unit = {} 60 | img_name = img['file_name'].split('.')[0] 61 | img_id = img['id'] 62 | height = img['height'] 63 | width = img['width'] 64 | 65 | keypoint_anno = {} 66 | human_anno = {} 67 | human_count = 0 68 | 69 | for anno in anno_info: 70 | bbox = anno['bbox'] 71 | anno_img_id = anno['image_id'] 72 | keypoints = anno['keypoints'] 73 | category_id = anno['category_id'] 74 | 75 | if anno_img_id == img_id: 76 | bbox[2] = bbox[0] + bbox[2] 77 | bbox[3] = bbox[1] + bbox[3] 78 | keypoint_anno['human'+str(human_count)] = keypoints 79 | human_anno['human'+str(human_count)] = bbox 80 | human_count += 1 81 | if human_count == 0: 82 | keypoint_anno['human0'] = [0 for i in range(17*3)] 83 | human_anno['human0'] = [0 for i in range(4)] 84 | unit['image_id'] = img_name 85 | unit['keypoint_annotations'] = keypoint_anno 86 | unit['human_annotations'] = human_anno 87 | unit['id'] = img_id 88 | 89 | units.append(unit) 90 | 91 | count += 1 92 | 93 | # if count == 10: 94 | # break 95 | 96 | if count % 100 == 0: 97 | print ('Processing {}/{}'.format(count, ll)) 98 | 99 | with open('/media/ulsee/E//coco_val2017_aiformat.json', 'w') as fw: 100 | json.dump(units, fw) 101 | print ('Convert done.') 102 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /utils/gaussian.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from skimage.filters import gaussian 3 | 4 | sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89] * 100) 5 | 6 | 7 | def multivariate_gaussian(N, sigma=2): 8 | t = 4 9 | X = np.linspace(-t, t, N) 10 | Y = np.linspace(-t, t, N) 11 | X, Y = np.meshgrid(X, Y) 12 | pos = np.empty(X.shape + (2,)) 13 | pos[:, :, 0] = X 14 | pos[:, :, 1] = Y 15 | mu = np.array([0., 0.]) 16 | sigma = np.array([[sigma, 0], [0, sigma]]) 17 | n = mu.shape[0] 18 | Sigma_det = np.linalg.det(sigma) 19 | Sigma_inv = np.linalg.inv(sigma) 20 | N = np.sqrt((2 * np.pi) ** n * Sigma_det) 21 | fac = np.einsum('...k,kl,...l->...', pos - mu, Sigma_inv, pos - mu) 22 | return np.exp(-fac / 2) / N 23 | 24 | 25 | def crop_paste(img, c, N=13, sigma=2): 26 | Z = multivariate_gaussian(N, sigma) 27 | 28 | H = img.shape[1] 29 | W = img.shape[0] 30 | 31 | h = (Z.shape[0] - 1) / 2 32 | 33 | N = Z.shape[0] 34 | x1 = (c[0] - h) 35 | y1 = (c[1] - h) 36 | 37 | x2 = (c[0] + h) + 1 38 | y2 = (c[1] + h) + 1 39 | 40 | zx1 = 0 41 | zy1 = 0 42 | zx2 = N + 1 43 | zy2 = N + 1 44 | 45 | if x1 < 0: 46 | x1 = 0 47 | zx1 = 0 - (c[0] - h) 48 | 49 | if y1 < 0: 50 | y1 = 0 51 | zy1 = 0 - (c[1] - h) 52 | 53 | if x2 > W - 1: 54 | x2 = W - 1 55 | zx2 = x2 - x1 + 1 56 | x2 = W 57 | 58 | if y2 > H - 1: 59 | y2 = H - 1 60 | zy2 = y2 - y1 + 1 61 | y2 = H 62 | 63 | img[x1:x2, y1:y2] = np.maximum(Z[zx1:zx2, zy1:zy2], img[x1:x2, y1:y2]) 64 | 65 | 66 | ''' 67 | def gaussian(img, N = 13, sigma=2): 68 | cs = np.where(img==1) 69 | img = np.zeros_like(img) 70 | for c in zip(cs[0], cs[1]): 71 | crop_paste(img, c, N, sigma) 72 | return img 73 | ''' 74 | 75 | 76 | def gaussian_multi_input_mp(inp): 77 | ''' 78 | :param inp: Multi person ground truth heatmap input (17 ch) Each channel contains multiple joints. 79 | :return: out: Gaussian augmented output. Values are between 0. and 1. 80 | ''' 81 | 82 | h, w, ch = inp.shape 83 | out = np.zeros_like(inp) 84 | for i in range(ch): 85 | layer = inp[:, :, i] 86 | ind = np.argwhere(layer == 1) 87 | b = [] 88 | if len(ind) > 0: 89 | for j in ind: 90 | t = np.zeros((h, w)) 91 | t[j[0], j[1]] = 1 92 | t = gaussian(t, sigma=2, mode='constant') 93 | t = t * (1 / t.max()) 94 | b.append(t) 95 | 96 | out[:, :, i] = np.maximum.reduce(b) 97 | else: 98 | out[:, :, i] = np.zeros((h, w)) 99 | return out 100 | 101 | 102 | def gaussian_multi_output(inp): 103 | ''' 104 | :param inp: Single person ground truth heatmap input (17 ch) Each channel contains one joint. 105 | :return: out: Gaussian augmented output. Values are between 0. and 1. 106 | ''' 107 | h, w, ch = inp.shape 108 | out = np.zeros_like(inp) 109 | for i in range(ch): 110 | j = np.argwhere(inp[:, :, i] == 1) 111 | if len(j) == 0: 112 | out[:, :, i] = np.zeros((h, w)) 113 | continue 114 | j = j[0] 115 | t = np.zeros((h, w)) 116 | t[j[0], j[1]] = 1 117 | t = gaussian(t, sigma=5, mode='constant') 118 | out[:, :, i] = t * (1 / t.max()) 119 | return out 120 | 121 | 122 | def crop(img, c, N=13): 123 | H = img.shape[1] 124 | W = img.shape[0] 125 | 126 | h = (N - 1) / 2 127 | 128 | x1 = int(c[0] - h) 129 | y1 = int(c[1] - h) 130 | 131 | x2 = int(c[0] + h) + 1 132 | y2 = int(c[1] + h) + 1 133 | 134 | if x1 < 0: 135 | x1 = 0 136 | 137 | if y1 < 0: 138 | y1 = 0 139 | 140 | if x2 > W - 1: 141 | x2 = W 142 | 143 | if y2 > H - 1: 144 | y2 = H 145 | 146 | return img[x1:x2, y1:y2] 147 | 148 | --------------------------------------------------------------------------------