├── README.md ├── Wonderseen_Handpose_cnn_depth ├── Wonderseen_HandPose_cnn_depth.py ├── result.png └── utils │ └── general.py └── result ├── test.png ├── test1.png ├── test2.png ├── test3.png ├── test4.png ├── test5.png └── test6.png /README.md: -------------------------------------------------------------------------------- 1 | Though recently I have not much time for this repository maintainance possess, But fortunately (maybe you could get more details of this project) I wrote [a blog (in Chinese)](https://blog.csdn.net/wonderseen/article/details/78341932) about the handpose investigation (traditional style and DL style) completed during this project and intermittently answered relevant questions raised by people who were interested in the comments section of the blog. 2 | # 3 | 4 | 5 | # Handpose-WonderSeen-Net 6 | 7 | 1. DATABASE: RHD_published_v2. 8 | 2. DATABASE INFO: RGBD, four channels, pix-level label. 9 | 3. CODE: To be updated in 2 months. 10 | 4. ADDITION: This script isn't the final version of my work of gesture recognization. For that Code Management requires a lot of work, the script 'Wonderseen_HandPose_cnn_depth.py' is provided merely as reference to show the main thoughts of the process. 11 | 12 | 13 | # Network-Result 14 | 15 | ![image](https://github.com/wonderseen/Handpose-WonderSeen-Net/tree/master/result/test.png) 16 | 17 | ![image](https://github.com/wonderseen/Handpose-WonderSeen-Net/tree/master/result/test1.png) 18 | 19 | ![image](https://github.com/wonderseen/Handpose-WonderSeen-Net/tree/master/result/test2.png) 20 | 21 | # Reference 22 | [1] Hand Gesture Recognition Based on Shape Parameters. 23 | 24 | [2] Densely Connected Convolutional Networks. 25 | 26 | [3] Convolutional Networks for Biomedical Image Segmentation. 27 | 28 | [4] ImageNet Classification with Deep Convolutional Neural Networks. 29 | -------------------------------------------------------------------------------- /Wonderseen_Handpose_cnn_depth/Wonderseen_HandPose_cnn_depth.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # This script isn't the final version of my work of gesture recognization. 3 | # For that Code Management requires a lot of work, the script is provided merely as reference to show the main process. 4 | # 5 | # Dataset: RHD_published_v2 [Alternative: NYU handpose dataset/BigHand2.2M] 6 | # 7 | # Segmentation: 8 | # Input:160×160×1 deep_img 9 | # output:80×80×1 mask_img 10 | # 11 | # Pose: 12 | # Input: image cropped on proposal region 13 | # Output: 21 scoremap of 2D key poses 14 | # 15 | # Classifier: 16 | # Input: key part of image cropped on proposal region or 21 key points' location 17 | # Output:type of gesture 18 | 19 | ############################################################################################## 20 | ############################################################################################## 21 | ## ## 22 | ## ## ## ##### ### # ##### ###### ###### ##### ##### ##### ### # ## 23 | ## ## # ## ####### # ## # # ## # ## ## ## # # # ## # ## 24 | ## ## ### ## ### ### # ## # # ## # ## ## ## # # # ## # ## 25 | ## ## ## ## ## ## ## # ## # # ## ###### ###### ##### ##### ##### # ## # ## 26 | ## ## ## ## ## ### ### # ## # # ## # ## ## ## # # # ## # ## 27 | ## #### #### ####### # ### # ## # ## ## ## # # # ### ## 28 | ## ## ## ##### # ## ##### ###### ## ## ##### ##### ##### # ## ## 29 | ## ## 30 | ############################################################################################## 31 | ############################################################################################## 32 | 33 | import tensorflow as tf 34 | import pickle 35 | import os 36 | import numpy as np 37 | import scipy.misc 38 | import matplotlib.pyplot as plt 39 | import random 40 | from mpl_toolkits.mplot3d import Axes3D 41 | import sys 42 | sys.path.append("~/Wonderseen_net/nets") 43 | sys.path.append("~/Wonderseen_Net/utils") 44 | sys.path.append("~/Wonderseen_Net/wonderseen_handpose_fcn/tools") 45 | 46 | import cv2 47 | from playsound import playsound 48 | import general 49 | import ReadData 50 | import PostTreatment 51 | 52 | # mode 53 | mode = 'predict' # train or predict 54 | 55 | # get data 56 | set = 'training'# 'training' 'evaluation' 57 | fatherdic = 'RHD_published_v2/' + set 58 | 59 | # Train Para 60 | channel = 1 61 | IMAGE_HEIGHT = 320 62 | IMAGE_WIDTH = 320 63 | trainstep = 50 64 | savestep = 400 65 | start_step = 101200 66 | start_lr = 1e-3 67 | net = general.NetworkOps 68 | saver_restore_addr = '/root/pose-model/handposetemp-model.ckpt-101200' 69 | X = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT*IMAGE_WIDTH*channel/4], name='INPUT_IMAGE_HEIGHT_MULTI_WIDTH') 70 | realMask = tf.placeholder(tf.float32, [None, IMAGE_HEIGHT*IMAGE_WIDTH*channel/16], name='realMask') 71 | keep_prob = tf.placeholder(tf.float32) 72 | 73 | # Classifier Para 74 | CL_graph = tf.Graph() 75 | CLASSIFIER_IMAGE_HEIGHT = 50 76 | CLASSIFIER_IMAGE_WIDTH = 50 77 | HAND_NUM = 1 78 | GESTURE_CLASSES = 17 79 | saver_restore_addr_classifier = '/root/clasiffier-model/handposetemp-model.ckpt-4250' 80 | 81 | 82 | # write data into memory 83 | if mode == 'train': 84 | depth_pred = [] 85 | hand_mask_pred = [] 86 | for x in range(0,40000): 87 | sample_id = random.randint(0,40000) 88 | # read mask / deep 89 | mask = scipy.misc.imread(os.path.join(fatherdic, 'mask', '%.5d.png' % sample_id)).astype('float32') 90 | depth = scipy.misc.imread(os.path.join(fatherdic, 'depth', '%.5d.png' % sample_id)) 91 | depth = ReadData.depth_two_uint8_to_float(depth[:, :, 0], depth[:, :, 1]) 92 | depth = cv2.resize(depth,(IMAGE_WIDTH/2,IMAGE_HEIGHT/2)) 93 | 94 | print 'load_data',sample_id, x 95 | mask = cv2.resize(mask,(IMAGE_WIDTH/4,IMAGE_HEIGHT/4)).astype('float32') 96 | for i in range(0, len(mask)): 97 | for j in range(0, len(mask[0])): 98 | if mask[i][j] <= 1: 99 | mask[i][j] = 0 100 | else: 101 | mask[i][j] = 1 102 | all = sum(sum(mask)) + 1e-4 103 | mask /= all 104 | depth = depth.reshape(IMAGE_WIDTH // 2 * IMAGE_HEIGHT // 2 * channel) 105 | depth_pred.append(depth) 106 | hand_mask_pred.append(mask.reshape(IMAGE_WIDTH//4*IMAGE_HEIGHT//4*channel)) 107 | 108 | if mode == 'predict': 109 | pass 110 | 111 | # train 112 | def train_handpose_depth_cnn(continueflag): 113 | global_step = tf.Variable(0, trainable=False) 114 | add_global = global_step.assign_add(1) 115 | return_global = global_step.assign(start_step) 116 | learning_rate = tf.train.exponential_decay(learning_rate = start_lr, global_step=global_step,decay_steps = 10000, decay_rate = 0.97)#,staircase=True) 117 | 118 | # Start TF 119 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) 120 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 121 | tf.train.start_queue_runners(sess=sess) 122 | 123 | # Net-Output 124 | hand_scoremap = depth_handpose_fcn() 125 | 126 | # Loss 127 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=hand_scoremap, labels=realMask)) 128 | optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss) 129 | 130 | # Predict 131 | predict = tf.reshape(hand_scoremap, [-1, IMAGE_HEIGHT, IMAGE_WIDTH]) 132 | max_idx_p = tf.argmax(predict, 2) 133 | max_idx_l = tf.argmax(tf.reshape(realMask, [-1, IMAGE_HEIGHT, IMAGE_WIDTH]), 2) 134 | correct_pred = tf.equal(max_idx_p, max_idx_l) 135 | accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) 136 | 137 | saver = tf.train.Saver() 138 | if continueflag == True: 139 | saver.restore(sess, saver_restore_addr) 140 | sess.run([return_global]) 141 | else: 142 | sess.run(tf.initialize_all_variables()) 143 | 144 | # training loop 145 | lossy = [[],[]] 146 | plt.figure(figsize=(7,4)) 147 | accuracy = [] 148 | while True: 149 | step, lr = sess.run([add_global, learning_rate]) 150 | batch_x , batch_y = get_next_data(batch_size=32) 151 | _, train_loss = sess.run([optimizer, loss], feed_dict={X: batch_x, realMask: batch_y, keep_prob: 0.5}) 152 | if step % trainstep == 0: 153 | batch_x, batch_y = get_next_data(batch_size=1) 154 | hand_scoremap1 = sess.run([hand_scoremap], feed_dict={X: batch_x, keep_prob: 1}) 155 | hand_scoremap1 = np.array(hand_scoremap1).reshape(1, 80, 80) 156 | [batch_x, batch_y] = [np.array(batch_x).reshape(1,160,160), np.array(batch_y).reshape(1,80,80,1)] 157 | 158 | for i in range(0, hand_scoremap1.shape[0]): 159 | fig = plt.figure(1) 160 | ax1 = fig.add_subplot('211') 161 | ax2 = fig.add_subplot('212') 162 | ax1.imshow(batch_x[i]) 163 | ax2.imshow(hand_scoremap1[i]) 164 | plt.pause(3) 165 | 166 | if step % savestep == 0: 167 | saver.save(sess, "./mycnnmodel/handposetemp-model.ckpt", global_step=step) 168 | tf.train.write_graph(sess.graph_def, "./mycnnmodel/","nn_model.pbtxt", False)#as_text=True) 169 | 170 | # simple evaluation on the accuracy of pixel-prediction result 171 | accuracy.append(cacul_accuracy(hand_scoremap1[0], batch_y[0])) 172 | print 'accuracy = ', accuracy[-1] 173 | print 'step,mean-accuracy = ', step, np.mean(accuracy) 174 | 175 | lossy[0].append(step) 176 | lossy[1].append(train_loss) 177 | print 'step= ',step, 'train_loss= ',train_loss 178 | plt.clf() 179 | plt.plot(lossy[0], lossy[1], color='blue') 180 | plt.xlabel('/Step', fontsize=15) 181 | plt.ylabel('/LOSS', fontsize=15) 182 | plt.title('FCN Training Loss Iteration', fontsize=18) 183 | plt.ylim(0, 1.0) 184 | plt.grid(True, linestyle="-.", color="black", linewidth="1") 185 | plt.pause(0.01) 186 | 187 | # FCN 188 | def depth_handpose_fcn(w_alpha=0.01, b_alpha=0.1): 189 | x = tf.reshape(X, shape=[-1, IMAGE_HEIGHT/2, IMAGE_WIDTH/2, channel]) 190 | w_c1 = tf.Variable(w_alpha * tf.random_normal([3, 3, 1, 64])) 191 | b_c1 = tf.Variable(b_alpha * tf.random_normal([64])) 192 | conv1 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(x, w_c1, strides=[1, 1, 1, 1], padding='SAME'), b_c1)) 193 | 194 | w_c2 = tf.Variable(w_alpha * tf.random_normal([7, 7, 64, 128])) 195 | b_c2 = tf.Variable(b_alpha * tf.random_normal([128])) 196 | conv2 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv1, w_c2,strides=[1, 1, 1, 1], padding='SAME'), b_c2)) 197 | 198 | w_c2 = tf.Variable(w_alpha * tf.random_normal([7, 7, 128, 256])) 199 | b_c2 = tf.Variable(b_alpha * tf.random_normal([256])) 200 | conv2_1 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv2, w_c2,strides=[1, 1, 1, 1], padding='SAME'), b_c2)) 201 | maxpool2 = tf.nn.max_pool(conv2_1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') 202 | 203 | w_c3 = tf.Variable(w_alpha * tf.random_normal([3, 3, 256, 128])) 204 | b_c3 = tf.Variable(b_alpha * tf.random_normal([128])) 205 | conv3 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(maxpool2, w_c3, 206 | strides=[1, 1, 1, 1], padding='SAME'), b_c3)) 207 | 208 | w_c3_1 = tf.Variable(w_alpha * tf.random_normal([3, 3, 128, 128])) 209 | b_c3_1 = tf.Variable(b_alpha * tf.random_normal([128])) 210 | conv3_1 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv3, w_c3_1, 211 | strides=[1, 1, 1, 1], padding='SAME'), b_c3_1)) 212 | dropout3 = tf.nn.dropout(conv3_1, keep_prob) 213 | 214 | w_c3_2 = tf.Variable(w_alpha * tf.random_normal([3, 3, 128, 32])) 215 | b_c3_2 = tf.Variable(b_alpha * tf.random_normal([32])) 216 | conv3_2 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(dropout3, w_c3_2, 217 | strides=[1, 1, 1, 1], padding='SAME'), b_c3_2)) 218 | 219 | w_c4 = tf.Variable(w_alpha * tf.random_normal([3, 3, 32, 16])) 220 | b_c4 = tf.Variable(b_alpha * tf.random_normal([16])) 221 | conv4 = tf.nn.leaky_relu(tf.nn.bias_add(tf.nn.conv2d(conv3_2, w_c4, strides=[1, 1, 1, 1], padding='SAME'), b_c4)) 222 | maxpool4 = tf.nn.max_pool(conv4, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') 223 | 224 | w_f = tf.Variable(w_alpha * tf.random_normal([40*40*16, 80*80*1])) 225 | b_f = tf.Variable(b_alpha * tf.random_normal([80*80*1])) 226 | dense = tf.reshape(maxpool4, [-1, w_f.get_shape().as_list()[0]]) 227 | conv_f = tf.nn.leaky_relu(tf.add(tf.matmul(dense, w_f), b_f)) 228 | hand_scoremap = net.fully_connected_relu(conv_f, 'hand_scoremap', 80*80*1) 229 | return hand_scoremap 230 | 231 | # classifier 232 | def gesture_classifier_cnn(w_alpha=0.01, b_alpha=0.1): 233 | with CL_graph.as_default(): 234 | x = tf.reshape(XX, shape=[-1, CLASSIFIER_IMAGE_HEIGHT, CLASSIFIER_IMAGE_WIDTH, 1]) 235 | w_c1 = tf.Variable(w_alpha * tf.random_normal([5, 5, 1, 64])) 236 | b_c1 = tf.Variable(b_alpha * tf.random_normal([64])) 237 | conv1 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(x, w_c1, strides=[1, 1, 1, 1], padding='SAME'), b_c1)) 238 | 239 | maxpool1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME') 240 | dropout1 = tf.nn.dropout(maxpool1, kkeep_prob) 241 | 242 | w_c2 = tf.Variable(w_alpha * tf.random_normal([3, 3, 64, 64])) 243 | b_c2 = tf.Variable(b_alpha * tf.random_normal([64])) 244 | conv2 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(dropout1, w_c2, 245 | strides=[1, 1, 1, 1], padding='SAME'), b_c2)) 246 | 247 | ww_c3 = tf.Variable(w_alpha * tf.random_normal([3, 3, 64, 128])) 248 | bb_c3 = tf.Variable(b_alpha * tf.random_normal([128])) 249 | conv3 = tf.nn.relu(tf.nn.bias_add(tf.nn.conv2d(conv2, ww_c3, 250 | strides=[1, 1, 1, 1], padding='SAME'), bb_c3)) 251 | 252 | w_f1 = tf.Variable(w_alpha * tf.random_normal([25 * 25 * 128, 1024])) 253 | b_f1 = tf.Variable(b_alpha * tf.random_normal([1024])) 254 | h_f1 = tf.reshape(conv3,[-1,25*25*128]) 255 | h_fc1 = tf.nn.relu(tf.matmul(h_f1,w_f1)+b_f1) 256 | h_f_drop1 = tf.nn.dropout(h_fc1, kkeep_prob) 257 | 258 | # Fully connected layer 259 | w_f2 = tf.Variable(w_alpha * tf.random_normal([1024, 170])) 260 | b_f2 = tf.Variable(b_alpha * tf.random_normal([170])) 261 | dense = tf.reshape(h_f_drop1, [-1, w_f2.get_shape().as_list()[0]]) 262 | 263 | dense = tf.nn.relu(tf.add(tf.matmul(dense, w_f2), b_f2)) 264 | w_out = tf.Variable(w_alpha * tf.random_normal([170, HAND_NUM * GESTURE_CLASSES])) 265 | b_out = tf.Variable(b_alpha * tf.random_normal([HAND_NUM * GESTURE_CLASSES])) 266 | out = tf.add(tf.matmul(dense, w_out), b_out) 267 | return out 268 | 269 | with CL_graph.as_default(): 270 | XX = tf.placeholder(tf.float32, [None, CLASSIFIER_IMAGE_HEIGHT * CLASSIFIER_IMAGE_WIDTH], name='INPUT_IMAGE_HEIGHT_MULTI_WIDTH') 271 | YY = tf.placeholder(tf.float32, [None, CLASSIFIER_IMAGE_HEIGHT * CLASSIFIER_IMAGE_WIDTH], name='OUTPUT_ONE_HOTS') 272 | kkeep_prob = tf.placeholder(tf.float32) 273 | sess1 = tf.Session() 274 | classifier = gesture_classifier_cnn() 275 | saver1 = tf.train.Saver() 276 | saver1.restore(sess1, saver_restore_addr_classifier) 277 | 278 | def get_next_data(batch_size = 60): 279 | depth_pred_batch = [] 280 | hand_mask_pred_batch = [] 281 | for i in range(0,batch_size): 282 | sample_id = random.randint(0,len(depth_pred)-1) 283 | depth_pred_batch.append(depth_pred[sample_id]) 284 | hand_mask_pred_batch.append(hand_mask_pred[sample_id]) 285 | return depth_pred_batch, hand_mask_pred_batch 286 | 287 | 288 | def predict_handscoremap(): 289 | global_step = tf.Variable(0, trainable=False) 290 | add_global = global_step.assign_add(1) 291 | return_global = global_step.assign(start_step) 292 | start_lr = 1e-3 293 | learning_rate = tf.train.exponential_decay(learning_rate=start_lr, global_step=global_step, decay_steps=10000, 294 | decay_rate=0.97) # ,staircase=True) 295 | # Start TF 296 | gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) 297 | sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 298 | tf.train.start_queue_runners(sess=sess) 299 | hand_scoremap = depth_handpose_fcn() 300 | 301 | # Loss 302 | saver = tf.train.Saver() 303 | saver.restore(sess, saver_restore_addr) 304 | while True: 305 | depth_pre,_ = ReadData.get_one_sample_form_RHD(depth=True,fatherdic=fatherdic) 306 | 307 | # Test 308 | hand_scoremap1 = sess.run([hand_scoremap], feed_dict={X: depth_pre, keep_prob: 0.5}) 309 | 310 | hand_scoremap1 = np.array(hand_scoremap1).reshape(1, 80, 80) 311 | [depth_pre, hand_scoremap1] = [np.array(depth_pre).reshape(1, 160, 160), 312 | np.array(hand_scoremap1).reshape(1, 80, 80)] 313 | # upsample 314 | hand_scoremap1 = cv2.resize(hand_scoremap1[0], (160,160)) 315 | hand_scoremap_cp, hand_scoremap1_show = PostTreatment.eliminate_bkground_from_handscoremap(hand_scoremap1, 316 | depth_pre, 317 | threshold=0.25, 318 | block_half_size=3) 319 | hand_depth_crop, box = PostTreatment.crop_mask(hand_scoremap_cp, uv_cood_noise = 5, dominate=True) 320 | 321 | hand_depth_crop = cv2.resize(hand_depth_crop,(CLASSIFIER_IMAGE_HEIGHT, CLASSIFIER_IMAGE_WIDTH)) 322 | crop = [] 323 | scale = 1000. 324 | crop.append(PostTreatment.PreTreatment(hand_depth_crop*scale)) 325 | result = predict_classifier(np.array(crop)) 326 | 327 | # Visualization 328 | plt.close() 329 | fig = plt.figure(dpi=100,figsize=(10,10)) 330 | ax1 = fig.add_subplot('221') 331 | import matplotlib.patches as mpatches 332 | rect = mpatches.Rectangle((box[0], box[1]), box[2] - box[0], box[3] - box[1], fill=False, edgecolor='red', linewidth=2) 333 | ax1.add_patch(rect) 334 | ax2 = fig.add_subplot('222') 335 | ax3 = fig.add_subplot('223') 336 | ax4 = fig.add_subplot('224') 337 | ax1.imshow(depth_pre[0]+hand_scoremap_cp*10.) 338 | ax2.imshow(hand_scoremap1_show) 339 | ax3.imshow(hand_scoremap_cp) 340 | ax4.imshow(crop[0]) 341 | 342 | plt.show() 343 | 344 | def predict_classifier(hand_score_crop): 345 | hand_score_crop = hand_score_crop.reshape(1,2500) 346 | with CL_graph.as_default(): 347 | predict = tf.reshape(classifier, [-1, HAND_NUM, GESTURE_CLASSES]) 348 | max_idx_p = tf.argmax(predict, axis=2) 349 | gesture_classifier_result, score = sess1.run([max_idx_p, predict], feed_dict={XX: hand_score_crop, kkeep_prob: 1.}) 350 | print 'predict result:', gesture_classifier_result[0][0], 'score=', score[0,0,int(gesture_classifier_result[0][0])] 351 | return gesture_classifier_result[0][0] 352 | 353 | def cacul_accuracy(hand_scoremap, mask_raw): 354 | # mask 355 | max = np.max(hand_scoremap) 356 | hand_scoremap /= max 357 | for j in range(0,len(hand_scoremap)): 358 | for k in range(0,len(hand_scoremap[0])): 359 | if hand_scoremap[j][k] < 0.8: 360 | hand_scoremap[j][k] = 0 361 | else: 362 | hand_scoremap[j][k] = 1 363 | 364 | # calculate 365 | accuracy = 0. 366 | handscore_pre = hand_scoremap.reshape(6400) 367 | mask_raw = mask_raw.reshape(6400) 368 | for i in range(0, handscore_pre.shape[0]): 369 | if handscore_pre[i] == 0. and mask_raw[i] == 0.: 370 | accuracy += 1. 371 | if handscore_pre[i] != 0. and mask_raw[i] != 0.: 372 | accuracy += 1. 373 | return accuracy/(80*80) 374 | 375 | 376 | if __name__ == '__main__': 377 | if mode == 'train': 378 | train_handpose_depth_cnn(continueflag= True) 379 | if mode == 'predict': 380 | predict_handscoremap() 381 | -------------------------------------------------------------------------------- /Wonderseen_Handpose_cnn_depth/result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonderseen/Handpose-WonderSeen-Net/6a50925ea80b4c5a0ec05cf6ef22796ce42f17c6/Wonderseen_Handpose_cnn_depth/result.png -------------------------------------------------------------------------------- /Wonderseen_Handpose_cnn_depth/utils/general.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ColorHandPose3DNetwork - Network for estimating 3D Hand Pose from a single RGB Image 3 | # Copyright (C) 2017 Christian Zimmermann 4 | # This program is free software: you can redistribute it and/or modify 5 | # it under the terms of the GNU General Public License as published by 6 | # the Free Software Foundation, either version 2 of the License, or 7 | # (at your option) any later version. 8 | # 9 | # This program is distributed in the hope that it will be useful, 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | # GNU General Public License for more details. 13 | # 14 | # You should have received a copy of the GNU General Public License 15 | # along with this program. If not, see . 16 | 17 | from __future__ import print_function, unicode_literals 18 | import tensorflow as tf 19 | from tensorflow.python import pywrap_tensorflow 20 | import numpy as np 21 | import math 22 | 23 | class NetworkOps(object): 24 | neg_slope_of_relu = 0.01 25 | 26 | @classmethod 27 | def leaky_relu(cls, tensor, name='relu'): 28 | out_tensor = tf.maximum(tensor, cls.neg_slope_of_relu*tensor, name=name) 29 | return out_tensor 30 | 31 | @classmethod 32 | def conv(cls, in_tensor, layer_name, kernel_size, stride, out_chan, trainable=True): 33 | with tf.variable_scope(layer_name): 34 | in_size = in_tensor.get_shape().as_list() 35 | 36 | strides = [1, stride, stride, 1] 37 | kernel_shape = [kernel_size, kernel_size, in_size[3], out_chan] # 38 | 39 | # conv 40 | kernel = tf.get_variable('weights', kernel_shape, tf.float32, 41 | tf.contrib.layers.xavier_initializer_conv2d(), trainable=trainable, collections=['wd', 'variables', 'filters']) 42 | tmp_result = tf.nn.conv2d(in_tensor, kernel, strides, padding='SAME') 43 | 44 | # bias 45 | biases = tf.get_variable('biases', [kernel_shape[3]], tf.float32, 46 | tf.constant_initializer(0.0001), trainable=trainable, collections=['wd', 'variables', 'biases']) 47 | out_tensor = tf.nn.bias_add(tmp_result, biases, name='out') 48 | 49 | return out_tensor 50 | 51 | @classmethod 52 | def conv_relu(cls, in_tensor, layer_name, kernel_size, stride, out_chan, trainable=True): 53 | tensor = cls.conv(in_tensor, layer_name, kernel_size, stride, out_chan, trainable) 54 | out_tensor = cls.leaky_relu(tensor, name='out') 55 | return out_tensor 56 | 57 | @classmethod 58 | def max_pool(cls, bottom, name='pool'): 59 | pooled = tf.nn.max_pool(bottom, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], 60 | padding='VALID', name=name) 61 | return pooled 62 | 63 | @classmethod 64 | def upconv(cls, in_tensor, layer_name, output_shape, kernel_size, stride, trainable=True): 65 | with tf.variable_scope(layer_name): 66 | in_size = in_tensor.get_shape().as_list() 67 | 68 | kernel_shape = [kernel_size, kernel_size, in_size[3], in_size[3]] 69 | strides = [1, stride, stride, 1] 70 | 71 | # conv 72 | kernel = cls.get_deconv_filter(kernel_shape, trainable) 73 | tmp_result = tf.nn.conv2d_transpose(value=in_tensor, filter=kernel, output_shape=output_shape, 74 | strides=strides, padding='SAME') 75 | 76 | # bias 77 | biases = tf.get_variable('biases', [kernel_shape[2]], tf.float32, 78 | tf.constant_initializer(0.0), trainable=trainable, collections=['wd', 'variables', 'biases']) 79 | out_tensor = tf.nn.bias_add(tmp_result, biases) 80 | return out_tensor 81 | 82 | @classmethod 83 | def upconv_relu(cls, in_tensor, layer_name, output_shape, kernel_size, stride, trainable=True): 84 | tensor = cls.upconv(in_tensor, layer_name, output_shape, kernel_size, stride, trainable) 85 | out_tensor = cls.leaky_relu(tensor, name='out') 86 | return out_tensor 87 | 88 | @staticmethod 89 | def get_deconv_filter(f_shape, trainable): 90 | width = f_shape[0] 91 | height = f_shape[1] 92 | f = math.ceil(width/2.0) 93 | c = (2 * f - 1 - f % 2) / (2.0 * f) 94 | bilinear = np.zeros([f_shape[0], f_shape[1]]) 95 | for x in range(width): 96 | for y in range(height): 97 | value = (1 - abs(x / f - c)) * (1 - abs(y / f - c)) 98 | bilinear[x, y] = value 99 | weights = np.zeros(f_shape) 100 | for i in range(f_shape[2]): 101 | weights[:, :, i, i] = bilinear 102 | 103 | init = tf.constant_initializer(value=weights, 104 | dtype=tf.float32) 105 | return tf.get_variable(name="weights", initializer=init, 106 | shape=weights.shape, trainable=trainable, collections=['wd', 'variables', 'filters']) 107 | 108 | @staticmethod 109 | def fully_connected(in_tensor, layer_name, out_chan, trainable=True): 110 | with tf.variable_scope(layer_name): 111 | in_size = in_tensor.get_shape().as_list() 112 | assert len(in_size) == 2, 'Input to a fully connected layer must be a vector.' 113 | weights_shape = [in_size[1], out_chan] 114 | 115 | # weight matrix 116 | weights = tf.get_variable('weights', weights_shape, tf.float32, 117 | tf.contrib.layers.xavier_initializer(), trainable=trainable) 118 | weights = tf.check_numerics(weights, 'weights: %s' % layer_name) 119 | 120 | # bias 121 | biases = tf.get_variable('biases', [out_chan], tf.float32, 122 | tf.constant_initializer(0.0001), trainable=trainable) 123 | biases = tf.check_numerics(biases, 'biases: %s' % layer_name) 124 | 125 | out_tensor = tf.matmul(in_tensor, weights) + biases 126 | return out_tensor 127 | 128 | @classmethod 129 | def fully_connected_relu(cls, in_tensor, layer_name, out_chan, trainable=True): 130 | tensor = cls.fully_connected(in_tensor, layer_name, out_chan, trainable) 131 | out_tensor = tf.maximum(tensor, cls.neg_slope_of_relu*tensor, name='out') 132 | return out_tensor 133 | 134 | @staticmethod 135 | def dropout(in_tensor, keep_prob, evaluation): 136 | """ Dropout: Each neuron is dropped independently. """ 137 | with tf.variable_scope('dropout'): 138 | tensor_shape = in_tensor.get_shape().as_list() 139 | out_tensor = tf.cond(evaluation, 140 | lambda: tf.nn.dropout(in_tensor, 1.0, 141 | noise_shape=tensor_shape), 142 | lambda: tf.nn.dropout(in_tensor, keep_prob, 143 | noise_shape=tensor_shape)) 144 | return out_tensor 145 | 146 | @staticmethod 147 | def spatial_dropout(in_tensor, keep_prob, evaluation): 148 | """ Spatial dropout: Not each neuron is dropped independently, but feature map wise. """ 149 | with tf.variable_scope('spatial_dropout'): 150 | tensor_shape = in_tensor.get_shape().as_list() 151 | out_tensor = tf.cond(evaluation, 152 | lambda: tf.nn.dropout(in_tensor, 1.0, 153 | noise_shape=tensor_shape), 154 | lambda: tf.nn.dropout(in_tensor, keep_prob, 155 | noise_shape=[tensor_shape[0], 1, 1, tensor_shape[3]])) 156 | return out_tensor 157 | 158 | 159 | def crop_image_from_xy(image, crop_location, crop_size, scale=1.0): 160 | """ 161 | Crops an image. When factor is not given does an central crop. 162 | 163 | Inputs: 164 | image: 4D tensor, [batch, height, width, channels] which will be cropped in height and width dimension 165 | crop_location: tensor, [batch, 2] which represent the height and width location of the crop 166 | crop_size: int, describes the extension of the crop 167 | Outputs: 168 | image_crop: 4D tensor, [batch, crop_size, crop_size, channels] 169 | """ 170 | with tf.name_scope('crop_image_from_xy'): 171 | s = image.get_shape().as_list() 172 | assert len(s) == 4, "Image needs to be of shape [batch, width, height, channel]" 173 | scale = tf.reshape(scale, [-1]) 174 | crop_location = tf.cast(crop_location, tf.float32) 175 | crop_location = tf.reshape(crop_location, [s[0], 2]) 176 | crop_size = tf.cast(crop_size, tf.float32) 177 | 178 | crop_size_scaled = crop_size / scale 179 | y1 = crop_location[:, 0] - crop_size_scaled//2 180 | y2 = y1 + crop_size_scaled 181 | x1 = crop_location[:, 1] - crop_size_scaled//2 182 | x2 = x1 + crop_size_scaled 183 | y1 /= s[1] 184 | y2 /= s[1] 185 | x1 /= s[2] 186 | x2 /= s[2] 187 | boxes = tf.stack([y1, x1, y2, x2], -1) 188 | 189 | crop_size = tf.cast(tf.stack([crop_size, crop_size]), tf.int32) 190 | box_ind = tf.range(s[0]) 191 | image_c = tf.image.crop_and_resize(tf.cast(image, tf.float32), boxes, box_ind, crop_size, name='crop') 192 | return image_c 193 | 194 | 195 | def find_max_location(scoremap): 196 | """ Returns the coordinates of the given scoremap with maximum value. """ 197 | with tf.variable_scope('find_max_location'): 198 | s = scoremap.get_shape().as_list() 199 | if len(s) == 4: 200 | scoremap = tf.squeeze(scoremap, [3]) 201 | if len(s) == 2: 202 | scoremap = tf.expand_dims(scoremap, 0) 203 | 204 | s = scoremap.get_shape().as_list() 205 | assert len(s) == 3, "Scoremap must be 3D." 206 | assert (s[0] < s[1]) and (s[0] < s[2]), "Scoremap must be [Batch, Width, Height]" 207 | 208 | # my meshgrid 209 | x_range = tf.expand_dims(tf.range(s[1]), 1) 210 | y_range = tf.expand_dims(tf.range(s[2]), 0) 211 | X = tf.tile(x_range, [1, s[2]]) 212 | Y = tf.tile(y_range, [s[1], 1]) 213 | 214 | x_vec = tf.reshape(X, [-1]) 215 | y_vec = tf.reshape(Y, [-1]) 216 | scoremap_vec = tf.reshape(scoremap, [s[0], -1]) 217 | max_ind_vec = tf.cast(tf.argmax(scoremap_vec, dimension=1), tf.int32) 218 | 219 | xy_loc = list() 220 | for i in range(s[0]): 221 | x_loc = tf.reshape(x_vec[max_ind_vec[i]], [1]) 222 | y_loc = tf.reshape(y_vec[max_ind_vec[i]], [1]) 223 | xy_loc.append(tf.concat([x_loc, y_loc], 0)) 224 | 225 | xy_loc = tf.stack(xy_loc, 0) 226 | return xy_loc 227 | 228 | 229 | def single_obj_scoremap(scoremap): 230 | """ Applies my algorithm to figure out the most likely object from a given segmentation scoremap. """ 231 | with tf.variable_scope('single_obj_scoremap'): 232 | filter_size = 21 233 | s = scoremap.get_shape().as_list() 234 | assert len(s) == 4, "Scoremap must be 4D." 235 | 236 | scoremap_softmax = tf.nn.softmax(scoremap) #B, H, W, C --> normalizes across last dimension 237 | scoremap_fg = tf.reduce_max(scoremap_softmax[:, :, :, 1:], 3) # B, H, W 238 | detmap_fg = tf.round(scoremap_fg) # B, H, W 239 | 240 | # find maximum in the fg scoremap 241 | max_loc = find_max_location(scoremap_fg) 242 | 243 | # use maximum to start "growing" our objectmap 244 | objectmap_list = list() 245 | kernel_dil = tf.ones((filter_size, filter_size, 1)) / float(filter_size*filter_size) 246 | for i in range(s[0]): 247 | # create initial objectmap (put a one at the maximum) 248 | sparse_ind = tf.reshape(max_loc[i, :], [1, 2]) # reshape that its one point with 2dim) 249 | objectmap = tf.sparse_to_dense(sparse_ind, [s[1], s[2]], 1.0) 250 | 251 | # grow the map by dilation and pixelwise and 252 | num_passes = max(s[1], s[2]) // (filter_size//2) # number of passes needes to make sure the map can spread over the whole image 253 | for j in range(num_passes): 254 | objectmap = tf.reshape(objectmap, [1, s[1], s[2], 1]) 255 | objectmap_dil = tf.nn.dilation2d(objectmap, kernel_dil, [1, 1, 1, 1], [1, 1, 1, 1], 'SAME') 256 | objectmap_dil = tf.reshape(objectmap_dil, [s[1], s[2]]) 257 | objectmap = tf.round(tf.multiply(detmap_fg[i, :, :], objectmap_dil)) 258 | 259 | objectmap = tf.reshape(objectmap, [s[1], s[2], 1]) 260 | objectmap_list.append(objectmap) 261 | 262 | objectmap = tf.stack(objectmap_list) 263 | 264 | return objectmap 265 | 266 | 267 | def calc_center_bb(binary_class_mask): 268 | """ Returns the center of mass coordinates for the given binary_class_mask. """ 269 | with tf.variable_scope('calc_center_bb'): 270 | binary_class_mask = tf.cast(binary_class_mask, tf.int32) 271 | binary_class_mask = tf.equal(binary_class_mask, 1) 272 | s = binary_class_mask.get_shape().as_list() 273 | if len(s) == 4: 274 | binary_class_mask = tf.squeeze(binary_class_mask, [3]) 275 | 276 | s = binary_class_mask.get_shape().as_list() 277 | assert len(s) == 3, "binary_class_mask must be 3D." 278 | assert (s[0] < s[1]) and (s[0] < s[2]), "binary_class_mask must be [Batch, Width, Height]" 279 | 280 | # my meshgrid 281 | x_range = tf.expand_dims(tf.range(s[1]), 1) 282 | y_range = tf.expand_dims(tf.range(s[2]), 0) 283 | X = tf.tile(x_range, [1, s[2]]) 284 | Y = tf.tile(y_range, [s[1], 1]) 285 | 286 | bb_list = list() 287 | center_list = list() 288 | crop_size_list = list() 289 | for i in range(s[0]): 290 | X_masked = tf.cast(tf.boolean_mask(X, binary_class_mask[i, :, :]), tf.float32) 291 | Y_masked = tf.cast(tf.boolean_mask(Y, binary_class_mask[i, :, :]), tf.float32) 292 | 293 | x_min = tf.reduce_min(X_masked) 294 | x_max = tf.reduce_max(X_masked) 295 | y_min = tf.reduce_min(Y_masked) 296 | y_max = tf.reduce_max(Y_masked) 297 | 298 | start = tf.stack([x_min, y_min]) 299 | end = tf.stack([x_max, y_max]) 300 | bb = tf.stack([start, end], 1) 301 | bb_list.append(bb) 302 | 303 | center_x = 0.5*(x_max + x_min) 304 | center_y = 0.5*(y_max + y_min) 305 | center = tf.stack([center_x, center_y], 0) 306 | 307 | center = tf.cond(tf.reduce_all(tf.is_finite(center)), lambda: center, 308 | lambda: tf.constant([160.0, 160.0])) 309 | center.set_shape([2]) 310 | center_list.append(center) 311 | 312 | crop_size_x = x_max - x_min 313 | crop_size_y = y_max - y_min 314 | crop_size = tf.expand_dims(tf.maximum(crop_size_x, crop_size_y), 0) 315 | crop_size = tf.cond(tf.reduce_all(tf.is_finite(crop_size)), lambda: crop_size, 316 | lambda: tf.constant([100.0])) 317 | crop_size.set_shape([1]) 318 | crop_size_list.append(crop_size) 319 | 320 | bb = tf.stack(bb_list) 321 | center = tf.stack(center_list) 322 | crop_size = tf.stack(crop_size_list) 323 | 324 | return center, bb, crop_size 325 | 326 | 327 | def detect_keypoints(scoremaps): 328 | """ Performs detection per scoremap for the hands keypoints. """ 329 | if len(scoremaps.shape) == 4: 330 | scoremaps = np.squeeze(scoremaps) 331 | s = scoremaps.shape 332 | assert len(s) == 3, "This function was only designed for 3D Scoremaps." 333 | assert (s[2] < s[1]) and (s[2] < s[0]), "Probably the input is not correct, because [H, W, C] is expected." 334 | 335 | keypoint_coords = np.zeros((s[2], 2)) 336 | for i in range(s[2]): 337 | v, u = np.unravel_index(np.argmax(scoremaps[:, :, i]), (s[0], s[1])) 338 | keypoint_coords[i, 0] = v 339 | keypoint_coords[i, 1] = u 340 | return keypoint_coords 341 | 342 | 343 | def trafo_coords(keypoints_crop_coords, centers, scale, crop_size): 344 | """ Transforms coords into global image coordinates. """ 345 | keypoints_coords = np.copy(keypoints_crop_coords) 346 | 347 | keypoints_coords -= crop_size // 2 348 | 349 | keypoints_coords /= scale 350 | 351 | keypoints_coords += centers 352 | 353 | return keypoints_coords 354 | 355 | 356 | def plot_hand(coords_hw, axis, color_fixed=None, linewidth='1'): 357 | """ Plots a hand stick figure into a matplotlib figure. """ 358 | colors = np.array([[0., 0., 0.5], 359 | [0., 0., 0.73172906], 360 | [0., 0., 0.96345811], 361 | [0., 0.12745098, 1.], 362 | [0., 0.33137255, 1.], 363 | [0., 0.55098039, 1.], 364 | [0., 0.75490196, 1.], 365 | [0.06008855, 0.9745098, 0.90765338], 366 | [0.22454143, 1., 0.74320051], 367 | [0.40164453, 1., 0.56609741], 368 | [0.56609741, 1., 0.40164453], 369 | [0.74320051, 1., 0.22454143], 370 | [0.90765338, 1., 0.06008855], 371 | [1., 0.82861293, 0.], 372 | [1., 0.63979666, 0.], 373 | [1., 0.43645606, 0.], 374 | [1., 0.2476398, 0.], 375 | [0.96345811, 0.0442992, 0.], 376 | [0.73172906, 0., 0.], 377 | [0.5, 0., 0.]]) 378 | 379 | # define connections and colors of the bones 380 | bones = [((0, 4), colors[0, :]), 381 | ((4, 3), colors[1, :]), 382 | ((3, 2), colors[2, :]), 383 | ((2, 1), colors[3, :]), 384 | 385 | ((0, 8), colors[4, :]), 386 | ((8, 7), colors[5, :]), 387 | ((7, 6), colors[6, :]), 388 | ((6, 5), colors[7, :]), 389 | 390 | ((0, 12), colors[8, :]), 391 | ((12, 11), colors[9, :]), 392 | ((11, 10), colors[10, :]), 393 | ((10, 9), colors[11, :]), 394 | 395 | ((0, 16), colors[12, :]), 396 | ((16, 15), colors[13, :]), 397 | ((15, 14), colors[14, :]), 398 | ((14, 13), colors[15, :]), 399 | 400 | ((0, 20), colors[16, :]), 401 | ((20, 19), colors[17, :]), 402 | ((19, 18), colors[18, :]), 403 | ((18, 17), colors[19, :])] 404 | 405 | for connection, color in bones: 406 | coord1 = coords_hw[connection[0], :] 407 | coord2 = coords_hw[connection[1], :] 408 | coords = np.stack([coord1, coord2]) 409 | if color_fixed is None: 410 | axis.plot(coords[:, 1], coords[:, 0], color=color, linewidth=linewidth) 411 | else: 412 | axis.plot(coords[:, 1], coords[:, 0], color_fixed, linewidth=linewidth) 413 | 414 | 415 | def plot_hand_3d(coords_xyz, axis, color_fixed=None, linewidth='1'): 416 | """ Plots a hand stick figure into a matplotlib figure. """ 417 | colors = np.array([[0., 0., 0.5], 418 | [0., 0., 0.73172906], 419 | [0., 0., 0.96345811], 420 | [0., 0.12745098, 1.], 421 | [0., 0.33137255, 1.], 422 | [0., 0.55098039, 1.], 423 | [0., 0.75490196, 1.], 424 | [0.06008855, 0.9745098, 0.90765338], 425 | [0.22454143, 1., 0.74320051], 426 | [0.40164453, 1., 0.56609741], 427 | [0.56609741, 1., 0.40164453], 428 | [0.74320051, 1., 0.22454143], 429 | [0.90765338, 1., 0.06008855], 430 | [1., 0.82861293, 0.], 431 | [1., 0.63979666, 0.], 432 | [1., 0.43645606, 0.], 433 | [1., 0.2476398, 0.], 434 | [0.96345811, 0.0442992, 0.], 435 | [0.73172906, 0., 0.], 436 | [0.5, 0., 0.]]) 437 | 438 | # define connections and colors of the bones 439 | bones = [((0, 4), colors[0, :]), 440 | ((4, 3), colors[1, :]), 441 | ((3, 2), colors[2, :]), 442 | ((2, 1), colors[3, :]), 443 | 444 | ((0, 8), colors[4, :]), 445 | ((8, 7), colors[5, :]), 446 | ((7, 6), colors[6, :]), 447 | ((6, 5), colors[7, :]), 448 | 449 | ((0, 12), colors[8, :]), 450 | ((12, 11), colors[9, :]), 451 | ((11, 10), colors[10, :]), 452 | ((10, 9), colors[11, :]), 453 | 454 | ((0, 16), colors[12, :]), 455 | ((16, 15), colors[13, :]), 456 | ((15, 14), colors[14, :]), 457 | ((14, 13), colors[15, :]), 458 | 459 | ((0, 20), colors[16, :]), 460 | ((20, 19), colors[17, :]), 461 | ((19, 18), colors[18, :]), 462 | ((18, 17), colors[19, :])] 463 | 464 | for connection, color in bones: 465 | coord1 = coords_xyz[connection[0], :] 466 | coord2 = coords_xyz[connection[1], :] 467 | coords = np.stack([coord1, coord2]) 468 | if color_fixed is None: 469 | axis.plot(coords[:, 0], coords[:, 1], coords[:, 2], color=color, linewidth=linewidth) 470 | else: 471 | axis.plot(coords[:, 0], coords[:, 1], coords[:, 2], color_fixed, linewidth=linewidth) 472 | 473 | axis.view_init(azim=-90., elev=90.) 474 | 475 | 476 | class LearningRateScheduler: 477 | """ 478 | Provides scalar tensors at certain iteration as is needed for a multistep learning rate schedule. 479 | 根据用户定制在不同的step,对学习率进行调整 480 | """ 481 | def __init__(self, steps, values): 482 | self.steps = steps 483 | self.values = values 484 | 485 | assert len(steps)+1 == len(values), "There must be one more element in value as step." 486 | 487 | def get_lr(self, global_step): 488 | with tf.name_scope('lr_scheduler'): 489 | 490 | if len(self.values) == 1: #1 value -> no step 491 | learning_rate = tf.constant(self.values[0]) 492 | elif len(self.values) == 2: #2 values -> one step 493 | cond = tf.greater(global_step, self.steps[0]) 494 | learning_rate = tf.where(cond, self.values[1], self.values[0]) 495 | else: # n values -> n-1 steps 496 | cond_first = tf.less(global_step, self.steps[0]) 497 | 498 | cond_between = list() 499 | for ind, step in enumerate(range(0, len(self.steps)-1)): 500 | cond_between.append(tf.logical_and(tf.less(global_step, self.steps[ind+1]), 501 | tf.greater_equal(global_step, self.steps[ind]))) 502 | 503 | cond_last = tf.greater_equal(global_step, self.steps[-1]) 504 | 505 | cond_full = [cond_first] 506 | cond_full.extend(cond_between) 507 | cond_full.append(cond_last) 508 | 509 | cond_vec = tf.stack(cond_full) 510 | lr_vec = tf.stack(self.values) 511 | 512 | learning_rate = tf.where(cond_vec, lr_vec, tf.zeros_like(lr_vec)) 513 | 514 | learning_rate = tf.reduce_sum(learning_rate) 515 | 516 | return learning_rate 517 | 518 | 519 | class EvalUtil: 520 | """ Util class for evaluation networks. 521 | """ 522 | def __init__(self, num_kp=21): 523 | # init empty data storage 524 | self.data = list() 525 | self.num_kp = num_kp 526 | for _ in range(num_kp): 527 | self.data.append(list()) 528 | 529 | def feed(self, keypoint_gt, keypoint_vis, keypoint_pred): 530 | """ Used to feed data to the class. Stores the euclidean distance between gt and pred, when it is visible. """ 531 | keypoint_gt = np.squeeze(keypoint_gt) 532 | keypoint_pred = np.squeeze(keypoint_pred) 533 | keypoint_vis = np.squeeze(keypoint_vis).astype('bool') 534 | 535 | assert len(keypoint_gt.shape) == 2 536 | assert len(keypoint_pred.shape) == 2 537 | assert len(keypoint_vis.shape) == 1 538 | 539 | # calc euclidean distance 540 | diff = keypoint_gt - keypoint_pred 541 | euclidean_dist = np.sqrt(np.sum(np.square(diff), axis=1)) 542 | 543 | num_kp = keypoint_gt.shape[0] 544 | for i in range(num_kp): 545 | if keypoint_vis[i]: 546 | self.data[i].append(euclidean_dist[i]) 547 | 548 | def _get_pck(self, kp_id, threshold): 549 | """ Returns pck for one keypoint for the given threshold. """ 550 | if len(self.data[kp_id]) == 0: 551 | return None 552 | 553 | data = np.array(self.data[kp_id]) 554 | pck = np.mean((data <= threshold).astype('float')) 555 | return pck 556 | 557 | def _get_epe(self, kp_id): 558 | """ Returns end point error for one keypoint. """ 559 | if len(self.data[kp_id]) == 0: 560 | return None, None 561 | 562 | data = np.array(self.data[kp_id]) 563 | epe_mean = np.mean(data) 564 | epe_median = np.median(data) 565 | return epe_mean, epe_median 566 | 567 | def get_measures(self, val_min, val_max, steps): 568 | """ Outputs the average mean and median error as well as the pck score. """ 569 | thresholds = np.linspace(val_min, val_max, steps) 570 | thresholds = np.array(thresholds) 571 | norm_factor = np.trapz(np.ones_like(thresholds), thresholds) 572 | 573 | # init mean measures 574 | epe_mean_all = list() 575 | epe_median_all = list() 576 | auc_all = list() 577 | pck_curve_all = list() 578 | 579 | # Create one plot for each part 580 | for part_id in range(self.num_kp): 581 | # mean/median error 582 | mean, median = self._get_epe(part_id) 583 | 584 | if mean is None: 585 | # there was no valid measurement for this keypoint 586 | continue 587 | 588 | epe_mean_all.append(mean) 589 | epe_median_all.append(median) 590 | 591 | # pck/auc 592 | pck_curve = list() 593 | for t in thresholds: 594 | pck = self._get_pck(part_id, t) 595 | pck_curve.append(pck) 596 | 597 | pck_curve = np.array(pck_curve) 598 | pck_curve_all.append(pck_curve) 599 | auc = np.trapz(pck_curve, thresholds) 600 | auc /= norm_factor 601 | auc_all.append(auc) 602 | 603 | epe_mean_all = np.mean(np.array(epe_mean_all)) 604 | epe_median_all = np.mean(np.array(epe_median_all)) 605 | auc_all = np.mean(np.array(auc_all)) 606 | pck_curve_all = np.mean(np.array(pck_curve_all), 0) # mean only over keypoints 607 | 608 | return epe_mean_all, epe_median_all, auc_all, pck_curve_all, thresholds 609 | 610 | 611 | def load_weights_from_snapshot(session, checkpoint_path, discard_list=None, rename_dict=None): 612 | """ Loads weights from a snapshot except the ones indicated with discard_list. Others are possibly renamed. """ 613 | reader = pywrap_tensorflow.NewCheckpointReader(checkpoint_path) 614 | var_to_shape_map = reader.get_variable_to_shape_map() 615 | 616 | # Remove everything from the discard list 617 | if discard_list is not None: 618 | num_disc = 0 619 | var_to_shape_map_new = dict() 620 | for k, v in var_to_shape_map.items(): 621 | good = True 622 | for dis_str in discard_list: 623 | if dis_str in k: 624 | good = False 625 | 626 | if good: 627 | var_to_shape_map_new[k] = v 628 | else: 629 | num_disc += 1 630 | var_to_shape_map = dict(var_to_shape_map_new) 631 | print('Discarded %d items' % num_disc) 632 | 633 | # rename everything according to rename_dict 634 | num_rename = 0 635 | var_to_shape_map_new = dict() 636 | for name in var_to_shape_map.keys(): 637 | new_name = name 638 | if rename_dict is not None: 639 | for rename_str in rename_dict.keys(): 640 | if rename_str in name: 641 | new_name = new_name.replace(rename_str, rename_dict[rename_str]) 642 | num_rename += 1 643 | var_to_shape_map_new[new_name] = reader.get_tensor(name) 644 | var_to_shape_map = dict(var_to_shape_map_new) 645 | 646 | init_op, init_feed = tf.contrib.framework.assign_from_values(var_to_shape_map) 647 | session.run(init_op, init_feed) 648 | print('Initialized %d variables from %s.' % (len(var_to_shape_map), checkpoint_path)) 649 | 650 | 651 | def calc_auc(x, y): 652 | """ Given x and y values it calculates the approx. integral and normalizes it: area under curve""" 653 | integral = np.trapz(y, x) 654 | norm = np.trapz(np.ones_like(y), x) 655 | 656 | return integral / norm 657 | 658 | 659 | def get_stb_ref_curves(): 660 | """ 661 | Returns results of various baseline methods on the Stereo Tracking Benchmark Dataset reported by: 662 | Zhang et al., ‘3d Hand Pose Tracking and Estimation Using Stereo Matching’, 2016 663 | """ 664 | curve_list = list() 665 | thresh_mm = np.array([20.0, 25, 30, 35, 40, 45, 50]) 666 | pso_b1 = np.array([0.32236842, 0.53947368, 0.67434211, 0.75657895, 0.80921053, 0.86513158, 0.89473684]) 667 | curve_list.append((thresh_mm, pso_b1, 'PSO (AUC=%.3f)' % calc_auc(thresh_mm, pso_b1))) 668 | icppso_b1 = np.array([ 0.51973684, 0.64473684, 0.71710526, 0.77302632, 0.80921053, 0.84868421, 0.86842105]) 669 | curve_list.append((thresh_mm, icppso_b1, 'ICPPSO (AUC=%.3f)' % calc_auc(thresh_mm, icppso_b1))) 670 | chpr_b1 = np.array([ 0.56578947, 0.71710526, 0.82236842, 0.88157895, 0.91447368, 0.9375, 0.96052632]) 671 | curve_list.append((thresh_mm, chpr_b1, 'CHPR (AUC=%.3f)' % calc_auc(thresh_mm, chpr_b1))) 672 | return curve_list 673 | -------------------------------------------------------------------------------- /result/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonderseen/Handpose-WonderSeen-Net/6a50925ea80b4c5a0ec05cf6ef22796ce42f17c6/result/test.png -------------------------------------------------------------------------------- /result/test1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonderseen/Handpose-WonderSeen-Net/6a50925ea80b4c5a0ec05cf6ef22796ce42f17c6/result/test1.png -------------------------------------------------------------------------------- /result/test2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonderseen/Handpose-WonderSeen-Net/6a50925ea80b4c5a0ec05cf6ef22796ce42f17c6/result/test2.png -------------------------------------------------------------------------------- /result/test3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonderseen/Handpose-WonderSeen-Net/6a50925ea80b4c5a0ec05cf6ef22796ce42f17c6/result/test3.png -------------------------------------------------------------------------------- /result/test4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonderseen/Handpose-WonderSeen-Net/6a50925ea80b4c5a0ec05cf6ef22796ce42f17c6/result/test4.png -------------------------------------------------------------------------------- /result/test5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonderseen/Handpose-WonderSeen-Net/6a50925ea80b4c5a0ec05cf6ef22796ce42f17c6/result/test5.png -------------------------------------------------------------------------------- /result/test6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonderseen/Handpose-WonderSeen-Net/6a50925ea80b4c5a0ec05cf6ef22796ce42f17c6/result/test6.png --------------------------------------------------------------------------------