├── .gitignore ├── README.md ├── demo ├── Openpose.py ├── SqueezeNet.ipynb └── img_keypoint_show.py ├── image ├── Loss.png ├── com.png ├── front-back.jpg ├── hand.jpeg ├── hand.jpg ├── hand.png ├── right-frontal.jpg └── unnamed.png ├── main ├── data │ └── dataset.py ├── demo.py └── train.py └── other ├── CMakeLists.txt ├── Hand_Caffe ├── 1_F_deploy.prototxt ├── 1_F_solver.prototxt ├── 1_F_train.prototxt ├── create_txt.py ├── getBox.py ├── hand.jpeg ├── level1.py ├── read_im_json.py └── utils.py ├── Hand_Detection ├── README.md ├── data │ ├── create_annoset.py │ ├── create_data.sh │ ├── create_txt.py │ ├── egohands │ │ ├── _screenshot_17.04.2018.png │ │ ├── egohands_data.txt │ │ ├── generate_egohands.py │ │ └── getInfo.m │ ├── gth │ │ └── .gitkeep │ ├── labelmap_voc.prototxt │ └── stanfordhands │ │ └── generate_stanfordhands.py ├── model │ ├── deploy.prototxt │ ├── generate_model.py │ ├── snapshot │ │ └── .gitkeep │ ├── solver.prototxt │ └── train.prototxt ├── old_README.md ├── pic │ ├── demo.jpg │ ├── example_image.jpg │ └── width_and_height.png ├── ssd_camera.py ├── ssd_detection.py └── utils │ ├── __init__.py │ ├── mAP.py │ ├── output.py │ ├── plot_loss.py │ ├── score.py │ ├── ssd_net.py │ └── utils.py ├── Openpose-Keras ├── .gitignore ├── README.md ├── StolenOpenPoseHandTracking.ipynb └── images │ └── test_image.png ├── asl.mp4 ├── front-back.jpg ├── getModels.sh ├── hand.jpg ├── hand.png ├── handPose-Notebook.ipynb ├── handPoseImage.cpp ├── handPoseImage.py ├── handPoseVideo.cpp └── handPoseVideo.py /.gitignore: -------------------------------------------------------------------------------- 1 | # TOTALLY IGNORE THE MODEL FILES 2 | .ipynb_checkpoints 3 | *.h5 4 | *.h5py 5 | *.npy 6 | *.zip 7 | /.idea 8 | Push.sh 9 | /data 10 | /文献 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # celery beat schedule file 90 | celerybeat-schedule 91 | 92 | # SageMath parsed files 93 | *.sage.py 94 | 95 | # Environments 96 | .env 97 | .venv 98 | env/ 99 | venv/ 100 | ENV/ 101 | env.bak/ 102 | venv.bak/ 103 | 104 | # Spyder project settings 105 | .spyderproject 106 | .spyproject 107 | 108 | # Rope project settings 109 | .ropeproject 110 | 111 | # mkdocs documentation 112 | /site 113 | 114 | # mypy 115 | .mypy_cache/ 116 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hand-Keypoint-Estimation 2 | 3 | ## Introduction 4 | 5 |
unnamed
6 | 7 | 手部21点关键点识别 8 | 9 |
unnamed
10 | 11 | ## TODO 12 | 13 | - [x] ResNet34+Finetune 14 | - [x] SqueezeNet+Finetune 15 | - [ ] Hourglass 16 | - [ ] Openpose+Design Loss 17 | - [ ] 更好的效果展示 18 | - [ ] 抗遮挡 19 | 20 | ## Dataset 21 | 22 | [CMU手部数据集](http://domedb.perception.cs.cmu.edu/handdb.html)(遮挡比较变态) 23 | 24 | ``` 25 | Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations) 26 | └─hand_labels_synth 27 | ├─output_viz_synth 28 | ├─synth1(json文件数据缺失指尖5个关键点) 29 | ├─synth2 30 | ├─synth3 31 | └─synth4 32 | ``` 33 | 34 | ## Benchmarking 35 | 36 | **SqueezeNet+Finetune** 37 | 38 | ``` 39 | Finetune = nn.Sequential( 40 | Flatten(), 41 | nn.ReLU(), 42 | nn.Dropout(0.5), 43 | nn.Linear(247808, 256), 44 | #ReLU不能放BN前会导致BN方差计算错误 45 | nn.BatchNorm1d(256), 46 | nn.ReLU(), 47 | nn.Dropout(0.5), 48 | nn.Linear(256, 42), 49 | Reshape(-1,21,2), 50 | nn.Tanh() 51 | ) 52 | Total params: 64,172,906 53 | Total trainable params: 64,172,906 54 | Total non-trainable params: 0 55 | Loss function : MSELoss 56 | Epoch : 200 57 | LR : 0.01->0.0001 58 | Train Loss end : 0.010500 59 | Valid Loss end : 0.012454 60 | ``` 61 | 62 |
unnamed
63 | 64 | CPU上0.0234s一张图片 65 | 66 | GPU-2070Ti上0.00727s一张图片 67 | 68 | -------------------------------------------------------------------------------- /demo/Openpose.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | from collections import OrderedDict 4 | from torch.autograd import Variable 5 | 6 | import torch 7 | import torch.nn as nn 8 | from fastai.vision import * 9 | from fastai import * 10 | import os 11 | os.environ['CUDA_VISIBLE_DEVICES'] = '3' 12 | 13 | class Reshape(nn.Module): 14 | def __init__(self, *args): 15 | super(Reshape, self).__init__() 16 | self.shape = args 17 | 18 | def forward(self, x): 19 | return x.view(self.shape) 20 | 21 | def make_layers(block, no_relu_layers): 22 | layers = [] 23 | for layer_name, v in block.items(): 24 | if 'pool' in layer_name: 25 | layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], 26 | padding=v[2]) 27 | layers.append((layer_name, layer)) 28 | else: 29 | conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1], 30 | kernel_size=v[2], stride=v[3], 31 | padding=v[4]) 32 | layers.append((layer_name, conv2d)) 33 | if layer_name not in no_relu_layers: 34 | layers.append(('relu_' + layer_name, nn.ReLU(inplace=True))) 35 | 36 | return nn.Sequential(OrderedDict(layers)) 37 | 38 | 39 | class handpose_model(nn.Module): 40 | def __init__(self): 41 | super().__init__() 42 | 43 | # these layers have no relu layer 44 | no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3', \ 45 | 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6'] 46 | # stage 1 47 | block1_0 = OrderedDict({ 48 | 'conv1_1': [3, 64, 3, 1, 1], 49 | 'conv1_2': [64, 64, 3, 1, 1], 50 | 'pool1_stage1': [2, 2, 0], 51 | 'conv2_1': [64, 128, 3, 1, 1], 52 | 'conv2_2': [128, 128, 3, 1, 1], 53 | 'pool2_stage1': [2, 2, 0], 54 | 'conv3_1': [128, 256, 3, 1, 1], 55 | 'conv3_2': [256, 256, 3, 1, 1], 56 | 'conv3_3': [256, 256, 3, 1, 1], 57 | 'conv3_4': [256, 256, 3, 1, 1], 58 | 'pool3_stage1': [2, 2, 0], 59 | 'conv4_1': [256, 512, 3, 1, 1], 60 | 'conv4_2': [512, 512, 3, 1, 1], 61 | 'conv4_3': [512, 512, 3, 1, 1], 62 | 'conv4_4': [512, 512, 3, 1, 1], 63 | 'conv5_1': [512, 512, 3, 1, 1], 64 | 'conv5_2': [512, 512, 3, 1, 1], 65 | 'conv5_3_CPM': [512, 128, 3, 1, 1]}) 66 | 67 | block1_1 = OrderedDict({ 68 | 'conv6_1_CPM': [128, 512, 1, 1, 0], 69 | 'conv6_2_CPM': [512, 22, 1, 1, 0] 70 | }) 71 | 72 | blocks = {} 73 | blocks['block1_0'] = block1_0 74 | blocks['block1_1'] = block1_1 75 | 76 | # stage 2-6 77 | for i in range(2, 7): 78 | blocks['block%d' % i] = OrderedDict({ 79 | 'Mconv1_stage%d' % i: [150, 128, 7, 1, 3], 80 | 'Mconv2_stage%d' % i: [128, 128, 7, 1, 3], 81 | 'Mconv3_stage%d' % i: [128, 128, 7, 1, 3], 82 | 'Mconv4_stage%d' % i: [128, 128, 7, 1, 3], 83 | 'Mconv5_stage%d' % i: [128, 128, 7, 1, 3], 84 | 'Mconv6_stage%d' % i: [128, 128, 1, 1, 0], 85 | 'Mconv7_stage%d' % i: [128, 22, 1, 1, 0]}) 86 | 87 | for k in blocks.keys(): 88 | blocks[k] = make_layers(blocks[k], no_relu_layers) 89 | 90 | self.model1_0 = blocks['block1_0'] 91 | self.model1_1 = blocks['block1_1'] 92 | self.model2 = blocks['block2'] 93 | self.model3 = blocks['block3'] 94 | self.model4 = blocks['block4'] 95 | self.model5 = blocks['block5'] 96 | self.model6 = blocks['block6'] 97 | self.head_reg = nn.Sequential( 98 | Flatten(), 99 | nn.ReLU(), 100 | nn.Dropout(0.5), 101 | nn.Linear(22*46*46, 256), 102 | nn.ReLU(), 103 | nn.Dropout(0.5), 104 | nn.Linear(256, 42), 105 | Reshape(-1, 21, 2), 106 | nn.Tanh()) 107 | self._initialize_weights() 108 | 109 | def forward(self, x): 110 | out1_0 = self.model1_0(x) 111 | out1_1 = self.model1_1(out1_0) 112 | concat_stage2 = torch.cat([out1_1, out1_0], 1) 113 | out_stage2 = self.model2(concat_stage2) 114 | concat_stage3 = torch.cat([out_stage2, out1_0], 1) 115 | out_stage3 = self.model3(concat_stage3) 116 | concat_stage4 = torch.cat([out_stage3, out1_0], 1) 117 | out_stage4 = self.model4(concat_stage4) 118 | concat_stage5 = torch.cat([out_stage4, out1_0], 1) 119 | out_stage5 = self.model5(concat_stage5) 120 | concat_stage6 = torch.cat([out_stage5, out1_0], 1) 121 | out_stage6 = self.model6(concat_stage6) 122 | x = self.head_reg(out_stage6) 123 | return x 124 | 125 | def _initialize_weights(self): 126 | for m in self.modules(): 127 | if isinstance(m, nn.Conv2d): 128 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') 129 | if m.bias is not None: 130 | nn.init.constant_(m.bias, 0) 131 | elif isinstance(m, nn.BatchNorm2d): 132 | nn.init.constant_(m.weight, 1) 133 | nn.init.constant_(m.bias, 0) 134 | elif isinstance(m, nn.Linear): 135 | nn.init.normal_(m.weight, 0, 0.01) 136 | nn.init.constant_(m.bias, 0) 137 | 138 | 139 | image_path = '/home/hanwei-1/data/hand_labels_synth/synth2_3' 140 | 141 | 142 | transforms = get_transforms(do_flip=False, max_zoom=1.05, max_warp=0.01,max_rotate=3, p_lighting=1) 143 | 144 | def get_y_func(x): 145 | pre, ext = os.path.splitext(x) 146 | hand_data_out = [] 147 | # pre = pre.replace('synth2', 'synth2_json') 148 | hand_data = json.load(open(pre + '.json')) 149 | for i in range(21): 150 | hand_tem_xy = hand_data['hand_pts'][i][:2] 151 | hand_tem_xy.reverse() 152 | hand_data_out.append(hand_tem_xy) 153 | return Tensor(hand_data_out) 154 | 155 | 156 | data = (PointsItemList.from_folder(path=image_path, extensions=['.jpg'], presort=True) 157 | .split_by_rand_pct() 158 | .label_from_func(get_y_func) 159 | .transform(transforms, size=368, tfm_y=True, remove_out=False, 160 | padding_mode='border', resize_method=ResizeMethod.PAD) 161 | .databunch(bs=32) 162 | .normalize(imagenet_stats)) 163 | 164 | 165 | class MSELossFlat(nn.MSELoss): 166 | def forward(self, input:Tensor, target:Tensor): 167 | return super().forward(input.view(-1), target.view(-1)) 168 | 169 | 170 | mse_loss_flat = MSELossFlat() 171 | 172 | 173 | class L2Loss(torch.nn.Module): 174 | def __init__(self, batch_size): 175 | super(L2Loss, self).__init__() 176 | self.batch_size = batch_size 177 | 178 | def forward(self, x: Variable, y: Variable, weights: Variable = None): 179 | if weights is not None: 180 | val = (x-y) * weights[:x.data.shape[0], :, :, :] # Slice by shape[n,..] for batch size (last batch < batch_size) 181 | else: 182 | val = x-y 183 | l = torch.sum(val ** 2) / self.batch_size / 2 184 | return l 185 | 186 | 187 | l2loss = L2Loss(batch_size=8) 188 | 189 | net = handpose_model() 190 | 191 | 192 | learn = Learner(data, net, loss_func=mse_loss_flat) 193 | learn.fit_one_cycle(cyc_len=200, max_lr=0.0001) 194 | learn.recorder.plot() 195 | plt.show() 196 | learn.lr_find() 197 | learn.recorder.plot() 198 | plt.show() 199 | -------------------------------------------------------------------------------- /demo/img_keypoint_show.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import matplotlib.pyplot as plt 4 | import cv2 5 | 6 | im_dir = 'Path' 7 | json_dir = 'Path' 8 | hand_data_out = {} 9 | 10 | with open(json_dir, 'r') as f: 11 | hand_data = json.load(f) 12 | 13 | for i in range(21): 14 | hand_data_out[i] = hand_data['hand_pts'][i][:2] 15 | 16 | for j in range(21): 17 | for i in range(2): 18 | hand_data_out[j][i] = int(hand_data_out[j][i]) 19 | 20 | 21 | def get_json_point(json_path): 22 | hand_data_out = {} 23 | hand_return = {} 24 | str_point = '' 25 | with open(json_dir, 'r') as f: 26 | hand_data = json.load(f) 27 | 28 | for i in range(21): 29 | hand_data_out[i] = hand_data['hand_pts'][i][:2] 30 | 31 | for j in range(21): 32 | for i in range(2): 33 | hand_data_out[j][i] = int(hand_data_out[j][i]) 34 | 35 | hand_return[0] = hand_data_out[1] 36 | hand_return[1] = hand_data_out[7] 37 | hand_return[2] = hand_data_out[11] 38 | hand_return[3] = hand_data_out[15] 39 | hand_return[4] = hand_data_out[19] 40 | for key, value in hand_return.items(): 41 | for i in range(2): 42 | str_point += str(value[i]) 43 | str_point += ' ' 44 | 45 | return hand_data_out 46 | 47 | 48 | data = get_json_point(json_dir) 49 | 50 | output = cv2.imread(im_dir) 51 | for i in range(21): 52 | cv2.circle(output, tuple(data[i]), 2, (0, 0, 255), 1) 53 | plt.imshow(output) 54 | plt.show() 55 | 56 | -------------------------------------------------------------------------------- /image/Loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/Loss.png -------------------------------------------------------------------------------- /image/com.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/com.png -------------------------------------------------------------------------------- /image/front-back.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/front-back.jpg -------------------------------------------------------------------------------- /image/hand.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/hand.jpeg -------------------------------------------------------------------------------- /image/hand.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/hand.jpg -------------------------------------------------------------------------------- /image/hand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/hand.png -------------------------------------------------------------------------------- /image/right-frontal.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/right-frontal.jpg -------------------------------------------------------------------------------- /image/unnamed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/unnamed.png -------------------------------------------------------------------------------- /main/data/dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @File : dataset.py 4 | @Time : 2019/9/13 16:34 5 | @Author : KeyForce 6 | @Email : july.master@outlook.com 7 | """ 8 | import os 9 | import torch 10 | import pandas as pd 11 | from skimage import io, transform 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from torch.utils.data import Dataset, DataLoader 15 | from torchvision import transforms, utils 16 | import json 17 | 18 | 19 | class ReadJsonPoint: 20 | """读取CMU手部21点关键点数据""" 21 | def __init__(self, json_path): 22 | self.json_path = json_path 23 | self.hand_point = [] 24 | 25 | def read(self): 26 | with open(self.json_path, 'r') as f: 27 | hand_data = json.load(f) 28 | 29 | for i in range(21): 30 | # 这边要注意不要xy坐标搞混 31 | hand_tem_xy = hand_data['hand_pts'][i][:2] 32 | hand_tem_xy = list(map(int, hand_tem_xy)) 33 | self.hand_point.append(hand_tem_xy) 34 | 35 | # hand_point = list(map(int, hand_point)) 36 | 37 | return np.array(self.hand_point) 38 | 39 | 40 | class CMUHandPointDataset(Dataset): 41 | """读取CMU手部关键点数据""" 42 | 43 | def __init__(self, root_dir, transform=None): 44 | self.root_dir = root_dir 45 | self.transform = transform 46 | self.image_name = [] 47 | 48 | # 分离目录下的jpg和json 49 | file_list = os.listdir(root_dir) 50 | for i in file_list: 51 | if os.path.splitext(i)[1] == '.jpg': 52 | self.image_name.append(i) 53 | 54 | def __getitem__(self, item): 55 | if torch.is_tensor(item): 56 | item = item.tolist() 57 | 58 | img_path = os.path.join(self.root_dir, 59 | self.image_name[item]) 60 | image = io.imread(img_path) 61 | json_path = os.path.join(img_path.replace('.jpg', '.json')) 62 | # 调用read方法读取数据 63 | landmarks = ReadJsonPoint(json_path).read() 64 | sample = {'image': image, 'landmarks': landmarks} 65 | 66 | if self.transform: 67 | sample = self.transform(sample) 68 | 69 | return sample 70 | 71 | def __len__(self): 72 | return len(self.image_name) 73 | 74 | 75 | class Rescale(object): 76 | """Rescale the image in a sample to a given size. 77 | 78 | Args: 79 | output_size (tuple or int): Desired output size. If tuple, output is 80 | matched to output_size. If int, smaller of image edges is matched 81 | to output_size keeping aspect ratio the same. 82 | """ 83 | 84 | def __init__(self, output_size): 85 | assert isinstance(output_size, (int, tuple)) 86 | self.output_size = output_size 87 | 88 | def __call__(self, sample): 89 | image, landmarks = sample['image'], sample['landmarks'] 90 | 91 | h, w = image.shape[:2] 92 | if isinstance(self.output_size, int): 93 | if h > w: 94 | new_h, new_w = self.output_size * h / w, self.output_size 95 | else: 96 | new_h, new_w = self.output_size, self.output_size * w / h 97 | else: 98 | new_h, new_w = self.output_size 99 | 100 | new_h, new_w = int(new_h), int(new_w) 101 | 102 | img = transform.resize(image, (new_h, new_w)) 103 | 104 | # h and w are swapped for landmarks because for images, 105 | # x and y axes are axis 1 and 0 respectively 106 | landmarks = landmarks * [new_w / w, new_h / h] 107 | 108 | 109 | return {'image': img, 'landmarks': landmarks} 110 | 111 | 112 | class RandomCrop(object): 113 | """Crop randomly the image in a sample. 114 | 115 | Args: 116 | output_size (tuple or int): Desired output size. If int, square crop 117 | is made. 118 | """ 119 | 120 | def __init__(self, output_size): 121 | assert isinstance(output_size, (int, tuple)) 122 | if isinstance(output_size, int): 123 | self.output_size = (output_size, output_size) 124 | else: 125 | assert len(output_size) == 2 126 | self.output_size = output_size 127 | 128 | def __call__(self, sample): 129 | image, landmarks = sample['image'], sample['landmarks'] 130 | 131 | h, w = image.shape[:2] 132 | new_h, new_w = self.output_size 133 | 134 | top = np.random.randint(0, h - new_h) 135 | left = np.random.randint(0, w - new_w) 136 | 137 | image = image[top: top + new_h, 138 | left: left + new_w] 139 | 140 | landmarks = landmarks - [left, top] 141 | 142 | return {'image': image, 'landmarks': landmarks} 143 | 144 | 145 | class ToTensor(object): 146 | """Convert ndarrays in sample to Tensors.""" 147 | 148 | def __call__(self, sample): 149 | image, landmarks = sample['image'], sample['landmarks'] 150 | 151 | # swap color axis because 152 | # numpy image: H x W x C 153 | # torch image: C X H X W 154 | image = image.transpose((2, 0, 1)) 155 | return {'image': torch.from_numpy(image), 156 | 'landmarks': torch.from_numpy(landmarks)} 157 | 158 | 159 | def show_landmarks(image, landmarks): 160 | """显示landmark,以方便检查数据""" 161 | plt.imshow(image) 162 | x = [] 163 | y = [] 164 | for i in range(21): 165 | x.append(landmarks[i][0]) 166 | y.append(landmarks[i][1]) 167 | plt.scatter(x, y, s=10, marker='.', c='r') 168 | 169 | 170 | if __name__ == '__main__': 171 | root_dir = '/home/wild/Hand-Keypoint-Estimation/data/Hands from Synthetic Data (6546 + 3243 + 2348 ' \ 172 | '+ 2124 = 14261 annotations)/hand_labels_synth/synth2' 173 | 174 | composed = transforms.Compose([Rescale(368), 175 | ToTensor()]) 176 | 177 | Data = CMUHandPointDataset(root_dir, composed) 178 | 179 | for i in range(8): 180 | sample = Data[i] 181 | 182 | print(i, sample['image'].shape) 183 | print('First 4 Landmarks: {}'.format(sample['landmarks'][:4])) 184 | ax = plt.subplot(2, 4, i + 1) 185 | plt.imshow(sample['image'].permute(1, 2, 0)) 186 | x = [] 187 | y = [] 188 | for i in range(21): 189 | x.append(np.array(sample['landmarks'][i][0])) 190 | y.append(np.array(sample['landmarks'][i][1])) 191 | plt.scatter(x, y, s=10, marker='.', c='r') 192 | 193 | plt.show() 194 | -------------------------------------------------------------------------------- /main/demo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @File : demo.py 4 | @Time : 2019/9/14 11:41 5 | @Author : KeyForce 6 | @Email : july.master@outlook.com 7 | """ 8 | -------------------------------------------------------------------------------- /main/train.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @File : train.py 4 | @Time : 2019/9/13 16:33 5 | @Author : KeyForce 6 | @Email : july.master@outlook.com 7 | """ 8 | import matplotlib.pyplot as plt 9 | import torch 10 | import torch.optim as optim 11 | import numpy as np 12 | import torch.nn as nn 13 | 14 | 15 | def Train(model, train_loader, criterion, optimizer, device, metrics=None, lr_scheduler=None, epoch=30): 16 | """ 17 | 训练模型 18 | :param model: 模型 19 | :param train_loader: 训练集 20 | :param criterion: 损失 21 | :param optimizer: 优化器 22 | :param device: GPU 或者CPU 23 | :param metrics: 评价指标 24 | :param lr_scheduler: 学习率调整 25 | :param epoch: 迭代次数 26 | :return: 27 | """ 28 | model.train() 29 | for batch_idx, (image, label) in enumerate(train_loader): 30 | image, label = image.to(device), label.to(device) 31 | optimizer.zero_grad() 32 | output = model(image) 33 | label = label.long() 34 | loss = criterion(output, label) 35 | loss.backward() 36 | optimizer.step() 37 | # Log 38 | if batch_idx % 10 == 0: 39 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'. 40 | format(epoch, 41 | batch_idx * len(image), 42 | len(train_loader.dataset), 43 | 100. * batch_idx / len(train_loader), 44 | loss.item()) 45 | ) 46 | 47 | 48 | def Test(model, test_loader, criterion, device, epoch): 49 | """ 50 | 测试模型 51 | :param model: 模型 52 | :param test_loader: 测试集 53 | :param criterion: 损失 54 | :param device: GPU 或者CPU 55 | :param epoch: 56 | :return: 57 | """ 58 | model.eval() 59 | test_loss = 0 60 | correct = 0 61 | confusion_matrix = np.zeros((21, 21)) 62 | flag = 0 63 | with torch.no_grad(): 64 | for image, label in test_loader: 65 | image, label = image.to(device), label.to(device) 66 | output = model(image) 67 | label = label.long() 68 | loss = criterion(output, label) 69 | test_loss += loss.item() 70 | pred = output.argmax(dim=1, keepdim=True) 71 | # PA像素精度 72 | num_class = 21 73 | pre_image = pred.squeeze(1).cpu().numpy() 74 | 75 | gt_image = label.cpu().numpy() 76 | 77 | confusion_matrix = fast_hist(gt_image, pre_image, num_class) 78 | # plt.close() 79 | PA = np.diag(confusion_matrix).sum() / confusion_matrix.sum() 80 | test_loss /= len(test_loader.dataset) 81 | 82 | print('\nTest set: Average loss: {:.4f}, PA: {}\n'. 83 | format(loss, 84 | PA, 85 | ) 86 | ) 87 | 88 | 89 | def fast_hist(a, b, n): 90 | k = (a >= 0) & (a < n) 91 | return np.bincount(n * a[k].astype(int) + b[k], minlength=n ** 2).reshape(n, n) 92 | 93 | 94 | def main(): 95 | # 加载数据 96 | 97 | 98 | # 使用drop_last让Batch能够整除 99 | train_loader = torch.utils.data.DataLoader(train_data, batch_size=16, drop_last=True) 100 | test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, drop_last=True) 101 | 102 | # 设置GPU 103 | torch.cuda.set_device(0) 104 | device = torch.device("cuda") 105 | # 初始化模型,损失,优化器 106 | model = 107 | loss = nn.CrossEntropyLoss(ignore_index=255, reduction='mean').to(device) 108 | optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4) 109 | # optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.8, weight_decay=5e-4) 110 | # 开始训练 111 | for epoch in range(40): 112 | Train(model, train_loader=train_loader, 113 | criterion=loss, optimizer=optimizer, 114 | device=device, epoch=epoch) 115 | Test(model, test_loader, loss, device, epoch) 116 | 117 | 118 | if __name__ == '__main__': 119 | main() -------------------------------------------------------------------------------- /other/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.12) 2 | 3 | PROJECT(handPose) 4 | 5 | find_package( OpenCV REQUIRED ) 6 | 7 | include_directories( ${OpenCV_INCLUDE_DIRS}) 8 | 9 | MACRO(add_example name) 10 | ADD_EXECUTABLE(${name} ${name}.cpp) 11 | TARGET_LINK_LIBRARIES(${name} ${OpenCV_LIBS}) 12 | ENDMACRO() 13 | 14 | 15 | add_example(handPoseImage) 16 | add_example(handPoseVideo) 17 | -------------------------------------------------------------------------------- /other/Hand_Caffe/1_F_deploy.prototxt: -------------------------------------------------------------------------------- 1 | # This file gives the CNN model to predict all landmark in LEVEL-1 2 | name: "landmark_1_F" 3 | layer { 4 | name: "data" 5 | type: "MemoryData" 6 | top: "data" 7 | top: "landmark" 8 | 9 | memory_data_param { 10 | batch_size: 1 11 | channels: 1 12 | height: 39 13 | width: 39 14 | } 15 | transform_param { 16 | scale: 0.00390625 17 | } 18 | } 19 | layer { 20 | name: "conv1" 21 | type: "Convolution" 22 | bottom: "data" 23 | top: "conv1" 24 | param { 25 | lr_mult: 1 26 | } 27 | param { 28 | lr_mult: 2 29 | } 30 | convolution_param { 31 | num_output: 20 32 | kernel_size: 4 33 | weight_filler { 34 | type: "xavier" 35 | } 36 | bias_filler { 37 | type: "constant" 38 | } 39 | } 40 | } 41 | layer { 42 | name: "relu1" 43 | type: "ReLU" 44 | bottom: "conv1" 45 | top: "conv1" 46 | } 47 | layer { 48 | name: "pool1" 49 | type: "Pooling" 50 | bottom: "conv1" 51 | top: "pool1" 52 | pooling_param { 53 | pool: MAX 54 | kernel_size: 2 55 | stride: 2 56 | } 57 | } 58 | layer { 59 | name: "conv2" 60 | type: "Convolution" 61 | bottom: "pool1" 62 | top: "conv2" 63 | param { 64 | lr_mult: 1 65 | } 66 | param { 67 | lr_mult: 2 68 | } 69 | convolution_param { 70 | num_output: 40 71 | kernel_size: 3 72 | weight_filler { 73 | type: "xavier" 74 | } 75 | bias_filler { 76 | type: "constant" 77 | } 78 | } 79 | } 80 | layer { 81 | name: "relu2" 82 | type: "ReLU" 83 | bottom: "conv2" 84 | top: "conv2" 85 | } 86 | layer { 87 | name: "pool2" 88 | type: "Pooling" 89 | bottom: "conv2" 90 | top: "pool2" 91 | pooling_param { 92 | pool: MAX 93 | kernel_size: 2 94 | stride: 2 95 | } 96 | } 97 | layer { 98 | name: "conv3" 99 | type: "Convolution" 100 | bottom: "pool2" 101 | top: "conv3" 102 | param { 103 | lr_mult: 1 104 | } 105 | param { 106 | lr_mult: 2 107 | } 108 | convolution_param { 109 | num_output: 60 110 | kernel_size: 3 111 | weight_filler { 112 | type: "xavier" 113 | } 114 | bias_filler { 115 | type: "constant" 116 | } 117 | } 118 | } 119 | layer { 120 | name: "relu3" 121 | type: "ReLU" 122 | bottom: "conv3" 123 | top: "conv3" 124 | } 125 | layer { 126 | name: "pool3" 127 | type: "Pooling" 128 | bottom: "conv3" 129 | top: "pool3" 130 | pooling_param { 131 | pool: MAX 132 | kernel_size: 2 133 | stride: 2 134 | } 135 | } 136 | layer { 137 | name: "conv4" 138 | type: "Convolution" 139 | bottom: "pool3" 140 | top: "conv4" 141 | param { 142 | lr_mult: 1 143 | } 144 | param { 145 | lr_mult: 2 146 | } 147 | convolution_param { 148 | num_output: 80 149 | kernel_size: 2 150 | weight_filler { 151 | type: "xavier" 152 | } 153 | bias_filler { 154 | type: "constant" 155 | } 156 | } 157 | } 158 | layer { 159 | name: "relu4" 160 | type: "ReLU" 161 | bottom: "conv4" 162 | top: "conv4" 163 | } 164 | layer { 165 | name: "pool3_flat" 166 | type: "Flatten" 167 | bottom: "pool3" 168 | top: "pool3_flat" 169 | } 170 | layer { 171 | name: "conv4_flat" 172 | type: "Flatten" 173 | bottom: "conv4" 174 | top: "conv4_flat" 175 | } 176 | layer { 177 | name: "concat" 178 | type: "Concat" 179 | bottom: "pool3_flat" 180 | bottom: "conv4_flat" 181 | top: "faker" 182 | concat_param { 183 | concat_dim: 1 184 | } 185 | } 186 | layer { 187 | name: "fc1" 188 | type: "InnerProduct" 189 | bottom: "faker" 190 | top: "fc1" 191 | param { 192 | lr_mult: 1 193 | } 194 | param { 195 | lr_mult: 2 196 | } 197 | inner_product_param { 198 | num_output: 120 199 | weight_filler { 200 | type: "xavier" 201 | } 202 | bias_filler { 203 | type: "constant" 204 | } 205 | } 206 | } 207 | layer { 208 | name: "relu_fc1" 209 | type: "ReLU" 210 | bottom: "fc1" 211 | top: "fc1" 212 | } 213 | layer { 214 | name: "fc2" 215 | type: "InnerProduct" 216 | bottom: "fc1" 217 | top: "fc2" 218 | param { 219 | lr_mult: 1 220 | } 221 | param { 222 | lr_mult: 2 223 | } 224 | inner_product_param { 225 | num_output: 10 226 | weight_filler { 227 | type: "xavier" 228 | } 229 | bias_filler { 230 | type: "constant" 231 | } 232 | } 233 | } 234 | layer { 235 | name: "relu_fc2" 236 | type: "ReLU" 237 | bottom: "fc2" 238 | top: "fc2" 239 | } 240 | -------------------------------------------------------------------------------- /other/Hand_Caffe/1_F_solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "/home/wild/Face_Landmark/Hand_Test/1_F_train.prototxt" 2 | 3 | test_iter: 25 4 | test_interval: 1000 5 | 6 | base_lr: 0.001 7 | momentum: 0.9 8 | weight_decay: 0.0005 9 | 10 | lr_policy: "inv" 11 | gamma: 0.0001 12 | power: 0.75 13 | 14 | #lr_policy: "step" 15 | #gamma: 0.1 16 | #stepsize: 50000 17 | 18 | display: 200 19 | 20 | max_iter: 50000 21 | 22 | snapshot: 50000 23 | snapshot_prefix: "/home/wild/Face_Landmark/Hand_Test/" 24 | 25 | test_compute_loss: true 26 | 27 | solver_mode: GPU -------------------------------------------------------------------------------- /other/Hand_Caffe/1_F_train.prototxt: -------------------------------------------------------------------------------- 1 | # This file gives the CNN model to predict all landmark in LEVEL-1 2 | name: "landmark_1_F" 3 | layer { 4 | name: "hdf5_train_data" 5 | type: "HDF5Data" 6 | top: "data" 7 | top: "landmark" 8 | include { 9 | phase: TRAIN 10 | } 11 | hdf5_data_param { 12 | source: "/home/wild/Face_Landmark/Hand_Test/Mytrain/1_F/train.txt" 13 | batch_size: 64 14 | } 15 | } 16 | layer { 17 | name: "hdf5_test_data" 18 | type: "HDF5Data" 19 | top: "data" 20 | top: "landmark" 21 | include { 22 | phase: TEST 23 | } 24 | hdf5_data_param { 25 | source: "/home/wild/Face_Landmark/Hand_Test/Mytrain/1_F/test.txt" 26 | batch_size: 64 27 | } 28 | } 29 | layer { 30 | name: "conv1" 31 | type: "Convolution" 32 | bottom: "data" 33 | top: "conv1" 34 | param { 35 | lr_mult: 1 36 | } 37 | param { 38 | lr_mult: 2 39 | } 40 | convolution_param { 41 | num_output: 20 42 | kernel_size: 4 43 | weight_filler { 44 | type: "xavier" 45 | } 46 | bias_filler { 47 | type: "constant" 48 | } 49 | } 50 | } 51 | layer { 52 | name: "relu1" 53 | type: "ReLU" 54 | bottom: "conv1" 55 | top: "conv1" 56 | } 57 | layer { 58 | name: "pool1" 59 | type: "Pooling" 60 | bottom: "conv1" 61 | top: "pool1" 62 | pooling_param { 63 | pool: MAX 64 | kernel_size: 2 65 | stride: 2 66 | } 67 | } 68 | layer { 69 | name: "conv2" 70 | type: "Convolution" 71 | bottom: "pool1" 72 | top: "conv2" 73 | param { 74 | lr_mult: 1 75 | } 76 | param { 77 | lr_mult: 2 78 | } 79 | convolution_param { 80 | num_output: 40 81 | kernel_size: 3 82 | weight_filler { 83 | type: "xavier" 84 | } 85 | bias_filler { 86 | type: "constant" 87 | } 88 | } 89 | } 90 | layer { 91 | name: "relu2" 92 | type: "ReLU" 93 | bottom: "conv2" 94 | top: "conv2" 95 | } 96 | layer { 97 | name: "pool2" 98 | type: "Pooling" 99 | bottom: "conv2" 100 | top: "pool2" 101 | pooling_param { 102 | pool: MAX 103 | kernel_size: 2 104 | stride: 2 105 | } 106 | } 107 | layer { 108 | name: "conv3" 109 | type: "Convolution" 110 | bottom: "pool2" 111 | top: "conv3" 112 | param { 113 | lr_mult: 1 114 | } 115 | param { 116 | lr_mult: 2 117 | } 118 | convolution_param { 119 | num_output: 60 120 | kernel_size: 3 121 | weight_filler { 122 | type: "xavier" 123 | } 124 | bias_filler { 125 | type: "constant" 126 | } 127 | } 128 | } 129 | layer { 130 | name: "relu3" 131 | type: "ReLU" 132 | bottom: "conv3" 133 | top: "conv3" 134 | } 135 | layer { 136 | name: "pool3" 137 | type: "Pooling" 138 | bottom: "conv3" 139 | top: "pool3" 140 | pooling_param { 141 | pool: MAX 142 | kernel_size: 2 143 | stride: 2 144 | } 145 | } 146 | layer { 147 | name: "conv4" 148 | type: "Convolution" 149 | bottom: "pool3" 150 | top: "conv4" 151 | param { 152 | lr_mult: 1 153 | } 154 | param { 155 | lr_mult: 2 156 | } 157 | convolution_param { 158 | num_output: 80 159 | kernel_size: 2 160 | weight_filler { 161 | type: "xavier" 162 | } 163 | bias_filler { 164 | type: "constant" 165 | } 166 | } 167 | } 168 | layer { 169 | name: "relu4" 170 | type: "ReLU" 171 | bottom: "conv4" 172 | top: "conv4" 173 | } 174 | layer { 175 | name: "pool3_flat" 176 | type: "Flatten" 177 | bottom: "pool3" 178 | top: "pool3_flat" 179 | } 180 | layer { 181 | name: "conv4_flat" 182 | type: "Flatten" 183 | bottom: "conv4" 184 | top: "conv4_flat" 185 | } 186 | layer { 187 | name: "concat" 188 | type: "Concat" 189 | bottom: "pool3_flat" 190 | bottom: "conv4_flat" 191 | top: "faker" 192 | concat_param { 193 | concat_dim: 1 194 | } 195 | } 196 | layer { 197 | name: "fc1" 198 | type: "InnerProduct" 199 | bottom: "faker" 200 | top: "fc1" 201 | param { 202 | lr_mult: 1 203 | } 204 | param { 205 | lr_mult: 2 206 | } 207 | inner_product_param { 208 | num_output: 120 209 | weight_filler { 210 | type: "xavier" 211 | } 212 | bias_filler { 213 | type: "constant" 214 | } 215 | } 216 | } 217 | layer { 218 | name: "relu_fc1" 219 | type: "ReLU" 220 | bottom: "fc1" 221 | top: "fc1" 222 | } 223 | layer { 224 | name: "fc2" 225 | type: "InnerProduct" 226 | bottom: "fc1" 227 | top: "fc2" 228 | param { 229 | lr_mult: 1 230 | } 231 | param { 232 | lr_mult: 2 233 | } 234 | inner_product_param { 235 | num_output: 10 236 | weight_filler { 237 | type: "xavier" 238 | } 239 | bias_filler { 240 | type: "constant" 241 | } 242 | } 243 | } 244 | layer { 245 | name: "relu_fc2" 246 | type: "ReLU" 247 | bottom: "fc2" 248 | top: "fc2" 249 | } 250 | layer { 251 | name: "error" 252 | type: "EuclideanLoss" 253 | bottom: "fc2" 254 | bottom: "landmark" 255 | top: "error" 256 | include { 257 | phase: TEST 258 | } 259 | } 260 | layer { 261 | name: "loss" 262 | type: "EuclideanLoss" 263 | bottom: "fc2" 264 | bottom: "landmark" 265 | top: "loss" 266 | include { 267 | phase: TRAIN 268 | } 269 | } 270 | -------------------------------------------------------------------------------- /other/Hand_Caffe/create_txt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import cv2 4 | import numpy 5 | 6 | def make_bbox_bigger(data, xR, yR, wR, hR): 7 | 8 | xDelta = data[0] * xR 9 | yDelta = data[1] * yR 10 | wDelta = data[2] * wR 11 | hDelta = data[3] * hR 12 | 13 | x = data[0] + xDelta 14 | y = data[1] + yDelta 15 | w = data[2] + wDelta 16 | h = data[3] + hDelta 17 | return [int(x), int(y), int(w), int(h)] 18 | 19 | def get_json_point(json_path): 20 | hand_data_out = {} 21 | hand_return = {} 22 | str_point = '' 23 | cnt = numpy.zeros((21, 2), dtype=int) 24 | with open(json_path, 'r') as f: 25 | hand_data = json.load(f) 26 | 27 | for i in range(21): 28 | hand_data_out[i] = hand_data['hand_pts'][i][:2] 29 | 30 | for j in range(21): 31 | for i in range(2): 32 | hand_data_out[j][i] = int(hand_data_out[j][i]) 33 | 34 | for i in range(21): 35 | cnt[i] = numpy.array(hand_data_out[i]) 36 | 37 | index = [4, 8, 12, 16, 20] 38 | new_a = numpy.delete(cnt, index, axis=0) 39 | x, y, w, h = cv2.boundingRect(new_a) 40 | x, y, w, h = make_bbox_bigger([x, y, w, h], -0.08, -0.08, 0.8, 0.8) 41 | 42 | hand_return[0] = hand_data_out[1] 43 | hand_return[1] = hand_data_out[7] 44 | hand_return[2] = hand_data_out[11] 45 | hand_return[3] = hand_data_out[15] 46 | hand_return[4] = hand_data_out[19] 47 | 48 | # box 49 | hand_return[5] = [x, y] 50 | hand_return[6] = [w, h] 51 | for key, value in hand_return.items(): 52 | for i in range(2): 53 | str_point += str(value[i]) 54 | str_point += ' ' 55 | 56 | return str_point 57 | 58 | 59 | if __name__ == '__main__': 60 | data_sources = ['synth1', 'synth2', 'synth3', 'synth4'] 61 | root_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth' 62 | 63 | data = [] 64 | 65 | for data_source in data_sources: 66 | im_dir = os.path.join(root_dir, data_source) 67 | for im_file in os.listdir(im_dir): 68 | if '.jpg' in im_file: 69 | name = im_file.rstrip('.jpg') 70 | json_file_path = os.path.join(root_dir, data_source, name + '.json') 71 | im_file_path = os.path.join(data_source, name + '.jpg') 72 | point = get_json_point(json_file_path) 73 | data.append(" ".join([im_file_path, point])) 74 | 75 | with open('{}/data.txt'.format(root_dir), 'w') as f: 76 | for image_point in data: 77 | f.write('{}\r\n'.format(image_point)) 78 | 79 | train = data[:int(len(data) * 0.7)] 80 | test = data[int(len(data) * 0.7):] 81 | 82 | with open('{}/train.txt'.format(root_dir), 'w') as f: 83 | for image_point in data: 84 | f.write('{}\r\n'.format(image_point)) 85 | 86 | with open('{}/test.txt'.format(root_dir), 'w') as f: 87 | for image_point in data: 88 | f.write('{}\r\n'.format(image_point)) 89 | 90 | 91 | # random.shuffle(test_data) 92 | # random.shuffle(test_data) 93 | # random.shuffle(train_data) 94 | # random.shuffle(train_data) 95 | # 96 | # with open('test.txt', 'w') as f: 97 | # f.write('\n'.join(test_data)) 98 | # with open('trainval.txt', 'w') as f: 99 | # f.write('\n'.join(train_data)) 100 | -------------------------------------------------------------------------------- /other/Hand_Caffe/getBox.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | import numpy 4 | import matplotlib.pyplot as plt 5 | 6 | im_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.jpg' 7 | json_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.json' 8 | hand_data_out = {} 9 | 10 | hand_data_out = {} 11 | cnt = numpy.zeros((21, 2), dtype=int) 12 | with open(json_dir, 'r') as f: 13 | hand_data = json.load(f) 14 | 15 | for i in range(21): 16 | hand_data_out[i] = hand_data['hand_pts'][i][:2] 17 | 18 | for j in range(21): 19 | for i in range(2): 20 | hand_data_out[j][i] = int(hand_data_out[j][i]) 21 | 22 | for i in range(21): 23 | cnt[i] = numpy.array(hand_data_out[i]) 24 | 25 | index = [4, 8, 12, 16, 20] 26 | new_a = numpy.delete(cnt, index, axis=0) 27 | img = cv2.imread(im_dir) 28 | x, y, w, h = cv2.boundingRect(new_a) 29 | 30 | 31 | def make_bbox_bigger(data, xR, yR, wR, hR): 32 | 33 | xDelta = data[0] * xR 34 | yDelta = data[1] * yR 35 | wDelta = data[2] * wR 36 | hDelta = data[3] * hR 37 | 38 | x = data[0] + xDelta 39 | y = data[1] + yDelta 40 | w = data[2] + wDelta 41 | h = data[3] + hDelta 42 | return [int(x), int(y), int(w), int(h)] 43 | 44 | x, y, w, h = make_bbox_bigger([x, y, w, h], -0.08, -0.08, 0.08, 0.08) 45 | 46 | cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 1) 47 | 48 | cv2.imwrite('hand.jpeg', img) 49 | plt.imshow(img) 50 | plt.show() 51 | 52 | 53 | -------------------------------------------------------------------------------- /other/Hand_Caffe/hand.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Caffe/hand.jpeg -------------------------------------------------------------------------------- /other/Hand_Caffe/level1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2.7 2 | # coding: utf-8 3 | 4 | 5 | import os 6 | from os.path import join, exists 7 | 8 | import cv2 9 | import h5py 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | from utils import getDataFromTxt 13 | from utils import shuffle_in_unison_scary, logger, createDir, processImage 14 | 15 | TRAIN = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth' 16 | OUTPUT = '/home/wild/Face_Landmark/Hand_Test/Mytrain' 17 | if not exists(OUTPUT): 18 | os.mkdir(OUTPUT) 19 | assert(exists(TRAIN) and exists(OUTPUT)) 20 | 21 | 22 | def generate_hdf5(ftxt, output, fname, argument=False): 23 | 24 | data = getDataFromTxt(ftxt) 25 | F_imgs = [] 26 | F_landmarks = [] 27 | 28 | for (imgPath, landmarkGt, bbox) in data: 29 | img = cv2.imread(imgPath, cv2.IMREAD_GRAYSCALE) 30 | assert(img is not None) 31 | logger("process %s" % imgPath) 32 | # plt.imshow(img) 33 | # plt.show() 34 | 35 | f_face = img[int(bbox[0]):int(bbox[2]), int(bbox[1]):int(bbox[3])] 36 | plt.imshow(f_face) 37 | plt.show() 38 | 39 | f_face = cv2.resize(f_face, (39, 39)) 40 | 41 | f_face = f_face.reshape((1, 39, 39)) 42 | 43 | f_landmark = landmarkGt.reshape((10)) 44 | F_imgs.append(f_face) 45 | F_landmarks.append(f_landmark) 46 | 47 | 48 | 49 | F_imgs, F_landmarks = np.asarray(F_imgs), np.asarray(F_landmarks) 50 | 51 | 52 | F_imgs = processImage(F_imgs) 53 | shuffle_in_unison_scary(F_imgs, F_landmarks) 54 | 55 | 56 | # full face 57 | base = join(OUTPUT, '1_F') 58 | createDir(base) 59 | output = join(base, fname) 60 | logger("generate %s" % output) 61 | 62 | 63 | with h5py.File(output, 'w') as h5: 64 | h5['data'] = F_imgs.astype(np.float32) 65 | h5['landmark'] = F_landmarks.astype(np.float32) 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | 71 | h5_path = '/home/wild/Face_Landmark/Hand_Test/Mytrain' 72 | # 训练集 73 | train_txt = join(TRAIN, 'train.txt') 74 | generate_hdf5(train_txt, OUTPUT, 'train.h5', argument=True) 75 | # 测试集 76 | test_txt = join(TRAIN, 'test.txt') 77 | generate_hdf5(test_txt, OUTPUT, 'test.h5') 78 | 79 | with open(join(OUTPUT, '1_F/train.txt'), 'w') as fd: 80 | fd.write(h5_path+'/1_F/train.h5') 81 | 82 | with open(join(OUTPUT, '1_F/test.txt'), 'w') as fd: 83 | fd.write(h5_path+'/1_F/test.h5') 84 | 85 | print 'ok' -------------------------------------------------------------------------------- /other/Hand_Caffe/read_im_json.py: -------------------------------------------------------------------------------- 1 | import json 2 | import cv2 3 | im_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.jpg' 4 | json_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.json' 5 | hand_data_out = {} 6 | with open(json_dir, 'r') as f: 7 | hand_data = json.load(f) 8 | 9 | for i in range(21): 10 | hand_data_out[i] = hand_data['hand_pts'][i][:2] 11 | 12 | for j in range(21): 13 | for i in range(2): 14 | hand_data_out[j][i] = int(hand_data_out[j][i]) 15 | 16 | def get_json_point(json_path): 17 | hand_data_out = {} 18 | hand_return = {} 19 | str_point = '' 20 | with open(json_dir, 'r') as f: 21 | hand_data = json.load(f) 22 | 23 | for i in range(21): 24 | hand_data_out[i] = hand_data['hand_pts'][i][:2] 25 | 26 | for j in range(21): 27 | for i in range(2): 28 | hand_data_out[j][i] = int(hand_data_out[j][i]) 29 | 30 | hand_return[0] = hand_data_out[1] 31 | hand_return[1] = hand_data_out[7] 32 | hand_return[2] = hand_data_out[11] 33 | hand_return[3] = hand_data_out[15] 34 | hand_return[4] = hand_data_out[19] 35 | for key, value in hand_return.items(): 36 | for i in range(2): 37 | str_point += str(value[i]) 38 | str_point += ' ' 39 | 40 | return str_point 41 | 42 | data = get_json_point(json_dir) 43 | 44 | # output = cv2.imread(im_dir) 45 | # for i in range(5): 46 | # cv2.circle(output, tuple(data[i]), 2, (0, 0, 255), 1) 47 | # cv2.imshow("capture", output) 48 | # while True: 49 | # if cv2.waitKey(1) == 27: 50 | # break # esc to quit -------------------------------------------------------------------------------- /other/Hand_Caffe/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import time 5 | 6 | import cv2 7 | import numpy as np 8 | 9 | 10 | def logger(msg): 11 | """ 12 | log message 13 | """ 14 | now = time.ctime() 15 | print("[%s] %s" % (now, msg)) 16 | 17 | 18 | def createDir(p): 19 | if not os.path.exists(p): 20 | os.mkdir(p) 21 | 22 | 23 | def shuffle_in_unison_scary(a, b): 24 | rng_state = np.random.get_state() 25 | np.random.shuffle(a) 26 | np.random.set_state(rng_state) 27 | np.random.shuffle(b) 28 | 29 | 30 | def drawLandmark(img, bbox, landmark): 31 | cv2.rectangle(img, (bbox.left, bbox.top), (bbox.right, bbox.bottom), (0, 0, 255), 2) 32 | for x, y in landmark: 33 | cv2.circle(img, (int(x), int(y)), 2, (0, 255, 0), -1) 34 | return img 35 | 36 | 37 | def getDataFromTxt(txt, with_landmark=True): 38 | """ 39 | Generate data from txt file 40 | return [(img_path, landmark)] 41 | landmark: [(x1, y1), (x2, y2), ...] 42 | """ 43 | dirname = os.path.dirname(txt) 44 | with open(txt, 'r') as fd: 45 | lines = fd.readlines() 46 | 47 | result = [] 48 | for line in lines: 49 | line = line.strip() 50 | components = line.split(' ') 51 | img_path = os.path.join(dirname, components[0].replace('\\', '/')) # file path 52 | # bounding box, (x, y, w, h) 53 | bbox = (components[11], components[12], int(components[11]) + int(components[13]), int(components[12]) + int(components[14])) 54 | # 将字符串转换成整型 55 | bbox = [int(_) for _ in bbox] 56 | # landmark 57 | if not with_landmark: 58 | result.append((img_path, BBox(bbox))) 59 | continue 60 | landmark = np.zeros((5, 2)) 61 | # 读取关键点坐标 62 | for index in range(0, 5): 63 | rv = (float(components[1 + 2 * index]), float(components[1 + 2 * index + 1])) 64 | landmark[index] = rv 65 | for index, one in enumerate(landmark): 66 | rv = ((one[0] - bbox[0]) / (bbox[1] - bbox[0]), (one[1] - bbox[2]) / (bbox[3] - bbox[2])) 67 | landmark[index] = rv 68 | result.append((img_path, landmark, bbox)) 69 | return result 70 | 71 | 72 | def getPatch(img, bbox, point, padding): 73 | """ 74 | Get a patch iamge around the given point in bbox with padding 75 | point: relative_point in [0, 1] in bbox 76 | """ 77 | point_x = bbox.x + point[0] * bbox.w 78 | point_y = bbox.y + point[1] * bbox.h 79 | patch_left = point_x - bbox.w * padding 80 | patch_right = point_x + bbox.w * padding 81 | patch_top = point_y - bbox.h * padding 82 | patch_bottom = point_y + bbox.h * padding 83 | patch = img[patch_top: patch_bottom + 1, patch_left: patch_right + 1] 84 | patch_bbox = BBox([patch_left, patch_right, patch_top, patch_bottom]) 85 | return patch, patch_bbox 86 | 87 | 88 | def processImage(imgs): 89 | """ 90 | process images before feeding to CNNs 91 | imgs: N x 1 x W x H 92 | """ 93 | imgs = imgs.astype(np.float32) 94 | for i, img in enumerate(imgs): 95 | m = img.mean() 96 | s = img.std() 97 | imgs[i] = (img - m) / s 98 | return imgs 99 | 100 | 101 | def dataArgument(data): 102 | """ 103 | dataArguments 104 | data: 105 | imgs: N x 1 x W x H 106 | bbox: N x BBox 107 | landmarks: N x 10 108 | """ 109 | pass 110 | 111 | 112 | class BBox(object): 113 | """ 114 | Bounding Box of face 115 | """ 116 | 117 | def __init__(self, bbox): 118 | self.left = bbox[0] 119 | self.right = bbox[1] 120 | self.top = bbox[2] 121 | self.bottom = bbox[3] 122 | self.x = bbox[0] 123 | self.y = bbox[1] 124 | self.w = bbox[2] - bbox[0] 125 | self.h = bbox[3] - bbox[1] 126 | 127 | def expand(self, scale=0.05): 128 | bbox = [self.left, self.right, self.top, self.bottom] 129 | bbox[0] -= int(self.w * scale) 130 | bbox[1] += int(self.w * scale) 131 | bbox[2] -= int(self.h * scale) 132 | bbox[3] += int(self.h * scale) 133 | return BBox(bbox) 134 | 135 | def project(self, point): 136 | x = (point[0] - self.x) / self.w 137 | y = (point[1] - self.y) / self.h 138 | return np.asarray([x, y]) 139 | 140 | def reproject(self, point): 141 | x = self.x + self.w * point[0] 142 | y = self.y + self.h * point[1] 143 | return np.asarray([x, y]) 144 | 145 | def reprojectLandmark(self, landmark): 146 | p = np.zeros((len(landmark), 2)) 147 | for i in range(len(landmark)): 148 | p[i] = self.reproject(landmark[i]) 149 | return p 150 | 151 | def projectLandmark(self, landmark): 152 | p = np.zeros((len(landmark), 2)) 153 | for i in range(len(landmark)): 154 | p[i] = self.project(landmark[i]) 155 | return p 156 | 157 | # 扩大框 158 | def subBBox(self, leftR, rightR, topR, bottomR): 159 | leftDelta = self.w * leftR 160 | rightDelta = self.w * rightR 161 | topDelta = self.h * topR 162 | bottomDelta = self.h * bottomR 163 | left = self.left + leftDelta 164 | right = self.left + rightDelta 165 | top = self.top + topDelta 166 | bottom = self.top + bottomDelta 167 | return BBox([left, right, top, bottom]) 168 | -------------------------------------------------------------------------------- /other/Hand_Detection/README.md: -------------------------------------------------------------------------------- 1 | ### [SSD-Hand-Detection](https://github.com/weiliu89/caffe/tree/ssd) 2 | #### Dataset 3 | * [egohands](http://vision.soic.indiana.edu/projects/egohands/) 4 | * [stanfordhands](http://www.robots.ox.ac.uk/~vgg/data/hands/) 5 | 6 | #### Preprocess 7 | * `min(hands width and height) > threshold`, for egohands, `threshold=40`; for stanfordhands, `threshold=20`. 8 | * clean dataset can be downloaded from [onedrive](). 9 | * run `create_txt.py` to generate `test.txt` and `trainval.txt`. 10 | * At last run `./create_data.sh` to generate lmdb file in data/lmdb folder. 11 | 12 | #### Train 13 | * pretrained model are provided by the author and trained on [PASCAL VOC 2012, 2017](http://host.robots.ox.ac.uk/pascal/VOC/). [Download Link](). 14 | 15 | #### demo 16 | ![](pic/demo.jpg) -------------------------------------------------------------------------------- /other/Hand_Detection/data/create_annoset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import shutil 4 | import subprocess 5 | import sys 6 | # get caffe root directory 7 | caffe_root = '../caffe' 8 | sys.path.insert(0, os.path.join(caffe_root, 'python')) 9 | from caffe.proto import caffe_pb2 10 | from google.protobuf import text_format 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser(description="Create AnnotatedDatum database") 14 | parser.add_argument("root", 15 | help="The root directory which contains the images and annotations.") 16 | parser.add_argument("listfile", 17 | help="The file which contains image paths and annotation info.") 18 | parser.add_argument("outdir", 19 | help="The output directory which stores the database file.") 20 | parser.add_argument("exampledir", 21 | help="The directory to store the link of the database files.") 22 | parser.add_argument("--redo", default = False, action = "store_true", 23 | help="Recreate the database.") 24 | parser.add_argument("--anno-type", default = "classification", 25 | help="The type of annotation {classification, detection}.") 26 | parser.add_argument("--label-type", default = "xml", 27 | help="The type of label file format for detection {xml, json, txt}.") 28 | parser.add_argument("--backend", default = "lmdb", 29 | help="The backend {lmdb, leveldb} for storing the result") 30 | parser.add_argument("--check-size", default = False, action = "store_true", 31 | help="Check that all the datum have the same size.") 32 | parser.add_argument("--encode-type", default = "", 33 | help="What type should we encode the image as ('png','jpg',...).") 34 | parser.add_argument("--encoded", default = False, action = "store_true", 35 | help="The encoded image will be save in datum.") 36 | parser.add_argument("--gray", default = False, action = "store_true", 37 | help="Treat images as grayscale ones.") 38 | parser.add_argument("--label-map-file", default = "", 39 | help="A file with LabelMap protobuf message.") 40 | parser.add_argument("--min-dim", default = 0, type = int, 41 | help="Minimum dimension images are resized to.") 42 | parser.add_argument("--max-dim", default = 0, type = int, 43 | help="Maximum dimension images are resized to.") 44 | parser.add_argument("--resize-height", default = 0, type = int, 45 | help="Height images are resized to.") 46 | parser.add_argument("--resize-width", default = 0, type = int, 47 | help="Width images are resized to.") 48 | parser.add_argument("--shuffle", default = False, action = "store_true", 49 | help="Randomly shuffle the order of images and their labels.") 50 | parser.add_argument("--check-label", default = False, action = "store_true", 51 | help="Check that there is no duplicated name/label.") 52 | 53 | args = parser.parse_args() 54 | root_dir = args.root 55 | list_file = args.listfile 56 | out_dir = args.outdir 57 | example_dir = args.exampledir 58 | 59 | redo = args.redo 60 | anno_type = args.anno_type 61 | label_type = args.label_type 62 | backend = args.backend 63 | check_size = args.check_size 64 | encode_type = args.encode_type 65 | encoded = args.encoded 66 | gray = args.gray 67 | label_map_file = args.label_map_file 68 | min_dim = args.min_dim 69 | max_dim = args.max_dim 70 | resize_height = args.resize_height 71 | resize_width = args.resize_width 72 | shuffle = args.shuffle 73 | check_label = args.check_label 74 | 75 | # check if root directory exists 76 | if not os.path.exists(root_dir): 77 | print("root directory: {} does not exist".format(root_dir)) 78 | sys.exit() 79 | # add "/" to root directory if needed 80 | if root_dir[-1] != "/": 81 | root_dir += "/" 82 | # check if list file exists 83 | if not os.path.exists(list_file): 84 | print("list file: {} does not exist".format(list_file)) 85 | sys.exit() 86 | # check list file format is correct 87 | with open(list_file, "r") as lf: 88 | for line in lf.readlines(): 89 | img_file, anno = line.strip("\n").split(" ") 90 | if not os.path.exists(root_dir + img_file): 91 | print("image file: {} does not exist".format(root_dir + img_file)) 92 | if anno_type == "classification": 93 | if not anno.isdigit(): 94 | print("annotation: {} is not an integer".format(anno)) 95 | elif anno_type == "detection": 96 | if not os.path.exists(root_dir + anno): 97 | print("annofation file: {} does not exist".format(root_dir + anno)) 98 | sys.exit() 99 | break 100 | # check if label map file exist 101 | if anno_type == "detection": 102 | if not os.path.exists(label_map_file): 103 | print("label map file: {} does not exist".format(label_map_file)) 104 | sys.exit() 105 | label_map = caffe_pb2.LabelMap() 106 | lmf = open(label_map_file, "r") 107 | try: 108 | text_format.Merge(str(lmf.read()), label_map) 109 | except: 110 | print("Cannot parse label map file: {}".format(label_map_file)) 111 | sys.exit() 112 | out_parent_dir = os.path.dirname(out_dir) 113 | if not os.path.exists(out_parent_dir): 114 | os.makedirs(out_parent_dir) 115 | if os.path.exists(out_dir) and not redo: 116 | print("{} already exists and I do not hear redo".format(out_dir)) 117 | sys.exit() 118 | if os.path.exists(out_dir): 119 | shutil.rmtree(out_dir) 120 | 121 | 122 | if anno_type == "detection": 123 | cmd = "{}/build/tools/convert_annoset" \ 124 | " --anno_type={}" \ 125 | " --label_type={}" \ 126 | " --label_map_file={}" \ 127 | " --check_label={}" \ 128 | " --min_dim={}" \ 129 | " --max_dim={}" \ 130 | " --resize_height={}" \ 131 | " --resize_width={}" \ 132 | " --backend={}" \ 133 | " --shuffle={}" \ 134 | " --check_size={}" \ 135 | " --encode_type={}" \ 136 | " --encoded={}" \ 137 | " --gray={}" \ 138 | " {} {} {}" \ 139 | .format(caffe_root, anno_type, label_type, label_map_file, check_label, 140 | min_dim, max_dim, resize_height, resize_width, backend, shuffle, 141 | check_size, encode_type, encoded, gray, root_dir, list_file, out_dir) 142 | elif anno_type == "classification": 143 | cmd = "{}/build/tools/convert_annoset" \ 144 | " --anno_type={}" \ 145 | " --min_dim={}" \ 146 | " --max_dim={}" \ 147 | " --resize_height={}" \ 148 | " --resize_width={}" \ 149 | " --backend={}" \ 150 | " --shuffle={}" \ 151 | " --check_size={}" \ 152 | " --encode_type={}" \ 153 | " --encoded={}" \ 154 | " --gray={}" \ 155 | " {} {} {}" \ 156 | .format(caffe_root, anno_type, min_dim, max_dim, resize_height, 157 | resize_width, backend, shuffle, check_size, encode_type, encoded, 158 | gray, root_dir, list_file, out_dir) 159 | print(cmd) 160 | process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE) 161 | output = process.communicate()[0] 162 | 163 | if not os.path.exists(example_dir): 164 | os.makedirs(example_dir) 165 | # link_dir = os.path.join(example_dir, os.path.basename(out_dir)) 166 | # if os.path.exists(link_dir): 167 | # os.unlink(link_dir) 168 | # os.symlink(out_dir, link_dir) 169 | -------------------------------------------------------------------------------- /other/Hand_Detection/data/create_data.sh: -------------------------------------------------------------------------------- 1 | redo=1 2 | data_root_dir="." 3 | mapfile="labelmap_voc.prototxt" 4 | anno_type="detection" 5 | db="lmdb" 6 | min_dim=0 7 | max_dim=0 8 | width=0 9 | height=0 10 | 11 | extra_cmd="--encode-type=jpg --encoded" 12 | if [ $redo ] 13 | then 14 | extra_cmd="$extra_cmd --redo" 15 | fi 16 | for subset in test trainval 17 | do 18 | python create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $subset.txt $data_root_dir/$db/$subset"_"$db '.' 19 | done 20 | -------------------------------------------------------------------------------- /other/Hand_Detection/data/create_txt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | data_sources = ['egohands', 'stanfordhands'] 4 | root_dir = os.path.dirname(os.path.abspath(__file__)) 5 | test_data = [] 6 | train_data = [] 7 | 8 | for data_source in data_sources: 9 | test_im_dir = os.path.join(root_dir, data_source, 'test', 'JPEGImages') 10 | train_im_dir = os.path.join(root_dir, data_source, 'trainval', 'JPEGImages') 11 | for im_file in os.listdir(test_im_dir): 12 | name = im_file.rstrip('.jpg') 13 | xml_file_path = os.path.join(data_source, 'test', 'Annotations', name+'.xml') 14 | im_file_path = os.path.join(data_source, 'test', 'JPEGImages', name+'.jpg') 15 | test_data.append(" ".join([im_file_path, xml_file_path])) 16 | for im_file in os.listdir(train_im_dir): 17 | name = im_file.rstrip('.jpg') 18 | xml_file_path = os.path.join(data_source, 'trainval', 'Annotations', name+'.xml') 19 | im_file_path = os.path.join(data_source, 'trainval', 'JPEGImages', name+'.jpg') 20 | train_data.append(" ".join([im_file_path, xml_file_path])) 21 | 22 | 23 | 24 | random.shuffle(test_data) 25 | random.shuffle(test_data) 26 | random.shuffle(train_data) 27 | random.shuffle(train_data) 28 | 29 | with open('test.txt', 'w') as f: 30 | f.write('\n'.join(test_data)) 31 | with open('trainval.txt', 'w') as f: 32 | f.write('\n'.join(train_data)) 33 | -------------------------------------------------------------------------------- /other/Hand_Detection/data/egohands/_screenshot_17.04.2018.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/data/egohands/_screenshot_17.04.2018.png -------------------------------------------------------------------------------- /other/Hand_Detection/data/egohands/generate_egohands.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from lxml.etree import Element, SubElement, tostring 4 | import random 5 | import cv2 6 | import shutil 7 | import tqdm 8 | 9 | data_root = '/Users/hzzone/Downloads/egohands_data/_LABELLED_SAMPLES' 10 | with open("egohands_data.txt") as f: 11 | data = f.readlines() 12 | 13 | 14 | random.shuffle(data) 15 | random.shuffle(data) 16 | 17 | test_data = random.sample(data, int(len(data)*0.2)) 18 | train_data = list(set(data) - set(test_data)) 19 | 20 | def trans(data, set_name): 21 | 22 | curr_dir = os.path.dirname(os.path.abspath(__file__)) 23 | os.mkdir(os.path.join(curr_dir, set_name)) 24 | Annotations_dir = os.path.join(curr_dir, set_name, 'Annotations') 25 | JPEGImages_dir = os.path.join(curr_dir, set_name, 'JPEGImages') 26 | os.mkdir(Annotations_dir) 27 | os.mkdir(JPEGImages_dir) 28 | 29 | for each_pic_data in tqdm.tqdm(data): 30 | # for each_pic_data in data: 31 | data_list = each_pic_data.strip().split() 32 | video_id = data_list[0] 33 | frame_num = str(data_list[1]).zfill(4) 34 | new_img_name = '{}_{}'.format(video_id, frame_num) 35 | frame_num = 'frame_{}.jpg'.format(frame_num) 36 | 37 | 38 | im_path = os.path.join(data_root, video_id, frame_num) 39 | 40 | boxes = np.reshape(np.array(map(int, data_list[2:])), (-1, 4)) 41 | node_root = Element('annotation') 42 | 43 | node_folder = SubElement(node_root, 'folder') 44 | node_folder.text = 'egohands' 45 | 46 | node_filename = SubElement(node_root, 'filename') 47 | node_filename.text = new_img_name 48 | # 49 | node_size = SubElement(node_root, 'size') 50 | node_segmented = SubElement(node_root, 'segmented') 51 | node_segmented.text = '0' 52 | node_width = SubElement(node_size, 'width') 53 | im_height, im_width, channel = cv2.imread(im_path).shape 54 | node_width.text = str(im_width) 55 | # 56 | node_height = SubElement(node_size, 'height') 57 | node_height.text = str(im_height) 58 | # 59 | node_depth = SubElement(node_size, 'depth') 60 | node_depth.text = str(channel) 61 | # 62 | # im = cv2.imread(im_path) 63 | # for index in range(boxes.shape[0]): 64 | # minx, miny, w, h = boxes[index] 65 | # cv2.namedWindow("", 0) 66 | # cv2.resizeWindow('', 300, 300) 67 | # cv2.rectangle(im, (minx, miny), (minx+w-1, miny+h-1), (0, 255, 0), thickness=2) 68 | # print(w, h) 69 | # cv2.imshow('', im) 70 | # cv2.waitKey(0) 71 | 72 | effective_hands = 0 73 | for index in range(boxes.shape[0]): 74 | minx, miny, w, h = boxes[index] 75 | maxx = minx+w-1 76 | maxy = miny+h-1 77 | maxx = im_width if maxx > im_width else maxx 78 | maxy = im_height if maxy > im_height else maxy 79 | minx = 0 if minx < 0 else minx 80 | miny = 0 if miny < 0 else miny 81 | w = maxx-minx+1 82 | h = maxy-miny+1 83 | if min(w, h) < 40: 84 | continue 85 | if maxx <= minx or maxy <= miny: 86 | print(minx, miny) 87 | 88 | effective_hands = effective_hands + 1 89 | node_object = SubElement(node_root, 'object') 90 | node_name = SubElement(node_object, 'name') 91 | node_name.text = 'hand' 92 | node_difficult = SubElement(node_object, 'difficult') 93 | node_difficult.text = '0' 94 | node_bndbox = SubElement(node_object, 'bndbox') 95 | node_xmin = SubElement(node_bndbox, 'xmin') 96 | node_xmin.text = str(minx) 97 | node_ymin = SubElement(node_bndbox, 'ymin') 98 | node_ymin.text = str(miny) 99 | node_xmax = SubElement(node_bndbox, 'xmax') 100 | node_xmax.text = str(maxx) 101 | node_ymax = SubElement(node_bndbox, 'ymax') 102 | node_ymax.text = str(maxy) 103 | 104 | xml = tostring(node_root, pretty_print=True) 105 | # if effective_hands == 0: 106 | # print(im_path) 107 | if effective_hands != 0: 108 | # print(im_path) 109 | with open(Annotations_dir + "/" + new_img_name+'.xml', 'w') as f: 110 | f.write(xml) 111 | shutil.copy(im_path, JPEGImages_dir + '/' + new_img_name + '.jpg') 112 | 113 | trans(train_data, 'trainval') 114 | trans(test_data, 'test') 115 | -------------------------------------------------------------------------------- /other/Hand_Detection/data/egohands/getInfo.m: -------------------------------------------------------------------------------- 1 | video = getMetaBy(); 2 | fid = fopen('egohands_data.txt','w'); 3 | for i=1:1:48 4 | video_id = video(i).video_id; 5 | for j=1:1:100 6 | fprintf(fid,'%s ', video_id); 7 | frame_num = video(i).labelled_frames(j).frame_num; 8 | fprintf(fid,'%s ', num2str(frame_num)); 9 | boxes = getBoundingBoxes(video(i), j); 10 | for x=1:4 11 | if sum(boxes(x, :)) ~=0 12 | box = boxes(x, :); 13 | fprintf(fid,'%d %d %d %d ', box(1), box(2), box(3), box(4)); 14 | end 15 | end 16 | fprintf(fid,'\n'); 17 | end 18 | end 19 | fclose(fid); 20 | 21 | -------------------------------------------------------------------------------- /other/Hand_Detection/data/gth/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/data/gth/.gitkeep -------------------------------------------------------------------------------- /other/Hand_Detection/data/labelmap_voc.prototxt: -------------------------------------------------------------------------------- 1 | item { 2 | name: "none_of_the_above" 3 | label: 0 4 | display_name: "background" 5 | } 6 | item { 7 | name: "hand" 8 | label: 1 9 | display_name: "hand" 10 | } -------------------------------------------------------------------------------- /other/Hand_Detection/data/stanfordhands/generate_stanfordhands.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import scipy.io as sio 3 | import os 4 | import numpy as np 5 | import cv2 6 | from lxml.etree import Element, SubElement, tostring 7 | import shutil 8 | 9 | test_data = ['/Users/hzzone/Downloads/hand_dataset/test_dataset/test_data'] 10 | trainval_data = ['/Users/hzzone/Downloads/hand_dataset/training_dataset/training_data', '/Users/hzzone/Downloads/hand_dataset/validation_dataset/validation_data'] 11 | def trans(data_sources, set_name): 12 | curr_dir = os.path.dirname(os.path.abspath(__file__)) 13 | os.mkdir(os.path.join(curr_dir, set_name)) 14 | Annotations_dir = os.path.join(curr_dir, set_name, 'Annotations') 15 | JPEGImages_dir = os.path.join(curr_dir, set_name, 'JPEGImages') 16 | os.mkdir(Annotations_dir) 17 | os.mkdir(JPEGImages_dir) 18 | # cv2.namedWindow("", 0) 19 | # cv2.resizeWindow('', 300, 300) 20 | for each_source in data_sources: 21 | annotations_source = osp.join(each_source, 'annotations') 22 | img_source = osp.join(each_source, 'images') 23 | for mat_file in os.listdir(annotations_source): 24 | mat_file_path = osp.join(annotations_source, mat_file) 25 | # print(mat_file_path) 26 | img_file_path = osp.join(img_source, mat_file.rstrip('.mat'))+'.jpg' 27 | img = cv2.imread(img_file_path) 28 | boxes_data = sio.loadmat(mat_file_path)["boxes"].flatten() 29 | 30 | 31 | node_root = Element('annotation') 32 | 33 | node_folder = SubElement(node_root, 'folder') 34 | node_folder.text = 'egohands' 35 | 36 | node_filename = SubElement(node_root, 'filename') 37 | node_filename.text = mat_file.strip('.mat')+'.jpg' 38 | # 39 | node_size = SubElement(node_root, 'size') 40 | node_segmented = SubElement(node_root, 'segmented') 41 | node_segmented.text = '0' 42 | node_width = SubElement(node_size, 'width') 43 | im_height, im_width, channel = img.shape 44 | node_width.text = str(im_width) 45 | # 46 | node_height = SubElement(node_size, 'height') 47 | node_height.text = str(im_height) 48 | # 49 | node_depth = SubElement(node_size, 'depth') 50 | node_depth.text = str(channel) 51 | 52 | effective_hands = 0 53 | for box in boxes_data: 54 | tmp = np.reshape(box[0, 0].tolist()[:4], (-1, 2)) 55 | y1 = int(round(min(tmp[:, 0]), 0)) 56 | y2 = int(round(max(tmp[:, 0]), 0)) 57 | x1 = int(round(min(tmp[:, 1]), 0)) 58 | x2 = int(round(max(tmp[:, 1]), 0)) 59 | # cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), thickness=2) 60 | x2 = im_width if x2 > im_width else x2 61 | y2 = im_height if y2 > im_height else y2 62 | x1 = 0 if x1 < 0 else x1 63 | y1 = 0 if y1 < 0 else y1 64 | 65 | width = x2-x1+1 66 | height = y2-y1+1 67 | 68 | if(min(width, height)<20): 69 | continue 70 | 71 | # if x2>im_width or x1<0 or y2>im_height or y1<0: 72 | # print(x1, x2, y1, y2, width, height, im_height, im_width) 73 | # cv2.imshow("", img) 74 | # cv2.waitKey(0) 75 | if x2<=x1 or y2<=y1: 76 | print(x1, y1) 77 | 78 | 79 | effective_hands = effective_hands + 1 80 | node_object = SubElement(node_root, 'object') 81 | node_name = SubElement(node_object, 'name') 82 | node_name.text = 'hand' 83 | node_difficult = SubElement(node_object, 'difficult') 84 | node_difficult.text = '0' 85 | node_bndbox = SubElement(node_object, 'bndbox') 86 | node_xmin = SubElement(node_bndbox, 'xmin') 87 | node_xmin.text = str(x1) 88 | node_ymin = SubElement(node_bndbox, 'ymin') 89 | node_ymin.text = str(y1) 90 | node_xmax = SubElement(node_bndbox, 'xmax') 91 | node_xmax.text = str(x2) 92 | node_ymax = SubElement(node_bndbox, 'ymax') 93 | node_ymax.text = str(y2) 94 | xml = tostring(node_root, pretty_print=True) 95 | if effective_hands != 0: 96 | with open(Annotations_dir + "/" + mat_file.rstrip('.mat') +'.xml', 'w') as f: 97 | f.write(xml) 98 | shutil.copy(img_file_path, JPEGImages_dir + '/' + mat_file.rstrip('.mat') + '.jpg') 99 | 100 | 101 | trans(trainval_data, 'trainval') 102 | trans(test_data, 'test') -------------------------------------------------------------------------------- /other/Hand_Detection/model/deploy.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_HAND_SSD_300x300_deploy" 2 | input: "data" 3 | input_shape { 4 | dim: 1 5 | dim: 3 6 | dim: 300 7 | dim: 300 8 | } 9 | layer { 10 | name: "conv1_1" 11 | type: "Convolution" 12 | bottom: "data" 13 | top: "conv1_1" 14 | param { 15 | lr_mult: 1.0 16 | decay_mult: 1.0 17 | } 18 | param { 19 | lr_mult: 2.0 20 | decay_mult: 0.0 21 | } 22 | convolution_param { 23 | num_output: 64 24 | pad: 1 25 | kernel_size: 3 26 | weight_filler { 27 | type: "xavier" 28 | } 29 | bias_filler { 30 | type: "constant" 31 | value: 0.0 32 | } 33 | } 34 | } 35 | layer { 36 | name: "relu1_1" 37 | type: "ReLU" 38 | bottom: "conv1_1" 39 | top: "conv1_1" 40 | } 41 | layer { 42 | name: "conv1_2" 43 | type: "Convolution" 44 | bottom: "conv1_1" 45 | top: "conv1_2" 46 | param { 47 | lr_mult: 1.0 48 | decay_mult: 1.0 49 | } 50 | param { 51 | lr_mult: 2.0 52 | decay_mult: 0.0 53 | } 54 | convolution_param { 55 | num_output: 64 56 | pad: 1 57 | kernel_size: 3 58 | weight_filler { 59 | type: "xavier" 60 | } 61 | bias_filler { 62 | type: "constant" 63 | value: 0.0 64 | } 65 | } 66 | } 67 | layer { 68 | name: "relu1_2" 69 | type: "ReLU" 70 | bottom: "conv1_2" 71 | top: "conv1_2" 72 | } 73 | layer { 74 | name: "pool1" 75 | type: "Pooling" 76 | bottom: "conv1_2" 77 | top: "pool1" 78 | pooling_param { 79 | pool: MAX 80 | kernel_size: 2 81 | stride: 2 82 | } 83 | } 84 | layer { 85 | name: "conv2_1" 86 | type: "Convolution" 87 | bottom: "pool1" 88 | top: "conv2_1" 89 | param { 90 | lr_mult: 1.0 91 | decay_mult: 1.0 92 | } 93 | param { 94 | lr_mult: 2.0 95 | decay_mult: 0.0 96 | } 97 | convolution_param { 98 | num_output: 128 99 | pad: 1 100 | kernel_size: 3 101 | weight_filler { 102 | type: "xavier" 103 | } 104 | bias_filler { 105 | type: "constant" 106 | value: 0.0 107 | } 108 | } 109 | } 110 | layer { 111 | name: "relu2_1" 112 | type: "ReLU" 113 | bottom: "conv2_1" 114 | top: "conv2_1" 115 | } 116 | layer { 117 | name: "conv2_2" 118 | type: "Convolution" 119 | bottom: "conv2_1" 120 | top: "conv2_2" 121 | param { 122 | lr_mult: 1.0 123 | decay_mult: 1.0 124 | } 125 | param { 126 | lr_mult: 2.0 127 | decay_mult: 0.0 128 | } 129 | convolution_param { 130 | num_output: 128 131 | pad: 1 132 | kernel_size: 3 133 | weight_filler { 134 | type: "xavier" 135 | } 136 | bias_filler { 137 | type: "constant" 138 | value: 0.0 139 | } 140 | } 141 | } 142 | layer { 143 | name: "relu2_2" 144 | type: "ReLU" 145 | bottom: "conv2_2" 146 | top: "conv2_2" 147 | } 148 | layer { 149 | name: "pool2" 150 | type: "Pooling" 151 | bottom: "conv2_2" 152 | top: "pool2" 153 | pooling_param { 154 | pool: MAX 155 | kernel_size: 2 156 | stride: 2 157 | } 158 | } 159 | layer { 160 | name: "conv3_1" 161 | type: "Convolution" 162 | bottom: "pool2" 163 | top: "conv3_1" 164 | param { 165 | lr_mult: 1.0 166 | decay_mult: 1.0 167 | } 168 | param { 169 | lr_mult: 2.0 170 | decay_mult: 0.0 171 | } 172 | convolution_param { 173 | num_output: 256 174 | pad: 1 175 | kernel_size: 3 176 | weight_filler { 177 | type: "xavier" 178 | } 179 | bias_filler { 180 | type: "constant" 181 | value: 0.0 182 | } 183 | } 184 | } 185 | layer { 186 | name: "relu3_1" 187 | type: "ReLU" 188 | bottom: "conv3_1" 189 | top: "conv3_1" 190 | } 191 | layer { 192 | name: "conv3_2" 193 | type: "Convolution" 194 | bottom: "conv3_1" 195 | top: "conv3_2" 196 | param { 197 | lr_mult: 1.0 198 | decay_mult: 1.0 199 | } 200 | param { 201 | lr_mult: 2.0 202 | decay_mult: 0.0 203 | } 204 | convolution_param { 205 | num_output: 256 206 | pad: 1 207 | kernel_size: 3 208 | weight_filler { 209 | type: "xavier" 210 | } 211 | bias_filler { 212 | type: "constant" 213 | value: 0.0 214 | } 215 | } 216 | } 217 | layer { 218 | name: "relu3_2" 219 | type: "ReLU" 220 | bottom: "conv3_2" 221 | top: "conv3_2" 222 | } 223 | layer { 224 | name: "conv3_3" 225 | type: "Convolution" 226 | bottom: "conv3_2" 227 | top: "conv3_3" 228 | param { 229 | lr_mult: 1.0 230 | decay_mult: 1.0 231 | } 232 | param { 233 | lr_mult: 2.0 234 | decay_mult: 0.0 235 | } 236 | convolution_param { 237 | num_output: 256 238 | pad: 1 239 | kernel_size: 3 240 | weight_filler { 241 | type: "xavier" 242 | } 243 | bias_filler { 244 | type: "constant" 245 | value: 0.0 246 | } 247 | } 248 | } 249 | layer { 250 | name: "relu3_3" 251 | type: "ReLU" 252 | bottom: "conv3_3" 253 | top: "conv3_3" 254 | } 255 | layer { 256 | name: "pool3" 257 | type: "Pooling" 258 | bottom: "conv3_3" 259 | top: "pool3" 260 | pooling_param { 261 | pool: MAX 262 | kernel_size: 2 263 | stride: 2 264 | } 265 | } 266 | layer { 267 | name: "conv4_1" 268 | type: "Convolution" 269 | bottom: "pool3" 270 | top: "conv4_1" 271 | param { 272 | lr_mult: 1.0 273 | decay_mult: 1.0 274 | } 275 | param { 276 | lr_mult: 2.0 277 | decay_mult: 0.0 278 | } 279 | convolution_param { 280 | num_output: 512 281 | pad: 1 282 | kernel_size: 3 283 | weight_filler { 284 | type: "xavier" 285 | } 286 | bias_filler { 287 | type: "constant" 288 | value: 0.0 289 | } 290 | } 291 | } 292 | layer { 293 | name: "relu4_1" 294 | type: "ReLU" 295 | bottom: "conv4_1" 296 | top: "conv4_1" 297 | } 298 | layer { 299 | name: "conv4_2" 300 | type: "Convolution" 301 | bottom: "conv4_1" 302 | top: "conv4_2" 303 | param { 304 | lr_mult: 1.0 305 | decay_mult: 1.0 306 | } 307 | param { 308 | lr_mult: 2.0 309 | decay_mult: 0.0 310 | } 311 | convolution_param { 312 | num_output: 512 313 | pad: 1 314 | kernel_size: 3 315 | weight_filler { 316 | type: "xavier" 317 | } 318 | bias_filler { 319 | type: "constant" 320 | value: 0.0 321 | } 322 | } 323 | } 324 | layer { 325 | name: "relu4_2" 326 | type: "ReLU" 327 | bottom: "conv4_2" 328 | top: "conv4_2" 329 | } 330 | layer { 331 | name: "conv4_3" 332 | type: "Convolution" 333 | bottom: "conv4_2" 334 | top: "conv4_3" 335 | param { 336 | lr_mult: 1.0 337 | decay_mult: 1.0 338 | } 339 | param { 340 | lr_mult: 2.0 341 | decay_mult: 0.0 342 | } 343 | convolution_param { 344 | num_output: 512 345 | pad: 1 346 | kernel_size: 3 347 | weight_filler { 348 | type: "xavier" 349 | } 350 | bias_filler { 351 | type: "constant" 352 | value: 0.0 353 | } 354 | } 355 | } 356 | layer { 357 | name: "relu4_3" 358 | type: "ReLU" 359 | bottom: "conv4_3" 360 | top: "conv4_3" 361 | } 362 | layer { 363 | name: "pool4" 364 | type: "Pooling" 365 | bottom: "conv4_3" 366 | top: "pool4" 367 | pooling_param { 368 | pool: MAX 369 | kernel_size: 2 370 | stride: 2 371 | } 372 | } 373 | layer { 374 | name: "conv5_1" 375 | type: "Convolution" 376 | bottom: "pool4" 377 | top: "conv5_1" 378 | param { 379 | lr_mult: 1.0 380 | decay_mult: 1.0 381 | } 382 | param { 383 | lr_mult: 2.0 384 | decay_mult: 0.0 385 | } 386 | convolution_param { 387 | num_output: 512 388 | pad: 1 389 | kernel_size: 3 390 | weight_filler { 391 | type: "xavier" 392 | } 393 | bias_filler { 394 | type: "constant" 395 | value: 0.0 396 | } 397 | dilation: 1 398 | } 399 | } 400 | layer { 401 | name: "relu5_1" 402 | type: "ReLU" 403 | bottom: "conv5_1" 404 | top: "conv5_1" 405 | } 406 | layer { 407 | name: "conv5_2" 408 | type: "Convolution" 409 | bottom: "conv5_1" 410 | top: "conv5_2" 411 | param { 412 | lr_mult: 1.0 413 | decay_mult: 1.0 414 | } 415 | param { 416 | lr_mult: 2.0 417 | decay_mult: 0.0 418 | } 419 | convolution_param { 420 | num_output: 512 421 | pad: 1 422 | kernel_size: 3 423 | weight_filler { 424 | type: "xavier" 425 | } 426 | bias_filler { 427 | type: "constant" 428 | value: 0.0 429 | } 430 | dilation: 1 431 | } 432 | } 433 | layer { 434 | name: "relu5_2" 435 | type: "ReLU" 436 | bottom: "conv5_2" 437 | top: "conv5_2" 438 | } 439 | layer { 440 | name: "conv5_3" 441 | type: "Convolution" 442 | bottom: "conv5_2" 443 | top: "conv5_3" 444 | param { 445 | lr_mult: 1.0 446 | decay_mult: 1.0 447 | } 448 | param { 449 | lr_mult: 2.0 450 | decay_mult: 0.0 451 | } 452 | convolution_param { 453 | num_output: 512 454 | pad: 1 455 | kernel_size: 3 456 | weight_filler { 457 | type: "xavier" 458 | } 459 | bias_filler { 460 | type: "constant" 461 | value: 0.0 462 | } 463 | dilation: 1 464 | } 465 | } 466 | layer { 467 | name: "relu5_3" 468 | type: "ReLU" 469 | bottom: "conv5_3" 470 | top: "conv5_3" 471 | } 472 | layer { 473 | name: "pool5" 474 | type: "Pooling" 475 | bottom: "conv5_3" 476 | top: "pool5" 477 | pooling_param { 478 | pool: MAX 479 | kernel_size: 3 480 | stride: 1 481 | pad: 1 482 | } 483 | } 484 | layer { 485 | name: "fc6" 486 | type: "Convolution" 487 | bottom: "pool5" 488 | top: "fc6" 489 | param { 490 | lr_mult: 1.0 491 | decay_mult: 1.0 492 | } 493 | param { 494 | lr_mult: 2.0 495 | decay_mult: 0.0 496 | } 497 | convolution_param { 498 | num_output: 1024 499 | pad: 6 500 | kernel_size: 3 501 | weight_filler { 502 | type: "xavier" 503 | } 504 | bias_filler { 505 | type: "constant" 506 | value: 0.0 507 | } 508 | dilation: 6 509 | } 510 | } 511 | layer { 512 | name: "relu6" 513 | type: "ReLU" 514 | bottom: "fc6" 515 | top: "fc6" 516 | } 517 | layer { 518 | name: "fc7" 519 | type: "Convolution" 520 | bottom: "fc6" 521 | top: "fc7" 522 | param { 523 | lr_mult: 1.0 524 | decay_mult: 1.0 525 | } 526 | param { 527 | lr_mult: 2.0 528 | decay_mult: 0.0 529 | } 530 | convolution_param { 531 | num_output: 1024 532 | kernel_size: 1 533 | weight_filler { 534 | type: "xavier" 535 | } 536 | bias_filler { 537 | type: "constant" 538 | value: 0.0 539 | } 540 | } 541 | } 542 | layer { 543 | name: "relu7" 544 | type: "ReLU" 545 | bottom: "fc7" 546 | top: "fc7" 547 | } 548 | layer { 549 | name: "conv6_1" 550 | type: "Convolution" 551 | bottom: "fc7" 552 | top: "conv6_1" 553 | param { 554 | lr_mult: 1.0 555 | decay_mult: 1.0 556 | } 557 | param { 558 | lr_mult: 2.0 559 | decay_mult: 0.0 560 | } 561 | convolution_param { 562 | num_output: 256 563 | pad: 0 564 | kernel_size: 1 565 | stride: 1 566 | weight_filler { 567 | type: "xavier" 568 | } 569 | bias_filler { 570 | type: "constant" 571 | value: 0.0 572 | } 573 | } 574 | } 575 | layer { 576 | name: "conv6_1_relu" 577 | type: "ReLU" 578 | bottom: "conv6_1" 579 | top: "conv6_1" 580 | } 581 | layer { 582 | name: "conv6_2" 583 | type: "Convolution" 584 | bottom: "conv6_1" 585 | top: "conv6_2" 586 | param { 587 | lr_mult: 1.0 588 | decay_mult: 1.0 589 | } 590 | param { 591 | lr_mult: 2.0 592 | decay_mult: 0.0 593 | } 594 | convolution_param { 595 | num_output: 512 596 | pad: 1 597 | kernel_size: 3 598 | stride: 2 599 | weight_filler { 600 | type: "xavier" 601 | } 602 | bias_filler { 603 | type: "constant" 604 | value: 0.0 605 | } 606 | } 607 | } 608 | layer { 609 | name: "conv6_2_relu" 610 | type: "ReLU" 611 | bottom: "conv6_2" 612 | top: "conv6_2" 613 | } 614 | layer { 615 | name: "conv7_1" 616 | type: "Convolution" 617 | bottom: "conv6_2" 618 | top: "conv7_1" 619 | param { 620 | lr_mult: 1.0 621 | decay_mult: 1.0 622 | } 623 | param { 624 | lr_mult: 2.0 625 | decay_mult: 0.0 626 | } 627 | convolution_param { 628 | num_output: 128 629 | pad: 0 630 | kernel_size: 1 631 | stride: 1 632 | weight_filler { 633 | type: "xavier" 634 | } 635 | bias_filler { 636 | type: "constant" 637 | value: 0.0 638 | } 639 | } 640 | } 641 | layer { 642 | name: "conv7_1_relu" 643 | type: "ReLU" 644 | bottom: "conv7_1" 645 | top: "conv7_1" 646 | } 647 | layer { 648 | name: "conv7_2" 649 | type: "Convolution" 650 | bottom: "conv7_1" 651 | top: "conv7_2" 652 | param { 653 | lr_mult: 1.0 654 | decay_mult: 1.0 655 | } 656 | param { 657 | lr_mult: 2.0 658 | decay_mult: 0.0 659 | } 660 | convolution_param { 661 | num_output: 256 662 | pad: 1 663 | kernel_size: 3 664 | stride: 2 665 | weight_filler { 666 | type: "xavier" 667 | } 668 | bias_filler { 669 | type: "constant" 670 | value: 0.0 671 | } 672 | } 673 | } 674 | layer { 675 | name: "conv7_2_relu" 676 | type: "ReLU" 677 | bottom: "conv7_2" 678 | top: "conv7_2" 679 | } 680 | layer { 681 | name: "conv8_1" 682 | type: "Convolution" 683 | bottom: "conv7_2" 684 | top: "conv8_1" 685 | param { 686 | lr_mult: 1.0 687 | decay_mult: 1.0 688 | } 689 | param { 690 | lr_mult: 2.0 691 | decay_mult: 0.0 692 | } 693 | convolution_param { 694 | num_output: 128 695 | pad: 0 696 | kernel_size: 1 697 | stride: 1 698 | weight_filler { 699 | type: "xavier" 700 | } 701 | bias_filler { 702 | type: "constant" 703 | value: 0.0 704 | } 705 | } 706 | } 707 | layer { 708 | name: "conv8_1_relu" 709 | type: "ReLU" 710 | bottom: "conv8_1" 711 | top: "conv8_1" 712 | } 713 | layer { 714 | name: "conv8_2" 715 | type: "Convolution" 716 | bottom: "conv8_1" 717 | top: "conv8_2" 718 | param { 719 | lr_mult: 1.0 720 | decay_mult: 1.0 721 | } 722 | param { 723 | lr_mult: 2.0 724 | decay_mult: 0.0 725 | } 726 | convolution_param { 727 | num_output: 256 728 | pad: 0 729 | kernel_size: 3 730 | stride: 1 731 | weight_filler { 732 | type: "xavier" 733 | } 734 | bias_filler { 735 | type: "constant" 736 | value: 0.0 737 | } 738 | } 739 | } 740 | layer { 741 | name: "conv8_2_relu" 742 | type: "ReLU" 743 | bottom: "conv8_2" 744 | top: "conv8_2" 745 | } 746 | layer { 747 | name: "conv9_1" 748 | type: "Convolution" 749 | bottom: "conv8_2" 750 | top: "conv9_1" 751 | param { 752 | lr_mult: 1.0 753 | decay_mult: 1.0 754 | } 755 | param { 756 | lr_mult: 2.0 757 | decay_mult: 0.0 758 | } 759 | convolution_param { 760 | num_output: 128 761 | pad: 0 762 | kernel_size: 1 763 | stride: 1 764 | weight_filler { 765 | type: "xavier" 766 | } 767 | bias_filler { 768 | type: "constant" 769 | value: 0.0 770 | } 771 | } 772 | } 773 | layer { 774 | name: "conv9_1_relu" 775 | type: "ReLU" 776 | bottom: "conv9_1" 777 | top: "conv9_1" 778 | } 779 | layer { 780 | name: "conv9_2" 781 | type: "Convolution" 782 | bottom: "conv9_1" 783 | top: "conv9_2" 784 | param { 785 | lr_mult: 1.0 786 | decay_mult: 1.0 787 | } 788 | param { 789 | lr_mult: 2.0 790 | decay_mult: 0.0 791 | } 792 | convolution_param { 793 | num_output: 256 794 | pad: 0 795 | kernel_size: 3 796 | stride: 1 797 | weight_filler { 798 | type: "xavier" 799 | } 800 | bias_filler { 801 | type: "constant" 802 | value: 0.0 803 | } 804 | } 805 | } 806 | layer { 807 | name: "conv9_2_relu" 808 | type: "ReLU" 809 | bottom: "conv9_2" 810 | top: "conv9_2" 811 | } 812 | layer { 813 | name: "conv4_3_norm" 814 | type: "Normalize" 815 | bottom: "conv4_3" 816 | top: "conv4_3_norm" 817 | norm_param { 818 | across_spatial: false 819 | scale_filler { 820 | type: "constant" 821 | value: 20.0 822 | } 823 | channel_shared: false 824 | } 825 | } 826 | layer { 827 | name: "conv4_3_norm_mbox_loc" 828 | type: "Convolution" 829 | bottom: "conv4_3_norm" 830 | top: "conv4_3_norm_mbox_loc" 831 | param { 832 | lr_mult: 1.0 833 | decay_mult: 1.0 834 | } 835 | param { 836 | lr_mult: 2.0 837 | decay_mult: 0.0 838 | } 839 | convolution_param { 840 | num_output: 16 841 | pad: 1 842 | kernel_size: 3 843 | stride: 1 844 | weight_filler { 845 | type: "xavier" 846 | } 847 | bias_filler { 848 | type: "constant" 849 | value: 0.0 850 | } 851 | } 852 | } 853 | layer { 854 | name: "conv4_3_norm_mbox_loc_perm" 855 | type: "Permute" 856 | bottom: "conv4_3_norm_mbox_loc" 857 | top: "conv4_3_norm_mbox_loc_perm" 858 | permute_param { 859 | order: 0 860 | order: 2 861 | order: 3 862 | order: 1 863 | } 864 | } 865 | layer { 866 | name: "conv4_3_norm_mbox_loc_flat" 867 | type: "Flatten" 868 | bottom: "conv4_3_norm_mbox_loc_perm" 869 | top: "conv4_3_norm_mbox_loc_flat" 870 | flatten_param { 871 | axis: 1 872 | } 873 | } 874 | layer { 875 | name: "conv4_3_norm_mbox_conf_hand_detection" 876 | type: "Convolution" 877 | bottom: "conv4_3_norm" 878 | top: "conv4_3_norm_mbox_conf_hand_detection" 879 | param { 880 | lr_mult: 1.0 881 | decay_mult: 1.0 882 | } 883 | param { 884 | lr_mult: 2.0 885 | decay_mult: 0.0 886 | } 887 | convolution_param { 888 | num_output: 8 889 | pad: 1 890 | kernel_size: 3 891 | stride: 1 892 | weight_filler { 893 | type: "xavier" 894 | } 895 | bias_filler { 896 | type: "constant" 897 | value: 0.0 898 | } 899 | } 900 | } 901 | layer { 902 | name: "conv4_3_norm_mbox_conf_hand_detection_perm" 903 | type: "Permute" 904 | bottom: "conv4_3_norm_mbox_conf_hand_detection" 905 | top: "conv4_3_norm_mbox_conf_hand_detection_perm" 906 | permute_param { 907 | order: 0 908 | order: 2 909 | order: 3 910 | order: 1 911 | } 912 | } 913 | layer { 914 | name: "conv4_3_norm_mbox_conf_hand_detection_flat" 915 | type: "Flatten" 916 | bottom: "conv4_3_norm_mbox_conf_hand_detection_perm" 917 | top: "conv4_3_norm_mbox_conf_hand_detection_flat" 918 | flatten_param { 919 | axis: 1 920 | } 921 | } 922 | layer { 923 | name: "conv4_3_norm_mbox_priorbox" 924 | type: "PriorBox" 925 | bottom: "conv4_3_norm" 926 | bottom: "data" 927 | top: "conv4_3_norm_mbox_priorbox" 928 | prior_box_param { 929 | min_size: 30.0 930 | max_size: 60.0 931 | aspect_ratio: 2.0 932 | flip: true 933 | clip: false 934 | variance: 0.10000000149 935 | variance: 0.10000000149 936 | variance: 0.20000000298 937 | variance: 0.20000000298 938 | step: 8.0 939 | offset: 0.5 940 | } 941 | } 942 | layer { 943 | name: "fc7_mbox_loc" 944 | type: "Convolution" 945 | bottom: "fc7" 946 | top: "fc7_mbox_loc" 947 | param { 948 | lr_mult: 1.0 949 | decay_mult: 1.0 950 | } 951 | param { 952 | lr_mult: 2.0 953 | decay_mult: 0.0 954 | } 955 | convolution_param { 956 | num_output: 24 957 | pad: 1 958 | kernel_size: 3 959 | stride: 1 960 | weight_filler { 961 | type: "xavier" 962 | } 963 | bias_filler { 964 | type: "constant" 965 | value: 0.0 966 | } 967 | } 968 | } 969 | layer { 970 | name: "fc7_mbox_loc_perm" 971 | type: "Permute" 972 | bottom: "fc7_mbox_loc" 973 | top: "fc7_mbox_loc_perm" 974 | permute_param { 975 | order: 0 976 | order: 2 977 | order: 3 978 | order: 1 979 | } 980 | } 981 | layer { 982 | name: "fc7_mbox_loc_flat" 983 | type: "Flatten" 984 | bottom: "fc7_mbox_loc_perm" 985 | top: "fc7_mbox_loc_flat" 986 | flatten_param { 987 | axis: 1 988 | } 989 | } 990 | layer { 991 | name: "fc7_mbox_conf_hand_detection" 992 | type: "Convolution" 993 | bottom: "fc7" 994 | top: "fc7_mbox_conf_hand_detection" 995 | param { 996 | lr_mult: 1.0 997 | decay_mult: 1.0 998 | } 999 | param { 1000 | lr_mult: 2.0 1001 | decay_mult: 0.0 1002 | } 1003 | convolution_param { 1004 | num_output: 12 1005 | pad: 1 1006 | kernel_size: 3 1007 | stride: 1 1008 | weight_filler { 1009 | type: "xavier" 1010 | } 1011 | bias_filler { 1012 | type: "constant" 1013 | value: 0.0 1014 | } 1015 | } 1016 | } 1017 | layer { 1018 | name: "fc7_mbox_conf_hand_detection_perm" 1019 | type: "Permute" 1020 | bottom: "fc7_mbox_conf_hand_detection" 1021 | top: "fc7_mbox_conf_hand_detection_perm" 1022 | permute_param { 1023 | order: 0 1024 | order: 2 1025 | order: 3 1026 | order: 1 1027 | } 1028 | } 1029 | layer { 1030 | name: "fc7_mbox_conf_hand_detection_flat" 1031 | type: "Flatten" 1032 | bottom: "fc7_mbox_conf_hand_detection_perm" 1033 | top: "fc7_mbox_conf_hand_detection_flat" 1034 | flatten_param { 1035 | axis: 1 1036 | } 1037 | } 1038 | layer { 1039 | name: "fc7_mbox_priorbox" 1040 | type: "PriorBox" 1041 | bottom: "fc7" 1042 | bottom: "data" 1043 | top: "fc7_mbox_priorbox" 1044 | prior_box_param { 1045 | min_size: 60.0 1046 | max_size: 111.0 1047 | aspect_ratio: 2.0 1048 | aspect_ratio: 3.0 1049 | flip: true 1050 | clip: false 1051 | variance: 0.10000000149 1052 | variance: 0.10000000149 1053 | variance: 0.20000000298 1054 | variance: 0.20000000298 1055 | step: 16.0 1056 | offset: 0.5 1057 | } 1058 | } 1059 | layer { 1060 | name: "conv6_2_mbox_loc" 1061 | type: "Convolution" 1062 | bottom: "conv6_2" 1063 | top: "conv6_2_mbox_loc" 1064 | param { 1065 | lr_mult: 1.0 1066 | decay_mult: 1.0 1067 | } 1068 | param { 1069 | lr_mult: 2.0 1070 | decay_mult: 0.0 1071 | } 1072 | convolution_param { 1073 | num_output: 24 1074 | pad: 1 1075 | kernel_size: 3 1076 | stride: 1 1077 | weight_filler { 1078 | type: "xavier" 1079 | } 1080 | bias_filler { 1081 | type: "constant" 1082 | value: 0.0 1083 | } 1084 | } 1085 | } 1086 | layer { 1087 | name: "conv6_2_mbox_loc_perm" 1088 | type: "Permute" 1089 | bottom: "conv6_2_mbox_loc" 1090 | top: "conv6_2_mbox_loc_perm" 1091 | permute_param { 1092 | order: 0 1093 | order: 2 1094 | order: 3 1095 | order: 1 1096 | } 1097 | } 1098 | layer { 1099 | name: "conv6_2_mbox_loc_flat" 1100 | type: "Flatten" 1101 | bottom: "conv6_2_mbox_loc_perm" 1102 | top: "conv6_2_mbox_loc_flat" 1103 | flatten_param { 1104 | axis: 1 1105 | } 1106 | } 1107 | layer { 1108 | name: "conv6_2_mbox_conf_hand_detection" 1109 | type: "Convolution" 1110 | bottom: "conv6_2" 1111 | top: "conv6_2_mbox_conf_hand_detection" 1112 | param { 1113 | lr_mult: 1.0 1114 | decay_mult: 1.0 1115 | } 1116 | param { 1117 | lr_mult: 2.0 1118 | decay_mult: 0.0 1119 | } 1120 | convolution_param { 1121 | num_output: 12 1122 | pad: 1 1123 | kernel_size: 3 1124 | stride: 1 1125 | weight_filler { 1126 | type: "xavier" 1127 | } 1128 | bias_filler { 1129 | type: "constant" 1130 | value: 0.0 1131 | } 1132 | } 1133 | } 1134 | layer { 1135 | name: "conv6_2_mbox_conf_hand_detection_perm" 1136 | type: "Permute" 1137 | bottom: "conv6_2_mbox_conf_hand_detection" 1138 | top: "conv6_2_mbox_conf_hand_detection_perm" 1139 | permute_param { 1140 | order: 0 1141 | order: 2 1142 | order: 3 1143 | order: 1 1144 | } 1145 | } 1146 | layer { 1147 | name: "conv6_2_mbox_conf_hand_detection_flat" 1148 | type: "Flatten" 1149 | bottom: "conv6_2_mbox_conf_hand_detection_perm" 1150 | top: "conv6_2_mbox_conf_hand_detection_flat" 1151 | flatten_param { 1152 | axis: 1 1153 | } 1154 | } 1155 | layer { 1156 | name: "conv6_2_mbox_priorbox" 1157 | type: "PriorBox" 1158 | bottom: "conv6_2" 1159 | bottom: "data" 1160 | top: "conv6_2_mbox_priorbox" 1161 | prior_box_param { 1162 | min_size: 111.0 1163 | max_size: 162.0 1164 | aspect_ratio: 2.0 1165 | aspect_ratio: 3.0 1166 | flip: true 1167 | clip: false 1168 | variance: 0.10000000149 1169 | variance: 0.10000000149 1170 | variance: 0.20000000298 1171 | variance: 0.20000000298 1172 | step: 32.0 1173 | offset: 0.5 1174 | } 1175 | } 1176 | layer { 1177 | name: "conv7_2_mbox_loc" 1178 | type: "Convolution" 1179 | bottom: "conv7_2" 1180 | top: "conv7_2_mbox_loc" 1181 | param { 1182 | lr_mult: 1.0 1183 | decay_mult: 1.0 1184 | } 1185 | param { 1186 | lr_mult: 2.0 1187 | decay_mult: 0.0 1188 | } 1189 | convolution_param { 1190 | num_output: 24 1191 | pad: 1 1192 | kernel_size: 3 1193 | stride: 1 1194 | weight_filler { 1195 | type: "xavier" 1196 | } 1197 | bias_filler { 1198 | type: "constant" 1199 | value: 0.0 1200 | } 1201 | } 1202 | } 1203 | layer { 1204 | name: "conv7_2_mbox_loc_perm" 1205 | type: "Permute" 1206 | bottom: "conv7_2_mbox_loc" 1207 | top: "conv7_2_mbox_loc_perm" 1208 | permute_param { 1209 | order: 0 1210 | order: 2 1211 | order: 3 1212 | order: 1 1213 | } 1214 | } 1215 | layer { 1216 | name: "conv7_2_mbox_loc_flat" 1217 | type: "Flatten" 1218 | bottom: "conv7_2_mbox_loc_perm" 1219 | top: "conv7_2_mbox_loc_flat" 1220 | flatten_param { 1221 | axis: 1 1222 | } 1223 | } 1224 | layer { 1225 | name: "conv7_2_mbox_conf_hand_detection" 1226 | type: "Convolution" 1227 | bottom: "conv7_2" 1228 | top: "conv7_2_mbox_conf_hand_detection" 1229 | param { 1230 | lr_mult: 1.0 1231 | decay_mult: 1.0 1232 | } 1233 | param { 1234 | lr_mult: 2.0 1235 | decay_mult: 0.0 1236 | } 1237 | convolution_param { 1238 | num_output: 12 1239 | pad: 1 1240 | kernel_size: 3 1241 | stride: 1 1242 | weight_filler { 1243 | type: "xavier" 1244 | } 1245 | bias_filler { 1246 | type: "constant" 1247 | value: 0.0 1248 | } 1249 | } 1250 | } 1251 | layer { 1252 | name: "conv7_2_mbox_conf_hand_detection_perm" 1253 | type: "Permute" 1254 | bottom: "conv7_2_mbox_conf_hand_detection" 1255 | top: "conv7_2_mbox_conf_hand_detection_perm" 1256 | permute_param { 1257 | order: 0 1258 | order: 2 1259 | order: 3 1260 | order: 1 1261 | } 1262 | } 1263 | layer { 1264 | name: "conv7_2_mbox_conf_hand_detection_flat" 1265 | type: "Flatten" 1266 | bottom: "conv7_2_mbox_conf_hand_detection_perm" 1267 | top: "conv7_2_mbox_conf_hand_detection_flat" 1268 | flatten_param { 1269 | axis: 1 1270 | } 1271 | } 1272 | layer { 1273 | name: "conv7_2_mbox_priorbox" 1274 | type: "PriorBox" 1275 | bottom: "conv7_2" 1276 | bottom: "data" 1277 | top: "conv7_2_mbox_priorbox" 1278 | prior_box_param { 1279 | min_size: 162.0 1280 | max_size: 213.0 1281 | aspect_ratio: 2.0 1282 | aspect_ratio: 3.0 1283 | flip: true 1284 | clip: false 1285 | variance: 0.10000000149 1286 | variance: 0.10000000149 1287 | variance: 0.20000000298 1288 | variance: 0.20000000298 1289 | step: 64.0 1290 | offset: 0.5 1291 | } 1292 | } 1293 | layer { 1294 | name: "conv8_2_mbox_loc" 1295 | type: "Convolution" 1296 | bottom: "conv8_2" 1297 | top: "conv8_2_mbox_loc" 1298 | param { 1299 | lr_mult: 1.0 1300 | decay_mult: 1.0 1301 | } 1302 | param { 1303 | lr_mult: 2.0 1304 | decay_mult: 0.0 1305 | } 1306 | convolution_param { 1307 | num_output: 16 1308 | pad: 1 1309 | kernel_size: 3 1310 | stride: 1 1311 | weight_filler { 1312 | type: "xavier" 1313 | } 1314 | bias_filler { 1315 | type: "constant" 1316 | value: 0.0 1317 | } 1318 | } 1319 | } 1320 | layer { 1321 | name: "conv8_2_mbox_loc_perm" 1322 | type: "Permute" 1323 | bottom: "conv8_2_mbox_loc" 1324 | top: "conv8_2_mbox_loc_perm" 1325 | permute_param { 1326 | order: 0 1327 | order: 2 1328 | order: 3 1329 | order: 1 1330 | } 1331 | } 1332 | layer { 1333 | name: "conv8_2_mbox_loc_flat" 1334 | type: "Flatten" 1335 | bottom: "conv8_2_mbox_loc_perm" 1336 | top: "conv8_2_mbox_loc_flat" 1337 | flatten_param { 1338 | axis: 1 1339 | } 1340 | } 1341 | layer { 1342 | name: "conv8_2_mbox_conf_hand_detection" 1343 | type: "Convolution" 1344 | bottom: "conv8_2" 1345 | top: "conv8_2_mbox_conf_hand_detection" 1346 | param { 1347 | lr_mult: 1.0 1348 | decay_mult: 1.0 1349 | } 1350 | param { 1351 | lr_mult: 2.0 1352 | decay_mult: 0.0 1353 | } 1354 | convolution_param { 1355 | num_output: 8 1356 | pad: 1 1357 | kernel_size: 3 1358 | stride: 1 1359 | weight_filler { 1360 | type: "xavier" 1361 | } 1362 | bias_filler { 1363 | type: "constant" 1364 | value: 0.0 1365 | } 1366 | } 1367 | } 1368 | layer { 1369 | name: "conv8_2_mbox_conf_hand_detection_perm" 1370 | type: "Permute" 1371 | bottom: "conv8_2_mbox_conf_hand_detection" 1372 | top: "conv8_2_mbox_conf_hand_detection_perm" 1373 | permute_param { 1374 | order: 0 1375 | order: 2 1376 | order: 3 1377 | order: 1 1378 | } 1379 | } 1380 | layer { 1381 | name: "conv8_2_mbox_conf_hand_detection_flat" 1382 | type: "Flatten" 1383 | bottom: "conv8_2_mbox_conf_hand_detection_perm" 1384 | top: "conv8_2_mbox_conf_hand_detection_flat" 1385 | flatten_param { 1386 | axis: 1 1387 | } 1388 | } 1389 | layer { 1390 | name: "conv8_2_mbox_priorbox" 1391 | type: "PriorBox" 1392 | bottom: "conv8_2" 1393 | bottom: "data" 1394 | top: "conv8_2_mbox_priorbox" 1395 | prior_box_param { 1396 | min_size: 213.0 1397 | max_size: 264.0 1398 | aspect_ratio: 2.0 1399 | flip: true 1400 | clip: false 1401 | variance: 0.10000000149 1402 | variance: 0.10000000149 1403 | variance: 0.20000000298 1404 | variance: 0.20000000298 1405 | step: 100.0 1406 | offset: 0.5 1407 | } 1408 | } 1409 | layer { 1410 | name: "conv9_2_mbox_loc" 1411 | type: "Convolution" 1412 | bottom: "conv9_2" 1413 | top: "conv9_2_mbox_loc" 1414 | param { 1415 | lr_mult: 1.0 1416 | decay_mult: 1.0 1417 | } 1418 | param { 1419 | lr_mult: 2.0 1420 | decay_mult: 0.0 1421 | } 1422 | convolution_param { 1423 | num_output: 16 1424 | pad: 1 1425 | kernel_size: 3 1426 | stride: 1 1427 | weight_filler { 1428 | type: "xavier" 1429 | } 1430 | bias_filler { 1431 | type: "constant" 1432 | value: 0.0 1433 | } 1434 | } 1435 | } 1436 | layer { 1437 | name: "conv9_2_mbox_loc_perm" 1438 | type: "Permute" 1439 | bottom: "conv9_2_mbox_loc" 1440 | top: "conv9_2_mbox_loc_perm" 1441 | permute_param { 1442 | order: 0 1443 | order: 2 1444 | order: 3 1445 | order: 1 1446 | } 1447 | } 1448 | layer { 1449 | name: "conv9_2_mbox_loc_flat" 1450 | type: "Flatten" 1451 | bottom: "conv9_2_mbox_loc_perm" 1452 | top: "conv9_2_mbox_loc_flat" 1453 | flatten_param { 1454 | axis: 1 1455 | } 1456 | } 1457 | layer { 1458 | name: "conv9_2_mbox_conf_hand_detection" 1459 | type: "Convolution" 1460 | bottom: "conv9_2" 1461 | top: "conv9_2_mbox_conf_hand_detection" 1462 | param { 1463 | lr_mult: 1.0 1464 | decay_mult: 1.0 1465 | } 1466 | param { 1467 | lr_mult: 2.0 1468 | decay_mult: 0.0 1469 | } 1470 | convolution_param { 1471 | num_output: 8 1472 | pad: 1 1473 | kernel_size: 3 1474 | stride: 1 1475 | weight_filler { 1476 | type: "xavier" 1477 | } 1478 | bias_filler { 1479 | type: "constant" 1480 | value: 0.0 1481 | } 1482 | } 1483 | } 1484 | layer { 1485 | name: "conv9_2_mbox_conf_hand_detection_perm" 1486 | type: "Permute" 1487 | bottom: "conv9_2_mbox_conf_hand_detection" 1488 | top: "conv9_2_mbox_conf_hand_detection_perm" 1489 | permute_param { 1490 | order: 0 1491 | order: 2 1492 | order: 3 1493 | order: 1 1494 | } 1495 | } 1496 | layer { 1497 | name: "conv9_2_mbox_conf_hand_detection_flat" 1498 | type: "Flatten" 1499 | bottom: "conv9_2_mbox_conf_hand_detection_perm" 1500 | top: "conv9_2_mbox_conf_hand_detection_flat" 1501 | flatten_param { 1502 | axis: 1 1503 | } 1504 | } 1505 | layer { 1506 | name: "conv9_2_mbox_priorbox" 1507 | type: "PriorBox" 1508 | bottom: "conv9_2" 1509 | bottom: "data" 1510 | top: "conv9_2_mbox_priorbox" 1511 | prior_box_param { 1512 | min_size: 264.0 1513 | max_size: 315.0 1514 | aspect_ratio: 2.0 1515 | flip: true 1516 | clip: false 1517 | variance: 0.10000000149 1518 | variance: 0.10000000149 1519 | variance: 0.20000000298 1520 | variance: 0.20000000298 1521 | step: 300.0 1522 | offset: 0.5 1523 | } 1524 | } 1525 | layer { 1526 | name: "mbox_loc" 1527 | type: "Concat" 1528 | bottom: "conv4_3_norm_mbox_loc_flat" 1529 | bottom: "fc7_mbox_loc_flat" 1530 | bottom: "conv6_2_mbox_loc_flat" 1531 | bottom: "conv7_2_mbox_loc_flat" 1532 | bottom: "conv8_2_mbox_loc_flat" 1533 | bottom: "conv9_2_mbox_loc_flat" 1534 | top: "mbox_loc" 1535 | concat_param { 1536 | axis: 1 1537 | } 1538 | } 1539 | layer { 1540 | name: "mbox_conf" 1541 | type: "Concat" 1542 | bottom: "conv4_3_norm_mbox_conf_hand_detection_flat" 1543 | bottom: "fc7_mbox_conf_hand_detection_flat" 1544 | bottom: "conv6_2_mbox_conf_hand_detection_flat" 1545 | bottom: "conv7_2_mbox_conf_hand_detection_flat" 1546 | bottom: "conv8_2_mbox_conf_hand_detection_flat" 1547 | bottom: "conv9_2_mbox_conf_hand_detection_flat" 1548 | top: "mbox_conf" 1549 | concat_param { 1550 | axis: 1 1551 | } 1552 | } 1553 | layer { 1554 | name: "mbox_priorbox" 1555 | type: "Concat" 1556 | bottom: "conv4_3_norm_mbox_priorbox" 1557 | bottom: "fc7_mbox_priorbox" 1558 | bottom: "conv6_2_mbox_priorbox" 1559 | bottom: "conv7_2_mbox_priorbox" 1560 | bottom: "conv8_2_mbox_priorbox" 1561 | bottom: "conv9_2_mbox_priorbox" 1562 | top: "mbox_priorbox" 1563 | concat_param { 1564 | axis: 2 1565 | } 1566 | } 1567 | layer { 1568 | name: "mbox_conf_reshape" 1569 | type: "Reshape" 1570 | bottom: "mbox_conf" 1571 | top: "mbox_conf_reshape" 1572 | reshape_param { 1573 | shape { 1574 | dim: 0 1575 | dim: -1 1576 | dim: 2 1577 | } 1578 | } 1579 | } 1580 | layer { 1581 | name: "mbox_conf_softmax" 1582 | type: "Softmax" 1583 | bottom: "mbox_conf_reshape" 1584 | top: "mbox_conf_softmax" 1585 | softmax_param { 1586 | axis: 2 1587 | } 1588 | } 1589 | layer { 1590 | name: "mbox_conf_flatten" 1591 | type: "Flatten" 1592 | bottom: "mbox_conf_softmax" 1593 | top: "mbox_conf_flatten" 1594 | flatten_param { 1595 | axis: 1 1596 | } 1597 | } 1598 | layer { 1599 | name: "detection_out" 1600 | type: "DetectionOutput" 1601 | bottom: "mbox_loc" 1602 | bottom: "mbox_conf_flatten" 1603 | bottom: "mbox_priorbox" 1604 | top: "detection_out" 1605 | include { 1606 | phase: TEST 1607 | } 1608 | detection_output_param { 1609 | num_classes: 2 1610 | share_location: true 1611 | background_label_id: 0 1612 | nms_param { 1613 | nms_threshold: 0.449999988079 1614 | top_k: 400 1615 | } 1616 | code_type: CENTER_SIZE 1617 | keep_top_k: 200 1618 | confidence_threshold: 0.00999999977648 1619 | } 1620 | } 1621 | 1622 | -------------------------------------------------------------------------------- /other/Hand_Detection/model/generate_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import sys 3 | sys.path.insert(0, '../caffe/python') 4 | import caffe 5 | from caffe.model_libs import * 6 | from google.protobuf import text_format 7 | 8 | import math 9 | import os 10 | import shutil 11 | import stat 12 | import subprocess 13 | 14 | # Add extra layers on top of a "base" network (e.g. VGGNet or Inception). 15 | def AddExtraLayers(net, use_batchnorm=True, lr_mult=1): 16 | use_relu = True 17 | 18 | # Add additional convolutional layers. 19 | # 19 x 19 20 | from_layer = net.keys()[-1] 21 | 22 | # TODO(weiliu89): Construct the name using the last layer to avoid duplication. 23 | # 10 x 10 24 | out_layer = "conv6_1" 25 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1, 26 | lr_mult=lr_mult) 27 | 28 | from_layer = out_layer 29 | out_layer = "conv6_2" 30 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2, 31 | lr_mult=lr_mult) 32 | 33 | # 5 x 5 34 | from_layer = out_layer 35 | out_layer = "conv7_1" 36 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1, 37 | lr_mult=lr_mult) 38 | 39 | from_layer = out_layer 40 | out_layer = "conv7_2" 41 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2, 42 | lr_mult=lr_mult) 43 | 44 | # 3 x 3 45 | from_layer = out_layer 46 | out_layer = "conv8_1" 47 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1, 48 | lr_mult=lr_mult) 49 | 50 | from_layer = out_layer 51 | out_layer = "conv8_2" 52 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1, 53 | lr_mult=lr_mult) 54 | 55 | # 1 x 1 56 | from_layer = out_layer 57 | out_layer = "conv9_1" 58 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1, 59 | lr_mult=lr_mult) 60 | 61 | from_layer = out_layer 62 | out_layer = "conv9_2" 63 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1, 64 | lr_mult=lr_mult) 65 | 66 | return net 67 | 68 | 69 | 70 | 71 | 72 | 73 | # The database file for training data. Created by data/VOC0712/create_data.sh 74 | train_data = "../data/lmdb/trainval_lmdb" 75 | model_name = "VGG_HAND_SSD_300x300" 76 | # The database file for testing data. Created by data/VOC0712/create_data.sh 77 | # Specify the batch sampler. 78 | resize_width = 300 79 | resize_height = 300 80 | resize = "{}x{}".format(resize_width, resize_height) 81 | batch_sampler = [ 82 | { 83 | 'sampler': { 84 | }, 85 | 'max_trials': 1, 86 | 'max_sample': 1, 87 | }, 88 | { 89 | 'sampler': { 90 | 'min_scale': 0.3, 91 | 'max_scale': 1.0, 92 | 'min_aspect_ratio': 0.5, 93 | 'max_aspect_ratio': 2.0, 94 | }, 95 | 'sample_constraint': { 96 | 'min_jaccard_overlap': 0.1, 97 | }, 98 | 'max_trials': 50, 99 | 'max_sample': 1, 100 | }, 101 | { 102 | 'sampler': { 103 | 'min_scale': 0.3, 104 | 'max_scale': 1.0, 105 | 'min_aspect_ratio': 0.5, 106 | 'max_aspect_ratio': 2.0, 107 | }, 108 | 'sample_constraint': { 109 | 'min_jaccard_overlap': 0.3, 110 | }, 111 | 'max_trials': 50, 112 | 'max_sample': 1, 113 | }, 114 | { 115 | 'sampler': { 116 | 'min_scale': 0.3, 117 | 'max_scale': 1.0, 118 | 'min_aspect_ratio': 0.5, 119 | 'max_aspect_ratio': 2.0, 120 | }, 121 | 'sample_constraint': { 122 | 'min_jaccard_overlap': 0.5, 123 | }, 124 | 'max_trials': 50, 125 | 'max_sample': 1, 126 | }, 127 | { 128 | 'sampler': { 129 | 'min_scale': 0.3, 130 | 'max_scale': 1.0, 131 | 'min_aspect_ratio': 0.5, 132 | 'max_aspect_ratio': 2.0, 133 | }, 134 | 'sample_constraint': { 135 | 'min_jaccard_overlap': 0.7, 136 | }, 137 | 'max_trials': 50, 138 | 'max_sample': 1, 139 | }, 140 | { 141 | 'sampler': { 142 | 'min_scale': 0.3, 143 | 'max_scale': 1.0, 144 | 'min_aspect_ratio': 0.5, 145 | 'max_aspect_ratio': 2.0, 146 | }, 147 | 'sample_constraint': { 148 | 'min_jaccard_overlap': 0.9, 149 | }, 150 | 'max_trials': 50, 151 | 'max_sample': 1, 152 | }, 153 | { 154 | 'sampler': { 155 | 'min_scale': 0.3, 156 | 'max_scale': 1.0, 157 | 'min_aspect_ratio': 0.5, 158 | 'max_aspect_ratio': 2.0, 159 | }, 160 | 'sample_constraint': { 161 | 'max_jaccard_overlap': 1.0, 162 | }, 163 | 'max_trials': 50, 164 | 'max_sample': 1, 165 | }, 166 | ] 167 | train_transform_param = { 168 | 'mirror': True, 169 | 'mean_value': [127.0, 127.0, 127.0], 170 | 'resize_param': { 171 | 'prob': 1, 172 | 'resize_mode': P.Resize.WARP, 173 | 'height': resize_height, 174 | 'width': resize_width, 175 | 'interp_mode': [ 176 | P.Resize.LINEAR, 177 | P.Resize.AREA, 178 | P.Resize.NEAREST, 179 | P.Resize.CUBIC, 180 | P.Resize.LANCZOS4, 181 | ], 182 | }, 183 | 'distort_param': { 184 | 'brightness_prob': 0.5, 185 | 'brightness_delta': 32, 186 | 'contrast_prob': 0.5, 187 | 'contrast_lower': 0.5, 188 | 'contrast_upper': 1.5, 189 | 'hue_prob': 0.5, 190 | 'hue_delta': 18, 191 | 'saturation_prob': 0.5, 192 | 'saturation_lower': 0.5, 193 | 'saturation_upper': 1.5, 194 | 'random_order_prob': 0.0, 195 | }, 196 | 'expand_param': { 197 | 'prob': 0.5, 198 | 'max_expand_ratio': 4.0, 199 | }, 200 | 'emit_constraint': { 201 | 'emit_type': caffe_pb2.EmitConstraint.CENTER, 202 | } 203 | } 204 | 205 | # If true, use batch norm for all newly added layers. 206 | # Currently only the non batch norm version has been tested. 207 | use_batchnorm = False 208 | lr_mult = 1 209 | 210 | # model definition files. 211 | train_net_file = "train.prototxt" 212 | deploy_net_file = "deploy.prototxt" 213 | solver_file = "solver.prototxt" 214 | # snapshot prefix. 215 | snapshot_prefix = "snapshot/VGG_HAND_SSD_300x300_iter_" 216 | # Stores LabelMapItem. 217 | label_map_file = "../data/labelmap_voc.prototxt" 218 | 219 | # MultiBoxLoss parameters. 220 | num_classes = 2 221 | share_location = True 222 | background_label_id=0 223 | train_on_diff_gt = True 224 | normalization_mode = P.Loss.VALID 225 | code_type = P.PriorBox.CENTER_SIZE 226 | ignore_cross_boundary_bbox = False 227 | mining_type = P.MultiBoxLoss.MAX_NEGATIVE 228 | neg_pos_ratio = 3. 229 | loc_weight = (neg_pos_ratio + 1.) / 4. 230 | multibox_loss_param = { 231 | 'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1, 232 | 'conf_loss_type': P.MultiBoxLoss.SOFTMAX, 233 | 'loc_weight': loc_weight, 234 | 'num_classes': num_classes, 235 | 'share_location': share_location, 236 | 'match_type': P.MultiBoxLoss.PER_PREDICTION, 237 | 'overlap_threshold': 0.5, 238 | 'use_prior_for_matching': True, 239 | 'background_label_id': background_label_id, 240 | 'use_difficult_gt': train_on_diff_gt, 241 | 'mining_type': mining_type, 242 | 'neg_pos_ratio': neg_pos_ratio, 243 | 'neg_overlap': 0.5, 244 | 'code_type': code_type, 245 | 'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox, 246 | } 247 | loss_param = { 248 | 'normalization': normalization_mode, 249 | } 250 | 251 | # parameters for generating priors. 252 | # minimum dimension of input image 253 | min_dim = 300 254 | # conv4_3 ==> 38 x 38 255 | # fc7 ==> 19 x 19 256 | # conv6_2 ==> 10 x 10 257 | # conv7_2 ==> 5 x 5 258 | # conv8_2 ==> 3 x 3 259 | # conv9_2 ==> 1 x 1 260 | mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2'] 261 | # in percent % 262 | min_ratio = 20 263 | max_ratio = 90 264 | step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2))) 265 | min_sizes = [] 266 | max_sizes = [] 267 | for ratio in xrange(min_ratio, max_ratio + 1, step): 268 | min_sizes.append(min_dim * ratio / 100.) 269 | max_sizes.append(min_dim * (ratio + step) / 100.) 270 | min_sizes = [min_dim * 10 / 100.] + min_sizes 271 | max_sizes = [min_dim * 20 / 100.] + max_sizes 272 | steps = [8, 16, 32, 64, 100, 300] 273 | aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]] 274 | # L2 normalize conv4_3. 275 | normalizations = [20, -1, -1, -1, -1, -1] 276 | # variance used to encode/decode prior bboxes. 277 | if code_type == P.PriorBox.CENTER_SIZE: 278 | prior_variance = [0.1, 0.1, 0.2, 0.2] 279 | else: 280 | prior_variance = [0.1] 281 | flip = True 282 | clip = False 283 | 284 | 285 | # Divide the mini-batch to different GPUs. 286 | batch_size = 2 287 | num_gpus = 1 288 | iter_size = 200000 289 | device_id = 0 290 | batch_size_per_device = batch_size 291 | 292 | batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus)) 293 | solver_mode = P.Solver.GPU 294 | 295 | base_lr = 0.0001 296 | 297 | solver_param = { 298 | # Train parameters 299 | 'base_lr': base_lr, 300 | 'weight_decay': 0.0005, 301 | 'lr_policy': "multistep", 302 | 'stepvalue': [80000, 100000, 120000], 303 | 'gamma': 0.1, 304 | 'momentum': 0.9, 305 | 'iter_size': iter_size, 306 | 'max_iter': 120000, 307 | 'snapshot': 80000, 308 | 'display': 10, 309 | 'average_loss': 10, 310 | 'type': "SGD", 311 | 'solver_mode': solver_mode, 312 | 'device_id': device_id, 313 | 'debug_info': False, 314 | 'snapshot_after_train': True 315 | } 316 | 317 | # Create train net. 318 | net = caffe.NetSpec() 319 | net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device, 320 | train=True, output_label=True, label_map_file=label_map_file, 321 | transform_param=train_transform_param, batch_sampler=batch_sampler) 322 | 323 | VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True, 324 | dropout=False) 325 | 326 | AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult) 327 | 328 | # New 329 | mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers, 330 | use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes, 331 | aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations, 332 | num_classes=num_classes, share_location=share_location, flip=flip, clip=clip, 333 | prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult, conf_postfix='_hand_detection') 334 | 335 | 336 | # ### initial 337 | # mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers, 338 | # use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes, 339 | # aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations, 340 | # num_classes=num_classes, share_location=share_location, flip=flip, clip=clip, 341 | # prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult) 342 | 343 | # Create the MultiBoxLossLayer. 344 | name = "mbox_loss" 345 | mbox_layers.append(net.label) 346 | net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param, 347 | loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')), 348 | propagate_down=[True, True, False, False]) 349 | 350 | with open(train_net_file, 'w') as f: 351 | print('name: "{}_train"'.format(model_name), file=f) 352 | print(net.to_proto(), file=f) 353 | 354 | # Create deploy net. 355 | # Remove the first and last layer from test net. 356 | ######### 357 | 358 | 359 | # parameters for generating detection output. 360 | det_out_param = { 361 | 'num_classes': num_classes, 362 | 'share_location': share_location, 363 | 'background_label_id': background_label_id, 364 | 'nms_param': {'nms_threshold': 0.45, 'top_k': 400}, 365 | 'keep_top_k': 200, 366 | 'confidence_threshold': 0.01, 367 | 'code_type': code_type, 368 | } 369 | 370 | 371 | conf_name = "mbox_conf" 372 | if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX: 373 | reshape_name = "{}_reshape".format(conf_name) 374 | net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes])) 375 | softmax_name = "{}_softmax".format(conf_name) 376 | net[softmax_name] = L.Softmax(net[reshape_name], axis=2) 377 | flatten_name = "{}_flatten".format(conf_name) 378 | net[flatten_name] = L.Flatten(net[softmax_name], axis=1) 379 | mbox_layers[1] = net[flatten_name] 380 | elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC: 381 | sigmoid_name = "{}_sigmoid".format(conf_name) 382 | net[sigmoid_name] = L.Sigmoid(net[conf_name]) 383 | mbox_layers[1] = net[sigmoid_name] 384 | 385 | net.detection_out = L.DetectionOutput(*mbox_layers, 386 | detection_output_param=det_out_param, 387 | include=dict(phase=caffe_pb2.Phase.Value('TEST'))) 388 | 389 | 390 | 391 | deploy_net = net 392 | with open(deploy_net_file, 'w') as f: 393 | net_param = deploy_net.to_proto() 394 | # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net. 395 | del net_param.layer[0] 396 | del net_param.layer[-5] 397 | del net_param.layer[-1].bottom[-1] 398 | net_param.name = '{}_deploy'.format(model_name) 399 | net_param.input.extend(['data']) 400 | net_param.input_shape.extend([ 401 | caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])]) 402 | print(net_param, file=f) 403 | 404 | # Create solver. 405 | solver = caffe_pb2.SolverParameter( 406 | train_net=train_net_file, 407 | snapshot_prefix=snapshot_prefix, 408 | **solver_param) 409 | 410 | with open(solver_file, 'w') as f: 411 | print(solver, file=f) 412 | -------------------------------------------------------------------------------- /other/Hand_Detection/model/snapshot/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/model/snapshot/.gitkeep -------------------------------------------------------------------------------- /other/Hand_Detection/model/solver.prototxt: -------------------------------------------------------------------------------- 1 | train_net: "train.prototxt" 2 | base_lr: 0.0001 3 | display: 20 4 | max_iter: 200000 5 | lr_policy: "multistep" 6 | gamma: 0.10000000149 7 | momentum: 0.899999976158 8 | weight_decay: 0.000500000023749 9 | snapshot: 80000 10 | snapshot_prefix: "snapshot/VGG_HAND_SSD_300x300_iter_" 11 | solver_mode: GPU 12 | device_id: 0 13 | debug_info: false 14 | snapshot_after_train: true 15 | average_loss: 10 16 | stepvalue: 60000 17 | stepvalue: 100000 18 | stepvalue: 140000 19 | type: "SGD" 20 | 21 | -------------------------------------------------------------------------------- /other/Hand_Detection/model/train.prototxt: -------------------------------------------------------------------------------- 1 | name: "VGG_HAND_SSD_300x300_train" 2 | layer { 3 | name: "data" 4 | type: "AnnotatedData" 5 | top: "data" 6 | top: "label" 7 | include { 8 | phase: TRAIN 9 | } 10 | transform_param { 11 | mirror: true 12 | mean_value: 127.0 13 | mean_value: 127.0 14 | mean_value: 127.0 15 | resize_param { 16 | prob: 1.0 17 | resize_mode: WARP 18 | height: 300 19 | width: 300 20 | interp_mode: LINEAR 21 | interp_mode: AREA 22 | interp_mode: NEAREST 23 | interp_mode: CUBIC 24 | interp_mode: LANCZOS4 25 | } 26 | emit_constraint { 27 | emit_type: CENTER 28 | } 29 | distort_param { 30 | brightness_prob: 0.5 31 | brightness_delta: 32.0 32 | contrast_prob: 0.5 33 | contrast_lower: 0.5 34 | contrast_upper: 1.5 35 | hue_prob: 0.5 36 | hue_delta: 18.0 37 | saturation_prob: 0.5 38 | saturation_lower: 0.5 39 | saturation_upper: 1.5 40 | random_order_prob: 0.0 41 | } 42 | expand_param { 43 | prob: 0.5 44 | max_expand_ratio: 4.0 45 | } 46 | } 47 | data_param { 48 | source: "../data/lmdb/trainval_lmdb" 49 | batch_size: 2 50 | backend: LMDB 51 | } 52 | annotated_data_param { 53 | batch_sampler { 54 | max_sample: 1 55 | max_trials: 1 56 | } 57 | batch_sampler { 58 | sampler { 59 | min_scale: 0.300000011921 60 | max_scale: 1.0 61 | min_aspect_ratio: 0.5 62 | max_aspect_ratio: 2.0 63 | } 64 | sample_constraint { 65 | min_jaccard_overlap: 0.10000000149 66 | } 67 | max_sample: 1 68 | max_trials: 50 69 | } 70 | batch_sampler { 71 | sampler { 72 | min_scale: 0.300000011921 73 | max_scale: 1.0 74 | min_aspect_ratio: 0.5 75 | max_aspect_ratio: 2.0 76 | } 77 | sample_constraint { 78 | min_jaccard_overlap: 0.300000011921 79 | } 80 | max_sample: 1 81 | max_trials: 50 82 | } 83 | batch_sampler { 84 | sampler { 85 | min_scale: 0.300000011921 86 | max_scale: 1.0 87 | min_aspect_ratio: 0.5 88 | max_aspect_ratio: 2.0 89 | } 90 | sample_constraint { 91 | min_jaccard_overlap: 0.5 92 | } 93 | max_sample: 1 94 | max_trials: 50 95 | } 96 | batch_sampler { 97 | sampler { 98 | min_scale: 0.300000011921 99 | max_scale: 1.0 100 | min_aspect_ratio: 0.5 101 | max_aspect_ratio: 2.0 102 | } 103 | sample_constraint { 104 | min_jaccard_overlap: 0.699999988079 105 | } 106 | max_sample: 1 107 | max_trials: 50 108 | } 109 | batch_sampler { 110 | sampler { 111 | min_scale: 0.300000011921 112 | max_scale: 1.0 113 | min_aspect_ratio: 0.5 114 | max_aspect_ratio: 2.0 115 | } 116 | sample_constraint { 117 | min_jaccard_overlap: 0.899999976158 118 | } 119 | max_sample: 1 120 | max_trials: 50 121 | } 122 | batch_sampler { 123 | sampler { 124 | min_scale: 0.300000011921 125 | max_scale: 1.0 126 | min_aspect_ratio: 0.5 127 | max_aspect_ratio: 2.0 128 | } 129 | sample_constraint { 130 | max_jaccard_overlap: 1.0 131 | } 132 | max_sample: 1 133 | max_trials: 50 134 | } 135 | label_map_file: "../data/labelmap_voc.prototxt" 136 | } 137 | } 138 | layer { 139 | name: "conv1_1" 140 | type: "Convolution" 141 | bottom: "data" 142 | top: "conv1_1" 143 | param { 144 | lr_mult: 1.0 145 | decay_mult: 1.0 146 | } 147 | param { 148 | lr_mult: 2.0 149 | decay_mult: 0.0 150 | } 151 | convolution_param { 152 | num_output: 64 153 | pad: 1 154 | kernel_size: 3 155 | weight_filler { 156 | type: "xavier" 157 | } 158 | bias_filler { 159 | type: "constant" 160 | value: 0.0 161 | } 162 | } 163 | } 164 | layer { 165 | name: "relu1_1" 166 | type: "ReLU" 167 | bottom: "conv1_1" 168 | top: "conv1_1" 169 | } 170 | layer { 171 | name: "conv1_2" 172 | type: "Convolution" 173 | bottom: "conv1_1" 174 | top: "conv1_2" 175 | param { 176 | lr_mult: 1.0 177 | decay_mult: 1.0 178 | } 179 | param { 180 | lr_mult: 2.0 181 | decay_mult: 0.0 182 | } 183 | convolution_param { 184 | num_output: 64 185 | pad: 1 186 | kernel_size: 3 187 | weight_filler { 188 | type: "xavier" 189 | } 190 | bias_filler { 191 | type: "constant" 192 | value: 0.0 193 | } 194 | } 195 | } 196 | layer { 197 | name: "relu1_2" 198 | type: "ReLU" 199 | bottom: "conv1_2" 200 | top: "conv1_2" 201 | } 202 | layer { 203 | name: "pool1" 204 | type: "Pooling" 205 | bottom: "conv1_2" 206 | top: "pool1" 207 | pooling_param { 208 | pool: MAX 209 | kernel_size: 2 210 | stride: 2 211 | } 212 | } 213 | layer { 214 | name: "conv2_1" 215 | type: "Convolution" 216 | bottom: "pool1" 217 | top: "conv2_1" 218 | param { 219 | lr_mult: 1.0 220 | decay_mult: 1.0 221 | } 222 | param { 223 | lr_mult: 2.0 224 | decay_mult: 0.0 225 | } 226 | convolution_param { 227 | num_output: 128 228 | pad: 1 229 | kernel_size: 3 230 | weight_filler { 231 | type: "xavier" 232 | } 233 | bias_filler { 234 | type: "constant" 235 | value: 0.0 236 | } 237 | } 238 | } 239 | layer { 240 | name: "relu2_1" 241 | type: "ReLU" 242 | bottom: "conv2_1" 243 | top: "conv2_1" 244 | } 245 | layer { 246 | name: "conv2_2" 247 | type: "Convolution" 248 | bottom: "conv2_1" 249 | top: "conv2_2" 250 | param { 251 | lr_mult: 1.0 252 | decay_mult: 1.0 253 | } 254 | param { 255 | lr_mult: 2.0 256 | decay_mult: 0.0 257 | } 258 | convolution_param { 259 | num_output: 128 260 | pad: 1 261 | kernel_size: 3 262 | weight_filler { 263 | type: "xavier" 264 | } 265 | bias_filler { 266 | type: "constant" 267 | value: 0.0 268 | } 269 | } 270 | } 271 | layer { 272 | name: "relu2_2" 273 | type: "ReLU" 274 | bottom: "conv2_2" 275 | top: "conv2_2" 276 | } 277 | layer { 278 | name: "pool2" 279 | type: "Pooling" 280 | bottom: "conv2_2" 281 | top: "pool2" 282 | pooling_param { 283 | pool: MAX 284 | kernel_size: 2 285 | stride: 2 286 | } 287 | } 288 | layer { 289 | name: "conv3_1" 290 | type: "Convolution" 291 | bottom: "pool2" 292 | top: "conv3_1" 293 | param { 294 | lr_mult: 1.0 295 | decay_mult: 1.0 296 | } 297 | param { 298 | lr_mult: 2.0 299 | decay_mult: 0.0 300 | } 301 | convolution_param { 302 | num_output: 256 303 | pad: 1 304 | kernel_size: 3 305 | weight_filler { 306 | type: "xavier" 307 | } 308 | bias_filler { 309 | type: "constant" 310 | value: 0.0 311 | } 312 | } 313 | } 314 | layer { 315 | name: "relu3_1" 316 | type: "ReLU" 317 | bottom: "conv3_1" 318 | top: "conv3_1" 319 | } 320 | layer { 321 | name: "conv3_2" 322 | type: "Convolution" 323 | bottom: "conv3_1" 324 | top: "conv3_2" 325 | param { 326 | lr_mult: 1.0 327 | decay_mult: 1.0 328 | } 329 | param { 330 | lr_mult: 2.0 331 | decay_mult: 0.0 332 | } 333 | convolution_param { 334 | num_output: 256 335 | pad: 1 336 | kernel_size: 3 337 | weight_filler { 338 | type: "xavier" 339 | } 340 | bias_filler { 341 | type: "constant" 342 | value: 0.0 343 | } 344 | } 345 | } 346 | layer { 347 | name: "relu3_2" 348 | type: "ReLU" 349 | bottom: "conv3_2" 350 | top: "conv3_2" 351 | } 352 | layer { 353 | name: "conv3_3" 354 | type: "Convolution" 355 | bottom: "conv3_2" 356 | top: "conv3_3" 357 | param { 358 | lr_mult: 1.0 359 | decay_mult: 1.0 360 | } 361 | param { 362 | lr_mult: 2.0 363 | decay_mult: 0.0 364 | } 365 | convolution_param { 366 | num_output: 256 367 | pad: 1 368 | kernel_size: 3 369 | weight_filler { 370 | type: "xavier" 371 | } 372 | bias_filler { 373 | type: "constant" 374 | value: 0.0 375 | } 376 | } 377 | } 378 | layer { 379 | name: "relu3_3" 380 | type: "ReLU" 381 | bottom: "conv3_3" 382 | top: "conv3_3" 383 | } 384 | layer { 385 | name: "pool3" 386 | type: "Pooling" 387 | bottom: "conv3_3" 388 | top: "pool3" 389 | pooling_param { 390 | pool: MAX 391 | kernel_size: 2 392 | stride: 2 393 | } 394 | } 395 | layer { 396 | name: "conv4_1" 397 | type: "Convolution" 398 | bottom: "pool3" 399 | top: "conv4_1" 400 | param { 401 | lr_mult: 1.0 402 | decay_mult: 1.0 403 | } 404 | param { 405 | lr_mult: 2.0 406 | decay_mult: 0.0 407 | } 408 | convolution_param { 409 | num_output: 512 410 | pad: 1 411 | kernel_size: 3 412 | weight_filler { 413 | type: "xavier" 414 | } 415 | bias_filler { 416 | type: "constant" 417 | value: 0.0 418 | } 419 | } 420 | } 421 | layer { 422 | name: "relu4_1" 423 | type: "ReLU" 424 | bottom: "conv4_1" 425 | top: "conv4_1" 426 | } 427 | layer { 428 | name: "conv4_2" 429 | type: "Convolution" 430 | bottom: "conv4_1" 431 | top: "conv4_2" 432 | param { 433 | lr_mult: 1.0 434 | decay_mult: 1.0 435 | } 436 | param { 437 | lr_mult: 2.0 438 | decay_mult: 0.0 439 | } 440 | convolution_param { 441 | num_output: 512 442 | pad: 1 443 | kernel_size: 3 444 | weight_filler { 445 | type: "xavier" 446 | } 447 | bias_filler { 448 | type: "constant" 449 | value: 0.0 450 | } 451 | } 452 | } 453 | layer { 454 | name: "relu4_2" 455 | type: "ReLU" 456 | bottom: "conv4_2" 457 | top: "conv4_2" 458 | } 459 | layer { 460 | name: "conv4_3" 461 | type: "Convolution" 462 | bottom: "conv4_2" 463 | top: "conv4_3" 464 | param { 465 | lr_mult: 1.0 466 | decay_mult: 1.0 467 | } 468 | param { 469 | lr_mult: 2.0 470 | decay_mult: 0.0 471 | } 472 | convolution_param { 473 | num_output: 512 474 | pad: 1 475 | kernel_size: 3 476 | weight_filler { 477 | type: "xavier" 478 | } 479 | bias_filler { 480 | type: "constant" 481 | value: 0.0 482 | } 483 | } 484 | } 485 | layer { 486 | name: "relu4_3" 487 | type: "ReLU" 488 | bottom: "conv4_3" 489 | top: "conv4_3" 490 | } 491 | layer { 492 | name: "pool4" 493 | type: "Pooling" 494 | bottom: "conv4_3" 495 | top: "pool4" 496 | pooling_param { 497 | pool: MAX 498 | kernel_size: 2 499 | stride: 2 500 | } 501 | } 502 | layer { 503 | name: "conv5_1" 504 | type: "Convolution" 505 | bottom: "pool4" 506 | top: "conv5_1" 507 | param { 508 | lr_mult: 1.0 509 | decay_mult: 1.0 510 | } 511 | param { 512 | lr_mult: 2.0 513 | decay_mult: 0.0 514 | } 515 | convolution_param { 516 | num_output: 512 517 | pad: 1 518 | kernel_size: 3 519 | weight_filler { 520 | type: "xavier" 521 | } 522 | bias_filler { 523 | type: "constant" 524 | value: 0.0 525 | } 526 | dilation: 1 527 | } 528 | } 529 | layer { 530 | name: "relu5_1" 531 | type: "ReLU" 532 | bottom: "conv5_1" 533 | top: "conv5_1" 534 | } 535 | layer { 536 | name: "conv5_2" 537 | type: "Convolution" 538 | bottom: "conv5_1" 539 | top: "conv5_2" 540 | param { 541 | lr_mult: 1.0 542 | decay_mult: 1.0 543 | } 544 | param { 545 | lr_mult: 2.0 546 | decay_mult: 0.0 547 | } 548 | convolution_param { 549 | num_output: 512 550 | pad: 1 551 | kernel_size: 3 552 | weight_filler { 553 | type: "xavier" 554 | } 555 | bias_filler { 556 | type: "constant" 557 | value: 0.0 558 | } 559 | dilation: 1 560 | } 561 | } 562 | layer { 563 | name: "relu5_2" 564 | type: "ReLU" 565 | bottom: "conv5_2" 566 | top: "conv5_2" 567 | } 568 | layer { 569 | name: "conv5_3" 570 | type: "Convolution" 571 | bottom: "conv5_2" 572 | top: "conv5_3" 573 | param { 574 | lr_mult: 1.0 575 | decay_mult: 1.0 576 | } 577 | param { 578 | lr_mult: 2.0 579 | decay_mult: 0.0 580 | } 581 | convolution_param { 582 | num_output: 512 583 | pad: 1 584 | kernel_size: 3 585 | weight_filler { 586 | type: "xavier" 587 | } 588 | bias_filler { 589 | type: "constant" 590 | value: 0.0 591 | } 592 | dilation: 1 593 | } 594 | } 595 | layer { 596 | name: "relu5_3" 597 | type: "ReLU" 598 | bottom: "conv5_3" 599 | top: "conv5_3" 600 | } 601 | layer { 602 | name: "pool5" 603 | type: "Pooling" 604 | bottom: "conv5_3" 605 | top: "pool5" 606 | pooling_param { 607 | pool: MAX 608 | kernel_size: 3 609 | stride: 1 610 | pad: 1 611 | } 612 | } 613 | layer { 614 | name: "fc6" 615 | type: "Convolution" 616 | bottom: "pool5" 617 | top: "fc6" 618 | param { 619 | lr_mult: 1.0 620 | decay_mult: 1.0 621 | } 622 | param { 623 | lr_mult: 2.0 624 | decay_mult: 0.0 625 | } 626 | convolution_param { 627 | num_output: 1024 628 | pad: 6 629 | kernel_size: 3 630 | weight_filler { 631 | type: "xavier" 632 | } 633 | bias_filler { 634 | type: "constant" 635 | value: 0.0 636 | } 637 | dilation: 6 638 | } 639 | } 640 | layer { 641 | name: "relu6" 642 | type: "ReLU" 643 | bottom: "fc6" 644 | top: "fc6" 645 | } 646 | layer { 647 | name: "fc7" 648 | type: "Convolution" 649 | bottom: "fc6" 650 | top: "fc7" 651 | param { 652 | lr_mult: 1.0 653 | decay_mult: 1.0 654 | } 655 | param { 656 | lr_mult: 2.0 657 | decay_mult: 0.0 658 | } 659 | convolution_param { 660 | num_output: 1024 661 | kernel_size: 1 662 | weight_filler { 663 | type: "xavier" 664 | } 665 | bias_filler { 666 | type: "constant" 667 | value: 0.0 668 | } 669 | } 670 | } 671 | layer { 672 | name: "relu7" 673 | type: "ReLU" 674 | bottom: "fc7" 675 | top: "fc7" 676 | } 677 | layer { 678 | name: "conv6_1" 679 | type: "Convolution" 680 | bottom: "fc7" 681 | top: "conv6_1" 682 | param { 683 | lr_mult: 1.0 684 | decay_mult: 1.0 685 | } 686 | param { 687 | lr_mult: 2.0 688 | decay_mult: 0.0 689 | } 690 | convolution_param { 691 | num_output: 256 692 | pad: 0 693 | kernel_size: 1 694 | stride: 1 695 | weight_filler { 696 | type: "xavier" 697 | } 698 | bias_filler { 699 | type: "constant" 700 | value: 0.0 701 | } 702 | } 703 | } 704 | layer { 705 | name: "conv6_1_relu" 706 | type: "ReLU" 707 | bottom: "conv6_1" 708 | top: "conv6_1" 709 | } 710 | layer { 711 | name: "conv6_2" 712 | type: "Convolution" 713 | bottom: "conv6_1" 714 | top: "conv6_2" 715 | param { 716 | lr_mult: 1.0 717 | decay_mult: 1.0 718 | } 719 | param { 720 | lr_mult: 2.0 721 | decay_mult: 0.0 722 | } 723 | convolution_param { 724 | num_output: 512 725 | pad: 1 726 | kernel_size: 3 727 | stride: 2 728 | weight_filler { 729 | type: "xavier" 730 | } 731 | bias_filler { 732 | type: "constant" 733 | value: 0.0 734 | } 735 | } 736 | } 737 | layer { 738 | name: "conv6_2_relu" 739 | type: "ReLU" 740 | bottom: "conv6_2" 741 | top: "conv6_2" 742 | } 743 | layer { 744 | name: "conv7_1" 745 | type: "Convolution" 746 | bottom: "conv6_2" 747 | top: "conv7_1" 748 | param { 749 | lr_mult: 1.0 750 | decay_mult: 1.0 751 | } 752 | param { 753 | lr_mult: 2.0 754 | decay_mult: 0.0 755 | } 756 | convolution_param { 757 | num_output: 128 758 | pad: 0 759 | kernel_size: 1 760 | stride: 1 761 | weight_filler { 762 | type: "xavier" 763 | } 764 | bias_filler { 765 | type: "constant" 766 | value: 0.0 767 | } 768 | } 769 | } 770 | layer { 771 | name: "conv7_1_relu" 772 | type: "ReLU" 773 | bottom: "conv7_1" 774 | top: "conv7_1" 775 | } 776 | layer { 777 | name: "conv7_2" 778 | type: "Convolution" 779 | bottom: "conv7_1" 780 | top: "conv7_2" 781 | param { 782 | lr_mult: 1.0 783 | decay_mult: 1.0 784 | } 785 | param { 786 | lr_mult: 2.0 787 | decay_mult: 0.0 788 | } 789 | convolution_param { 790 | num_output: 256 791 | pad: 1 792 | kernel_size: 3 793 | stride: 2 794 | weight_filler { 795 | type: "xavier" 796 | } 797 | bias_filler { 798 | type: "constant" 799 | value: 0.0 800 | } 801 | } 802 | } 803 | layer { 804 | name: "conv7_2_relu" 805 | type: "ReLU" 806 | bottom: "conv7_2" 807 | top: "conv7_2" 808 | } 809 | layer { 810 | name: "conv8_1" 811 | type: "Convolution" 812 | bottom: "conv7_2" 813 | top: "conv8_1" 814 | param { 815 | lr_mult: 1.0 816 | decay_mult: 1.0 817 | } 818 | param { 819 | lr_mult: 2.0 820 | decay_mult: 0.0 821 | } 822 | convolution_param { 823 | num_output: 128 824 | pad: 0 825 | kernel_size: 1 826 | stride: 1 827 | weight_filler { 828 | type: "xavier" 829 | } 830 | bias_filler { 831 | type: "constant" 832 | value: 0.0 833 | } 834 | } 835 | } 836 | layer { 837 | name: "conv8_1_relu" 838 | type: "ReLU" 839 | bottom: "conv8_1" 840 | top: "conv8_1" 841 | } 842 | layer { 843 | name: "conv8_2" 844 | type: "Convolution" 845 | bottom: "conv8_1" 846 | top: "conv8_2" 847 | param { 848 | lr_mult: 1.0 849 | decay_mult: 1.0 850 | } 851 | param { 852 | lr_mult: 2.0 853 | decay_mult: 0.0 854 | } 855 | convolution_param { 856 | num_output: 256 857 | pad: 0 858 | kernel_size: 3 859 | stride: 1 860 | weight_filler { 861 | type: "xavier" 862 | } 863 | bias_filler { 864 | type: "constant" 865 | value: 0.0 866 | } 867 | } 868 | } 869 | layer { 870 | name: "conv8_2_relu" 871 | type: "ReLU" 872 | bottom: "conv8_2" 873 | top: "conv8_2" 874 | } 875 | layer { 876 | name: "conv9_1" 877 | type: "Convolution" 878 | bottom: "conv8_2" 879 | top: "conv9_1" 880 | param { 881 | lr_mult: 1.0 882 | decay_mult: 1.0 883 | } 884 | param { 885 | lr_mult: 2.0 886 | decay_mult: 0.0 887 | } 888 | convolution_param { 889 | num_output: 128 890 | pad: 0 891 | kernel_size: 1 892 | stride: 1 893 | weight_filler { 894 | type: "xavier" 895 | } 896 | bias_filler { 897 | type: "constant" 898 | value: 0.0 899 | } 900 | } 901 | } 902 | layer { 903 | name: "conv9_1_relu" 904 | type: "ReLU" 905 | bottom: "conv9_1" 906 | top: "conv9_1" 907 | } 908 | layer { 909 | name: "conv9_2" 910 | type: "Convolution" 911 | bottom: "conv9_1" 912 | top: "conv9_2" 913 | param { 914 | lr_mult: 1.0 915 | decay_mult: 1.0 916 | } 917 | param { 918 | lr_mult: 2.0 919 | decay_mult: 0.0 920 | } 921 | convolution_param { 922 | num_output: 256 923 | pad: 0 924 | kernel_size: 3 925 | stride: 1 926 | weight_filler { 927 | type: "xavier" 928 | } 929 | bias_filler { 930 | type: "constant" 931 | value: 0.0 932 | } 933 | } 934 | } 935 | layer { 936 | name: "conv9_2_relu" 937 | type: "ReLU" 938 | bottom: "conv9_2" 939 | top: "conv9_2" 940 | } 941 | layer { 942 | name: "conv4_3_norm" 943 | type: "Normalize" 944 | bottom: "conv4_3" 945 | top: "conv4_3_norm" 946 | norm_param { 947 | across_spatial: false 948 | scale_filler { 949 | type: "constant" 950 | value: 20.0 951 | } 952 | channel_shared: false 953 | } 954 | } 955 | layer { 956 | name: "conv4_3_norm_mbox_loc" 957 | type: "Convolution" 958 | bottom: "conv4_3_norm" 959 | top: "conv4_3_norm_mbox_loc" 960 | param { 961 | lr_mult: 1.0 962 | decay_mult: 1.0 963 | } 964 | param { 965 | lr_mult: 2.0 966 | decay_mult: 0.0 967 | } 968 | convolution_param { 969 | num_output: 16 970 | pad: 1 971 | kernel_size: 3 972 | stride: 1 973 | weight_filler { 974 | type: "xavier" 975 | } 976 | bias_filler { 977 | type: "constant" 978 | value: 0.0 979 | } 980 | } 981 | } 982 | layer { 983 | name: "conv4_3_norm_mbox_loc_perm" 984 | type: "Permute" 985 | bottom: "conv4_3_norm_mbox_loc" 986 | top: "conv4_3_norm_mbox_loc_perm" 987 | permute_param { 988 | order: 0 989 | order: 2 990 | order: 3 991 | order: 1 992 | } 993 | } 994 | layer { 995 | name: "conv4_3_norm_mbox_loc_flat" 996 | type: "Flatten" 997 | bottom: "conv4_3_norm_mbox_loc_perm" 998 | top: "conv4_3_norm_mbox_loc_flat" 999 | flatten_param { 1000 | axis: 1 1001 | } 1002 | } 1003 | layer { 1004 | name: "conv4_3_norm_mbox_conf_hand_detection" 1005 | type: "Convolution" 1006 | bottom: "conv4_3_norm" 1007 | top: "conv4_3_norm_mbox_conf_hand_detection" 1008 | param { 1009 | lr_mult: 1.0 1010 | decay_mult: 1.0 1011 | } 1012 | param { 1013 | lr_mult: 2.0 1014 | decay_mult: 0.0 1015 | } 1016 | convolution_param { 1017 | num_output: 8 1018 | pad: 1 1019 | kernel_size: 3 1020 | stride: 1 1021 | weight_filler { 1022 | type: "xavier" 1023 | } 1024 | bias_filler { 1025 | type: "constant" 1026 | value: 0.0 1027 | } 1028 | } 1029 | } 1030 | layer { 1031 | name: "conv4_3_norm_mbox_conf_hand_detection_perm" 1032 | type: "Permute" 1033 | bottom: "conv4_3_norm_mbox_conf_hand_detection" 1034 | top: "conv4_3_norm_mbox_conf_hand_detection_perm" 1035 | permute_param { 1036 | order: 0 1037 | order: 2 1038 | order: 3 1039 | order: 1 1040 | } 1041 | } 1042 | layer { 1043 | name: "conv4_3_norm_mbox_conf_hand_detection_flat" 1044 | type: "Flatten" 1045 | bottom: "conv4_3_norm_mbox_conf_hand_detection_perm" 1046 | top: "conv4_3_norm_mbox_conf_hand_detection_flat" 1047 | flatten_param { 1048 | axis: 1 1049 | } 1050 | } 1051 | layer { 1052 | name: "conv4_3_norm_mbox_priorbox" 1053 | type: "PriorBox" 1054 | bottom: "conv4_3_norm" 1055 | bottom: "data" 1056 | top: "conv4_3_norm_mbox_priorbox" 1057 | prior_box_param { 1058 | min_size: 30.0 1059 | max_size: 60.0 1060 | aspect_ratio: 2.0 1061 | flip: true 1062 | clip: false 1063 | variance: 0.10000000149 1064 | variance: 0.10000000149 1065 | variance: 0.20000000298 1066 | variance: 0.20000000298 1067 | step: 8.0 1068 | offset: 0.5 1069 | } 1070 | } 1071 | layer { 1072 | name: "fc7_mbox_loc" 1073 | type: "Convolution" 1074 | bottom: "fc7" 1075 | top: "fc7_mbox_loc" 1076 | param { 1077 | lr_mult: 1.0 1078 | decay_mult: 1.0 1079 | } 1080 | param { 1081 | lr_mult: 2.0 1082 | decay_mult: 0.0 1083 | } 1084 | convolution_param { 1085 | num_output: 24 1086 | pad: 1 1087 | kernel_size: 3 1088 | stride: 1 1089 | weight_filler { 1090 | type: "xavier" 1091 | } 1092 | bias_filler { 1093 | type: "constant" 1094 | value: 0.0 1095 | } 1096 | } 1097 | } 1098 | layer { 1099 | name: "fc7_mbox_loc_perm" 1100 | type: "Permute" 1101 | bottom: "fc7_mbox_loc" 1102 | top: "fc7_mbox_loc_perm" 1103 | permute_param { 1104 | order: 0 1105 | order: 2 1106 | order: 3 1107 | order: 1 1108 | } 1109 | } 1110 | layer { 1111 | name: "fc7_mbox_loc_flat" 1112 | type: "Flatten" 1113 | bottom: "fc7_mbox_loc_perm" 1114 | top: "fc7_mbox_loc_flat" 1115 | flatten_param { 1116 | axis: 1 1117 | } 1118 | } 1119 | layer { 1120 | name: "fc7_mbox_conf_hand_detection" 1121 | type: "Convolution" 1122 | bottom: "fc7" 1123 | top: "fc7_mbox_conf_hand_detection" 1124 | param { 1125 | lr_mult: 1.0 1126 | decay_mult: 1.0 1127 | } 1128 | param { 1129 | lr_mult: 2.0 1130 | decay_mult: 0.0 1131 | } 1132 | convolution_param { 1133 | num_output: 12 1134 | pad: 1 1135 | kernel_size: 3 1136 | stride: 1 1137 | weight_filler { 1138 | type: "xavier" 1139 | } 1140 | bias_filler { 1141 | type: "constant" 1142 | value: 0.0 1143 | } 1144 | } 1145 | } 1146 | layer { 1147 | name: "fc7_mbox_conf_hand_detection_perm" 1148 | type: "Permute" 1149 | bottom: "fc7_mbox_conf_hand_detection" 1150 | top: "fc7_mbox_conf_hand_detection_perm" 1151 | permute_param { 1152 | order: 0 1153 | order: 2 1154 | order: 3 1155 | order: 1 1156 | } 1157 | } 1158 | layer { 1159 | name: "fc7_mbox_conf_hand_detection_flat" 1160 | type: "Flatten" 1161 | bottom: "fc7_mbox_conf_hand_detection_perm" 1162 | top: "fc7_mbox_conf_hand_detection_flat" 1163 | flatten_param { 1164 | axis: 1 1165 | } 1166 | } 1167 | layer { 1168 | name: "fc7_mbox_priorbox" 1169 | type: "PriorBox" 1170 | bottom: "fc7" 1171 | bottom: "data" 1172 | top: "fc7_mbox_priorbox" 1173 | prior_box_param { 1174 | min_size: 60.0 1175 | max_size: 111.0 1176 | aspect_ratio: 2.0 1177 | aspect_ratio: 3.0 1178 | flip: true 1179 | clip: false 1180 | variance: 0.10000000149 1181 | variance: 0.10000000149 1182 | variance: 0.20000000298 1183 | variance: 0.20000000298 1184 | step: 16.0 1185 | offset: 0.5 1186 | } 1187 | } 1188 | layer { 1189 | name: "conv6_2_mbox_loc" 1190 | type: "Convolution" 1191 | bottom: "conv6_2" 1192 | top: "conv6_2_mbox_loc" 1193 | param { 1194 | lr_mult: 1.0 1195 | decay_mult: 1.0 1196 | } 1197 | param { 1198 | lr_mult: 2.0 1199 | decay_mult: 0.0 1200 | } 1201 | convolution_param { 1202 | num_output: 24 1203 | pad: 1 1204 | kernel_size: 3 1205 | stride: 1 1206 | weight_filler { 1207 | type: "xavier" 1208 | } 1209 | bias_filler { 1210 | type: "constant" 1211 | value: 0.0 1212 | } 1213 | } 1214 | } 1215 | layer { 1216 | name: "conv6_2_mbox_loc_perm" 1217 | type: "Permute" 1218 | bottom: "conv6_2_mbox_loc" 1219 | top: "conv6_2_mbox_loc_perm" 1220 | permute_param { 1221 | order: 0 1222 | order: 2 1223 | order: 3 1224 | order: 1 1225 | } 1226 | } 1227 | layer { 1228 | name: "conv6_2_mbox_loc_flat" 1229 | type: "Flatten" 1230 | bottom: "conv6_2_mbox_loc_perm" 1231 | top: "conv6_2_mbox_loc_flat" 1232 | flatten_param { 1233 | axis: 1 1234 | } 1235 | } 1236 | layer { 1237 | name: "conv6_2_mbox_conf_hand_detection" 1238 | type: "Convolution" 1239 | bottom: "conv6_2" 1240 | top: "conv6_2_mbox_conf_hand_detection" 1241 | param { 1242 | lr_mult: 1.0 1243 | decay_mult: 1.0 1244 | } 1245 | param { 1246 | lr_mult: 2.0 1247 | decay_mult: 0.0 1248 | } 1249 | convolution_param { 1250 | num_output: 12 1251 | pad: 1 1252 | kernel_size: 3 1253 | stride: 1 1254 | weight_filler { 1255 | type: "xavier" 1256 | } 1257 | bias_filler { 1258 | type: "constant" 1259 | value: 0.0 1260 | } 1261 | } 1262 | } 1263 | layer { 1264 | name: "conv6_2_mbox_conf_hand_detection_perm" 1265 | type: "Permute" 1266 | bottom: "conv6_2_mbox_conf_hand_detection" 1267 | top: "conv6_2_mbox_conf_hand_detection_perm" 1268 | permute_param { 1269 | order: 0 1270 | order: 2 1271 | order: 3 1272 | order: 1 1273 | } 1274 | } 1275 | layer { 1276 | name: "conv6_2_mbox_conf_hand_detection_flat" 1277 | type: "Flatten" 1278 | bottom: "conv6_2_mbox_conf_hand_detection_perm" 1279 | top: "conv6_2_mbox_conf_hand_detection_flat" 1280 | flatten_param { 1281 | axis: 1 1282 | } 1283 | } 1284 | layer { 1285 | name: "conv6_2_mbox_priorbox" 1286 | type: "PriorBox" 1287 | bottom: "conv6_2" 1288 | bottom: "data" 1289 | top: "conv6_2_mbox_priorbox" 1290 | prior_box_param { 1291 | min_size: 111.0 1292 | max_size: 162.0 1293 | aspect_ratio: 2.0 1294 | aspect_ratio: 3.0 1295 | flip: true 1296 | clip: false 1297 | variance: 0.10000000149 1298 | variance: 0.10000000149 1299 | variance: 0.20000000298 1300 | variance: 0.20000000298 1301 | step: 32.0 1302 | offset: 0.5 1303 | } 1304 | } 1305 | layer { 1306 | name: "conv7_2_mbox_loc" 1307 | type: "Convolution" 1308 | bottom: "conv7_2" 1309 | top: "conv7_2_mbox_loc" 1310 | param { 1311 | lr_mult: 1.0 1312 | decay_mult: 1.0 1313 | } 1314 | param { 1315 | lr_mult: 2.0 1316 | decay_mult: 0.0 1317 | } 1318 | convolution_param { 1319 | num_output: 24 1320 | pad: 1 1321 | kernel_size: 3 1322 | stride: 1 1323 | weight_filler { 1324 | type: "xavier" 1325 | } 1326 | bias_filler { 1327 | type: "constant" 1328 | value: 0.0 1329 | } 1330 | } 1331 | } 1332 | layer { 1333 | name: "conv7_2_mbox_loc_perm" 1334 | type: "Permute" 1335 | bottom: "conv7_2_mbox_loc" 1336 | top: "conv7_2_mbox_loc_perm" 1337 | permute_param { 1338 | order: 0 1339 | order: 2 1340 | order: 3 1341 | order: 1 1342 | } 1343 | } 1344 | layer { 1345 | name: "conv7_2_mbox_loc_flat" 1346 | type: "Flatten" 1347 | bottom: "conv7_2_mbox_loc_perm" 1348 | top: "conv7_2_mbox_loc_flat" 1349 | flatten_param { 1350 | axis: 1 1351 | } 1352 | } 1353 | layer { 1354 | name: "conv7_2_mbox_conf_hand_detection" 1355 | type: "Convolution" 1356 | bottom: "conv7_2" 1357 | top: "conv7_2_mbox_conf_hand_detection" 1358 | param { 1359 | lr_mult: 1.0 1360 | decay_mult: 1.0 1361 | } 1362 | param { 1363 | lr_mult: 2.0 1364 | decay_mult: 0.0 1365 | } 1366 | convolution_param { 1367 | num_output: 12 1368 | pad: 1 1369 | kernel_size: 3 1370 | stride: 1 1371 | weight_filler { 1372 | type: "xavier" 1373 | } 1374 | bias_filler { 1375 | type: "constant" 1376 | value: 0.0 1377 | } 1378 | } 1379 | } 1380 | layer { 1381 | name: "conv7_2_mbox_conf_hand_detection_perm" 1382 | type: "Permute" 1383 | bottom: "conv7_2_mbox_conf_hand_detection" 1384 | top: "conv7_2_mbox_conf_hand_detection_perm" 1385 | permute_param { 1386 | order: 0 1387 | order: 2 1388 | order: 3 1389 | order: 1 1390 | } 1391 | } 1392 | layer { 1393 | name: "conv7_2_mbox_conf_hand_detection_flat" 1394 | type: "Flatten" 1395 | bottom: "conv7_2_mbox_conf_hand_detection_perm" 1396 | top: "conv7_2_mbox_conf_hand_detection_flat" 1397 | flatten_param { 1398 | axis: 1 1399 | } 1400 | } 1401 | layer { 1402 | name: "conv7_2_mbox_priorbox" 1403 | type: "PriorBox" 1404 | bottom: "conv7_2" 1405 | bottom: "data" 1406 | top: "conv7_2_mbox_priorbox" 1407 | prior_box_param { 1408 | min_size: 162.0 1409 | max_size: 213.0 1410 | aspect_ratio: 2.0 1411 | aspect_ratio: 3.0 1412 | flip: true 1413 | clip: false 1414 | variance: 0.10000000149 1415 | variance: 0.10000000149 1416 | variance: 0.20000000298 1417 | variance: 0.20000000298 1418 | step: 64.0 1419 | offset: 0.5 1420 | } 1421 | } 1422 | layer { 1423 | name: "conv8_2_mbox_loc" 1424 | type: "Convolution" 1425 | bottom: "conv8_2" 1426 | top: "conv8_2_mbox_loc" 1427 | param { 1428 | lr_mult: 1.0 1429 | decay_mult: 1.0 1430 | } 1431 | param { 1432 | lr_mult: 2.0 1433 | decay_mult: 0.0 1434 | } 1435 | convolution_param { 1436 | num_output: 16 1437 | pad: 1 1438 | kernel_size: 3 1439 | stride: 1 1440 | weight_filler { 1441 | type: "xavier" 1442 | } 1443 | bias_filler { 1444 | type: "constant" 1445 | value: 0.0 1446 | } 1447 | } 1448 | } 1449 | layer { 1450 | name: "conv8_2_mbox_loc_perm" 1451 | type: "Permute" 1452 | bottom: "conv8_2_mbox_loc" 1453 | top: "conv8_2_mbox_loc_perm" 1454 | permute_param { 1455 | order: 0 1456 | order: 2 1457 | order: 3 1458 | order: 1 1459 | } 1460 | } 1461 | layer { 1462 | name: "conv8_2_mbox_loc_flat" 1463 | type: "Flatten" 1464 | bottom: "conv8_2_mbox_loc_perm" 1465 | top: "conv8_2_mbox_loc_flat" 1466 | flatten_param { 1467 | axis: 1 1468 | } 1469 | } 1470 | layer { 1471 | name: "conv8_2_mbox_conf_hand_detection" 1472 | type: "Convolution" 1473 | bottom: "conv8_2" 1474 | top: "conv8_2_mbox_conf_hand_detection" 1475 | param { 1476 | lr_mult: 1.0 1477 | decay_mult: 1.0 1478 | } 1479 | param { 1480 | lr_mult: 2.0 1481 | decay_mult: 0.0 1482 | } 1483 | convolution_param { 1484 | num_output: 8 1485 | pad: 1 1486 | kernel_size: 3 1487 | stride: 1 1488 | weight_filler { 1489 | type: "xavier" 1490 | } 1491 | bias_filler { 1492 | type: "constant" 1493 | value: 0.0 1494 | } 1495 | } 1496 | } 1497 | layer { 1498 | name: "conv8_2_mbox_conf_hand_detection_perm" 1499 | type: "Permute" 1500 | bottom: "conv8_2_mbox_conf_hand_detection" 1501 | top: "conv8_2_mbox_conf_hand_detection_perm" 1502 | permute_param { 1503 | order: 0 1504 | order: 2 1505 | order: 3 1506 | order: 1 1507 | } 1508 | } 1509 | layer { 1510 | name: "conv8_2_mbox_conf_hand_detection_flat" 1511 | type: "Flatten" 1512 | bottom: "conv8_2_mbox_conf_hand_detection_perm" 1513 | top: "conv8_2_mbox_conf_hand_detection_flat" 1514 | flatten_param { 1515 | axis: 1 1516 | } 1517 | } 1518 | layer { 1519 | name: "conv8_2_mbox_priorbox" 1520 | type: "PriorBox" 1521 | bottom: "conv8_2" 1522 | bottom: "data" 1523 | top: "conv8_2_mbox_priorbox" 1524 | prior_box_param { 1525 | min_size: 213.0 1526 | max_size: 264.0 1527 | aspect_ratio: 2.0 1528 | flip: true 1529 | clip: false 1530 | variance: 0.10000000149 1531 | variance: 0.10000000149 1532 | variance: 0.20000000298 1533 | variance: 0.20000000298 1534 | step: 100.0 1535 | offset: 0.5 1536 | } 1537 | } 1538 | layer { 1539 | name: "conv9_2_mbox_loc" 1540 | type: "Convolution" 1541 | bottom: "conv9_2" 1542 | top: "conv9_2_mbox_loc" 1543 | param { 1544 | lr_mult: 1.0 1545 | decay_mult: 1.0 1546 | } 1547 | param { 1548 | lr_mult: 2.0 1549 | decay_mult: 0.0 1550 | } 1551 | convolution_param { 1552 | num_output: 16 1553 | pad: 1 1554 | kernel_size: 3 1555 | stride: 1 1556 | weight_filler { 1557 | type: "xavier" 1558 | } 1559 | bias_filler { 1560 | type: "constant" 1561 | value: 0.0 1562 | } 1563 | } 1564 | } 1565 | layer { 1566 | name: "conv9_2_mbox_loc_perm" 1567 | type: "Permute" 1568 | bottom: "conv9_2_mbox_loc" 1569 | top: "conv9_2_mbox_loc_perm" 1570 | permute_param { 1571 | order: 0 1572 | order: 2 1573 | order: 3 1574 | order: 1 1575 | } 1576 | } 1577 | layer { 1578 | name: "conv9_2_mbox_loc_flat" 1579 | type: "Flatten" 1580 | bottom: "conv9_2_mbox_loc_perm" 1581 | top: "conv9_2_mbox_loc_flat" 1582 | flatten_param { 1583 | axis: 1 1584 | } 1585 | } 1586 | layer { 1587 | name: "conv9_2_mbox_conf_hand_detection" 1588 | type: "Convolution" 1589 | bottom: "conv9_2" 1590 | top: "conv9_2_mbox_conf_hand_detection" 1591 | param { 1592 | lr_mult: 1.0 1593 | decay_mult: 1.0 1594 | } 1595 | param { 1596 | lr_mult: 2.0 1597 | decay_mult: 0.0 1598 | } 1599 | convolution_param { 1600 | num_output: 8 1601 | pad: 1 1602 | kernel_size: 3 1603 | stride: 1 1604 | weight_filler { 1605 | type: "xavier" 1606 | } 1607 | bias_filler { 1608 | type: "constant" 1609 | value: 0.0 1610 | } 1611 | } 1612 | } 1613 | layer { 1614 | name: "conv9_2_mbox_conf_hand_detection_perm" 1615 | type: "Permute" 1616 | bottom: "conv9_2_mbox_conf_hand_detection" 1617 | top: "conv9_2_mbox_conf_hand_detection_perm" 1618 | permute_param { 1619 | order: 0 1620 | order: 2 1621 | order: 3 1622 | order: 1 1623 | } 1624 | } 1625 | layer { 1626 | name: "conv9_2_mbox_conf_hand_detection_flat" 1627 | type: "Flatten" 1628 | bottom: "conv9_2_mbox_conf_hand_detection_perm" 1629 | top: "conv9_2_mbox_conf_hand_detection_flat" 1630 | flatten_param { 1631 | axis: 1 1632 | } 1633 | } 1634 | layer { 1635 | name: "conv9_2_mbox_priorbox" 1636 | type: "PriorBox" 1637 | bottom: "conv9_2" 1638 | bottom: "data" 1639 | top: "conv9_2_mbox_priorbox" 1640 | prior_box_param { 1641 | min_size: 264.0 1642 | max_size: 315.0 1643 | aspect_ratio: 2.0 1644 | flip: true 1645 | clip: false 1646 | variance: 0.10000000149 1647 | variance: 0.10000000149 1648 | variance: 0.20000000298 1649 | variance: 0.20000000298 1650 | step: 300.0 1651 | offset: 0.5 1652 | } 1653 | } 1654 | layer { 1655 | name: "mbox_loc" 1656 | type: "Concat" 1657 | bottom: "conv4_3_norm_mbox_loc_flat" 1658 | bottom: "fc7_mbox_loc_flat" 1659 | bottom: "conv6_2_mbox_loc_flat" 1660 | bottom: "conv7_2_mbox_loc_flat" 1661 | bottom: "conv8_2_mbox_loc_flat" 1662 | bottom: "conv9_2_mbox_loc_flat" 1663 | top: "mbox_loc" 1664 | concat_param { 1665 | axis: 1 1666 | } 1667 | } 1668 | layer { 1669 | name: "mbox_conf" 1670 | type: "Concat" 1671 | bottom: "conv4_3_norm_mbox_conf_hand_detection_flat" 1672 | bottom: "fc7_mbox_conf_hand_detection_flat" 1673 | bottom: "conv6_2_mbox_conf_hand_detection_flat" 1674 | bottom: "conv7_2_mbox_conf_hand_detection_flat" 1675 | bottom: "conv8_2_mbox_conf_hand_detection_flat" 1676 | bottom: "conv9_2_mbox_conf_hand_detection_flat" 1677 | top: "mbox_conf" 1678 | concat_param { 1679 | axis: 1 1680 | } 1681 | } 1682 | layer { 1683 | name: "mbox_priorbox" 1684 | type: "Concat" 1685 | bottom: "conv4_3_norm_mbox_priorbox" 1686 | bottom: "fc7_mbox_priorbox" 1687 | bottom: "conv6_2_mbox_priorbox" 1688 | bottom: "conv7_2_mbox_priorbox" 1689 | bottom: "conv8_2_mbox_priorbox" 1690 | bottom: "conv9_2_mbox_priorbox" 1691 | top: "mbox_priorbox" 1692 | concat_param { 1693 | axis: 2 1694 | } 1695 | } 1696 | layer { 1697 | name: "mbox_loss" 1698 | type: "MultiBoxLoss" 1699 | bottom: "mbox_loc" 1700 | bottom: "mbox_conf" 1701 | bottom: "mbox_priorbox" 1702 | bottom: "label" 1703 | top: "mbox_loss" 1704 | include { 1705 | phase: TRAIN 1706 | } 1707 | propagate_down: true 1708 | propagate_down: true 1709 | propagate_down: false 1710 | propagate_down: false 1711 | loss_param { 1712 | normalization: VALID 1713 | } 1714 | multibox_loss_param { 1715 | loc_loss_type: SMOOTH_L1 1716 | conf_loss_type: SOFTMAX 1717 | loc_weight: 1.0 1718 | num_classes: 2 1719 | share_location: true 1720 | match_type: PER_PREDICTION 1721 | overlap_threshold: 0.5 1722 | use_prior_for_matching: true 1723 | background_label_id: 0 1724 | use_difficult_gt: true 1725 | neg_pos_ratio: 3.0 1726 | neg_overlap: 0.5 1727 | code_type: CENTER_SIZE 1728 | ignore_cross_boundary_bbox: false 1729 | mining_type: MAX_NEGATIVE 1730 | } 1731 | } 1732 | 1733 | -------------------------------------------------------------------------------- /other/Hand_Detection/old_README.md: -------------------------------------------------------------------------------- 1 | ### Roadmap of hand keypoint detection 2 | * 第一步 3 | 分为三个网络,大小分别是propoasl-net: 12,refine-net: 24,output-net: 48。propoasl-net和refine-net使用检测手的数据集,output-net使用手部关键点数据集。 4 | 12,24输出之后生成hard example和原来的数据集一起作为下一个网络的输入,具体思路如下: 5 | 12-net(生成12大小的数据集) ---> 24-net(12-net生成的hard example+生成的24大小的数据集) ---> 48-net 6 | * 生成数据 7 | 8 | 生成数据的时候,有一个ground truth, 高或宽小于40的话,我就认为那不是一只手,是错的标签。在12-net, 24-net输出时使用NMS(非极大值抑制)去除重复框,可以既减少计算量。 9 | [mtcnn-caffe](https://github.com/CongWeilin/mtcnn-caffe)的复现里自定义了data层,我不希望这样做,我会生成hdf5文件,这样更灵活,可以加入测试、训练阶段。 10 | 11 | * 标签 12 | 13 | 所有在整理数据中,对于每个图片进行了15个label的标注信息: 14 | 15 | 1. 第1列:为正负样本标志,1正样本, 0负样本,-1部分样本,3关键点信息 16 | 17 | 2. 第2-5列:为边框偏移,为float类型,对于无边框信息的数据,全部置为-1 18 | 19 | 3. 42列:为关键点偏移,为float类型,对于无边框信息的数据,全部置为-1 20 | 21 | 22 | > 修改softmax_loss_layer.cpp 增加判断,只对于1,0计算loss值 23 | 修改euclidean_loss_layer.cpp 增加判断,对于置为-1的不进行loss计算 24 | 25 | 换句话说,hdf5文件里有四块区域,除了data,还有label(标注正负部分样本), points(21个关键点,所以有42列), roi(边框信息,两个点,四列) 。 26 | 27 | * 自定义层 28 | 1. 自定义一个fc层,只对标签不是-1的进行向前推进,这样就不区分到底是第几个网络,我也不需要写几个文件生成数据集。相当于修改了softmax_loss_layer 29 | 2. 自定义euclidean_loss_layer, 同理也不对-1进行计算 30 | 31 | ### 数据集 32 | 手势识别包括Hand detection和Hand keypoint detection两个问题。首先将手从原图片中提取出,然后针对特征点进行回归,因此需要两种类型的数据集完成问题。 33 | 1. Hand detection数据集,用一个矩形分割出图片中的手的位置,标注了矩形在原始图片中的坐标。 34 | 2. Hand keypoint detection数据集,标注了包括掌心、关节、指尖等关键点的坐标。 35 | 36 | 因此,我们采用的数据集如下: 37 | #### Hand detection数据集 38 | * [Hand Dataset by Arpit Mittal, Andrew Zisserman and Phil Torr](http://www.robots.ox.ac.uk/~vgg/data/hands/) 39 | 这个数据集从各种不同的公共图像数据集源收集的手图像的全面数据集。总共有13050个实例被注释。大于固定框包围面积(1500平方像素)的手实例被认为是“足够大”用于检测并用于评估,给出了大约4170个高质量的手工实例。在收集数据时,没有对人的姿势或能见度施加限制,也没有对环境施加任何限制。在每个图像中,所有能被人类清晰感知的手都有注释。注解由一个包围矩形组成,它不必是轴向对齐的,而是面向手腕的。 40 | 41 | 42 | 43 | 44 | 47 | 50 | 51 | 52 | 55 | 58 | 61 | 64 | 67 | 70 | 71 | 72 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 86 | 87 | 88 | 91 | 92 | 93 | 94 | 98 | 99 | 100 | 103 | 104 | 105 | 106 | 107 | 108 | 111 | 114 | 117 | 118 | 119 | 125 | 126 | 127 | 131 | 132 | 133 | 134 | 138 | 139 | 140 | 144 | 145 | 146 | 147 | 150 | 151 | 152 | 155 | 156 | 157 | 158 |
45 | Training Dataset 46 | 48 | Validation Dataset 49 |
53 | Source 54 | 56 | #Instances 57 | 59 | #Big Instances 60 | 62 | Source 63 | 65 | #Instances 66 | 68 | #Big Instances 69 |
73 | 74 | Buffy Stickman 75 | 887438Movie Dataset*1856649
83 | 84 | INRIA pedestrian 85 | 1343137 89 | Total 90 | 1856649
95 | 96 | Poselet (H3D) 97 | 1355580 101 | Test Dataset 102 |
Skin Dataset [2]703139 109 | Source 110 | 112 | #Instances 113 | 115 | #Big Instances 116 |
120 | 121 | PASCAL VOC 2007 train and val set 122 | 123 | 124 | 1867507 128 | 129 | PASCAL VOC 2007 test set 130 | 1626562
135 | 136 | PASCAL VOC 2010 train and val set (except human layout set) 137 | 30081060 141 | 142 | PASCAL VOC 2010 human layout val set 143 | 40598
148 | Total 149 | 91632861 153 | Total 154 | 2031660
159 | 数据集预览: 160 |
161 | 162 |
163 | 164 | * [VIVA Hand Detection Dataset](http://cvrr.ucsd.edu/vivachallenge/index.php/hands/hand-detection/) 165 | 该数据集由2D的bounding box标注司机和乘客的手。由54个在自然驾驶中收集的视频组成,包括照明的变化,大的手运动,和普遍的遮挡问题。一些数据由平台拍摄,还有一些是由YouTube提供。 166 | 数据集预览: 167 |
168 | 169 |
170 | #### Hand keypoint detection数据集 171 | * [CMU Hand Database](http://domedb.perception.cs.cmu.edu/handdb.html) 172 | 该数据集由CMU从不同公开数据集进行采集,并进行人工标记手的关键点。并且通过能够容纳关键点的放大的矩形来生成更多的Hand detection数据集。 173 | 数据集预览: 174 |
175 | 176 |
177 | ### 预处理 178 | 我们主要使用Hand detection数据集,Hand keypoint detection数据集。其中Hand detection数据集包含人手边框标注数据,主要用于检测任务的训练;Hand keypoint detection数据集包含边框标注数据和关键点信息,主要用于关键点的训练。训练集分为四种:负样本,正样本,部分样本,关键点样本. 三个样本的比例为$3:1:1:2$。 179 | #### 正负样本,部分样本提取 180 | 1. 从Hand detection数据集随机选出边框,然后和标注数据计算$IOU$,如果大于$0.65$,则为正样本,大于$0.4$小于$0.65$为部分样本,小于$0.4$为负样本。 181 | $IOU$: 简单来讲就是模型产生的目标窗口和原来标记窗口的交叠率。具体我们可以简单的理解为: 即检测结果(DetectionResult)与Ground Truth的交集比上它们的并集,即为检测的准确率IOU,公式如下: 182 | $$ IOU = \frac{DetectionResult \bigcap Ground Truth }{DetectionResult \bigcup Ground Truth} $$ 183 | 2. 计算边框偏移.对于边框,$(x1,y1)$为左上角坐标,$(x2,y2)$为右下角坐标,新剪裁的边框坐标为 184 | $(xn1,yn1)$, $(xn2,yn2)$, $width$, $height$。则 185 | $$ offset x1 = (x1 - xn1)/width$$ 186 | 同上,计算另三个点的坐标偏移. 187 | 188 | 3. 对于正样本,部分样本均有边框信息,而对于负样本不需要边框信息 189 | 190 | #### 关键点样本提取 191 | 从Hand keypoint detection数据集中提取,可以根据标注的边框,在满足正样本的要求下,随机裁剪出图片,然后调整关键点的坐标。 192 | 193 | ### 路线 194 | #### loss修改 195 | 由于训练过程中需要同时计算3个loss,但是对于不同的任务,每个任务需要的loss不同,所以在整理数据中,对于每个图片进行15个label的标注信息: 196 | 1. 第1列:为正负样本标志,1正样本,0负样本,2部分样本,3关键点信息 197 | 2. 第2-5列:为边框偏移,为float类型,对于无边框信息的数据,全部置为-1 198 | 3. 第6-15列:为关键点偏移,为floagt类型,对于无边框信息的数据,全部置为-1 199 | 200 | 标注好label之后,在训练过程中,采取以下措施: 201 | 1. 自定义softmax_loss,增加判断,只对于1,0计算loss值。 202 | 2. 自定义euclidean_loss,增加判断,对于置为-1的不进行loss计算。 203 | 3. Hard Example选择,在进行人脸分类任务时,采用了在线困难样本选择,即在训练过程中,根据计算出的loss值,进行排序,只对于70%的值较低的数据,进行反向传播。 204 | 205 | #### 网络描述 206 | 分为三个阶段,分别是classifier、boundingbox regression和landmarks detection 207 | 1. stage1: 在构建图像金字塔的基础上,利用fully convolutional network来进行检测,同时利用boundingbox regression和非极大值抑制(NMS)来合并高度重叠的候选框。在这一步获得了手的区域的候选窗口和边界框的回归向量,并用该边界框做回归,对候选窗口进行了校准。 208 | 2. stage2: 将通过stage1的所有窗口输入作进一步判断,同时也通过boundingbox regression和 NMS去掉那些false-positive区域。 209 | 3. stage3: 作用和stage2相似,但是stage3对手的区域进行了更多的监督和更强的约束即手的关键点,因此在stage3还会输出手的关键点。 210 | -------------------------------------------------------------------------------- /other/Hand_Detection/pic/demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/pic/demo.jpg -------------------------------------------------------------------------------- /other/Hand_Detection/pic/example_image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/pic/example_image.jpg -------------------------------------------------------------------------------- /other/Hand_Detection/pic/width_and_height.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/pic/width_and_height.png -------------------------------------------------------------------------------- /other/Hand_Detection/ssd_camera.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import sys 5 | sys.path.insert(0, 'caffe/python') 6 | import caffe 7 | from utils.ssd_net import * 8 | import time 9 | import urllib 10 | 11 | 12 | ## Use local camera 13 | # cap = cv2.VideoCapture(0) 14 | # # width = 720 15 | # # height = 480 16 | width = 640 17 | height = 480 18 | # cap.set(cv2.CAP_PROP_FRAME_WIDTH, width) 19 | # cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height) 20 | 21 | ## Use ipcam 22 | # url = r"http://192.168.1.190:8080/videofeed" 23 | # capture = cv2.VideoCapture(url) 24 | 25 | # Replace the URL with your own IPwebcam shot.jpg IP:port 26 | url = 'http://192.168.1.190:8080/shot.jpg' 27 | 28 | 29 | model_def = 'model/deploy.prototxt' 30 | model_weights = 'model/snapshot/VGG_HAND_SSD_300x300_iter_50000.caffemodel' 31 | 32 | ssd_net = SSD_NET(model_weights, model_def, GPU_MODE=True, threshold=0.7) 33 | 34 | while True: 35 | # get a frame 36 | # start_time = time.time() 37 | # ret, frame = capture.read() 38 | 39 | # Use urllib to get the image from the IP camera 40 | imgResp = urllib.urlopen(url) 41 | 42 | # Numpy to convert into a array 43 | imgNp = np.array(bytearray(imgResp.read()),dtype=np.uint8) 44 | 45 | # Finally decode the array to OpenCV usable format ;) 46 | frame = cv2.imdecode(imgNp,-1) 47 | 48 | start_time = time.time() 49 | 50 | # show a frame 51 | try: 52 | image_np = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 53 | except: 54 | print("Error converting to RGB") 55 | 56 | top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax = ssd_net.detect(image_np/255.0) 57 | print(image_np.shape) 58 | 59 | print(top_conf) 60 | print(top_label_indices) 61 | for i in range(len(top_conf)): 62 | xmin = int(round(top_xmin[i] * width)) 63 | ymin = int(round(top_ymin[i] * height)) 64 | xmax = int(round(top_xmax[i] * width)) 65 | ymax = int(round(top_ymax[i] * height)) 66 | print(xmin, ymin, xmax, ymax, top_conf[i]) 67 | # if np.sum(top_xmin[i]<0) > 0 or np.sum(top_xmax[i]<0) > 0 or np.sum(top_ymin[i]<0) > 0 or np.sum(top_ymax[i]<0) > 0: 68 | # continue 69 | cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2) 70 | # # time.sleep(0.1) 71 | fps = 1/(time.time() - start_time) 72 | cv2.putText(frame, 'FPS: %d' % fps, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2) 73 | 74 | cv2.imshow("capture", frame) 75 | 76 | if cv2.waitKey(1) == 27: 77 | break # esc to quit 78 | 79 | # capture.release() 80 | cv2.destroyAllWindows() -------------------------------------------------------------------------------- /other/Hand_Detection/ssd_detection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import sys 4 | sys.path.insert(0, 'caffe/python') 5 | import caffe 6 | from utils.ssd_net import * 7 | 8 | plt.rcParams['figure.figsize'] = (10, 10) 9 | plt.rcParams['image.interpolation'] = 'nearest' 10 | plt.rcParams['image.cmap'] = 'gray' 11 | 12 | model_def = 'model/deploy.prototxt' 13 | model_weights = 'model/snapshot/VGG_HAND_SSD_300x300_iter_50000.caffemodel' 14 | 15 | ssd_net = SSD_NET(model_weights, model_def, GPU_MODE=True, threshold=0.5) 16 | 17 | # image = caffe.io.load_image('/Users/hzzone/Desktop/CARDS_COURTYARD_B_T_0324.jpg') 18 | image = caffe.io.load_image('/home/hzzone/Desktop/2.jpg') 19 | 20 | top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax = ssd_net.detect(image) 21 | 22 | # print(top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax) 23 | 24 | colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist() 25 | 26 | plt.imshow(image) 27 | currentAxis = plt.gca() 28 | 29 | for i in xrange(top_conf.shape[0]): 30 | xmin = int(round(top_xmin[i] * image.shape[1])) 31 | ymin = int(round(top_ymin[i] * image.shape[0])) 32 | xmax = int(round(top_xmax[i] * image.shape[1])) 33 | ymax = int(round(top_ymax[i] * image.shape[0])) 34 | score = top_conf[i] 35 | label = int(top_label_indices[i]) 36 | # label_name = top_labels[i] 37 | label_name = label 38 | display_txt = '%s: %.2f' % ('hand', score) 39 | coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1 40 | color = colors[label] 41 | currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2)) 42 | currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5}) 43 | 44 | plt.show() 45 | 46 | -------------------------------------------------------------------------------- /other/Hand_Detection/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/utils/__init__.py -------------------------------------------------------------------------------- /other/Hand_Detection/utils/mAP.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def eval_mAP(predict_file, ground_truth_file): 4 | predict_dict = dict() 5 | ground_truth_dict = dict() 6 | def get_info(info_file, info_dict): 7 | bbox_num = 0 8 | first_line = True 9 | with open(info_file) as fr: 10 | for line in fr: 11 | if first_line: 12 | first_line = False 13 | continue 14 | if len(line.strip().split(',')) == 6: 15 | line_data = line.strip().split(',') 16 | im_id = line_data[0] 17 | xmin,ymin,xmax,ymax,score = map(float, line_data[1:]) 18 | else: 19 | im_id,xmin,ymin,xmax,ymax = map(float, line.strip().split(',')) 20 | score = 1. 21 | if not im_id in info_dict: 22 | info_dict[im_id] = list() 23 | info_dict[im_id].append([xmin,ymin,xmax,ymax,score]) 24 | bbox_num+=1 25 | return bbox_num 26 | 27 | predict_bbox_num = get_info(predict_file, predict_dict) 28 | ground_truth_bbox_num = get_info(ground_truth_file, ground_truth_dict) 29 | score_list = list() 30 | match_list = list() 31 | 32 | def iou(predict_bbox, ground_truth_bbox): 33 | predict_area = (predict_bbox[2] - predict_bbox[0])*(predict_bbox[3] - predict_bbox[1]) 34 | ground_truth_area = (ground_truth_bbox[2] - ground_truth_bbox[0])*(ground_truth_bbox[3] - ground_truth_bbox[1]) 35 | inter_x = min(predict_bbox[2],ground_truth_bbox[2]) - max(predict_bbox[0],ground_truth_bbox[0]) 36 | inter_y = min(predict_bbox[3],ground_truth_bbox[3]) - max(predict_bbox[1],ground_truth_bbox[1]) 37 | if inter_x<=0 or inter_y<=0: 38 | return 0 39 | inter_area = inter_x*inter_y 40 | return inter_area / (predict_area+ground_truth_area-inter_area) 41 | 42 | def compare(predict_list, ground_truth_list, score_list, match_list): 43 | ground_truth_unuse = [True for i in range(len(ground_truth_list))] 44 | for predict_bbox in predict_list: 45 | match = False 46 | for i in range(len(ground_truth_list)): 47 | if ground_truth_unuse[i]: 48 | if iou(predict_bbox, ground_truth_list[i])>0.5: 49 | match = True 50 | ground_truth_unuse[i] = False 51 | break 52 | score_list.append(predict_bbox[-1]) 53 | match_list.append(int(match)) 54 | 55 | for key in predict_dict.keys(): 56 | compare(predict_dict[key], ground_truth_dict[key], score_list, match_list) 57 | 58 | p = list() 59 | r = list() 60 | predict_num = 0 61 | truth_num = 0 62 | score_match_list = list(zip(score_list, match_list)) 63 | score_match_list.sort(key=lambda x:x[0], reverse = True) 64 | for item in score_match_list: 65 | predict_num+=1 66 | truth_num+=item[1] 67 | p.append(float(truth_num)/ground_truth_bbox_num) 68 | r.append(float(truth_num)/predict_num) 69 | mAP = 0 70 | for i in range(1,len(p)): 71 | mAP += (r[i-1]+r[i])/2*(p[i]-p[i-1]) 72 | return p, r, mAP 73 | 74 | if __name__ == "__main__": 75 | p, r, mAP = eval_mAP("/Users/hzzone/Downloads/object_detection_mAP-master/example/val_pred.csv", 76 | "/Users/hzzone/Downloads/object_detection_mAP-master/example/val_gt.csv") 77 | print(mAP) 78 | -------------------------------------------------------------------------------- /other/Hand_Detection/utils/output.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ssd_net import * 3 | import sys 4 | sys.path.insert(0, '../caffe/python') 5 | import xml.dom.minidom 6 | import csv 7 | import re 8 | import time 9 | 10 | data_dir = '../data' 11 | 12 | def read_xmlfile(file_path): 13 | DomTree = xml.dom.minidom.parse(file_path) 14 | annotation = DomTree.documentElement 15 | objectlist = annotation.getElementsByTagName('object') 16 | label = file_path.split(os.sep)[-1].strip('.xml') 17 | boxes = [] 18 | for objects in objectlist: 19 | bndbox = objects.getElementsByTagName('bndbox')[0] 20 | xmin = int(bndbox.getElementsByTagName('xmin')[0].childNodes[0].data) 21 | ymin = int(bndbox.getElementsByTagName('ymin')[0].childNodes[0].data) 22 | xmax = int(bndbox.getElementsByTagName('xmax')[0].childNodes[0].data) 23 | ymax = int(bndbox.getElementsByTagName('ymax')[0].childNodes[0].data) 24 | print(xmin, ymin, xmax, ymax) 25 | boxes.append([label, xmin, ymin, xmax, ymax, 1]) 26 | # print(bndbox) 27 | return boxes 28 | 29 | 30 | def output_gt_label(datatset_name): 31 | anno_path = os.path.join(data_dir, datatset_name, 'test', 'Annotations') 32 | # img_dir = os.path.join(data_dir, datatset_name, 'test', 'JPEGImages') 33 | all_boxes = [['id', 'x1', 'y1', 'x2', 'y2', 'score'], ] 34 | for root, dirs, files in os.walk(anno_path): 35 | for xml_file in files: 36 | xml_file_path = os.path.join(root, xml_file) 37 | all_boxes.extend(read_xmlfile(xml_file_path)) 38 | with open('../data/gth/{}.csv'.format(datatset_name), 'wb') as csvfile: 39 | csvwriter = csv.writer(csvfile, delimiter=',') 40 | for box in all_boxes: 41 | csvwriter.writerow(box) 42 | 43 | def output(model_def, model_weights, datatset_name): 44 | 45 | img_dir = os.path.join(data_dir, datatset_name, 'test', 'JPEGImages') 46 | ssd_net = SSD_NET(model_weights, model_def, GPU_MODE=True, threshold=0.2) 47 | 48 | output_boxes = [['id', 'x1', 'y1', 'x2', 'y2', 'score'], ] 49 | 50 | 51 | total_time = 0.0 52 | 53 | for img_name in os.listdir(img_dir): 54 | img_path = os.path.join(img_dir, img_name) 55 | img_name = img_name.strip('.jpg') 56 | 57 | image = caffe.io.load_image(img_path) 58 | 59 | start = time.time() 60 | 61 | top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax = ssd_net.detect(image) 62 | 63 | total_time = total_time + time.time() - start 64 | 65 | print(img_path) 66 | 67 | for i in xrange(top_conf.shape[0]): 68 | xmin = int(round(top_xmin[i] * image.shape[1])) 69 | ymin = int(round(top_ymin[i] * image.shape[0])) 70 | xmax = int(round(top_xmax[i] * image.shape[1])) 71 | ymax = int(round(top_ymax[i] * image.shape[0])) 72 | score = top_conf[i] 73 | label_indice = top_label_indices[i] 74 | 75 | output_boxes.append([img_name, xmin, ymin, xmax, ymax, score]) 76 | 77 | assert label_indice == 1.0 78 | 79 | 80 | iter_times = re.findall('VGG_HAND_SSD_300x300_(.*?).caffemodel', model_weights.split(os.sep)[-1])[0] 81 | print(iter_times) 82 | output_dir = '../output/{}'.format(iter_times) 83 | if not os.path.exists(output_dir): 84 | os.makedirs(output_dir) 85 | output_file = os.path.join(output_dir, '{}.csv'.format(datatset_name)) 86 | with open(output_file, 'wb') as csvfile: 87 | csvwriter = csv.writer(csvfile, delimiter=',') 88 | for box in output_boxes: 89 | csvwriter.writerow(box) 90 | return total_time/len(os.listdir(img_dir)) 91 | 92 | 93 | 94 | model_def = '../model/deploy.prototxt' 95 | model_weights = '../model/snapshot/VGG_HAND_SSD_300x300_iter_50000.caffemodel' 96 | # model_path = '../model/snapshot' 97 | # total_time = [] 98 | # for model_weights in os.listdir(model_path): 99 | # if model_weights.endswith('.caffemodel'): 100 | # total_time.append(output(model_def, os.path.join(model_path, model_weights), 'stanfordhands')) 101 | # total_time.append(output(model_def, os.path.join(model_path, model_weights), 'egohands')) 102 | 103 | print(output(model_def, model_weights, 'stanfordhands')) 104 | print(output(model_def, model_weights, 'egohands')) 105 | 106 | # print(total_time) 107 | # output_gt_label('egohands') 108 | # output_gt_label('stanfordhands') 109 | # read_xmlfile('/Users/hzzone/Desktop/Hand-Keypoint-Detection/data/stanfordhands/test/Annotations/VOC2007_1.xml') 110 | 111 | -------------------------------------------------------------------------------- /other/Hand_Detection/utils/plot_loss.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import re 3 | 4 | with open('../model/train.log') as f: 5 | data = f.read() 6 | 7 | pattern = re.compile(r''' 8 | I0(.*?)solver.cpp:243] Iteration (.*?), loss = (.*?) 9 | I0(.*?)solver.cpp:259] Train net output #0: mbox_loss = (.*?) \(\* 1 = (.*?) loss\) 10 | I0(.*?)sgd_solver.cpp:138] Iteration (.*?), lr = (.*?) 11 | ''') 12 | results = re.findall(pattern, data) 13 | iter_num = [] 14 | total_loss = [] 15 | mbox_loss = [] 16 | learning_rate = [] 17 | print(results) 18 | 19 | for result in results: 20 | iter_num.append(int(result[1])) 21 | total_loss.append(float(result[2])) 22 | mbox_loss.append(float(result[4])) 23 | learning_rate.append(float(result[-1])) 24 | 25 | plt.subplot(311) 26 | plt.plot(iter_num, total_loss) 27 | plt.subplot(312) 28 | plt.plot(iter_num, mbox_loss) 29 | plt.subplot(313) 30 | plt.plot(iter_num, learning_rate) 31 | 32 | plt.show() 33 | 34 | -------------------------------------------------------------------------------- /other/Hand_Detection/utils/score.py: -------------------------------------------------------------------------------- 1 | import mAP 2 | import os 3 | 4 | p, r, AP = mAP.eval_mAP('/home/hzzone/Hand-Keypoint-Detection/output/iter_50000/egohands.csv', '/home/hzzone/Hand-Keypoint-Detection/data/gth/egohands.csv') 5 | print(AP) 6 | p, r, AP = mAP.eval_mAP('/home/hzzone/Hand-Keypoint-Detection/output/iter_50000/stanfordhands.csv', '/home/hzzone/Hand-Keypoint-Detection/data/gth/stanfordhands.csv') 7 | print(AP) 8 | # for test_data in ['egohands', 'stanfordhands']: 9 | # gth_path = '../data/gth/{}.csv'.format(test_data) 10 | # output_path = [os.path.join('../output', iter_num) for iter_num in os.listdir('../output')] 11 | # for iter_num_output in output_path: 12 | # p, r, AP = mAP.eval_mAP('{}/{}.csv'.format(iter_num_output, test_data), gth_path) 13 | # print(iter_num_output, AP) 14 | -------------------------------------------------------------------------------- /other/Hand_Detection/utils/ssd_net.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, '../caffe/python') 3 | import caffe 4 | import numpy as np 5 | from google.protobuf import text_format 6 | from caffe.proto import caffe_pb2 7 | 8 | def get_labelname(labelmap, labels): 9 | num_labels = len(labelmap.item) 10 | print(labelmap.item[0]) 11 | print(num_labels) 12 | labelnames = [] 13 | if type(labels) is not list: 14 | labels = [labels] 15 | for label in labels: 16 | found = False 17 | for i in xrange(0, num_labels): 18 | if label == labelmap.item[i].label: 19 | found = True 20 | labelnames.append(labelmap.item[i].display_name) 21 | break 22 | assert found == True 23 | return labelnames 24 | 25 | class SSD_NET(object): 26 | 27 | def __init__(self, model_weights, model_def, threshold=0.5, GPU_MODE=False): 28 | if GPU_MODE: 29 | caffe.set_device(0) 30 | caffe.set_mode_gpu() 31 | else: 32 | caffe.set_mode_cpu() 33 | self.net = caffe.Net(model_def, # defines the structure of the model 34 | model_weights, # contains the trained weights 35 | caffe.TEST) # use test mode (e.g., don't perform dropout) 36 | self.threshold = threshold 37 | self.transformer = caffe.io.Transformer({'data': self.net.blobs['data'].data.shape}) 38 | self.transformer.set_transpose('data', (2, 0, 1)) 39 | self.transformer.set_mean('data', np.array([127.0, 127.0, 127.0])) # mean pixel 40 | self.transformer.set_raw_scale('data', 41 | 255) # the reference model operates on images in [0,255] range instead of [0,1] 42 | self.transformer.set_channel_swap('data', (2, 1, 0)) # the reference model has channels in BGR order instead of RGB 43 | image_resize = 300 44 | self.net.blobs['data'].reshape(1, 3, image_resize, image_resize) 45 | 46 | 47 | def detect(self, img): 48 | transformed_image = self.transformer.preprocess('data', img) 49 | self.net.blobs['data'].data[...] = transformed_image 50 | detections = self.net.forward()['detection_out'] 51 | # Parse the outputs. 52 | det_label = detections[0, 0, :, 1] 53 | det_conf = detections[0, 0, :, 2] 54 | det_xmin = detections[0, 0, :, 3] 55 | det_ymin = detections[0, 0, :, 4] 56 | det_xmax = detections[0, 0, :, 5] 57 | det_ymax = detections[0, 0, :, 6] 58 | # Get detections with confidence higher than 0.6. 59 | # print(det_conf) 60 | top_indices = [i for i, conf in enumerate(det_conf) if conf >= self.threshold] 61 | 62 | top_conf = det_conf[top_indices] 63 | top_label_indices = det_label[top_indices].tolist() 64 | top_xmin = det_xmin[top_indices] 65 | top_ymin = det_ymin[top_indices] 66 | top_xmax = det_xmax[top_indices] 67 | top_ymax = det_ymax[top_indices] 68 | 69 | return top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /other/Hand_Detection/utils/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def IoU(box, boxes): 4 | """Compute IoU between detect box and gt boxes 5 | 6 | Parameters: 7 | ---------- 8 | box: numpy array , shape (5, ): x1, y1, x2, y2, score 9 | input box 10 | boxes: numpy array, shape (n, 4): x1, y1, x2, y2 11 | input ground truth boxes 12 | 13 | Returns: 14 | ------- 15 | ovr: numpy.array, shape (n, ) 16 | IoU 17 | """ 18 | box_area = (box[2] - box[0] + 1) * (box[3] - box[1] + 1) 19 | area = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1) 20 | xx1 = np.maximum(box[0], boxes[:, 0]) 21 | yy1 = np.maximum(box[1], boxes[:, 1]) 22 | xx2 = np.minimum(box[2], boxes[:, 2]) 23 | yy2 = np.minimum(box[3], boxes[:, 3]) 24 | 25 | # compute the width and height of the bounding box 26 | w = np.maximum(0, xx2 - xx1 + 1) 27 | h = np.maximum(0, yy2 - yy1 + 1) 28 | 29 | inter = w * h 30 | ovr = inter / (box_area + area - inter) 31 | return ovr 32 | 33 | 34 | def convert_to_square(bbox): 35 | """Convert bbox to square 36 | 37 | Parameters: 38 | ---------- 39 | bbox: numpy array , shape n x 5 40 | input bbox 41 | 42 | Returns: 43 | ------- 44 | square bbox 45 | """ 46 | square_bbox = bbox.copy() 47 | 48 | h = bbox[:, 3] - bbox[:, 1] + 1 49 | w = bbox[:, 2] - bbox[:, 0] + 1 50 | max_size = np.maximum(h, w) 51 | square_bbox[:, 0] = bbox[:, 0] + w*0.5 - max_size*0.5 52 | square_bbox[:, 1] = bbox[:, 1] + h*0.5 - max_size*0.5 53 | square_bbox[:, 2] = square_bbox[:, 0] + max_size - 1 54 | square_bbox[:, 3] = square_bbox[:, 1] + max_size - 1 55 | return square_bbox 56 | -------------------------------------------------------------------------------- /other/Openpose-Keras/.gitignore: -------------------------------------------------------------------------------- 1 | # TOTALLY IGNORE THE MODEL FILES 2 | .ipynb_checkpoints 3 | *.h5 4 | *.h5py 5 | *.npy -------------------------------------------------------------------------------- /other/Openpose-Keras/README.md: -------------------------------------------------------------------------------- 1 | # OpenPose-Keras 2 | A little bit of play with OpenPose without using their API but allowing to build / prototype pre and post-processing steps in Keras. Please keep in mind that this is more of a toy project and not anything even close to any production applications. If you are looking for something more *useful* please invest some time and get the actual OpenPose up and running :) 3 | 4 | Table of contents: 5 | - Body keypoint estimation network (coming soon) 6 | - Face keypoint estimation network (coming soon) 7 | - [Hand keypoint estimation network](#handKeypointEstimationNetwork) 8 | * [Model description](#handKeypointEstimationNetwork_modelDescription) 9 | * [Input format and pre-processing](#handKeypointEstimationNetwork_inputFormatAndPreProcessing) 10 | * [Post-processing](#handKeypointEstimationNetwork_postProcessing) 11 | * [Discovered issues](#handKeypointEstimationNetwork_issues) 12 | - [External resources](#externalResources) 13 | 14 | 15 | ## Hand keypoint estimation network 16 | [![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/FnoI8ufwhbs/0.jpg)](https://www.youtube.com/watch?v=FnoI8ufwhbs) 17 | Please check out the demo on yt: https://www.youtube.com/watch?v=FnoI8ufwhbs 18 | 19 | 20 | ### Model description 21 | Original model can be found on the [OpenPose's github](https://github.com/CMU-Perceptual-Computing-Lab/openpose). Model weights converted from Caffe model definition available for download: https://drive.google.com/file/d/1yPQFrCrDltqzYAnWBl__O7oZxGL0sQlu/view 22 | The readme on the main page says that the "hand keypoint detection" 2x21 keypoint estimation. The network itself outputs 22 channels (21 keypoints + background). The final layer feeds from the 128-deep convolutional layer (Mconv6_stage6). It is defined as follows (as defined in the [models/hand/pose_deploy.prototxt](https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/models/hand/pose_deploy.prototxt)): 23 | ``` 24 | layer { 25 | name: "Mconv7_stage6" 26 | type: "Convolution" 27 | bottom: "Mconv6_stage6" 28 | top: "net_output" 29 | param { 30 | lr_mult: 4.0 31 | decay_mult: 1 32 | } 33 | param { 34 | lr_mult: 8.0 35 | decay_mult: 0 36 | } 37 | convolution_param { 38 | num_output: 22 39 | pad: 0 40 | kernel_size: 1 41 | weight_filler { 42 | type: "gaussian" 43 | std: 0.01 44 | } 45 | bias_filler { 46 | type: "constant" 47 | } 48 | dilation: 1 49 | } 50 | } 51 | ``` 52 | Keep in mind that this particular network does NOT produce any part affinity fields, just finger-keypoints. OpenPose's documentation contains the following picture describing the keypoint channel ids: 53 | 54 | 55 | ### Input format and pre-processing 56 | I believe that the natural resolution of the input images (e.g. the standard network input size) is 368 on the width and whatever turns out to be on the height. From the papers presenting this method one figure out that the authors use multi-scale inputs. Basically they go through different scales from 0.5 to 1.5 and average the heatmaps. Network accepts 3 channel RGB images with 32-bit floating point values scaled between -0.5 <= x <= 0.5. 57 | 58 | 59 | ### Post-processing 60 | I haven't studied the code of the OpenPose library very well (yet!) but I noticed that the returned heatmaps seem to have bi-modal distributions. E.g. some values of detections are strongly negative and some are strongly positive. I understood that it may be their way of distinguishing the left hand from the right one. I still need to investigate that. 61 | 62 | 63 | ### Discovered issues 64 | - It seems like the model does not capture hand keypoints when exposed to images of people wearing gloves. I haven't figured out how exaclty the network was trained, but I can imagine that there was no emphasis on glove-wearing targets. 65 | 66 | 67 | 68 | # External resources 69 | 1. OpenPose GitHub repo: https://github.com/CMU-Perceptual-Computing-Lab/openpose 70 | 2. Origin of OpenPose: https://github.com/ZheC/Realtime_Multi-Person_Pose_Estimation 71 | 3. Paper describing the method: https://arxiv.org/abs/1611.08050 72 | 4. Keras implementation of the Realtime Multi-Person Pose Estimation (my major inspiration): https://github.com/michalfaber/keras_Realtime_Multi-Person_Pose_Estimation 73 | -------------------------------------------------------------------------------- /other/Openpose-Keras/images/test_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Openpose-Keras/images/test_image.png -------------------------------------------------------------------------------- /other/asl.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/asl.mp4 -------------------------------------------------------------------------------- /other/front-back.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/front-back.jpg -------------------------------------------------------------------------------- /other/getModels.sh: -------------------------------------------------------------------------------- 1 | # ------------------------- BODY, FACE AND HAND MODELS ------------------------- 2 | # Downloading body pose (COCO and MPI), face and hand models 3 | OPENPOSE_URL="http://posefs1.perception.cs.cmu.edu/OpenPose/models/" 4 | HAND_FOLDER="hand/" 5 | 6 | # "------------------------- HAND MODELS -------------------------" 7 | # Hand 8 | HAND_MODEL=$HAND_FOLDER"pose_iter_102000.caffemodel" 9 | wget -c ${OPENPOSE_URL}${HAND_MODEL} -P ${HAND_FOLDER} 10 | -------------------------------------------------------------------------------- /other/hand.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/hand.jpg -------------------------------------------------------------------------------- /other/hand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/hand.png -------------------------------------------------------------------------------- /other/handPoseImage.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace cv; 8 | using namespace cv::dnn; 9 | 10 | 11 | const int POSE_PAIRS[20][2] = 12 | { 13 | {0,1}, {1,2}, {2,3}, {3,4}, // thumb 14 | {0,5}, {5,6}, {6,7}, {7,8}, // index 15 | {0,9}, {9,10}, {10,11}, {11,12}, // middle 16 | {0,13}, {13,14}, {14,15}, {15,16}, // ring 17 | {0,17}, {17,18}, {18,19}, {19,20} // small 18 | }; 19 | 20 | string protoFile = "hand/pose_deploy.prototxt"; 21 | string weightsFile = "hand/pose_iter_102000.caffemodel"; 22 | 23 | int nPoints = 22; 24 | 25 | int main(int argc, char **argv) 26 | { 27 | 28 | cout << "USAGE : ./handPoseImage " << endl; 29 | 30 | string imageFile = "right-frontal.jpg"; 31 | // Take arguments from commmand line 32 | if (argc == 2) 33 | { 34 | imageFile = argv[1]; 35 | } 36 | 37 | float thresh = 0.01; 38 | 39 | Mat frame = imread(imageFile); 40 | Mat frameCopy = frame.clone(); 41 | int frameWidth = frame.cols; 42 | int frameHeight = frame.rows; 43 | 44 | float aspect_ratio = frameWidth/(float)frameHeight; 45 | int inHeight = 368; 46 | int inWidth = (int(aspect_ratio*inHeight) * 8) / 8; 47 | 48 | cout << "inWidth = " << inWidth << " ; inHeight = " << inHeight << endl; 49 | 50 | double t = (double) cv::getTickCount(); 51 | Net net = readNetFromCaffe(protoFile, weightsFile); 52 | 53 | Mat inpBlob = blobFromImage(frame, 1.0 / 255, Size(inWidth, inHeight), Scalar(0, 0, 0), false, false); 54 | 55 | net.setInput(inpBlob); 56 | 57 | Mat output = net.forward(); 58 | 59 | int H = output.size[2]; 60 | int W = output.size[3]; 61 | 62 | // find the position of the body parts 63 | vector points(nPoints); 64 | for (int n=0; n < nPoints; n++) 65 | { 66 | // Probability map of corresponding body's part. 67 | Mat probMap(H, W, CV_32F, output.ptr(0,n)); 68 | resize(probMap, probMap, Size(frameWidth, frameHeight)); 69 | 70 | Point maxLoc; 71 | double prob; 72 | minMaxLoc(probMap, 0, &prob, 0, &maxLoc); 73 | if (prob > thresh) 74 | { 75 | circle(frameCopy, cv::Point((int)maxLoc.x, (int)maxLoc.y), 8, Scalar(0,255,255), -1); 76 | cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)maxLoc.x, (int)maxLoc.y), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 0, 255), 2); 77 | 78 | } 79 | points[n] = maxLoc; 80 | } 81 | 82 | int nPairs = sizeof(POSE_PAIRS)/sizeof(POSE_PAIRS[0]); 83 | 84 | for (int n = 0; n < nPairs; n++) 85 | { 86 | // lookup 2 connected body/hand parts 87 | Point2f partA = points[POSE_PAIRS[n][0]]; 88 | Point2f partB = points[POSE_PAIRS[n][1]]; 89 | 90 | if (partA.x<=0 || partA.y<=0 || partB.x<=0 || partB.y<=0) 91 | continue; 92 | 93 | line(frame, partA, partB, Scalar(0,255,255), 8); 94 | circle(frame, partA, 8, Scalar(0,0,255), -1); 95 | circle(frame, partB, 8, Scalar(0,0,255), -1); 96 | } 97 | 98 | t = ((double)cv::getTickCount() - t)/cv::getTickFrequency(); 99 | cout << "Time Taken = " << t << endl; 100 | imshow("Output-Keypoints", frameCopy); 101 | imshow("Output-Skeleton", frame); 102 | imwrite("Output-Skeleton.jpg", frame); 103 | 104 | waitKey(); 105 | 106 | return 0; 107 | } 108 | -------------------------------------------------------------------------------- /other/handPoseImage.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import cv2 3 | import time 4 | import numpy as np 5 | 6 | protoFile = "hand/pose_deploy.prototxt" 7 | weightsFile = "hand/pose_iter_102000.caffemodel" 8 | nPoints = 22 9 | POSE_PAIRS = [ [0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20] ] 10 | net = cv2.dnn.readNetFromCaffe(protoFile, weightsFile) 11 | 12 | frame = cv2.imread("right-frontal.jpg") 13 | frameCopy = np.copy(frame) 14 | frameWidth = frame.shape[1] 15 | frameHeight = frame.shape[0] 16 | aspect_ratio = frameWidth/frameHeight 17 | 18 | threshold = 0.1 19 | 20 | t = time.time() 21 | # input image dimensions for the network 22 | inHeight = 368 23 | inWidth = int(((aspect_ratio*inHeight)*8)//8) 24 | inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight), (0, 0, 0), swapRB=False, crop=False) 25 | 26 | net.setInput(inpBlob) 27 | 28 | output = net.forward() 29 | print("time taken by network : {:.3f}".format(time.time() - t)) 30 | 31 | # Empty list to store the detected keypoints 32 | points = [] 33 | 34 | for i in range(nPoints): 35 | # confidence map of corresponding body's part. 36 | probMap = output[0, i, :, :] 37 | probMap = cv2.resize(probMap, (frameWidth, frameHeight)) 38 | 39 | # Find global maxima of the probMap. 40 | minVal, prob, minLoc, point = cv2.minMaxLoc(probMap) 41 | 42 | if prob > threshold : 43 | cv2.circle(frameCopy, (int(point[0]), int(point[1])), 8, (0, 255, 255), thickness=-1, lineType=cv2.FILLED) 44 | cv2.putText(frameCopy, "{}".format(i), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, lineType=cv2.LINE_AA) 45 | 46 | # Add the point to the list if the probability is greater than the threshold 47 | points.append((int(point[0]), int(point[1]))) 48 | else : 49 | points.append(None) 50 | 51 | # Draw Skeleton 52 | for pair in POSE_PAIRS: 53 | partA = pair[0] 54 | partB = pair[1] 55 | 56 | if points[partA] and points[partB]: 57 | cv2.line(frame, points[partA], points[partB], (0, 255, 255), 2) 58 | cv2.circle(frame, points[partA], 8, (0, 0, 255), thickness=-1, lineType=cv2.FILLED) 59 | cv2.circle(frame, points[partB], 8, (0, 0, 255), thickness=-1, lineType=cv2.FILLED) 60 | 61 | 62 | cv2.imshow('Output-Keypoints', frameCopy) 63 | cv2.imshow('Output-Skeleton', frame) 64 | 65 | 66 | cv2.imwrite('Output-Keypoints.jpg', frameCopy) 67 | cv2.imwrite('Output-Skeleton.jpg', frame) 68 | 69 | print("Total time taken : {:.3f}".format(time.time() - t)) 70 | 71 | cv2.waitKey(0) 72 | -------------------------------------------------------------------------------- /other/handPoseVideo.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace std; 7 | using namespace cv; 8 | using namespace cv::dnn; 9 | 10 | const int POSE_PAIRS[20][2] = 11 | { 12 | {0,1}, {1,2}, {2,3}, {3,4}, // thumb 13 | {0,5}, {5,6}, {6,7}, {7,8}, // index 14 | {0,9}, {9,10}, {10,11}, {11,12}, // middle 15 | {0,13}, {13,14}, {14,15}, {15,16}, // ring 16 | {0,17}, {17,18}, {18,19}, {19,20} // small 17 | }; 18 | 19 | string protoFile = "hand/pose_deploy.prototxt"; 20 | string weightsFile = "hand/pose_iter_102000.caffemodel"; 21 | 22 | int nPoints = 22; 23 | 24 | int main(int argc, char **argv) 25 | { 26 | float thresh = 0.01; 27 | 28 | cv::VideoCapture cap("asl.mp4"); 29 | 30 | if (!cap.isOpened()) 31 | { 32 | cerr << "Unable to connect to camera" << endl; 33 | return 1; 34 | } 35 | 36 | Mat frame, frameCopy; 37 | int frameWidth = cap.get(CAP_PROP_FRAME_WIDTH); 38 | int frameHeight = cap.get(CAP_PROP_FRAME_HEIGHT); 39 | float aspect_ratio = frameWidth/(float)frameHeight; 40 | int inHeight = 368; 41 | int inWidth = (int(aspect_ratio*inHeight) * 8) / 8; 42 | 43 | cout << "inWidth = " << inWidth << " ; inHeight = " << inHeight << endl; 44 | 45 | VideoWriter video("Output-Skeleton.avi",VideoWriter::fourcc('M','J','P','G'), 10, Size(frameWidth,frameHeight)); 46 | 47 | Net net = readNetFromCaffe(protoFile, weightsFile); 48 | 49 | double t=0; 50 | while(1) 51 | { 52 | double t = (double) cv::getTickCount(); 53 | 54 | cap >> frame; 55 | frameCopy = frame.clone(); 56 | Mat inpBlob = blobFromImage(frame, 1.0 / 255, Size(inWidth, inHeight), Scalar(0, 0, 0), false, false); 57 | 58 | net.setInput(inpBlob); 59 | 60 | Mat output = net.forward(); 61 | 62 | int H = output.size[2]; 63 | int W = output.size[3]; 64 | 65 | // find the position of the body parts 66 | vector points(nPoints); 67 | for (int n=0; n < nPoints; n++) 68 | { 69 | // Probability map of corresponding body's part. 70 | Mat probMap(H, W, CV_32F, output.ptr(0,n)); 71 | resize(probMap, probMap, Size(frameWidth, frameHeight)); 72 | 73 | Point maxLoc; 74 | double prob; 75 | minMaxLoc(probMap, 0, &prob, 0, &maxLoc); 76 | if (prob > thresh) 77 | { 78 | circle(frameCopy, cv::Point((int)maxLoc.x, (int)maxLoc.y), 8, Scalar(0,255,255), -1); 79 | cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)maxLoc.x, (int)maxLoc.y), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 0, 255), 2); 80 | 81 | } 82 | points[n] = maxLoc; 83 | } 84 | 85 | int nPairs = sizeof(POSE_PAIRS)/sizeof(POSE_PAIRS[0]); 86 | 87 | for (int n = 0; n < nPairs; n++) 88 | { 89 | // lookup 2 connected body/hand parts 90 | Point2f partA = points[POSE_PAIRS[n][0]]; 91 | Point2f partB = points[POSE_PAIRS[n][1]]; 92 | 93 | if (partA.x<=0 || partA.y<=0 || partB.x<=0 || partB.y<=0) 94 | continue; 95 | 96 | line(frame, partA, partB, Scalar(0,255,255), 8); 97 | circle(frame, partA, 8, Scalar(0,0,255), -1); 98 | circle(frame, partB, 8, Scalar(0,0,255), -1); 99 | } 100 | 101 | t = ((double)cv::getTickCount() - t)/cv::getTickFrequency(); 102 | cout << "Time Taken for frame = " << t << endl; 103 | cv::putText(frame, cv::format("time taken = %.2f sec", t), cv::Point(50, 50), cv::FONT_HERSHEY_COMPLEX, .8, cv::Scalar(255, 50, 0), 2); 104 | // imshow("Output-Keypoints", frameCopy); 105 | imshow("Output-Skeleton", frame); 106 | video.write(frame); 107 | char key = waitKey(1); 108 | if (key==27) 109 | break; 110 | } 111 | // When everything done, release the video capture and write object 112 | cap.release(); 113 | video.release(); 114 | 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /other/handPoseVideo.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import time 3 | import numpy as np 4 | 5 | 6 | protoFile = "hand/pose_deploy.prototxt" 7 | weightsFile = "hand/pose_iter_102000.caffemodel" 8 | nPoints = 22 9 | POSE_PAIRS = [ [0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20] ] 10 | 11 | threshold = 0.2 12 | 13 | 14 | input_source = "asl.mp4" 15 | cap = cv2.VideoCapture(input_source) 16 | hasFrame, frame = cap.read() 17 | 18 | frameWidth = frame.shape[1] 19 | frameHeight = frame.shape[0] 20 | 21 | aspect_ratio = frameWidth/frameHeight 22 | 23 | inHeight = 368 24 | inWidth = int(((aspect_ratio*inHeight)*8)//8) 25 | 26 | vid_writer = cv2.VideoWriter('output.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 15, (frame.shape[1],frame.shape[0])) 27 | 28 | net = cv2.dnn.readNetFromCaffe(protoFile, weightsFile) 29 | k = 0 30 | while 1: 31 | k+=1 32 | t = time.time() 33 | hasFrame, frame = cap.read() 34 | frameCopy = np.copy(frame) 35 | if not hasFrame: 36 | cv2.waitKey() 37 | break 38 | 39 | inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight), 40 | (0, 0, 0), swapRB=False, crop=False) 41 | 42 | net.setInput(inpBlob) 43 | 44 | output = net.forward() 45 | 46 | print("forward = {}".format(time.time() - t)) 47 | 48 | # Empty list to store the detected keypoints 49 | points = [] 50 | 51 | for i in range(nPoints): 52 | # confidence map of corresponding body's part. 53 | probMap = output[0, i, :, :] 54 | probMap = cv2.resize(probMap, (frameWidth, frameHeight)) 55 | 56 | # Find global maxima of the probMap. 57 | minVal, prob, minLoc, point = cv2.minMaxLoc(probMap) 58 | 59 | if prob > threshold : 60 | cv2.circle(frameCopy, (int(point[0]), int(point[1])), 6, (0, 255, 255), thickness=-1, lineType=cv2.FILLED) 61 | cv2.putText(frameCopy, "{}".format(i), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_SIMPLEX, .8, (0, 0, 255), 2, lineType=cv2.LINE_AA) 62 | 63 | # Add the point to the list if the probability is greater than the threshold 64 | points.append((int(point[0]), int(point[1]))) 65 | else : 66 | points.append(None) 67 | 68 | # Draw Skeleton 69 | for pair in POSE_PAIRS: 70 | partA = pair[0] 71 | partB = pair[1] 72 | 73 | if points[partA] and points[partB]: 74 | cv2.line(frame, points[partA], points[partB], (0, 255, 255), 2, lineType=cv2.LINE_AA) 75 | cv2.circle(frame, points[partA], 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED) 76 | cv2.circle(frame, points[partB], 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED) 77 | 78 | print("Time Taken for frame = {}".format(time.time() - t)) 79 | 80 | # cv2.putText(frame, "time taken = {:.2f} sec".format(time.time() - t), (50, 50), cv2.FONT_HERSHEY_COMPLEX, .8, (255, 50, 0), 2, lineType=cv2.LINE_AA) 81 | # cv2.putText(frame, "Hand Pose using OpenCV", (50, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 50, 0), 2, lineType=cv2.LINE_AA) 82 | cv2.imshow('Output-Skeleton', frame) 83 | # cv2.imwrite("video_output/{:03d}.jpg".format(k), frame) 84 | key = cv2.waitKey(1) 85 | if key == 27: 86 | break 87 | 88 | print("total = {}".format(time.time() - t)) 89 | 90 | vid_writer.write(frame) 91 | 92 | vid_writer.release() 93 | --------------------------------------------------------------------------------