├── .gitignore
├── README.md
├── demo
├── Openpose.py
├── SqueezeNet.ipynb
└── img_keypoint_show.py
├── image
├── Loss.png
├── com.png
├── front-back.jpg
├── hand.jpeg
├── hand.jpg
├── hand.png
├── right-frontal.jpg
└── unnamed.png
├── main
├── data
│ └── dataset.py
├── demo.py
└── train.py
└── other
├── CMakeLists.txt
├── Hand_Caffe
├── 1_F_deploy.prototxt
├── 1_F_solver.prototxt
├── 1_F_train.prototxt
├── create_txt.py
├── getBox.py
├── hand.jpeg
├── level1.py
├── read_im_json.py
└── utils.py
├── Hand_Detection
├── README.md
├── data
│ ├── create_annoset.py
│ ├── create_data.sh
│ ├── create_txt.py
│ ├── egohands
│ │ ├── _screenshot_17.04.2018.png
│ │ ├── egohands_data.txt
│ │ ├── generate_egohands.py
│ │ └── getInfo.m
│ ├── gth
│ │ └── .gitkeep
│ ├── labelmap_voc.prototxt
│ └── stanfordhands
│ │ └── generate_stanfordhands.py
├── model
│ ├── deploy.prototxt
│ ├── generate_model.py
│ ├── snapshot
│ │ └── .gitkeep
│ ├── solver.prototxt
│ └── train.prototxt
├── old_README.md
├── pic
│ ├── demo.jpg
│ ├── example_image.jpg
│ └── width_and_height.png
├── ssd_camera.py
├── ssd_detection.py
└── utils
│ ├── __init__.py
│ ├── mAP.py
│ ├── output.py
│ ├── plot_loss.py
│ ├── score.py
│ ├── ssd_net.py
│ └── utils.py
├── Openpose-Keras
├── .gitignore
├── README.md
├── StolenOpenPoseHandTracking.ipynb
└── images
│ └── test_image.png
├── asl.mp4
├── front-back.jpg
├── getModels.sh
├── hand.jpg
├── hand.png
├── handPose-Notebook.ipynb
├── handPoseImage.cpp
├── handPoseImage.py
├── handPoseVideo.cpp
└── handPoseVideo.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # TOTALLY IGNORE THE MODEL FILES
2 | .ipynb_checkpoints
3 | *.h5
4 | *.h5py
5 | *.npy
6 | *.zip
7 | /.idea
8 | Push.sh
9 | /data
10 | /文献
11 |
12 | # Byte-compiled / optimized / DLL files
13 | __pycache__/
14 | *.py[cod]
15 | *$py.class
16 |
17 | # C extensions
18 | *.so
19 |
20 | # Distribution / packaging
21 | .Python
22 | build/
23 | develop-eggs/
24 | dist/
25 | downloads/
26 | eggs/
27 | .eggs/
28 | lib/
29 | lib64/
30 | parts/
31 | sdist/
32 | var/
33 | wheels/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 | MANIFEST
38 |
39 | # PyInstaller
40 | # Usually these files are written by a python script from a template
41 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
42 | *.manifest
43 | *.spec
44 |
45 | # Installer logs
46 | pip-log.txt
47 | pip-delete-this-directory.txt
48 |
49 | # Unit test / coverage reports
50 | htmlcov/
51 | .tox/
52 | .coverage
53 | .coverage.*
54 | .cache
55 | nosetests.xml
56 | coverage.xml
57 | *.cover
58 | .hypothesis/
59 | .pytest_cache/
60 |
61 | # Translations
62 | *.mo
63 | *.pot
64 |
65 | # Django stuff:
66 | *.log
67 | local_settings.py
68 | db.sqlite3
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | target/
82 |
83 | # Jupyter Notebook
84 | .ipynb_checkpoints
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # celery beat schedule file
90 | celerybeat-schedule
91 |
92 | # SageMath parsed files
93 | *.sage.py
94 |
95 | # Environments
96 | .env
97 | .venv
98 | env/
99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 |
104 | # Spyder project settings
105 | .spyderproject
106 | .spyproject
107 |
108 | # Rope project settings
109 | .ropeproject
110 |
111 | # mkdocs documentation
112 | /site
113 |
114 | # mypy
115 | .mypy_cache/
116 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Hand-Keypoint-Estimation
2 |
3 | ## Introduction
4 |
5 |

6 |
7 | 手部21点关键点识别
8 |
9 | 
10 |
11 | ## TODO
12 |
13 | - [x] ResNet34+Finetune
14 | - [x] SqueezeNet+Finetune
15 | - [ ] Hourglass
16 | - [ ] Openpose+Design Loss
17 | - [ ] 更好的效果展示
18 | - [ ] 抗遮挡
19 |
20 | ## Dataset
21 |
22 | [CMU手部数据集](http://domedb.perception.cs.cmu.edu/handdb.html)(遮挡比较变态)
23 |
24 | ```
25 | Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)
26 | └─hand_labels_synth
27 | ├─output_viz_synth
28 | ├─synth1(json文件数据缺失指尖5个关键点)
29 | ├─synth2
30 | ├─synth3
31 | └─synth4
32 | ```
33 |
34 | ## Benchmarking
35 |
36 | **SqueezeNet+Finetune**
37 |
38 | ```
39 | Finetune = nn.Sequential(
40 | Flatten(),
41 | nn.ReLU(),
42 | nn.Dropout(0.5),
43 | nn.Linear(247808, 256),
44 | #ReLU不能放BN前会导致BN方差计算错误
45 | nn.BatchNorm1d(256),
46 | nn.ReLU(),
47 | nn.Dropout(0.5),
48 | nn.Linear(256, 42),
49 | Reshape(-1,21,2),
50 | nn.Tanh()
51 | )
52 | Total params: 64,172,906
53 | Total trainable params: 64,172,906
54 | Total non-trainable params: 0
55 | Loss function : MSELoss
56 | Epoch : 200
57 | LR : 0.01->0.0001
58 | Train Loss end : 0.010500
59 | Valid Loss end : 0.012454
60 | ```
61 |
62 | 
63 |
64 | CPU上0.0234s一张图片
65 |
66 | GPU-2070Ti上0.00727s一张图片
67 |
68 |
--------------------------------------------------------------------------------
/demo/Openpose.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | from collections import OrderedDict
4 | from torch.autograd import Variable
5 |
6 | import torch
7 | import torch.nn as nn
8 | from fastai.vision import *
9 | from fastai import *
10 | import os
11 | os.environ['CUDA_VISIBLE_DEVICES'] = '3'
12 |
13 | class Reshape(nn.Module):
14 | def __init__(self, *args):
15 | super(Reshape, self).__init__()
16 | self.shape = args
17 |
18 | def forward(self, x):
19 | return x.view(self.shape)
20 |
21 | def make_layers(block, no_relu_layers):
22 | layers = []
23 | for layer_name, v in block.items():
24 | if 'pool' in layer_name:
25 | layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
26 | padding=v[2])
27 | layers.append((layer_name, layer))
28 | else:
29 | conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
30 | kernel_size=v[2], stride=v[3],
31 | padding=v[4])
32 | layers.append((layer_name, conv2d))
33 | if layer_name not in no_relu_layers:
34 | layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
35 |
36 | return nn.Sequential(OrderedDict(layers))
37 |
38 |
39 | class handpose_model(nn.Module):
40 | def __init__(self):
41 | super().__init__()
42 |
43 | # these layers have no relu layer
44 | no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3', \
45 | 'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
46 | # stage 1
47 | block1_0 = OrderedDict({
48 | 'conv1_1': [3, 64, 3, 1, 1],
49 | 'conv1_2': [64, 64, 3, 1, 1],
50 | 'pool1_stage1': [2, 2, 0],
51 | 'conv2_1': [64, 128, 3, 1, 1],
52 | 'conv2_2': [128, 128, 3, 1, 1],
53 | 'pool2_stage1': [2, 2, 0],
54 | 'conv3_1': [128, 256, 3, 1, 1],
55 | 'conv3_2': [256, 256, 3, 1, 1],
56 | 'conv3_3': [256, 256, 3, 1, 1],
57 | 'conv3_4': [256, 256, 3, 1, 1],
58 | 'pool3_stage1': [2, 2, 0],
59 | 'conv4_1': [256, 512, 3, 1, 1],
60 | 'conv4_2': [512, 512, 3, 1, 1],
61 | 'conv4_3': [512, 512, 3, 1, 1],
62 | 'conv4_4': [512, 512, 3, 1, 1],
63 | 'conv5_1': [512, 512, 3, 1, 1],
64 | 'conv5_2': [512, 512, 3, 1, 1],
65 | 'conv5_3_CPM': [512, 128, 3, 1, 1]})
66 |
67 | block1_1 = OrderedDict({
68 | 'conv6_1_CPM': [128, 512, 1, 1, 0],
69 | 'conv6_2_CPM': [512, 22, 1, 1, 0]
70 | })
71 |
72 | blocks = {}
73 | blocks['block1_0'] = block1_0
74 | blocks['block1_1'] = block1_1
75 |
76 | # stage 2-6
77 | for i in range(2, 7):
78 | blocks['block%d' % i] = OrderedDict({
79 | 'Mconv1_stage%d' % i: [150, 128, 7, 1, 3],
80 | 'Mconv2_stage%d' % i: [128, 128, 7, 1, 3],
81 | 'Mconv3_stage%d' % i: [128, 128, 7, 1, 3],
82 | 'Mconv4_stage%d' % i: [128, 128, 7, 1, 3],
83 | 'Mconv5_stage%d' % i: [128, 128, 7, 1, 3],
84 | 'Mconv6_stage%d' % i: [128, 128, 1, 1, 0],
85 | 'Mconv7_stage%d' % i: [128, 22, 1, 1, 0]})
86 |
87 | for k in blocks.keys():
88 | blocks[k] = make_layers(blocks[k], no_relu_layers)
89 |
90 | self.model1_0 = blocks['block1_0']
91 | self.model1_1 = blocks['block1_1']
92 | self.model2 = blocks['block2']
93 | self.model3 = blocks['block3']
94 | self.model4 = blocks['block4']
95 | self.model5 = blocks['block5']
96 | self.model6 = blocks['block6']
97 | self.head_reg = nn.Sequential(
98 | Flatten(),
99 | nn.ReLU(),
100 | nn.Dropout(0.5),
101 | nn.Linear(22*46*46, 256),
102 | nn.ReLU(),
103 | nn.Dropout(0.5),
104 | nn.Linear(256, 42),
105 | Reshape(-1, 21, 2),
106 | nn.Tanh())
107 | self._initialize_weights()
108 |
109 | def forward(self, x):
110 | out1_0 = self.model1_0(x)
111 | out1_1 = self.model1_1(out1_0)
112 | concat_stage2 = torch.cat([out1_1, out1_0], 1)
113 | out_stage2 = self.model2(concat_stage2)
114 | concat_stage3 = torch.cat([out_stage2, out1_0], 1)
115 | out_stage3 = self.model3(concat_stage3)
116 | concat_stage4 = torch.cat([out_stage3, out1_0], 1)
117 | out_stage4 = self.model4(concat_stage4)
118 | concat_stage5 = torch.cat([out_stage4, out1_0], 1)
119 | out_stage5 = self.model5(concat_stage5)
120 | concat_stage6 = torch.cat([out_stage5, out1_0], 1)
121 | out_stage6 = self.model6(concat_stage6)
122 | x = self.head_reg(out_stage6)
123 | return x
124 |
125 | def _initialize_weights(self):
126 | for m in self.modules():
127 | if isinstance(m, nn.Conv2d):
128 | nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
129 | if m.bias is not None:
130 | nn.init.constant_(m.bias, 0)
131 | elif isinstance(m, nn.BatchNorm2d):
132 | nn.init.constant_(m.weight, 1)
133 | nn.init.constant_(m.bias, 0)
134 | elif isinstance(m, nn.Linear):
135 | nn.init.normal_(m.weight, 0, 0.01)
136 | nn.init.constant_(m.bias, 0)
137 |
138 |
139 | image_path = '/home/hanwei-1/data/hand_labels_synth/synth2_3'
140 |
141 |
142 | transforms = get_transforms(do_flip=False, max_zoom=1.05, max_warp=0.01,max_rotate=3, p_lighting=1)
143 |
144 | def get_y_func(x):
145 | pre, ext = os.path.splitext(x)
146 | hand_data_out = []
147 | # pre = pre.replace('synth2', 'synth2_json')
148 | hand_data = json.load(open(pre + '.json'))
149 | for i in range(21):
150 | hand_tem_xy = hand_data['hand_pts'][i][:2]
151 | hand_tem_xy.reverse()
152 | hand_data_out.append(hand_tem_xy)
153 | return Tensor(hand_data_out)
154 |
155 |
156 | data = (PointsItemList.from_folder(path=image_path, extensions=['.jpg'], presort=True)
157 | .split_by_rand_pct()
158 | .label_from_func(get_y_func)
159 | .transform(transforms, size=368, tfm_y=True, remove_out=False,
160 | padding_mode='border', resize_method=ResizeMethod.PAD)
161 | .databunch(bs=32)
162 | .normalize(imagenet_stats))
163 |
164 |
165 | class MSELossFlat(nn.MSELoss):
166 | def forward(self, input:Tensor, target:Tensor):
167 | return super().forward(input.view(-1), target.view(-1))
168 |
169 |
170 | mse_loss_flat = MSELossFlat()
171 |
172 |
173 | class L2Loss(torch.nn.Module):
174 | def __init__(self, batch_size):
175 | super(L2Loss, self).__init__()
176 | self.batch_size = batch_size
177 |
178 | def forward(self, x: Variable, y: Variable, weights: Variable = None):
179 | if weights is not None:
180 | val = (x-y) * weights[:x.data.shape[0], :, :, :] # Slice by shape[n,..] for batch size (last batch < batch_size)
181 | else:
182 | val = x-y
183 | l = torch.sum(val ** 2) / self.batch_size / 2
184 | return l
185 |
186 |
187 | l2loss = L2Loss(batch_size=8)
188 |
189 | net = handpose_model()
190 |
191 |
192 | learn = Learner(data, net, loss_func=mse_loss_flat)
193 | learn.fit_one_cycle(cyc_len=200, max_lr=0.0001)
194 | learn.recorder.plot()
195 | plt.show()
196 | learn.lr_find()
197 | learn.recorder.plot()
198 | plt.show()
199 |
--------------------------------------------------------------------------------
/demo/img_keypoint_show.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import matplotlib.pyplot as plt
4 | import cv2
5 |
6 | im_dir = 'Path'
7 | json_dir = 'Path'
8 | hand_data_out = {}
9 |
10 | with open(json_dir, 'r') as f:
11 | hand_data = json.load(f)
12 |
13 | for i in range(21):
14 | hand_data_out[i] = hand_data['hand_pts'][i][:2]
15 |
16 | for j in range(21):
17 | for i in range(2):
18 | hand_data_out[j][i] = int(hand_data_out[j][i])
19 |
20 |
21 | def get_json_point(json_path):
22 | hand_data_out = {}
23 | hand_return = {}
24 | str_point = ''
25 | with open(json_dir, 'r') as f:
26 | hand_data = json.load(f)
27 |
28 | for i in range(21):
29 | hand_data_out[i] = hand_data['hand_pts'][i][:2]
30 |
31 | for j in range(21):
32 | for i in range(2):
33 | hand_data_out[j][i] = int(hand_data_out[j][i])
34 |
35 | hand_return[0] = hand_data_out[1]
36 | hand_return[1] = hand_data_out[7]
37 | hand_return[2] = hand_data_out[11]
38 | hand_return[3] = hand_data_out[15]
39 | hand_return[4] = hand_data_out[19]
40 | for key, value in hand_return.items():
41 | for i in range(2):
42 | str_point += str(value[i])
43 | str_point += ' '
44 |
45 | return hand_data_out
46 |
47 |
48 | data = get_json_point(json_dir)
49 |
50 | output = cv2.imread(im_dir)
51 | for i in range(21):
52 | cv2.circle(output, tuple(data[i]), 2, (0, 0, 255), 1)
53 | plt.imshow(output)
54 | plt.show()
55 |
56 |
--------------------------------------------------------------------------------
/image/Loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/Loss.png
--------------------------------------------------------------------------------
/image/com.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/com.png
--------------------------------------------------------------------------------
/image/front-back.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/front-back.jpg
--------------------------------------------------------------------------------
/image/hand.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/hand.jpeg
--------------------------------------------------------------------------------
/image/hand.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/hand.jpg
--------------------------------------------------------------------------------
/image/hand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/hand.png
--------------------------------------------------------------------------------
/image/right-frontal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/right-frontal.jpg
--------------------------------------------------------------------------------
/image/unnamed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/unnamed.png
--------------------------------------------------------------------------------
/main/data/dataset.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @File : dataset.py
4 | @Time : 2019/9/13 16:34
5 | @Author : KeyForce
6 | @Email : july.master@outlook.com
7 | """
8 | import os
9 | import torch
10 | import pandas as pd
11 | from skimage import io, transform
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | from torch.utils.data import Dataset, DataLoader
15 | from torchvision import transforms, utils
16 | import json
17 |
18 |
19 | class ReadJsonPoint:
20 | """读取CMU手部21点关键点数据"""
21 | def __init__(self, json_path):
22 | self.json_path = json_path
23 | self.hand_point = []
24 |
25 | def read(self):
26 | with open(self.json_path, 'r') as f:
27 | hand_data = json.load(f)
28 |
29 | for i in range(21):
30 | # 这边要注意不要xy坐标搞混
31 | hand_tem_xy = hand_data['hand_pts'][i][:2]
32 | hand_tem_xy = list(map(int, hand_tem_xy))
33 | self.hand_point.append(hand_tem_xy)
34 |
35 | # hand_point = list(map(int, hand_point))
36 |
37 | return np.array(self.hand_point)
38 |
39 |
40 | class CMUHandPointDataset(Dataset):
41 | """读取CMU手部关键点数据"""
42 |
43 | def __init__(self, root_dir, transform=None):
44 | self.root_dir = root_dir
45 | self.transform = transform
46 | self.image_name = []
47 |
48 | # 分离目录下的jpg和json
49 | file_list = os.listdir(root_dir)
50 | for i in file_list:
51 | if os.path.splitext(i)[1] == '.jpg':
52 | self.image_name.append(i)
53 |
54 | def __getitem__(self, item):
55 | if torch.is_tensor(item):
56 | item = item.tolist()
57 |
58 | img_path = os.path.join(self.root_dir,
59 | self.image_name[item])
60 | image = io.imread(img_path)
61 | json_path = os.path.join(img_path.replace('.jpg', '.json'))
62 | # 调用read方法读取数据
63 | landmarks = ReadJsonPoint(json_path).read()
64 | sample = {'image': image, 'landmarks': landmarks}
65 |
66 | if self.transform:
67 | sample = self.transform(sample)
68 |
69 | return sample
70 |
71 | def __len__(self):
72 | return len(self.image_name)
73 |
74 |
75 | class Rescale(object):
76 | """Rescale the image in a sample to a given size.
77 |
78 | Args:
79 | output_size (tuple or int): Desired output size. If tuple, output is
80 | matched to output_size. If int, smaller of image edges is matched
81 | to output_size keeping aspect ratio the same.
82 | """
83 |
84 | def __init__(self, output_size):
85 | assert isinstance(output_size, (int, tuple))
86 | self.output_size = output_size
87 |
88 | def __call__(self, sample):
89 | image, landmarks = sample['image'], sample['landmarks']
90 |
91 | h, w = image.shape[:2]
92 | if isinstance(self.output_size, int):
93 | if h > w:
94 | new_h, new_w = self.output_size * h / w, self.output_size
95 | else:
96 | new_h, new_w = self.output_size, self.output_size * w / h
97 | else:
98 | new_h, new_w = self.output_size
99 |
100 | new_h, new_w = int(new_h), int(new_w)
101 |
102 | img = transform.resize(image, (new_h, new_w))
103 |
104 | # h and w are swapped for landmarks because for images,
105 | # x and y axes are axis 1 and 0 respectively
106 | landmarks = landmarks * [new_w / w, new_h / h]
107 |
108 |
109 | return {'image': img, 'landmarks': landmarks}
110 |
111 |
112 | class RandomCrop(object):
113 | """Crop randomly the image in a sample.
114 |
115 | Args:
116 | output_size (tuple or int): Desired output size. If int, square crop
117 | is made.
118 | """
119 |
120 | def __init__(self, output_size):
121 | assert isinstance(output_size, (int, tuple))
122 | if isinstance(output_size, int):
123 | self.output_size = (output_size, output_size)
124 | else:
125 | assert len(output_size) == 2
126 | self.output_size = output_size
127 |
128 | def __call__(self, sample):
129 | image, landmarks = sample['image'], sample['landmarks']
130 |
131 | h, w = image.shape[:2]
132 | new_h, new_w = self.output_size
133 |
134 | top = np.random.randint(0, h - new_h)
135 | left = np.random.randint(0, w - new_w)
136 |
137 | image = image[top: top + new_h,
138 | left: left + new_w]
139 |
140 | landmarks = landmarks - [left, top]
141 |
142 | return {'image': image, 'landmarks': landmarks}
143 |
144 |
145 | class ToTensor(object):
146 | """Convert ndarrays in sample to Tensors."""
147 |
148 | def __call__(self, sample):
149 | image, landmarks = sample['image'], sample['landmarks']
150 |
151 | # swap color axis because
152 | # numpy image: H x W x C
153 | # torch image: C X H X W
154 | image = image.transpose((2, 0, 1))
155 | return {'image': torch.from_numpy(image),
156 | 'landmarks': torch.from_numpy(landmarks)}
157 |
158 |
159 | def show_landmarks(image, landmarks):
160 | """显示landmark,以方便检查数据"""
161 | plt.imshow(image)
162 | x = []
163 | y = []
164 | for i in range(21):
165 | x.append(landmarks[i][0])
166 | y.append(landmarks[i][1])
167 | plt.scatter(x, y, s=10, marker='.', c='r')
168 |
169 |
170 | if __name__ == '__main__':
171 | root_dir = '/home/wild/Hand-Keypoint-Estimation/data/Hands from Synthetic Data (6546 + 3243 + 2348 ' \
172 | '+ 2124 = 14261 annotations)/hand_labels_synth/synth2'
173 |
174 | composed = transforms.Compose([Rescale(368),
175 | ToTensor()])
176 |
177 | Data = CMUHandPointDataset(root_dir, composed)
178 |
179 | for i in range(8):
180 | sample = Data[i]
181 |
182 | print(i, sample['image'].shape)
183 | print('First 4 Landmarks: {}'.format(sample['landmarks'][:4]))
184 | ax = plt.subplot(2, 4, i + 1)
185 | plt.imshow(sample['image'].permute(1, 2, 0))
186 | x = []
187 | y = []
188 | for i in range(21):
189 | x.append(np.array(sample['landmarks'][i][0]))
190 | y.append(np.array(sample['landmarks'][i][1]))
191 | plt.scatter(x, y, s=10, marker='.', c='r')
192 |
193 | plt.show()
194 |
--------------------------------------------------------------------------------
/main/demo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @File : demo.py
4 | @Time : 2019/9/14 11:41
5 | @Author : KeyForce
6 | @Email : july.master@outlook.com
7 | """
8 |
--------------------------------------------------------------------------------
/main/train.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @File : train.py
4 | @Time : 2019/9/13 16:33
5 | @Author : KeyForce
6 | @Email : july.master@outlook.com
7 | """
8 | import matplotlib.pyplot as plt
9 | import torch
10 | import torch.optim as optim
11 | import numpy as np
12 | import torch.nn as nn
13 |
14 |
15 | def Train(model, train_loader, criterion, optimizer, device, metrics=None, lr_scheduler=None, epoch=30):
16 | """
17 | 训练模型
18 | :param model: 模型
19 | :param train_loader: 训练集
20 | :param criterion: 损失
21 | :param optimizer: 优化器
22 | :param device: GPU 或者CPU
23 | :param metrics: 评价指标
24 | :param lr_scheduler: 学习率调整
25 | :param epoch: 迭代次数
26 | :return:
27 | """
28 | model.train()
29 | for batch_idx, (image, label) in enumerate(train_loader):
30 | image, label = image.to(device), label.to(device)
31 | optimizer.zero_grad()
32 | output = model(image)
33 | label = label.long()
34 | loss = criterion(output, label)
35 | loss.backward()
36 | optimizer.step()
37 | # Log
38 | if batch_idx % 10 == 0:
39 | print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.
40 | format(epoch,
41 | batch_idx * len(image),
42 | len(train_loader.dataset),
43 | 100. * batch_idx / len(train_loader),
44 | loss.item())
45 | )
46 |
47 |
48 | def Test(model, test_loader, criterion, device, epoch):
49 | """
50 | 测试模型
51 | :param model: 模型
52 | :param test_loader: 测试集
53 | :param criterion: 损失
54 | :param device: GPU 或者CPU
55 | :param epoch:
56 | :return:
57 | """
58 | model.eval()
59 | test_loss = 0
60 | correct = 0
61 | confusion_matrix = np.zeros((21, 21))
62 | flag = 0
63 | with torch.no_grad():
64 | for image, label in test_loader:
65 | image, label = image.to(device), label.to(device)
66 | output = model(image)
67 | label = label.long()
68 | loss = criterion(output, label)
69 | test_loss += loss.item()
70 | pred = output.argmax(dim=1, keepdim=True)
71 | # PA像素精度
72 | num_class = 21
73 | pre_image = pred.squeeze(1).cpu().numpy()
74 |
75 | gt_image = label.cpu().numpy()
76 |
77 | confusion_matrix = fast_hist(gt_image, pre_image, num_class)
78 | # plt.close()
79 | PA = np.diag(confusion_matrix).sum() / confusion_matrix.sum()
80 | test_loss /= len(test_loader.dataset)
81 |
82 | print('\nTest set: Average loss: {:.4f}, PA: {}\n'.
83 | format(loss,
84 | PA,
85 | )
86 | )
87 |
88 |
89 | def fast_hist(a, b, n):
90 | k = (a >= 0) & (a < n)
91 | return np.bincount(n * a[k].astype(int) + b[k], minlength=n ** 2).reshape(n, n)
92 |
93 |
94 | def main():
95 | # 加载数据
96 |
97 |
98 | # 使用drop_last让Batch能够整除
99 | train_loader = torch.utils.data.DataLoader(train_data, batch_size=16, drop_last=True)
100 | test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, drop_last=True)
101 |
102 | # 设置GPU
103 | torch.cuda.set_device(0)
104 | device = torch.device("cuda")
105 | # 初始化模型,损失,优化器
106 | model =
107 | loss = nn.CrossEntropyLoss(ignore_index=255, reduction='mean').to(device)
108 | optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
109 | # optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.8, weight_decay=5e-4)
110 | # 开始训练
111 | for epoch in range(40):
112 | Train(model, train_loader=train_loader,
113 | criterion=loss, optimizer=optimizer,
114 | device=device, epoch=epoch)
115 | Test(model, test_loader, loss, device, epoch)
116 |
117 |
118 | if __name__ == '__main__':
119 | main()
--------------------------------------------------------------------------------
/other/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 2.8.12)
2 |
3 | PROJECT(handPose)
4 |
5 | find_package( OpenCV REQUIRED )
6 |
7 | include_directories( ${OpenCV_INCLUDE_DIRS})
8 |
9 | MACRO(add_example name)
10 | ADD_EXECUTABLE(${name} ${name}.cpp)
11 | TARGET_LINK_LIBRARIES(${name} ${OpenCV_LIBS})
12 | ENDMACRO()
13 |
14 |
15 | add_example(handPoseImage)
16 | add_example(handPoseVideo)
17 |
--------------------------------------------------------------------------------
/other/Hand_Caffe/1_F_deploy.prototxt:
--------------------------------------------------------------------------------
1 | # This file gives the CNN model to predict all landmark in LEVEL-1
2 | name: "landmark_1_F"
3 | layer {
4 | name: "data"
5 | type: "MemoryData"
6 | top: "data"
7 | top: "landmark"
8 |
9 | memory_data_param {
10 | batch_size: 1
11 | channels: 1
12 | height: 39
13 | width: 39
14 | }
15 | transform_param {
16 | scale: 0.00390625
17 | }
18 | }
19 | layer {
20 | name: "conv1"
21 | type: "Convolution"
22 | bottom: "data"
23 | top: "conv1"
24 | param {
25 | lr_mult: 1
26 | }
27 | param {
28 | lr_mult: 2
29 | }
30 | convolution_param {
31 | num_output: 20
32 | kernel_size: 4
33 | weight_filler {
34 | type: "xavier"
35 | }
36 | bias_filler {
37 | type: "constant"
38 | }
39 | }
40 | }
41 | layer {
42 | name: "relu1"
43 | type: "ReLU"
44 | bottom: "conv1"
45 | top: "conv1"
46 | }
47 | layer {
48 | name: "pool1"
49 | type: "Pooling"
50 | bottom: "conv1"
51 | top: "pool1"
52 | pooling_param {
53 | pool: MAX
54 | kernel_size: 2
55 | stride: 2
56 | }
57 | }
58 | layer {
59 | name: "conv2"
60 | type: "Convolution"
61 | bottom: "pool1"
62 | top: "conv2"
63 | param {
64 | lr_mult: 1
65 | }
66 | param {
67 | lr_mult: 2
68 | }
69 | convolution_param {
70 | num_output: 40
71 | kernel_size: 3
72 | weight_filler {
73 | type: "xavier"
74 | }
75 | bias_filler {
76 | type: "constant"
77 | }
78 | }
79 | }
80 | layer {
81 | name: "relu2"
82 | type: "ReLU"
83 | bottom: "conv2"
84 | top: "conv2"
85 | }
86 | layer {
87 | name: "pool2"
88 | type: "Pooling"
89 | bottom: "conv2"
90 | top: "pool2"
91 | pooling_param {
92 | pool: MAX
93 | kernel_size: 2
94 | stride: 2
95 | }
96 | }
97 | layer {
98 | name: "conv3"
99 | type: "Convolution"
100 | bottom: "pool2"
101 | top: "conv3"
102 | param {
103 | lr_mult: 1
104 | }
105 | param {
106 | lr_mult: 2
107 | }
108 | convolution_param {
109 | num_output: 60
110 | kernel_size: 3
111 | weight_filler {
112 | type: "xavier"
113 | }
114 | bias_filler {
115 | type: "constant"
116 | }
117 | }
118 | }
119 | layer {
120 | name: "relu3"
121 | type: "ReLU"
122 | bottom: "conv3"
123 | top: "conv3"
124 | }
125 | layer {
126 | name: "pool3"
127 | type: "Pooling"
128 | bottom: "conv3"
129 | top: "pool3"
130 | pooling_param {
131 | pool: MAX
132 | kernel_size: 2
133 | stride: 2
134 | }
135 | }
136 | layer {
137 | name: "conv4"
138 | type: "Convolution"
139 | bottom: "pool3"
140 | top: "conv4"
141 | param {
142 | lr_mult: 1
143 | }
144 | param {
145 | lr_mult: 2
146 | }
147 | convolution_param {
148 | num_output: 80
149 | kernel_size: 2
150 | weight_filler {
151 | type: "xavier"
152 | }
153 | bias_filler {
154 | type: "constant"
155 | }
156 | }
157 | }
158 | layer {
159 | name: "relu4"
160 | type: "ReLU"
161 | bottom: "conv4"
162 | top: "conv4"
163 | }
164 | layer {
165 | name: "pool3_flat"
166 | type: "Flatten"
167 | bottom: "pool3"
168 | top: "pool3_flat"
169 | }
170 | layer {
171 | name: "conv4_flat"
172 | type: "Flatten"
173 | bottom: "conv4"
174 | top: "conv4_flat"
175 | }
176 | layer {
177 | name: "concat"
178 | type: "Concat"
179 | bottom: "pool3_flat"
180 | bottom: "conv4_flat"
181 | top: "faker"
182 | concat_param {
183 | concat_dim: 1
184 | }
185 | }
186 | layer {
187 | name: "fc1"
188 | type: "InnerProduct"
189 | bottom: "faker"
190 | top: "fc1"
191 | param {
192 | lr_mult: 1
193 | }
194 | param {
195 | lr_mult: 2
196 | }
197 | inner_product_param {
198 | num_output: 120
199 | weight_filler {
200 | type: "xavier"
201 | }
202 | bias_filler {
203 | type: "constant"
204 | }
205 | }
206 | }
207 | layer {
208 | name: "relu_fc1"
209 | type: "ReLU"
210 | bottom: "fc1"
211 | top: "fc1"
212 | }
213 | layer {
214 | name: "fc2"
215 | type: "InnerProduct"
216 | bottom: "fc1"
217 | top: "fc2"
218 | param {
219 | lr_mult: 1
220 | }
221 | param {
222 | lr_mult: 2
223 | }
224 | inner_product_param {
225 | num_output: 10
226 | weight_filler {
227 | type: "xavier"
228 | }
229 | bias_filler {
230 | type: "constant"
231 | }
232 | }
233 | }
234 | layer {
235 | name: "relu_fc2"
236 | type: "ReLU"
237 | bottom: "fc2"
238 | top: "fc2"
239 | }
240 |
--------------------------------------------------------------------------------
/other/Hand_Caffe/1_F_solver.prototxt:
--------------------------------------------------------------------------------
1 | net: "/home/wild/Face_Landmark/Hand_Test/1_F_train.prototxt"
2 |
3 | test_iter: 25
4 | test_interval: 1000
5 |
6 | base_lr: 0.001
7 | momentum: 0.9
8 | weight_decay: 0.0005
9 |
10 | lr_policy: "inv"
11 | gamma: 0.0001
12 | power: 0.75
13 |
14 | #lr_policy: "step"
15 | #gamma: 0.1
16 | #stepsize: 50000
17 |
18 | display: 200
19 |
20 | max_iter: 50000
21 |
22 | snapshot: 50000
23 | snapshot_prefix: "/home/wild/Face_Landmark/Hand_Test/"
24 |
25 | test_compute_loss: true
26 |
27 | solver_mode: GPU
--------------------------------------------------------------------------------
/other/Hand_Caffe/1_F_train.prototxt:
--------------------------------------------------------------------------------
1 | # This file gives the CNN model to predict all landmark in LEVEL-1
2 | name: "landmark_1_F"
3 | layer {
4 | name: "hdf5_train_data"
5 | type: "HDF5Data"
6 | top: "data"
7 | top: "landmark"
8 | include {
9 | phase: TRAIN
10 | }
11 | hdf5_data_param {
12 | source: "/home/wild/Face_Landmark/Hand_Test/Mytrain/1_F/train.txt"
13 | batch_size: 64
14 | }
15 | }
16 | layer {
17 | name: "hdf5_test_data"
18 | type: "HDF5Data"
19 | top: "data"
20 | top: "landmark"
21 | include {
22 | phase: TEST
23 | }
24 | hdf5_data_param {
25 | source: "/home/wild/Face_Landmark/Hand_Test/Mytrain/1_F/test.txt"
26 | batch_size: 64
27 | }
28 | }
29 | layer {
30 | name: "conv1"
31 | type: "Convolution"
32 | bottom: "data"
33 | top: "conv1"
34 | param {
35 | lr_mult: 1
36 | }
37 | param {
38 | lr_mult: 2
39 | }
40 | convolution_param {
41 | num_output: 20
42 | kernel_size: 4
43 | weight_filler {
44 | type: "xavier"
45 | }
46 | bias_filler {
47 | type: "constant"
48 | }
49 | }
50 | }
51 | layer {
52 | name: "relu1"
53 | type: "ReLU"
54 | bottom: "conv1"
55 | top: "conv1"
56 | }
57 | layer {
58 | name: "pool1"
59 | type: "Pooling"
60 | bottom: "conv1"
61 | top: "pool1"
62 | pooling_param {
63 | pool: MAX
64 | kernel_size: 2
65 | stride: 2
66 | }
67 | }
68 | layer {
69 | name: "conv2"
70 | type: "Convolution"
71 | bottom: "pool1"
72 | top: "conv2"
73 | param {
74 | lr_mult: 1
75 | }
76 | param {
77 | lr_mult: 2
78 | }
79 | convolution_param {
80 | num_output: 40
81 | kernel_size: 3
82 | weight_filler {
83 | type: "xavier"
84 | }
85 | bias_filler {
86 | type: "constant"
87 | }
88 | }
89 | }
90 | layer {
91 | name: "relu2"
92 | type: "ReLU"
93 | bottom: "conv2"
94 | top: "conv2"
95 | }
96 | layer {
97 | name: "pool2"
98 | type: "Pooling"
99 | bottom: "conv2"
100 | top: "pool2"
101 | pooling_param {
102 | pool: MAX
103 | kernel_size: 2
104 | stride: 2
105 | }
106 | }
107 | layer {
108 | name: "conv3"
109 | type: "Convolution"
110 | bottom: "pool2"
111 | top: "conv3"
112 | param {
113 | lr_mult: 1
114 | }
115 | param {
116 | lr_mult: 2
117 | }
118 | convolution_param {
119 | num_output: 60
120 | kernel_size: 3
121 | weight_filler {
122 | type: "xavier"
123 | }
124 | bias_filler {
125 | type: "constant"
126 | }
127 | }
128 | }
129 | layer {
130 | name: "relu3"
131 | type: "ReLU"
132 | bottom: "conv3"
133 | top: "conv3"
134 | }
135 | layer {
136 | name: "pool3"
137 | type: "Pooling"
138 | bottom: "conv3"
139 | top: "pool3"
140 | pooling_param {
141 | pool: MAX
142 | kernel_size: 2
143 | stride: 2
144 | }
145 | }
146 | layer {
147 | name: "conv4"
148 | type: "Convolution"
149 | bottom: "pool3"
150 | top: "conv4"
151 | param {
152 | lr_mult: 1
153 | }
154 | param {
155 | lr_mult: 2
156 | }
157 | convolution_param {
158 | num_output: 80
159 | kernel_size: 2
160 | weight_filler {
161 | type: "xavier"
162 | }
163 | bias_filler {
164 | type: "constant"
165 | }
166 | }
167 | }
168 | layer {
169 | name: "relu4"
170 | type: "ReLU"
171 | bottom: "conv4"
172 | top: "conv4"
173 | }
174 | layer {
175 | name: "pool3_flat"
176 | type: "Flatten"
177 | bottom: "pool3"
178 | top: "pool3_flat"
179 | }
180 | layer {
181 | name: "conv4_flat"
182 | type: "Flatten"
183 | bottom: "conv4"
184 | top: "conv4_flat"
185 | }
186 | layer {
187 | name: "concat"
188 | type: "Concat"
189 | bottom: "pool3_flat"
190 | bottom: "conv4_flat"
191 | top: "faker"
192 | concat_param {
193 | concat_dim: 1
194 | }
195 | }
196 | layer {
197 | name: "fc1"
198 | type: "InnerProduct"
199 | bottom: "faker"
200 | top: "fc1"
201 | param {
202 | lr_mult: 1
203 | }
204 | param {
205 | lr_mult: 2
206 | }
207 | inner_product_param {
208 | num_output: 120
209 | weight_filler {
210 | type: "xavier"
211 | }
212 | bias_filler {
213 | type: "constant"
214 | }
215 | }
216 | }
217 | layer {
218 | name: "relu_fc1"
219 | type: "ReLU"
220 | bottom: "fc1"
221 | top: "fc1"
222 | }
223 | layer {
224 | name: "fc2"
225 | type: "InnerProduct"
226 | bottom: "fc1"
227 | top: "fc2"
228 | param {
229 | lr_mult: 1
230 | }
231 | param {
232 | lr_mult: 2
233 | }
234 | inner_product_param {
235 | num_output: 10
236 | weight_filler {
237 | type: "xavier"
238 | }
239 | bias_filler {
240 | type: "constant"
241 | }
242 | }
243 | }
244 | layer {
245 | name: "relu_fc2"
246 | type: "ReLU"
247 | bottom: "fc2"
248 | top: "fc2"
249 | }
250 | layer {
251 | name: "error"
252 | type: "EuclideanLoss"
253 | bottom: "fc2"
254 | bottom: "landmark"
255 | top: "error"
256 | include {
257 | phase: TEST
258 | }
259 | }
260 | layer {
261 | name: "loss"
262 | type: "EuclideanLoss"
263 | bottom: "fc2"
264 | bottom: "landmark"
265 | top: "loss"
266 | include {
267 | phase: TRAIN
268 | }
269 | }
270 |
--------------------------------------------------------------------------------
/other/Hand_Caffe/create_txt.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import cv2
4 | import numpy
5 |
6 | def make_bbox_bigger(data, xR, yR, wR, hR):
7 |
8 | xDelta = data[0] * xR
9 | yDelta = data[1] * yR
10 | wDelta = data[2] * wR
11 | hDelta = data[3] * hR
12 |
13 | x = data[0] + xDelta
14 | y = data[1] + yDelta
15 | w = data[2] + wDelta
16 | h = data[3] + hDelta
17 | return [int(x), int(y), int(w), int(h)]
18 |
19 | def get_json_point(json_path):
20 | hand_data_out = {}
21 | hand_return = {}
22 | str_point = ''
23 | cnt = numpy.zeros((21, 2), dtype=int)
24 | with open(json_path, 'r') as f:
25 | hand_data = json.load(f)
26 |
27 | for i in range(21):
28 | hand_data_out[i] = hand_data['hand_pts'][i][:2]
29 |
30 | for j in range(21):
31 | for i in range(2):
32 | hand_data_out[j][i] = int(hand_data_out[j][i])
33 |
34 | for i in range(21):
35 | cnt[i] = numpy.array(hand_data_out[i])
36 |
37 | index = [4, 8, 12, 16, 20]
38 | new_a = numpy.delete(cnt, index, axis=0)
39 | x, y, w, h = cv2.boundingRect(new_a)
40 | x, y, w, h = make_bbox_bigger([x, y, w, h], -0.08, -0.08, 0.8, 0.8)
41 |
42 | hand_return[0] = hand_data_out[1]
43 | hand_return[1] = hand_data_out[7]
44 | hand_return[2] = hand_data_out[11]
45 | hand_return[3] = hand_data_out[15]
46 | hand_return[4] = hand_data_out[19]
47 |
48 | # box
49 | hand_return[5] = [x, y]
50 | hand_return[6] = [w, h]
51 | for key, value in hand_return.items():
52 | for i in range(2):
53 | str_point += str(value[i])
54 | str_point += ' '
55 |
56 | return str_point
57 |
58 |
59 | if __name__ == '__main__':
60 | data_sources = ['synth1', 'synth2', 'synth3', 'synth4']
61 | root_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth'
62 |
63 | data = []
64 |
65 | for data_source in data_sources:
66 | im_dir = os.path.join(root_dir, data_source)
67 | for im_file in os.listdir(im_dir):
68 | if '.jpg' in im_file:
69 | name = im_file.rstrip('.jpg')
70 | json_file_path = os.path.join(root_dir, data_source, name + '.json')
71 | im_file_path = os.path.join(data_source, name + '.jpg')
72 | point = get_json_point(json_file_path)
73 | data.append(" ".join([im_file_path, point]))
74 |
75 | with open('{}/data.txt'.format(root_dir), 'w') as f:
76 | for image_point in data:
77 | f.write('{}\r\n'.format(image_point))
78 |
79 | train = data[:int(len(data) * 0.7)]
80 | test = data[int(len(data) * 0.7):]
81 |
82 | with open('{}/train.txt'.format(root_dir), 'w') as f:
83 | for image_point in data:
84 | f.write('{}\r\n'.format(image_point))
85 |
86 | with open('{}/test.txt'.format(root_dir), 'w') as f:
87 | for image_point in data:
88 | f.write('{}\r\n'.format(image_point))
89 |
90 |
91 | # random.shuffle(test_data)
92 | # random.shuffle(test_data)
93 | # random.shuffle(train_data)
94 | # random.shuffle(train_data)
95 | #
96 | # with open('test.txt', 'w') as f:
97 | # f.write('\n'.join(test_data))
98 | # with open('trainval.txt', 'w') as f:
99 | # f.write('\n'.join(train_data))
100 |
--------------------------------------------------------------------------------
/other/Hand_Caffe/getBox.py:
--------------------------------------------------------------------------------
1 | import json
2 | import cv2
3 | import numpy
4 | import matplotlib.pyplot as plt
5 |
6 | im_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.jpg'
7 | json_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.json'
8 | hand_data_out = {}
9 |
10 | hand_data_out = {}
11 | cnt = numpy.zeros((21, 2), dtype=int)
12 | with open(json_dir, 'r') as f:
13 | hand_data = json.load(f)
14 |
15 | for i in range(21):
16 | hand_data_out[i] = hand_data['hand_pts'][i][:2]
17 |
18 | for j in range(21):
19 | for i in range(2):
20 | hand_data_out[j][i] = int(hand_data_out[j][i])
21 |
22 | for i in range(21):
23 | cnt[i] = numpy.array(hand_data_out[i])
24 |
25 | index = [4, 8, 12, 16, 20]
26 | new_a = numpy.delete(cnt, index, axis=0)
27 | img = cv2.imread(im_dir)
28 | x, y, w, h = cv2.boundingRect(new_a)
29 |
30 |
31 | def make_bbox_bigger(data, xR, yR, wR, hR):
32 |
33 | xDelta = data[0] * xR
34 | yDelta = data[1] * yR
35 | wDelta = data[2] * wR
36 | hDelta = data[3] * hR
37 |
38 | x = data[0] + xDelta
39 | y = data[1] + yDelta
40 | w = data[2] + wDelta
41 | h = data[3] + hDelta
42 | return [int(x), int(y), int(w), int(h)]
43 |
44 | x, y, w, h = make_bbox_bigger([x, y, w, h], -0.08, -0.08, 0.08, 0.08)
45 |
46 | cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 1)
47 |
48 | cv2.imwrite('hand.jpeg', img)
49 | plt.imshow(img)
50 | plt.show()
51 |
52 |
53 |
--------------------------------------------------------------------------------
/other/Hand_Caffe/hand.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Caffe/hand.jpeg
--------------------------------------------------------------------------------
/other/Hand_Caffe/level1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2.7
2 | # coding: utf-8
3 |
4 |
5 | import os
6 | from os.path import join, exists
7 |
8 | import cv2
9 | import h5py
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | from utils import getDataFromTxt
13 | from utils import shuffle_in_unison_scary, logger, createDir, processImage
14 |
15 | TRAIN = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth'
16 | OUTPUT = '/home/wild/Face_Landmark/Hand_Test/Mytrain'
17 | if not exists(OUTPUT):
18 | os.mkdir(OUTPUT)
19 | assert(exists(TRAIN) and exists(OUTPUT))
20 |
21 |
22 | def generate_hdf5(ftxt, output, fname, argument=False):
23 |
24 | data = getDataFromTxt(ftxt)
25 | F_imgs = []
26 | F_landmarks = []
27 |
28 | for (imgPath, landmarkGt, bbox) in data:
29 | img = cv2.imread(imgPath, cv2.IMREAD_GRAYSCALE)
30 | assert(img is not None)
31 | logger("process %s" % imgPath)
32 | # plt.imshow(img)
33 | # plt.show()
34 |
35 | f_face = img[int(bbox[0]):int(bbox[2]), int(bbox[1]):int(bbox[3])]
36 | plt.imshow(f_face)
37 | plt.show()
38 |
39 | f_face = cv2.resize(f_face, (39, 39))
40 |
41 | f_face = f_face.reshape((1, 39, 39))
42 |
43 | f_landmark = landmarkGt.reshape((10))
44 | F_imgs.append(f_face)
45 | F_landmarks.append(f_landmark)
46 |
47 |
48 |
49 | F_imgs, F_landmarks = np.asarray(F_imgs), np.asarray(F_landmarks)
50 |
51 |
52 | F_imgs = processImage(F_imgs)
53 | shuffle_in_unison_scary(F_imgs, F_landmarks)
54 |
55 |
56 | # full face
57 | base = join(OUTPUT, '1_F')
58 | createDir(base)
59 | output = join(base, fname)
60 | logger("generate %s" % output)
61 |
62 |
63 | with h5py.File(output, 'w') as h5:
64 | h5['data'] = F_imgs.astype(np.float32)
65 | h5['landmark'] = F_landmarks.astype(np.float32)
66 |
67 |
68 |
69 | if __name__ == '__main__':
70 |
71 | h5_path = '/home/wild/Face_Landmark/Hand_Test/Mytrain'
72 | # 训练集
73 | train_txt = join(TRAIN, 'train.txt')
74 | generate_hdf5(train_txt, OUTPUT, 'train.h5', argument=True)
75 | # 测试集
76 | test_txt = join(TRAIN, 'test.txt')
77 | generate_hdf5(test_txt, OUTPUT, 'test.h5')
78 |
79 | with open(join(OUTPUT, '1_F/train.txt'), 'w') as fd:
80 | fd.write(h5_path+'/1_F/train.h5')
81 |
82 | with open(join(OUTPUT, '1_F/test.txt'), 'w') as fd:
83 | fd.write(h5_path+'/1_F/test.h5')
84 |
85 | print 'ok'
--------------------------------------------------------------------------------
/other/Hand_Caffe/read_im_json.py:
--------------------------------------------------------------------------------
1 | import json
2 | import cv2
3 | im_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.jpg'
4 | json_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.json'
5 | hand_data_out = {}
6 | with open(json_dir, 'r') as f:
7 | hand_data = json.load(f)
8 |
9 | for i in range(21):
10 | hand_data_out[i] = hand_data['hand_pts'][i][:2]
11 |
12 | for j in range(21):
13 | for i in range(2):
14 | hand_data_out[j][i] = int(hand_data_out[j][i])
15 |
16 | def get_json_point(json_path):
17 | hand_data_out = {}
18 | hand_return = {}
19 | str_point = ''
20 | with open(json_dir, 'r') as f:
21 | hand_data = json.load(f)
22 |
23 | for i in range(21):
24 | hand_data_out[i] = hand_data['hand_pts'][i][:2]
25 |
26 | for j in range(21):
27 | for i in range(2):
28 | hand_data_out[j][i] = int(hand_data_out[j][i])
29 |
30 | hand_return[0] = hand_data_out[1]
31 | hand_return[1] = hand_data_out[7]
32 | hand_return[2] = hand_data_out[11]
33 | hand_return[3] = hand_data_out[15]
34 | hand_return[4] = hand_data_out[19]
35 | for key, value in hand_return.items():
36 | for i in range(2):
37 | str_point += str(value[i])
38 | str_point += ' '
39 |
40 | return str_point
41 |
42 | data = get_json_point(json_dir)
43 |
44 | # output = cv2.imread(im_dir)
45 | # for i in range(5):
46 | # cv2.circle(output, tuple(data[i]), 2, (0, 0, 255), 1)
47 | # cv2.imshow("capture", output)
48 | # while True:
49 | # if cv2.waitKey(1) == 27:
50 | # break # esc to quit
--------------------------------------------------------------------------------
/other/Hand_Caffe/utils.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import os
4 | import time
5 |
6 | import cv2
7 | import numpy as np
8 |
9 |
10 | def logger(msg):
11 | """
12 | log message
13 | """
14 | now = time.ctime()
15 | print("[%s] %s" % (now, msg))
16 |
17 |
18 | def createDir(p):
19 | if not os.path.exists(p):
20 | os.mkdir(p)
21 |
22 |
23 | def shuffle_in_unison_scary(a, b):
24 | rng_state = np.random.get_state()
25 | np.random.shuffle(a)
26 | np.random.set_state(rng_state)
27 | np.random.shuffle(b)
28 |
29 |
30 | def drawLandmark(img, bbox, landmark):
31 | cv2.rectangle(img, (bbox.left, bbox.top), (bbox.right, bbox.bottom), (0, 0, 255), 2)
32 | for x, y in landmark:
33 | cv2.circle(img, (int(x), int(y)), 2, (0, 255, 0), -1)
34 | return img
35 |
36 |
37 | def getDataFromTxt(txt, with_landmark=True):
38 | """
39 | Generate data from txt file
40 | return [(img_path, landmark)]
41 | landmark: [(x1, y1), (x2, y2), ...]
42 | """
43 | dirname = os.path.dirname(txt)
44 | with open(txt, 'r') as fd:
45 | lines = fd.readlines()
46 |
47 | result = []
48 | for line in lines:
49 | line = line.strip()
50 | components = line.split(' ')
51 | img_path = os.path.join(dirname, components[0].replace('\\', '/')) # file path
52 | # bounding box, (x, y, w, h)
53 | bbox = (components[11], components[12], int(components[11]) + int(components[13]), int(components[12]) + int(components[14]))
54 | # 将字符串转换成整型
55 | bbox = [int(_) for _ in bbox]
56 | # landmark
57 | if not with_landmark:
58 | result.append((img_path, BBox(bbox)))
59 | continue
60 | landmark = np.zeros((5, 2))
61 | # 读取关键点坐标
62 | for index in range(0, 5):
63 | rv = (float(components[1 + 2 * index]), float(components[1 + 2 * index + 1]))
64 | landmark[index] = rv
65 | for index, one in enumerate(landmark):
66 | rv = ((one[0] - bbox[0]) / (bbox[1] - bbox[0]), (one[1] - bbox[2]) / (bbox[3] - bbox[2]))
67 | landmark[index] = rv
68 | result.append((img_path, landmark, bbox))
69 | return result
70 |
71 |
72 | def getPatch(img, bbox, point, padding):
73 | """
74 | Get a patch iamge around the given point in bbox with padding
75 | point: relative_point in [0, 1] in bbox
76 | """
77 | point_x = bbox.x + point[0] * bbox.w
78 | point_y = bbox.y + point[1] * bbox.h
79 | patch_left = point_x - bbox.w * padding
80 | patch_right = point_x + bbox.w * padding
81 | patch_top = point_y - bbox.h * padding
82 | patch_bottom = point_y + bbox.h * padding
83 | patch = img[patch_top: patch_bottom + 1, patch_left: patch_right + 1]
84 | patch_bbox = BBox([patch_left, patch_right, patch_top, patch_bottom])
85 | return patch, patch_bbox
86 |
87 |
88 | def processImage(imgs):
89 | """
90 | process images before feeding to CNNs
91 | imgs: N x 1 x W x H
92 | """
93 | imgs = imgs.astype(np.float32)
94 | for i, img in enumerate(imgs):
95 | m = img.mean()
96 | s = img.std()
97 | imgs[i] = (img - m) / s
98 | return imgs
99 |
100 |
101 | def dataArgument(data):
102 | """
103 | dataArguments
104 | data:
105 | imgs: N x 1 x W x H
106 | bbox: N x BBox
107 | landmarks: N x 10
108 | """
109 | pass
110 |
111 |
112 | class BBox(object):
113 | """
114 | Bounding Box of face
115 | """
116 |
117 | def __init__(self, bbox):
118 | self.left = bbox[0]
119 | self.right = bbox[1]
120 | self.top = bbox[2]
121 | self.bottom = bbox[3]
122 | self.x = bbox[0]
123 | self.y = bbox[1]
124 | self.w = bbox[2] - bbox[0]
125 | self.h = bbox[3] - bbox[1]
126 |
127 | def expand(self, scale=0.05):
128 | bbox = [self.left, self.right, self.top, self.bottom]
129 | bbox[0] -= int(self.w * scale)
130 | bbox[1] += int(self.w * scale)
131 | bbox[2] -= int(self.h * scale)
132 | bbox[3] += int(self.h * scale)
133 | return BBox(bbox)
134 |
135 | def project(self, point):
136 | x = (point[0] - self.x) / self.w
137 | y = (point[1] - self.y) / self.h
138 | return np.asarray([x, y])
139 |
140 | def reproject(self, point):
141 | x = self.x + self.w * point[0]
142 | y = self.y + self.h * point[1]
143 | return np.asarray([x, y])
144 |
145 | def reprojectLandmark(self, landmark):
146 | p = np.zeros((len(landmark), 2))
147 | for i in range(len(landmark)):
148 | p[i] = self.reproject(landmark[i])
149 | return p
150 |
151 | def projectLandmark(self, landmark):
152 | p = np.zeros((len(landmark), 2))
153 | for i in range(len(landmark)):
154 | p[i] = self.project(landmark[i])
155 | return p
156 |
157 | # 扩大框
158 | def subBBox(self, leftR, rightR, topR, bottomR):
159 | leftDelta = self.w * leftR
160 | rightDelta = self.w * rightR
161 | topDelta = self.h * topR
162 | bottomDelta = self.h * bottomR
163 | left = self.left + leftDelta
164 | right = self.left + rightDelta
165 | top = self.top + topDelta
166 | bottom = self.top + bottomDelta
167 | return BBox([left, right, top, bottom])
168 |
--------------------------------------------------------------------------------
/other/Hand_Detection/README.md:
--------------------------------------------------------------------------------
1 | ### [SSD-Hand-Detection](https://github.com/weiliu89/caffe/tree/ssd)
2 | #### Dataset
3 | * [egohands](http://vision.soic.indiana.edu/projects/egohands/)
4 | * [stanfordhands](http://www.robots.ox.ac.uk/~vgg/data/hands/)
5 |
6 | #### Preprocess
7 | * `min(hands width and height) > threshold`, for egohands, `threshold=40`; for stanfordhands, `threshold=20`.
8 | * clean dataset can be downloaded from [onedrive]().
9 | * run `create_txt.py` to generate `test.txt` and `trainval.txt`.
10 | * At last run `./create_data.sh` to generate lmdb file in data/lmdb folder.
11 |
12 | #### Train
13 | * pretrained model are provided by the author and trained on [PASCAL VOC 2012, 2017](http://host.robots.ox.ac.uk/pascal/VOC/). [Download Link]().
14 |
15 | #### demo
16 | 
--------------------------------------------------------------------------------
/other/Hand_Detection/data/create_annoset.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | import shutil
4 | import subprocess
5 | import sys
6 | # get caffe root directory
7 | caffe_root = '../caffe'
8 | sys.path.insert(0, os.path.join(caffe_root, 'python'))
9 | from caffe.proto import caffe_pb2
10 | from google.protobuf import text_format
11 |
12 | if __name__ == "__main__":
13 | parser = argparse.ArgumentParser(description="Create AnnotatedDatum database")
14 | parser.add_argument("root",
15 | help="The root directory which contains the images and annotations.")
16 | parser.add_argument("listfile",
17 | help="The file which contains image paths and annotation info.")
18 | parser.add_argument("outdir",
19 | help="The output directory which stores the database file.")
20 | parser.add_argument("exampledir",
21 | help="The directory to store the link of the database files.")
22 | parser.add_argument("--redo", default = False, action = "store_true",
23 | help="Recreate the database.")
24 | parser.add_argument("--anno-type", default = "classification",
25 | help="The type of annotation {classification, detection}.")
26 | parser.add_argument("--label-type", default = "xml",
27 | help="The type of label file format for detection {xml, json, txt}.")
28 | parser.add_argument("--backend", default = "lmdb",
29 | help="The backend {lmdb, leveldb} for storing the result")
30 | parser.add_argument("--check-size", default = False, action = "store_true",
31 | help="Check that all the datum have the same size.")
32 | parser.add_argument("--encode-type", default = "",
33 | help="What type should we encode the image as ('png','jpg',...).")
34 | parser.add_argument("--encoded", default = False, action = "store_true",
35 | help="The encoded image will be save in datum.")
36 | parser.add_argument("--gray", default = False, action = "store_true",
37 | help="Treat images as grayscale ones.")
38 | parser.add_argument("--label-map-file", default = "",
39 | help="A file with LabelMap protobuf message.")
40 | parser.add_argument("--min-dim", default = 0, type = int,
41 | help="Minimum dimension images are resized to.")
42 | parser.add_argument("--max-dim", default = 0, type = int,
43 | help="Maximum dimension images are resized to.")
44 | parser.add_argument("--resize-height", default = 0, type = int,
45 | help="Height images are resized to.")
46 | parser.add_argument("--resize-width", default = 0, type = int,
47 | help="Width images are resized to.")
48 | parser.add_argument("--shuffle", default = False, action = "store_true",
49 | help="Randomly shuffle the order of images and their labels.")
50 | parser.add_argument("--check-label", default = False, action = "store_true",
51 | help="Check that there is no duplicated name/label.")
52 |
53 | args = parser.parse_args()
54 | root_dir = args.root
55 | list_file = args.listfile
56 | out_dir = args.outdir
57 | example_dir = args.exampledir
58 |
59 | redo = args.redo
60 | anno_type = args.anno_type
61 | label_type = args.label_type
62 | backend = args.backend
63 | check_size = args.check_size
64 | encode_type = args.encode_type
65 | encoded = args.encoded
66 | gray = args.gray
67 | label_map_file = args.label_map_file
68 | min_dim = args.min_dim
69 | max_dim = args.max_dim
70 | resize_height = args.resize_height
71 | resize_width = args.resize_width
72 | shuffle = args.shuffle
73 | check_label = args.check_label
74 |
75 | # check if root directory exists
76 | if not os.path.exists(root_dir):
77 | print("root directory: {} does not exist".format(root_dir))
78 | sys.exit()
79 | # add "/" to root directory if needed
80 | if root_dir[-1] != "/":
81 | root_dir += "/"
82 | # check if list file exists
83 | if not os.path.exists(list_file):
84 | print("list file: {} does not exist".format(list_file))
85 | sys.exit()
86 | # check list file format is correct
87 | with open(list_file, "r") as lf:
88 | for line in lf.readlines():
89 | img_file, anno = line.strip("\n").split(" ")
90 | if not os.path.exists(root_dir + img_file):
91 | print("image file: {} does not exist".format(root_dir + img_file))
92 | if anno_type == "classification":
93 | if not anno.isdigit():
94 | print("annotation: {} is not an integer".format(anno))
95 | elif anno_type == "detection":
96 | if not os.path.exists(root_dir + anno):
97 | print("annofation file: {} does not exist".format(root_dir + anno))
98 | sys.exit()
99 | break
100 | # check if label map file exist
101 | if anno_type == "detection":
102 | if not os.path.exists(label_map_file):
103 | print("label map file: {} does not exist".format(label_map_file))
104 | sys.exit()
105 | label_map = caffe_pb2.LabelMap()
106 | lmf = open(label_map_file, "r")
107 | try:
108 | text_format.Merge(str(lmf.read()), label_map)
109 | except:
110 | print("Cannot parse label map file: {}".format(label_map_file))
111 | sys.exit()
112 | out_parent_dir = os.path.dirname(out_dir)
113 | if not os.path.exists(out_parent_dir):
114 | os.makedirs(out_parent_dir)
115 | if os.path.exists(out_dir) and not redo:
116 | print("{} already exists and I do not hear redo".format(out_dir))
117 | sys.exit()
118 | if os.path.exists(out_dir):
119 | shutil.rmtree(out_dir)
120 |
121 |
122 | if anno_type == "detection":
123 | cmd = "{}/build/tools/convert_annoset" \
124 | " --anno_type={}" \
125 | " --label_type={}" \
126 | " --label_map_file={}" \
127 | " --check_label={}" \
128 | " --min_dim={}" \
129 | " --max_dim={}" \
130 | " --resize_height={}" \
131 | " --resize_width={}" \
132 | " --backend={}" \
133 | " --shuffle={}" \
134 | " --check_size={}" \
135 | " --encode_type={}" \
136 | " --encoded={}" \
137 | " --gray={}" \
138 | " {} {} {}" \
139 | .format(caffe_root, anno_type, label_type, label_map_file, check_label,
140 | min_dim, max_dim, resize_height, resize_width, backend, shuffle,
141 | check_size, encode_type, encoded, gray, root_dir, list_file, out_dir)
142 | elif anno_type == "classification":
143 | cmd = "{}/build/tools/convert_annoset" \
144 | " --anno_type={}" \
145 | " --min_dim={}" \
146 | " --max_dim={}" \
147 | " --resize_height={}" \
148 | " --resize_width={}" \
149 | " --backend={}" \
150 | " --shuffle={}" \
151 | " --check_size={}" \
152 | " --encode_type={}" \
153 | " --encoded={}" \
154 | " --gray={}" \
155 | " {} {} {}" \
156 | .format(caffe_root, anno_type, min_dim, max_dim, resize_height,
157 | resize_width, backend, shuffle, check_size, encode_type, encoded,
158 | gray, root_dir, list_file, out_dir)
159 | print(cmd)
160 | process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
161 | output = process.communicate()[0]
162 |
163 | if not os.path.exists(example_dir):
164 | os.makedirs(example_dir)
165 | # link_dir = os.path.join(example_dir, os.path.basename(out_dir))
166 | # if os.path.exists(link_dir):
167 | # os.unlink(link_dir)
168 | # os.symlink(out_dir, link_dir)
169 |
--------------------------------------------------------------------------------
/other/Hand_Detection/data/create_data.sh:
--------------------------------------------------------------------------------
1 | redo=1
2 | data_root_dir="."
3 | mapfile="labelmap_voc.prototxt"
4 | anno_type="detection"
5 | db="lmdb"
6 | min_dim=0
7 | max_dim=0
8 | width=0
9 | height=0
10 |
11 | extra_cmd="--encode-type=jpg --encoded"
12 | if [ $redo ]
13 | then
14 | extra_cmd="$extra_cmd --redo"
15 | fi
16 | for subset in test trainval
17 | do
18 | python create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $subset.txt $data_root_dir/$db/$subset"_"$db '.'
19 | done
20 |
--------------------------------------------------------------------------------
/other/Hand_Detection/data/create_txt.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 | data_sources = ['egohands', 'stanfordhands']
4 | root_dir = os.path.dirname(os.path.abspath(__file__))
5 | test_data = []
6 | train_data = []
7 |
8 | for data_source in data_sources:
9 | test_im_dir = os.path.join(root_dir, data_source, 'test', 'JPEGImages')
10 | train_im_dir = os.path.join(root_dir, data_source, 'trainval', 'JPEGImages')
11 | for im_file in os.listdir(test_im_dir):
12 | name = im_file.rstrip('.jpg')
13 | xml_file_path = os.path.join(data_source, 'test', 'Annotations', name+'.xml')
14 | im_file_path = os.path.join(data_source, 'test', 'JPEGImages', name+'.jpg')
15 | test_data.append(" ".join([im_file_path, xml_file_path]))
16 | for im_file in os.listdir(train_im_dir):
17 | name = im_file.rstrip('.jpg')
18 | xml_file_path = os.path.join(data_source, 'trainval', 'Annotations', name+'.xml')
19 | im_file_path = os.path.join(data_source, 'trainval', 'JPEGImages', name+'.jpg')
20 | train_data.append(" ".join([im_file_path, xml_file_path]))
21 |
22 |
23 |
24 | random.shuffle(test_data)
25 | random.shuffle(test_data)
26 | random.shuffle(train_data)
27 | random.shuffle(train_data)
28 |
29 | with open('test.txt', 'w') as f:
30 | f.write('\n'.join(test_data))
31 | with open('trainval.txt', 'w') as f:
32 | f.write('\n'.join(train_data))
33 |
--------------------------------------------------------------------------------
/other/Hand_Detection/data/egohands/_screenshot_17.04.2018.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/data/egohands/_screenshot_17.04.2018.png
--------------------------------------------------------------------------------
/other/Hand_Detection/data/egohands/generate_egohands.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from lxml.etree import Element, SubElement, tostring
4 | import random
5 | import cv2
6 | import shutil
7 | import tqdm
8 |
9 | data_root = '/Users/hzzone/Downloads/egohands_data/_LABELLED_SAMPLES'
10 | with open("egohands_data.txt") as f:
11 | data = f.readlines()
12 |
13 |
14 | random.shuffle(data)
15 | random.shuffle(data)
16 |
17 | test_data = random.sample(data, int(len(data)*0.2))
18 | train_data = list(set(data) - set(test_data))
19 |
20 | def trans(data, set_name):
21 |
22 | curr_dir = os.path.dirname(os.path.abspath(__file__))
23 | os.mkdir(os.path.join(curr_dir, set_name))
24 | Annotations_dir = os.path.join(curr_dir, set_name, 'Annotations')
25 | JPEGImages_dir = os.path.join(curr_dir, set_name, 'JPEGImages')
26 | os.mkdir(Annotations_dir)
27 | os.mkdir(JPEGImages_dir)
28 |
29 | for each_pic_data in tqdm.tqdm(data):
30 | # for each_pic_data in data:
31 | data_list = each_pic_data.strip().split()
32 | video_id = data_list[0]
33 | frame_num = str(data_list[1]).zfill(4)
34 | new_img_name = '{}_{}'.format(video_id, frame_num)
35 | frame_num = 'frame_{}.jpg'.format(frame_num)
36 |
37 |
38 | im_path = os.path.join(data_root, video_id, frame_num)
39 |
40 | boxes = np.reshape(np.array(map(int, data_list[2:])), (-1, 4))
41 | node_root = Element('annotation')
42 |
43 | node_folder = SubElement(node_root, 'folder')
44 | node_folder.text = 'egohands'
45 |
46 | node_filename = SubElement(node_root, 'filename')
47 | node_filename.text = new_img_name
48 | #
49 | node_size = SubElement(node_root, 'size')
50 | node_segmented = SubElement(node_root, 'segmented')
51 | node_segmented.text = '0'
52 | node_width = SubElement(node_size, 'width')
53 | im_height, im_width, channel = cv2.imread(im_path).shape
54 | node_width.text = str(im_width)
55 | #
56 | node_height = SubElement(node_size, 'height')
57 | node_height.text = str(im_height)
58 | #
59 | node_depth = SubElement(node_size, 'depth')
60 | node_depth.text = str(channel)
61 | #
62 | # im = cv2.imread(im_path)
63 | # for index in range(boxes.shape[0]):
64 | # minx, miny, w, h = boxes[index]
65 | # cv2.namedWindow("", 0)
66 | # cv2.resizeWindow('', 300, 300)
67 | # cv2.rectangle(im, (minx, miny), (minx+w-1, miny+h-1), (0, 255, 0), thickness=2)
68 | # print(w, h)
69 | # cv2.imshow('', im)
70 | # cv2.waitKey(0)
71 |
72 | effective_hands = 0
73 | for index in range(boxes.shape[0]):
74 | minx, miny, w, h = boxes[index]
75 | maxx = minx+w-1
76 | maxy = miny+h-1
77 | maxx = im_width if maxx > im_width else maxx
78 | maxy = im_height if maxy > im_height else maxy
79 | minx = 0 if minx < 0 else minx
80 | miny = 0 if miny < 0 else miny
81 | w = maxx-minx+1
82 | h = maxy-miny+1
83 | if min(w, h) < 40:
84 | continue
85 | if maxx <= minx or maxy <= miny:
86 | print(minx, miny)
87 |
88 | effective_hands = effective_hands + 1
89 | node_object = SubElement(node_root, 'object')
90 | node_name = SubElement(node_object, 'name')
91 | node_name.text = 'hand'
92 | node_difficult = SubElement(node_object, 'difficult')
93 | node_difficult.text = '0'
94 | node_bndbox = SubElement(node_object, 'bndbox')
95 | node_xmin = SubElement(node_bndbox, 'xmin')
96 | node_xmin.text = str(minx)
97 | node_ymin = SubElement(node_bndbox, 'ymin')
98 | node_ymin.text = str(miny)
99 | node_xmax = SubElement(node_bndbox, 'xmax')
100 | node_xmax.text = str(maxx)
101 | node_ymax = SubElement(node_bndbox, 'ymax')
102 | node_ymax.text = str(maxy)
103 |
104 | xml = tostring(node_root, pretty_print=True)
105 | # if effective_hands == 0:
106 | # print(im_path)
107 | if effective_hands != 0:
108 | # print(im_path)
109 | with open(Annotations_dir + "/" + new_img_name+'.xml', 'w') as f:
110 | f.write(xml)
111 | shutil.copy(im_path, JPEGImages_dir + '/' + new_img_name + '.jpg')
112 |
113 | trans(train_data, 'trainval')
114 | trans(test_data, 'test')
115 |
--------------------------------------------------------------------------------
/other/Hand_Detection/data/egohands/getInfo.m:
--------------------------------------------------------------------------------
1 | video = getMetaBy();
2 | fid = fopen('egohands_data.txt','w');
3 | for i=1:1:48
4 | video_id = video(i).video_id;
5 | for j=1:1:100
6 | fprintf(fid,'%s ', video_id);
7 | frame_num = video(i).labelled_frames(j).frame_num;
8 | fprintf(fid,'%s ', num2str(frame_num));
9 | boxes = getBoundingBoxes(video(i), j);
10 | for x=1:4
11 | if sum(boxes(x, :)) ~=0
12 | box = boxes(x, :);
13 | fprintf(fid,'%d %d %d %d ', box(1), box(2), box(3), box(4));
14 | end
15 | end
16 | fprintf(fid,'\n');
17 | end
18 | end
19 | fclose(fid);
20 |
21 |
--------------------------------------------------------------------------------
/other/Hand_Detection/data/gth/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/data/gth/.gitkeep
--------------------------------------------------------------------------------
/other/Hand_Detection/data/labelmap_voc.prototxt:
--------------------------------------------------------------------------------
1 | item {
2 | name: "none_of_the_above"
3 | label: 0
4 | display_name: "background"
5 | }
6 | item {
7 | name: "hand"
8 | label: 1
9 | display_name: "hand"
10 | }
--------------------------------------------------------------------------------
/other/Hand_Detection/data/stanfordhands/generate_stanfordhands.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import scipy.io as sio
3 | import os
4 | import numpy as np
5 | import cv2
6 | from lxml.etree import Element, SubElement, tostring
7 | import shutil
8 |
9 | test_data = ['/Users/hzzone/Downloads/hand_dataset/test_dataset/test_data']
10 | trainval_data = ['/Users/hzzone/Downloads/hand_dataset/training_dataset/training_data', '/Users/hzzone/Downloads/hand_dataset/validation_dataset/validation_data']
11 | def trans(data_sources, set_name):
12 | curr_dir = os.path.dirname(os.path.abspath(__file__))
13 | os.mkdir(os.path.join(curr_dir, set_name))
14 | Annotations_dir = os.path.join(curr_dir, set_name, 'Annotations')
15 | JPEGImages_dir = os.path.join(curr_dir, set_name, 'JPEGImages')
16 | os.mkdir(Annotations_dir)
17 | os.mkdir(JPEGImages_dir)
18 | # cv2.namedWindow("", 0)
19 | # cv2.resizeWindow('', 300, 300)
20 | for each_source in data_sources:
21 | annotations_source = osp.join(each_source, 'annotations')
22 | img_source = osp.join(each_source, 'images')
23 | for mat_file in os.listdir(annotations_source):
24 | mat_file_path = osp.join(annotations_source, mat_file)
25 | # print(mat_file_path)
26 | img_file_path = osp.join(img_source, mat_file.rstrip('.mat'))+'.jpg'
27 | img = cv2.imread(img_file_path)
28 | boxes_data = sio.loadmat(mat_file_path)["boxes"].flatten()
29 |
30 |
31 | node_root = Element('annotation')
32 |
33 | node_folder = SubElement(node_root, 'folder')
34 | node_folder.text = 'egohands'
35 |
36 | node_filename = SubElement(node_root, 'filename')
37 | node_filename.text = mat_file.strip('.mat')+'.jpg'
38 | #
39 | node_size = SubElement(node_root, 'size')
40 | node_segmented = SubElement(node_root, 'segmented')
41 | node_segmented.text = '0'
42 | node_width = SubElement(node_size, 'width')
43 | im_height, im_width, channel = img.shape
44 | node_width.text = str(im_width)
45 | #
46 | node_height = SubElement(node_size, 'height')
47 | node_height.text = str(im_height)
48 | #
49 | node_depth = SubElement(node_size, 'depth')
50 | node_depth.text = str(channel)
51 |
52 | effective_hands = 0
53 | for box in boxes_data:
54 | tmp = np.reshape(box[0, 0].tolist()[:4], (-1, 2))
55 | y1 = int(round(min(tmp[:, 0]), 0))
56 | y2 = int(round(max(tmp[:, 0]), 0))
57 | x1 = int(round(min(tmp[:, 1]), 0))
58 | x2 = int(round(max(tmp[:, 1]), 0))
59 | # cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), thickness=2)
60 | x2 = im_width if x2 > im_width else x2
61 | y2 = im_height if y2 > im_height else y2
62 | x1 = 0 if x1 < 0 else x1
63 | y1 = 0 if y1 < 0 else y1
64 |
65 | width = x2-x1+1
66 | height = y2-y1+1
67 |
68 | if(min(width, height)<20):
69 | continue
70 |
71 | # if x2>im_width or x1<0 or y2>im_height or y1<0:
72 | # print(x1, x2, y1, y2, width, height, im_height, im_width)
73 | # cv2.imshow("", img)
74 | # cv2.waitKey(0)
75 | if x2<=x1 or y2<=y1:
76 | print(x1, y1)
77 |
78 |
79 | effective_hands = effective_hands + 1
80 | node_object = SubElement(node_root, 'object')
81 | node_name = SubElement(node_object, 'name')
82 | node_name.text = 'hand'
83 | node_difficult = SubElement(node_object, 'difficult')
84 | node_difficult.text = '0'
85 | node_bndbox = SubElement(node_object, 'bndbox')
86 | node_xmin = SubElement(node_bndbox, 'xmin')
87 | node_xmin.text = str(x1)
88 | node_ymin = SubElement(node_bndbox, 'ymin')
89 | node_ymin.text = str(y1)
90 | node_xmax = SubElement(node_bndbox, 'xmax')
91 | node_xmax.text = str(x2)
92 | node_ymax = SubElement(node_bndbox, 'ymax')
93 | node_ymax.text = str(y2)
94 | xml = tostring(node_root, pretty_print=True)
95 | if effective_hands != 0:
96 | with open(Annotations_dir + "/" + mat_file.rstrip('.mat') +'.xml', 'w') as f:
97 | f.write(xml)
98 | shutil.copy(img_file_path, JPEGImages_dir + '/' + mat_file.rstrip('.mat') + '.jpg')
99 |
100 |
101 | trans(trainval_data, 'trainval')
102 | trans(test_data, 'test')
--------------------------------------------------------------------------------
/other/Hand_Detection/model/deploy.prototxt:
--------------------------------------------------------------------------------
1 | name: "VGG_HAND_SSD_300x300_deploy"
2 | input: "data"
3 | input_shape {
4 | dim: 1
5 | dim: 3
6 | dim: 300
7 | dim: 300
8 | }
9 | layer {
10 | name: "conv1_1"
11 | type: "Convolution"
12 | bottom: "data"
13 | top: "conv1_1"
14 | param {
15 | lr_mult: 1.0
16 | decay_mult: 1.0
17 | }
18 | param {
19 | lr_mult: 2.0
20 | decay_mult: 0.0
21 | }
22 | convolution_param {
23 | num_output: 64
24 | pad: 1
25 | kernel_size: 3
26 | weight_filler {
27 | type: "xavier"
28 | }
29 | bias_filler {
30 | type: "constant"
31 | value: 0.0
32 | }
33 | }
34 | }
35 | layer {
36 | name: "relu1_1"
37 | type: "ReLU"
38 | bottom: "conv1_1"
39 | top: "conv1_1"
40 | }
41 | layer {
42 | name: "conv1_2"
43 | type: "Convolution"
44 | bottom: "conv1_1"
45 | top: "conv1_2"
46 | param {
47 | lr_mult: 1.0
48 | decay_mult: 1.0
49 | }
50 | param {
51 | lr_mult: 2.0
52 | decay_mult: 0.0
53 | }
54 | convolution_param {
55 | num_output: 64
56 | pad: 1
57 | kernel_size: 3
58 | weight_filler {
59 | type: "xavier"
60 | }
61 | bias_filler {
62 | type: "constant"
63 | value: 0.0
64 | }
65 | }
66 | }
67 | layer {
68 | name: "relu1_2"
69 | type: "ReLU"
70 | bottom: "conv1_2"
71 | top: "conv1_2"
72 | }
73 | layer {
74 | name: "pool1"
75 | type: "Pooling"
76 | bottom: "conv1_2"
77 | top: "pool1"
78 | pooling_param {
79 | pool: MAX
80 | kernel_size: 2
81 | stride: 2
82 | }
83 | }
84 | layer {
85 | name: "conv2_1"
86 | type: "Convolution"
87 | bottom: "pool1"
88 | top: "conv2_1"
89 | param {
90 | lr_mult: 1.0
91 | decay_mult: 1.0
92 | }
93 | param {
94 | lr_mult: 2.0
95 | decay_mult: 0.0
96 | }
97 | convolution_param {
98 | num_output: 128
99 | pad: 1
100 | kernel_size: 3
101 | weight_filler {
102 | type: "xavier"
103 | }
104 | bias_filler {
105 | type: "constant"
106 | value: 0.0
107 | }
108 | }
109 | }
110 | layer {
111 | name: "relu2_1"
112 | type: "ReLU"
113 | bottom: "conv2_1"
114 | top: "conv2_1"
115 | }
116 | layer {
117 | name: "conv2_2"
118 | type: "Convolution"
119 | bottom: "conv2_1"
120 | top: "conv2_2"
121 | param {
122 | lr_mult: 1.0
123 | decay_mult: 1.0
124 | }
125 | param {
126 | lr_mult: 2.0
127 | decay_mult: 0.0
128 | }
129 | convolution_param {
130 | num_output: 128
131 | pad: 1
132 | kernel_size: 3
133 | weight_filler {
134 | type: "xavier"
135 | }
136 | bias_filler {
137 | type: "constant"
138 | value: 0.0
139 | }
140 | }
141 | }
142 | layer {
143 | name: "relu2_2"
144 | type: "ReLU"
145 | bottom: "conv2_2"
146 | top: "conv2_2"
147 | }
148 | layer {
149 | name: "pool2"
150 | type: "Pooling"
151 | bottom: "conv2_2"
152 | top: "pool2"
153 | pooling_param {
154 | pool: MAX
155 | kernel_size: 2
156 | stride: 2
157 | }
158 | }
159 | layer {
160 | name: "conv3_1"
161 | type: "Convolution"
162 | bottom: "pool2"
163 | top: "conv3_1"
164 | param {
165 | lr_mult: 1.0
166 | decay_mult: 1.0
167 | }
168 | param {
169 | lr_mult: 2.0
170 | decay_mult: 0.0
171 | }
172 | convolution_param {
173 | num_output: 256
174 | pad: 1
175 | kernel_size: 3
176 | weight_filler {
177 | type: "xavier"
178 | }
179 | bias_filler {
180 | type: "constant"
181 | value: 0.0
182 | }
183 | }
184 | }
185 | layer {
186 | name: "relu3_1"
187 | type: "ReLU"
188 | bottom: "conv3_1"
189 | top: "conv3_1"
190 | }
191 | layer {
192 | name: "conv3_2"
193 | type: "Convolution"
194 | bottom: "conv3_1"
195 | top: "conv3_2"
196 | param {
197 | lr_mult: 1.0
198 | decay_mult: 1.0
199 | }
200 | param {
201 | lr_mult: 2.0
202 | decay_mult: 0.0
203 | }
204 | convolution_param {
205 | num_output: 256
206 | pad: 1
207 | kernel_size: 3
208 | weight_filler {
209 | type: "xavier"
210 | }
211 | bias_filler {
212 | type: "constant"
213 | value: 0.0
214 | }
215 | }
216 | }
217 | layer {
218 | name: "relu3_2"
219 | type: "ReLU"
220 | bottom: "conv3_2"
221 | top: "conv3_2"
222 | }
223 | layer {
224 | name: "conv3_3"
225 | type: "Convolution"
226 | bottom: "conv3_2"
227 | top: "conv3_3"
228 | param {
229 | lr_mult: 1.0
230 | decay_mult: 1.0
231 | }
232 | param {
233 | lr_mult: 2.0
234 | decay_mult: 0.0
235 | }
236 | convolution_param {
237 | num_output: 256
238 | pad: 1
239 | kernel_size: 3
240 | weight_filler {
241 | type: "xavier"
242 | }
243 | bias_filler {
244 | type: "constant"
245 | value: 0.0
246 | }
247 | }
248 | }
249 | layer {
250 | name: "relu3_3"
251 | type: "ReLU"
252 | bottom: "conv3_3"
253 | top: "conv3_3"
254 | }
255 | layer {
256 | name: "pool3"
257 | type: "Pooling"
258 | bottom: "conv3_3"
259 | top: "pool3"
260 | pooling_param {
261 | pool: MAX
262 | kernel_size: 2
263 | stride: 2
264 | }
265 | }
266 | layer {
267 | name: "conv4_1"
268 | type: "Convolution"
269 | bottom: "pool3"
270 | top: "conv4_1"
271 | param {
272 | lr_mult: 1.0
273 | decay_mult: 1.0
274 | }
275 | param {
276 | lr_mult: 2.0
277 | decay_mult: 0.0
278 | }
279 | convolution_param {
280 | num_output: 512
281 | pad: 1
282 | kernel_size: 3
283 | weight_filler {
284 | type: "xavier"
285 | }
286 | bias_filler {
287 | type: "constant"
288 | value: 0.0
289 | }
290 | }
291 | }
292 | layer {
293 | name: "relu4_1"
294 | type: "ReLU"
295 | bottom: "conv4_1"
296 | top: "conv4_1"
297 | }
298 | layer {
299 | name: "conv4_2"
300 | type: "Convolution"
301 | bottom: "conv4_1"
302 | top: "conv4_2"
303 | param {
304 | lr_mult: 1.0
305 | decay_mult: 1.0
306 | }
307 | param {
308 | lr_mult: 2.0
309 | decay_mult: 0.0
310 | }
311 | convolution_param {
312 | num_output: 512
313 | pad: 1
314 | kernel_size: 3
315 | weight_filler {
316 | type: "xavier"
317 | }
318 | bias_filler {
319 | type: "constant"
320 | value: 0.0
321 | }
322 | }
323 | }
324 | layer {
325 | name: "relu4_2"
326 | type: "ReLU"
327 | bottom: "conv4_2"
328 | top: "conv4_2"
329 | }
330 | layer {
331 | name: "conv4_3"
332 | type: "Convolution"
333 | bottom: "conv4_2"
334 | top: "conv4_3"
335 | param {
336 | lr_mult: 1.0
337 | decay_mult: 1.0
338 | }
339 | param {
340 | lr_mult: 2.0
341 | decay_mult: 0.0
342 | }
343 | convolution_param {
344 | num_output: 512
345 | pad: 1
346 | kernel_size: 3
347 | weight_filler {
348 | type: "xavier"
349 | }
350 | bias_filler {
351 | type: "constant"
352 | value: 0.0
353 | }
354 | }
355 | }
356 | layer {
357 | name: "relu4_3"
358 | type: "ReLU"
359 | bottom: "conv4_3"
360 | top: "conv4_3"
361 | }
362 | layer {
363 | name: "pool4"
364 | type: "Pooling"
365 | bottom: "conv4_3"
366 | top: "pool4"
367 | pooling_param {
368 | pool: MAX
369 | kernel_size: 2
370 | stride: 2
371 | }
372 | }
373 | layer {
374 | name: "conv5_1"
375 | type: "Convolution"
376 | bottom: "pool4"
377 | top: "conv5_1"
378 | param {
379 | lr_mult: 1.0
380 | decay_mult: 1.0
381 | }
382 | param {
383 | lr_mult: 2.0
384 | decay_mult: 0.0
385 | }
386 | convolution_param {
387 | num_output: 512
388 | pad: 1
389 | kernel_size: 3
390 | weight_filler {
391 | type: "xavier"
392 | }
393 | bias_filler {
394 | type: "constant"
395 | value: 0.0
396 | }
397 | dilation: 1
398 | }
399 | }
400 | layer {
401 | name: "relu5_1"
402 | type: "ReLU"
403 | bottom: "conv5_1"
404 | top: "conv5_1"
405 | }
406 | layer {
407 | name: "conv5_2"
408 | type: "Convolution"
409 | bottom: "conv5_1"
410 | top: "conv5_2"
411 | param {
412 | lr_mult: 1.0
413 | decay_mult: 1.0
414 | }
415 | param {
416 | lr_mult: 2.0
417 | decay_mult: 0.0
418 | }
419 | convolution_param {
420 | num_output: 512
421 | pad: 1
422 | kernel_size: 3
423 | weight_filler {
424 | type: "xavier"
425 | }
426 | bias_filler {
427 | type: "constant"
428 | value: 0.0
429 | }
430 | dilation: 1
431 | }
432 | }
433 | layer {
434 | name: "relu5_2"
435 | type: "ReLU"
436 | bottom: "conv5_2"
437 | top: "conv5_2"
438 | }
439 | layer {
440 | name: "conv5_3"
441 | type: "Convolution"
442 | bottom: "conv5_2"
443 | top: "conv5_3"
444 | param {
445 | lr_mult: 1.0
446 | decay_mult: 1.0
447 | }
448 | param {
449 | lr_mult: 2.0
450 | decay_mult: 0.0
451 | }
452 | convolution_param {
453 | num_output: 512
454 | pad: 1
455 | kernel_size: 3
456 | weight_filler {
457 | type: "xavier"
458 | }
459 | bias_filler {
460 | type: "constant"
461 | value: 0.0
462 | }
463 | dilation: 1
464 | }
465 | }
466 | layer {
467 | name: "relu5_3"
468 | type: "ReLU"
469 | bottom: "conv5_3"
470 | top: "conv5_3"
471 | }
472 | layer {
473 | name: "pool5"
474 | type: "Pooling"
475 | bottom: "conv5_3"
476 | top: "pool5"
477 | pooling_param {
478 | pool: MAX
479 | kernel_size: 3
480 | stride: 1
481 | pad: 1
482 | }
483 | }
484 | layer {
485 | name: "fc6"
486 | type: "Convolution"
487 | bottom: "pool5"
488 | top: "fc6"
489 | param {
490 | lr_mult: 1.0
491 | decay_mult: 1.0
492 | }
493 | param {
494 | lr_mult: 2.0
495 | decay_mult: 0.0
496 | }
497 | convolution_param {
498 | num_output: 1024
499 | pad: 6
500 | kernel_size: 3
501 | weight_filler {
502 | type: "xavier"
503 | }
504 | bias_filler {
505 | type: "constant"
506 | value: 0.0
507 | }
508 | dilation: 6
509 | }
510 | }
511 | layer {
512 | name: "relu6"
513 | type: "ReLU"
514 | bottom: "fc6"
515 | top: "fc6"
516 | }
517 | layer {
518 | name: "fc7"
519 | type: "Convolution"
520 | bottom: "fc6"
521 | top: "fc7"
522 | param {
523 | lr_mult: 1.0
524 | decay_mult: 1.0
525 | }
526 | param {
527 | lr_mult: 2.0
528 | decay_mult: 0.0
529 | }
530 | convolution_param {
531 | num_output: 1024
532 | kernel_size: 1
533 | weight_filler {
534 | type: "xavier"
535 | }
536 | bias_filler {
537 | type: "constant"
538 | value: 0.0
539 | }
540 | }
541 | }
542 | layer {
543 | name: "relu7"
544 | type: "ReLU"
545 | bottom: "fc7"
546 | top: "fc7"
547 | }
548 | layer {
549 | name: "conv6_1"
550 | type: "Convolution"
551 | bottom: "fc7"
552 | top: "conv6_1"
553 | param {
554 | lr_mult: 1.0
555 | decay_mult: 1.0
556 | }
557 | param {
558 | lr_mult: 2.0
559 | decay_mult: 0.0
560 | }
561 | convolution_param {
562 | num_output: 256
563 | pad: 0
564 | kernel_size: 1
565 | stride: 1
566 | weight_filler {
567 | type: "xavier"
568 | }
569 | bias_filler {
570 | type: "constant"
571 | value: 0.0
572 | }
573 | }
574 | }
575 | layer {
576 | name: "conv6_1_relu"
577 | type: "ReLU"
578 | bottom: "conv6_1"
579 | top: "conv6_1"
580 | }
581 | layer {
582 | name: "conv6_2"
583 | type: "Convolution"
584 | bottom: "conv6_1"
585 | top: "conv6_2"
586 | param {
587 | lr_mult: 1.0
588 | decay_mult: 1.0
589 | }
590 | param {
591 | lr_mult: 2.0
592 | decay_mult: 0.0
593 | }
594 | convolution_param {
595 | num_output: 512
596 | pad: 1
597 | kernel_size: 3
598 | stride: 2
599 | weight_filler {
600 | type: "xavier"
601 | }
602 | bias_filler {
603 | type: "constant"
604 | value: 0.0
605 | }
606 | }
607 | }
608 | layer {
609 | name: "conv6_2_relu"
610 | type: "ReLU"
611 | bottom: "conv6_2"
612 | top: "conv6_2"
613 | }
614 | layer {
615 | name: "conv7_1"
616 | type: "Convolution"
617 | bottom: "conv6_2"
618 | top: "conv7_1"
619 | param {
620 | lr_mult: 1.0
621 | decay_mult: 1.0
622 | }
623 | param {
624 | lr_mult: 2.0
625 | decay_mult: 0.0
626 | }
627 | convolution_param {
628 | num_output: 128
629 | pad: 0
630 | kernel_size: 1
631 | stride: 1
632 | weight_filler {
633 | type: "xavier"
634 | }
635 | bias_filler {
636 | type: "constant"
637 | value: 0.0
638 | }
639 | }
640 | }
641 | layer {
642 | name: "conv7_1_relu"
643 | type: "ReLU"
644 | bottom: "conv7_1"
645 | top: "conv7_1"
646 | }
647 | layer {
648 | name: "conv7_2"
649 | type: "Convolution"
650 | bottom: "conv7_1"
651 | top: "conv7_2"
652 | param {
653 | lr_mult: 1.0
654 | decay_mult: 1.0
655 | }
656 | param {
657 | lr_mult: 2.0
658 | decay_mult: 0.0
659 | }
660 | convolution_param {
661 | num_output: 256
662 | pad: 1
663 | kernel_size: 3
664 | stride: 2
665 | weight_filler {
666 | type: "xavier"
667 | }
668 | bias_filler {
669 | type: "constant"
670 | value: 0.0
671 | }
672 | }
673 | }
674 | layer {
675 | name: "conv7_2_relu"
676 | type: "ReLU"
677 | bottom: "conv7_2"
678 | top: "conv7_2"
679 | }
680 | layer {
681 | name: "conv8_1"
682 | type: "Convolution"
683 | bottom: "conv7_2"
684 | top: "conv8_1"
685 | param {
686 | lr_mult: 1.0
687 | decay_mult: 1.0
688 | }
689 | param {
690 | lr_mult: 2.0
691 | decay_mult: 0.0
692 | }
693 | convolution_param {
694 | num_output: 128
695 | pad: 0
696 | kernel_size: 1
697 | stride: 1
698 | weight_filler {
699 | type: "xavier"
700 | }
701 | bias_filler {
702 | type: "constant"
703 | value: 0.0
704 | }
705 | }
706 | }
707 | layer {
708 | name: "conv8_1_relu"
709 | type: "ReLU"
710 | bottom: "conv8_1"
711 | top: "conv8_1"
712 | }
713 | layer {
714 | name: "conv8_2"
715 | type: "Convolution"
716 | bottom: "conv8_1"
717 | top: "conv8_2"
718 | param {
719 | lr_mult: 1.0
720 | decay_mult: 1.0
721 | }
722 | param {
723 | lr_mult: 2.0
724 | decay_mult: 0.0
725 | }
726 | convolution_param {
727 | num_output: 256
728 | pad: 0
729 | kernel_size: 3
730 | stride: 1
731 | weight_filler {
732 | type: "xavier"
733 | }
734 | bias_filler {
735 | type: "constant"
736 | value: 0.0
737 | }
738 | }
739 | }
740 | layer {
741 | name: "conv8_2_relu"
742 | type: "ReLU"
743 | bottom: "conv8_2"
744 | top: "conv8_2"
745 | }
746 | layer {
747 | name: "conv9_1"
748 | type: "Convolution"
749 | bottom: "conv8_2"
750 | top: "conv9_1"
751 | param {
752 | lr_mult: 1.0
753 | decay_mult: 1.0
754 | }
755 | param {
756 | lr_mult: 2.0
757 | decay_mult: 0.0
758 | }
759 | convolution_param {
760 | num_output: 128
761 | pad: 0
762 | kernel_size: 1
763 | stride: 1
764 | weight_filler {
765 | type: "xavier"
766 | }
767 | bias_filler {
768 | type: "constant"
769 | value: 0.0
770 | }
771 | }
772 | }
773 | layer {
774 | name: "conv9_1_relu"
775 | type: "ReLU"
776 | bottom: "conv9_1"
777 | top: "conv9_1"
778 | }
779 | layer {
780 | name: "conv9_2"
781 | type: "Convolution"
782 | bottom: "conv9_1"
783 | top: "conv9_2"
784 | param {
785 | lr_mult: 1.0
786 | decay_mult: 1.0
787 | }
788 | param {
789 | lr_mult: 2.0
790 | decay_mult: 0.0
791 | }
792 | convolution_param {
793 | num_output: 256
794 | pad: 0
795 | kernel_size: 3
796 | stride: 1
797 | weight_filler {
798 | type: "xavier"
799 | }
800 | bias_filler {
801 | type: "constant"
802 | value: 0.0
803 | }
804 | }
805 | }
806 | layer {
807 | name: "conv9_2_relu"
808 | type: "ReLU"
809 | bottom: "conv9_2"
810 | top: "conv9_2"
811 | }
812 | layer {
813 | name: "conv4_3_norm"
814 | type: "Normalize"
815 | bottom: "conv4_3"
816 | top: "conv4_3_norm"
817 | norm_param {
818 | across_spatial: false
819 | scale_filler {
820 | type: "constant"
821 | value: 20.0
822 | }
823 | channel_shared: false
824 | }
825 | }
826 | layer {
827 | name: "conv4_3_norm_mbox_loc"
828 | type: "Convolution"
829 | bottom: "conv4_3_norm"
830 | top: "conv4_3_norm_mbox_loc"
831 | param {
832 | lr_mult: 1.0
833 | decay_mult: 1.0
834 | }
835 | param {
836 | lr_mult: 2.0
837 | decay_mult: 0.0
838 | }
839 | convolution_param {
840 | num_output: 16
841 | pad: 1
842 | kernel_size: 3
843 | stride: 1
844 | weight_filler {
845 | type: "xavier"
846 | }
847 | bias_filler {
848 | type: "constant"
849 | value: 0.0
850 | }
851 | }
852 | }
853 | layer {
854 | name: "conv4_3_norm_mbox_loc_perm"
855 | type: "Permute"
856 | bottom: "conv4_3_norm_mbox_loc"
857 | top: "conv4_3_norm_mbox_loc_perm"
858 | permute_param {
859 | order: 0
860 | order: 2
861 | order: 3
862 | order: 1
863 | }
864 | }
865 | layer {
866 | name: "conv4_3_norm_mbox_loc_flat"
867 | type: "Flatten"
868 | bottom: "conv4_3_norm_mbox_loc_perm"
869 | top: "conv4_3_norm_mbox_loc_flat"
870 | flatten_param {
871 | axis: 1
872 | }
873 | }
874 | layer {
875 | name: "conv4_3_norm_mbox_conf_hand_detection"
876 | type: "Convolution"
877 | bottom: "conv4_3_norm"
878 | top: "conv4_3_norm_mbox_conf_hand_detection"
879 | param {
880 | lr_mult: 1.0
881 | decay_mult: 1.0
882 | }
883 | param {
884 | lr_mult: 2.0
885 | decay_mult: 0.0
886 | }
887 | convolution_param {
888 | num_output: 8
889 | pad: 1
890 | kernel_size: 3
891 | stride: 1
892 | weight_filler {
893 | type: "xavier"
894 | }
895 | bias_filler {
896 | type: "constant"
897 | value: 0.0
898 | }
899 | }
900 | }
901 | layer {
902 | name: "conv4_3_norm_mbox_conf_hand_detection_perm"
903 | type: "Permute"
904 | bottom: "conv4_3_norm_mbox_conf_hand_detection"
905 | top: "conv4_3_norm_mbox_conf_hand_detection_perm"
906 | permute_param {
907 | order: 0
908 | order: 2
909 | order: 3
910 | order: 1
911 | }
912 | }
913 | layer {
914 | name: "conv4_3_norm_mbox_conf_hand_detection_flat"
915 | type: "Flatten"
916 | bottom: "conv4_3_norm_mbox_conf_hand_detection_perm"
917 | top: "conv4_3_norm_mbox_conf_hand_detection_flat"
918 | flatten_param {
919 | axis: 1
920 | }
921 | }
922 | layer {
923 | name: "conv4_3_norm_mbox_priorbox"
924 | type: "PriorBox"
925 | bottom: "conv4_3_norm"
926 | bottom: "data"
927 | top: "conv4_3_norm_mbox_priorbox"
928 | prior_box_param {
929 | min_size: 30.0
930 | max_size: 60.0
931 | aspect_ratio: 2.0
932 | flip: true
933 | clip: false
934 | variance: 0.10000000149
935 | variance: 0.10000000149
936 | variance: 0.20000000298
937 | variance: 0.20000000298
938 | step: 8.0
939 | offset: 0.5
940 | }
941 | }
942 | layer {
943 | name: "fc7_mbox_loc"
944 | type: "Convolution"
945 | bottom: "fc7"
946 | top: "fc7_mbox_loc"
947 | param {
948 | lr_mult: 1.0
949 | decay_mult: 1.0
950 | }
951 | param {
952 | lr_mult: 2.0
953 | decay_mult: 0.0
954 | }
955 | convolution_param {
956 | num_output: 24
957 | pad: 1
958 | kernel_size: 3
959 | stride: 1
960 | weight_filler {
961 | type: "xavier"
962 | }
963 | bias_filler {
964 | type: "constant"
965 | value: 0.0
966 | }
967 | }
968 | }
969 | layer {
970 | name: "fc7_mbox_loc_perm"
971 | type: "Permute"
972 | bottom: "fc7_mbox_loc"
973 | top: "fc7_mbox_loc_perm"
974 | permute_param {
975 | order: 0
976 | order: 2
977 | order: 3
978 | order: 1
979 | }
980 | }
981 | layer {
982 | name: "fc7_mbox_loc_flat"
983 | type: "Flatten"
984 | bottom: "fc7_mbox_loc_perm"
985 | top: "fc7_mbox_loc_flat"
986 | flatten_param {
987 | axis: 1
988 | }
989 | }
990 | layer {
991 | name: "fc7_mbox_conf_hand_detection"
992 | type: "Convolution"
993 | bottom: "fc7"
994 | top: "fc7_mbox_conf_hand_detection"
995 | param {
996 | lr_mult: 1.0
997 | decay_mult: 1.0
998 | }
999 | param {
1000 | lr_mult: 2.0
1001 | decay_mult: 0.0
1002 | }
1003 | convolution_param {
1004 | num_output: 12
1005 | pad: 1
1006 | kernel_size: 3
1007 | stride: 1
1008 | weight_filler {
1009 | type: "xavier"
1010 | }
1011 | bias_filler {
1012 | type: "constant"
1013 | value: 0.0
1014 | }
1015 | }
1016 | }
1017 | layer {
1018 | name: "fc7_mbox_conf_hand_detection_perm"
1019 | type: "Permute"
1020 | bottom: "fc7_mbox_conf_hand_detection"
1021 | top: "fc7_mbox_conf_hand_detection_perm"
1022 | permute_param {
1023 | order: 0
1024 | order: 2
1025 | order: 3
1026 | order: 1
1027 | }
1028 | }
1029 | layer {
1030 | name: "fc7_mbox_conf_hand_detection_flat"
1031 | type: "Flatten"
1032 | bottom: "fc7_mbox_conf_hand_detection_perm"
1033 | top: "fc7_mbox_conf_hand_detection_flat"
1034 | flatten_param {
1035 | axis: 1
1036 | }
1037 | }
1038 | layer {
1039 | name: "fc7_mbox_priorbox"
1040 | type: "PriorBox"
1041 | bottom: "fc7"
1042 | bottom: "data"
1043 | top: "fc7_mbox_priorbox"
1044 | prior_box_param {
1045 | min_size: 60.0
1046 | max_size: 111.0
1047 | aspect_ratio: 2.0
1048 | aspect_ratio: 3.0
1049 | flip: true
1050 | clip: false
1051 | variance: 0.10000000149
1052 | variance: 0.10000000149
1053 | variance: 0.20000000298
1054 | variance: 0.20000000298
1055 | step: 16.0
1056 | offset: 0.5
1057 | }
1058 | }
1059 | layer {
1060 | name: "conv6_2_mbox_loc"
1061 | type: "Convolution"
1062 | bottom: "conv6_2"
1063 | top: "conv6_2_mbox_loc"
1064 | param {
1065 | lr_mult: 1.0
1066 | decay_mult: 1.0
1067 | }
1068 | param {
1069 | lr_mult: 2.0
1070 | decay_mult: 0.0
1071 | }
1072 | convolution_param {
1073 | num_output: 24
1074 | pad: 1
1075 | kernel_size: 3
1076 | stride: 1
1077 | weight_filler {
1078 | type: "xavier"
1079 | }
1080 | bias_filler {
1081 | type: "constant"
1082 | value: 0.0
1083 | }
1084 | }
1085 | }
1086 | layer {
1087 | name: "conv6_2_mbox_loc_perm"
1088 | type: "Permute"
1089 | bottom: "conv6_2_mbox_loc"
1090 | top: "conv6_2_mbox_loc_perm"
1091 | permute_param {
1092 | order: 0
1093 | order: 2
1094 | order: 3
1095 | order: 1
1096 | }
1097 | }
1098 | layer {
1099 | name: "conv6_2_mbox_loc_flat"
1100 | type: "Flatten"
1101 | bottom: "conv6_2_mbox_loc_perm"
1102 | top: "conv6_2_mbox_loc_flat"
1103 | flatten_param {
1104 | axis: 1
1105 | }
1106 | }
1107 | layer {
1108 | name: "conv6_2_mbox_conf_hand_detection"
1109 | type: "Convolution"
1110 | bottom: "conv6_2"
1111 | top: "conv6_2_mbox_conf_hand_detection"
1112 | param {
1113 | lr_mult: 1.0
1114 | decay_mult: 1.0
1115 | }
1116 | param {
1117 | lr_mult: 2.0
1118 | decay_mult: 0.0
1119 | }
1120 | convolution_param {
1121 | num_output: 12
1122 | pad: 1
1123 | kernel_size: 3
1124 | stride: 1
1125 | weight_filler {
1126 | type: "xavier"
1127 | }
1128 | bias_filler {
1129 | type: "constant"
1130 | value: 0.0
1131 | }
1132 | }
1133 | }
1134 | layer {
1135 | name: "conv6_2_mbox_conf_hand_detection_perm"
1136 | type: "Permute"
1137 | bottom: "conv6_2_mbox_conf_hand_detection"
1138 | top: "conv6_2_mbox_conf_hand_detection_perm"
1139 | permute_param {
1140 | order: 0
1141 | order: 2
1142 | order: 3
1143 | order: 1
1144 | }
1145 | }
1146 | layer {
1147 | name: "conv6_2_mbox_conf_hand_detection_flat"
1148 | type: "Flatten"
1149 | bottom: "conv6_2_mbox_conf_hand_detection_perm"
1150 | top: "conv6_2_mbox_conf_hand_detection_flat"
1151 | flatten_param {
1152 | axis: 1
1153 | }
1154 | }
1155 | layer {
1156 | name: "conv6_2_mbox_priorbox"
1157 | type: "PriorBox"
1158 | bottom: "conv6_2"
1159 | bottom: "data"
1160 | top: "conv6_2_mbox_priorbox"
1161 | prior_box_param {
1162 | min_size: 111.0
1163 | max_size: 162.0
1164 | aspect_ratio: 2.0
1165 | aspect_ratio: 3.0
1166 | flip: true
1167 | clip: false
1168 | variance: 0.10000000149
1169 | variance: 0.10000000149
1170 | variance: 0.20000000298
1171 | variance: 0.20000000298
1172 | step: 32.0
1173 | offset: 0.5
1174 | }
1175 | }
1176 | layer {
1177 | name: "conv7_2_mbox_loc"
1178 | type: "Convolution"
1179 | bottom: "conv7_2"
1180 | top: "conv7_2_mbox_loc"
1181 | param {
1182 | lr_mult: 1.0
1183 | decay_mult: 1.0
1184 | }
1185 | param {
1186 | lr_mult: 2.0
1187 | decay_mult: 0.0
1188 | }
1189 | convolution_param {
1190 | num_output: 24
1191 | pad: 1
1192 | kernel_size: 3
1193 | stride: 1
1194 | weight_filler {
1195 | type: "xavier"
1196 | }
1197 | bias_filler {
1198 | type: "constant"
1199 | value: 0.0
1200 | }
1201 | }
1202 | }
1203 | layer {
1204 | name: "conv7_2_mbox_loc_perm"
1205 | type: "Permute"
1206 | bottom: "conv7_2_mbox_loc"
1207 | top: "conv7_2_mbox_loc_perm"
1208 | permute_param {
1209 | order: 0
1210 | order: 2
1211 | order: 3
1212 | order: 1
1213 | }
1214 | }
1215 | layer {
1216 | name: "conv7_2_mbox_loc_flat"
1217 | type: "Flatten"
1218 | bottom: "conv7_2_mbox_loc_perm"
1219 | top: "conv7_2_mbox_loc_flat"
1220 | flatten_param {
1221 | axis: 1
1222 | }
1223 | }
1224 | layer {
1225 | name: "conv7_2_mbox_conf_hand_detection"
1226 | type: "Convolution"
1227 | bottom: "conv7_2"
1228 | top: "conv7_2_mbox_conf_hand_detection"
1229 | param {
1230 | lr_mult: 1.0
1231 | decay_mult: 1.0
1232 | }
1233 | param {
1234 | lr_mult: 2.0
1235 | decay_mult: 0.0
1236 | }
1237 | convolution_param {
1238 | num_output: 12
1239 | pad: 1
1240 | kernel_size: 3
1241 | stride: 1
1242 | weight_filler {
1243 | type: "xavier"
1244 | }
1245 | bias_filler {
1246 | type: "constant"
1247 | value: 0.0
1248 | }
1249 | }
1250 | }
1251 | layer {
1252 | name: "conv7_2_mbox_conf_hand_detection_perm"
1253 | type: "Permute"
1254 | bottom: "conv7_2_mbox_conf_hand_detection"
1255 | top: "conv7_2_mbox_conf_hand_detection_perm"
1256 | permute_param {
1257 | order: 0
1258 | order: 2
1259 | order: 3
1260 | order: 1
1261 | }
1262 | }
1263 | layer {
1264 | name: "conv7_2_mbox_conf_hand_detection_flat"
1265 | type: "Flatten"
1266 | bottom: "conv7_2_mbox_conf_hand_detection_perm"
1267 | top: "conv7_2_mbox_conf_hand_detection_flat"
1268 | flatten_param {
1269 | axis: 1
1270 | }
1271 | }
1272 | layer {
1273 | name: "conv7_2_mbox_priorbox"
1274 | type: "PriorBox"
1275 | bottom: "conv7_2"
1276 | bottom: "data"
1277 | top: "conv7_2_mbox_priorbox"
1278 | prior_box_param {
1279 | min_size: 162.0
1280 | max_size: 213.0
1281 | aspect_ratio: 2.0
1282 | aspect_ratio: 3.0
1283 | flip: true
1284 | clip: false
1285 | variance: 0.10000000149
1286 | variance: 0.10000000149
1287 | variance: 0.20000000298
1288 | variance: 0.20000000298
1289 | step: 64.0
1290 | offset: 0.5
1291 | }
1292 | }
1293 | layer {
1294 | name: "conv8_2_mbox_loc"
1295 | type: "Convolution"
1296 | bottom: "conv8_2"
1297 | top: "conv8_2_mbox_loc"
1298 | param {
1299 | lr_mult: 1.0
1300 | decay_mult: 1.0
1301 | }
1302 | param {
1303 | lr_mult: 2.0
1304 | decay_mult: 0.0
1305 | }
1306 | convolution_param {
1307 | num_output: 16
1308 | pad: 1
1309 | kernel_size: 3
1310 | stride: 1
1311 | weight_filler {
1312 | type: "xavier"
1313 | }
1314 | bias_filler {
1315 | type: "constant"
1316 | value: 0.0
1317 | }
1318 | }
1319 | }
1320 | layer {
1321 | name: "conv8_2_mbox_loc_perm"
1322 | type: "Permute"
1323 | bottom: "conv8_2_mbox_loc"
1324 | top: "conv8_2_mbox_loc_perm"
1325 | permute_param {
1326 | order: 0
1327 | order: 2
1328 | order: 3
1329 | order: 1
1330 | }
1331 | }
1332 | layer {
1333 | name: "conv8_2_mbox_loc_flat"
1334 | type: "Flatten"
1335 | bottom: "conv8_2_mbox_loc_perm"
1336 | top: "conv8_2_mbox_loc_flat"
1337 | flatten_param {
1338 | axis: 1
1339 | }
1340 | }
1341 | layer {
1342 | name: "conv8_2_mbox_conf_hand_detection"
1343 | type: "Convolution"
1344 | bottom: "conv8_2"
1345 | top: "conv8_2_mbox_conf_hand_detection"
1346 | param {
1347 | lr_mult: 1.0
1348 | decay_mult: 1.0
1349 | }
1350 | param {
1351 | lr_mult: 2.0
1352 | decay_mult: 0.0
1353 | }
1354 | convolution_param {
1355 | num_output: 8
1356 | pad: 1
1357 | kernel_size: 3
1358 | stride: 1
1359 | weight_filler {
1360 | type: "xavier"
1361 | }
1362 | bias_filler {
1363 | type: "constant"
1364 | value: 0.0
1365 | }
1366 | }
1367 | }
1368 | layer {
1369 | name: "conv8_2_mbox_conf_hand_detection_perm"
1370 | type: "Permute"
1371 | bottom: "conv8_2_mbox_conf_hand_detection"
1372 | top: "conv8_2_mbox_conf_hand_detection_perm"
1373 | permute_param {
1374 | order: 0
1375 | order: 2
1376 | order: 3
1377 | order: 1
1378 | }
1379 | }
1380 | layer {
1381 | name: "conv8_2_mbox_conf_hand_detection_flat"
1382 | type: "Flatten"
1383 | bottom: "conv8_2_mbox_conf_hand_detection_perm"
1384 | top: "conv8_2_mbox_conf_hand_detection_flat"
1385 | flatten_param {
1386 | axis: 1
1387 | }
1388 | }
1389 | layer {
1390 | name: "conv8_2_mbox_priorbox"
1391 | type: "PriorBox"
1392 | bottom: "conv8_2"
1393 | bottom: "data"
1394 | top: "conv8_2_mbox_priorbox"
1395 | prior_box_param {
1396 | min_size: 213.0
1397 | max_size: 264.0
1398 | aspect_ratio: 2.0
1399 | flip: true
1400 | clip: false
1401 | variance: 0.10000000149
1402 | variance: 0.10000000149
1403 | variance: 0.20000000298
1404 | variance: 0.20000000298
1405 | step: 100.0
1406 | offset: 0.5
1407 | }
1408 | }
1409 | layer {
1410 | name: "conv9_2_mbox_loc"
1411 | type: "Convolution"
1412 | bottom: "conv9_2"
1413 | top: "conv9_2_mbox_loc"
1414 | param {
1415 | lr_mult: 1.0
1416 | decay_mult: 1.0
1417 | }
1418 | param {
1419 | lr_mult: 2.0
1420 | decay_mult: 0.0
1421 | }
1422 | convolution_param {
1423 | num_output: 16
1424 | pad: 1
1425 | kernel_size: 3
1426 | stride: 1
1427 | weight_filler {
1428 | type: "xavier"
1429 | }
1430 | bias_filler {
1431 | type: "constant"
1432 | value: 0.0
1433 | }
1434 | }
1435 | }
1436 | layer {
1437 | name: "conv9_2_mbox_loc_perm"
1438 | type: "Permute"
1439 | bottom: "conv9_2_mbox_loc"
1440 | top: "conv9_2_mbox_loc_perm"
1441 | permute_param {
1442 | order: 0
1443 | order: 2
1444 | order: 3
1445 | order: 1
1446 | }
1447 | }
1448 | layer {
1449 | name: "conv9_2_mbox_loc_flat"
1450 | type: "Flatten"
1451 | bottom: "conv9_2_mbox_loc_perm"
1452 | top: "conv9_2_mbox_loc_flat"
1453 | flatten_param {
1454 | axis: 1
1455 | }
1456 | }
1457 | layer {
1458 | name: "conv9_2_mbox_conf_hand_detection"
1459 | type: "Convolution"
1460 | bottom: "conv9_2"
1461 | top: "conv9_2_mbox_conf_hand_detection"
1462 | param {
1463 | lr_mult: 1.0
1464 | decay_mult: 1.0
1465 | }
1466 | param {
1467 | lr_mult: 2.0
1468 | decay_mult: 0.0
1469 | }
1470 | convolution_param {
1471 | num_output: 8
1472 | pad: 1
1473 | kernel_size: 3
1474 | stride: 1
1475 | weight_filler {
1476 | type: "xavier"
1477 | }
1478 | bias_filler {
1479 | type: "constant"
1480 | value: 0.0
1481 | }
1482 | }
1483 | }
1484 | layer {
1485 | name: "conv9_2_mbox_conf_hand_detection_perm"
1486 | type: "Permute"
1487 | bottom: "conv9_2_mbox_conf_hand_detection"
1488 | top: "conv9_2_mbox_conf_hand_detection_perm"
1489 | permute_param {
1490 | order: 0
1491 | order: 2
1492 | order: 3
1493 | order: 1
1494 | }
1495 | }
1496 | layer {
1497 | name: "conv9_2_mbox_conf_hand_detection_flat"
1498 | type: "Flatten"
1499 | bottom: "conv9_2_mbox_conf_hand_detection_perm"
1500 | top: "conv9_2_mbox_conf_hand_detection_flat"
1501 | flatten_param {
1502 | axis: 1
1503 | }
1504 | }
1505 | layer {
1506 | name: "conv9_2_mbox_priorbox"
1507 | type: "PriorBox"
1508 | bottom: "conv9_2"
1509 | bottom: "data"
1510 | top: "conv9_2_mbox_priorbox"
1511 | prior_box_param {
1512 | min_size: 264.0
1513 | max_size: 315.0
1514 | aspect_ratio: 2.0
1515 | flip: true
1516 | clip: false
1517 | variance: 0.10000000149
1518 | variance: 0.10000000149
1519 | variance: 0.20000000298
1520 | variance: 0.20000000298
1521 | step: 300.0
1522 | offset: 0.5
1523 | }
1524 | }
1525 | layer {
1526 | name: "mbox_loc"
1527 | type: "Concat"
1528 | bottom: "conv4_3_norm_mbox_loc_flat"
1529 | bottom: "fc7_mbox_loc_flat"
1530 | bottom: "conv6_2_mbox_loc_flat"
1531 | bottom: "conv7_2_mbox_loc_flat"
1532 | bottom: "conv8_2_mbox_loc_flat"
1533 | bottom: "conv9_2_mbox_loc_flat"
1534 | top: "mbox_loc"
1535 | concat_param {
1536 | axis: 1
1537 | }
1538 | }
1539 | layer {
1540 | name: "mbox_conf"
1541 | type: "Concat"
1542 | bottom: "conv4_3_norm_mbox_conf_hand_detection_flat"
1543 | bottom: "fc7_mbox_conf_hand_detection_flat"
1544 | bottom: "conv6_2_mbox_conf_hand_detection_flat"
1545 | bottom: "conv7_2_mbox_conf_hand_detection_flat"
1546 | bottom: "conv8_2_mbox_conf_hand_detection_flat"
1547 | bottom: "conv9_2_mbox_conf_hand_detection_flat"
1548 | top: "mbox_conf"
1549 | concat_param {
1550 | axis: 1
1551 | }
1552 | }
1553 | layer {
1554 | name: "mbox_priorbox"
1555 | type: "Concat"
1556 | bottom: "conv4_3_norm_mbox_priorbox"
1557 | bottom: "fc7_mbox_priorbox"
1558 | bottom: "conv6_2_mbox_priorbox"
1559 | bottom: "conv7_2_mbox_priorbox"
1560 | bottom: "conv8_2_mbox_priorbox"
1561 | bottom: "conv9_2_mbox_priorbox"
1562 | top: "mbox_priorbox"
1563 | concat_param {
1564 | axis: 2
1565 | }
1566 | }
1567 | layer {
1568 | name: "mbox_conf_reshape"
1569 | type: "Reshape"
1570 | bottom: "mbox_conf"
1571 | top: "mbox_conf_reshape"
1572 | reshape_param {
1573 | shape {
1574 | dim: 0
1575 | dim: -1
1576 | dim: 2
1577 | }
1578 | }
1579 | }
1580 | layer {
1581 | name: "mbox_conf_softmax"
1582 | type: "Softmax"
1583 | bottom: "mbox_conf_reshape"
1584 | top: "mbox_conf_softmax"
1585 | softmax_param {
1586 | axis: 2
1587 | }
1588 | }
1589 | layer {
1590 | name: "mbox_conf_flatten"
1591 | type: "Flatten"
1592 | bottom: "mbox_conf_softmax"
1593 | top: "mbox_conf_flatten"
1594 | flatten_param {
1595 | axis: 1
1596 | }
1597 | }
1598 | layer {
1599 | name: "detection_out"
1600 | type: "DetectionOutput"
1601 | bottom: "mbox_loc"
1602 | bottom: "mbox_conf_flatten"
1603 | bottom: "mbox_priorbox"
1604 | top: "detection_out"
1605 | include {
1606 | phase: TEST
1607 | }
1608 | detection_output_param {
1609 | num_classes: 2
1610 | share_location: true
1611 | background_label_id: 0
1612 | nms_param {
1613 | nms_threshold: 0.449999988079
1614 | top_k: 400
1615 | }
1616 | code_type: CENTER_SIZE
1617 | keep_top_k: 200
1618 | confidence_threshold: 0.00999999977648
1619 | }
1620 | }
1621 |
1622 |
--------------------------------------------------------------------------------
/other/Hand_Detection/model/generate_model.py:
--------------------------------------------------------------------------------
1 | from __future__ import print_function
2 | import sys
3 | sys.path.insert(0, '../caffe/python')
4 | import caffe
5 | from caffe.model_libs import *
6 | from google.protobuf import text_format
7 |
8 | import math
9 | import os
10 | import shutil
11 | import stat
12 | import subprocess
13 |
14 | # Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
15 | def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
16 | use_relu = True
17 |
18 | # Add additional convolutional layers.
19 | # 19 x 19
20 | from_layer = net.keys()[-1]
21 |
22 | # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
23 | # 10 x 10
24 | out_layer = "conv6_1"
25 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
26 | lr_mult=lr_mult)
27 |
28 | from_layer = out_layer
29 | out_layer = "conv6_2"
30 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
31 | lr_mult=lr_mult)
32 |
33 | # 5 x 5
34 | from_layer = out_layer
35 | out_layer = "conv7_1"
36 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
37 | lr_mult=lr_mult)
38 |
39 | from_layer = out_layer
40 | out_layer = "conv7_2"
41 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
42 | lr_mult=lr_mult)
43 |
44 | # 3 x 3
45 | from_layer = out_layer
46 | out_layer = "conv8_1"
47 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
48 | lr_mult=lr_mult)
49 |
50 | from_layer = out_layer
51 | out_layer = "conv8_2"
52 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
53 | lr_mult=lr_mult)
54 |
55 | # 1 x 1
56 | from_layer = out_layer
57 | out_layer = "conv9_1"
58 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
59 | lr_mult=lr_mult)
60 |
61 | from_layer = out_layer
62 | out_layer = "conv9_2"
63 | ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
64 | lr_mult=lr_mult)
65 |
66 | return net
67 |
68 |
69 |
70 |
71 |
72 |
73 | # The database file for training data. Created by data/VOC0712/create_data.sh
74 | train_data = "../data/lmdb/trainval_lmdb"
75 | model_name = "VGG_HAND_SSD_300x300"
76 | # The database file for testing data. Created by data/VOC0712/create_data.sh
77 | # Specify the batch sampler.
78 | resize_width = 300
79 | resize_height = 300
80 | resize = "{}x{}".format(resize_width, resize_height)
81 | batch_sampler = [
82 | {
83 | 'sampler': {
84 | },
85 | 'max_trials': 1,
86 | 'max_sample': 1,
87 | },
88 | {
89 | 'sampler': {
90 | 'min_scale': 0.3,
91 | 'max_scale': 1.0,
92 | 'min_aspect_ratio': 0.5,
93 | 'max_aspect_ratio': 2.0,
94 | },
95 | 'sample_constraint': {
96 | 'min_jaccard_overlap': 0.1,
97 | },
98 | 'max_trials': 50,
99 | 'max_sample': 1,
100 | },
101 | {
102 | 'sampler': {
103 | 'min_scale': 0.3,
104 | 'max_scale': 1.0,
105 | 'min_aspect_ratio': 0.5,
106 | 'max_aspect_ratio': 2.0,
107 | },
108 | 'sample_constraint': {
109 | 'min_jaccard_overlap': 0.3,
110 | },
111 | 'max_trials': 50,
112 | 'max_sample': 1,
113 | },
114 | {
115 | 'sampler': {
116 | 'min_scale': 0.3,
117 | 'max_scale': 1.0,
118 | 'min_aspect_ratio': 0.5,
119 | 'max_aspect_ratio': 2.0,
120 | },
121 | 'sample_constraint': {
122 | 'min_jaccard_overlap': 0.5,
123 | },
124 | 'max_trials': 50,
125 | 'max_sample': 1,
126 | },
127 | {
128 | 'sampler': {
129 | 'min_scale': 0.3,
130 | 'max_scale': 1.0,
131 | 'min_aspect_ratio': 0.5,
132 | 'max_aspect_ratio': 2.0,
133 | },
134 | 'sample_constraint': {
135 | 'min_jaccard_overlap': 0.7,
136 | },
137 | 'max_trials': 50,
138 | 'max_sample': 1,
139 | },
140 | {
141 | 'sampler': {
142 | 'min_scale': 0.3,
143 | 'max_scale': 1.0,
144 | 'min_aspect_ratio': 0.5,
145 | 'max_aspect_ratio': 2.0,
146 | },
147 | 'sample_constraint': {
148 | 'min_jaccard_overlap': 0.9,
149 | },
150 | 'max_trials': 50,
151 | 'max_sample': 1,
152 | },
153 | {
154 | 'sampler': {
155 | 'min_scale': 0.3,
156 | 'max_scale': 1.0,
157 | 'min_aspect_ratio': 0.5,
158 | 'max_aspect_ratio': 2.0,
159 | },
160 | 'sample_constraint': {
161 | 'max_jaccard_overlap': 1.0,
162 | },
163 | 'max_trials': 50,
164 | 'max_sample': 1,
165 | },
166 | ]
167 | train_transform_param = {
168 | 'mirror': True,
169 | 'mean_value': [127.0, 127.0, 127.0],
170 | 'resize_param': {
171 | 'prob': 1,
172 | 'resize_mode': P.Resize.WARP,
173 | 'height': resize_height,
174 | 'width': resize_width,
175 | 'interp_mode': [
176 | P.Resize.LINEAR,
177 | P.Resize.AREA,
178 | P.Resize.NEAREST,
179 | P.Resize.CUBIC,
180 | P.Resize.LANCZOS4,
181 | ],
182 | },
183 | 'distort_param': {
184 | 'brightness_prob': 0.5,
185 | 'brightness_delta': 32,
186 | 'contrast_prob': 0.5,
187 | 'contrast_lower': 0.5,
188 | 'contrast_upper': 1.5,
189 | 'hue_prob': 0.5,
190 | 'hue_delta': 18,
191 | 'saturation_prob': 0.5,
192 | 'saturation_lower': 0.5,
193 | 'saturation_upper': 1.5,
194 | 'random_order_prob': 0.0,
195 | },
196 | 'expand_param': {
197 | 'prob': 0.5,
198 | 'max_expand_ratio': 4.0,
199 | },
200 | 'emit_constraint': {
201 | 'emit_type': caffe_pb2.EmitConstraint.CENTER,
202 | }
203 | }
204 |
205 | # If true, use batch norm for all newly added layers.
206 | # Currently only the non batch norm version has been tested.
207 | use_batchnorm = False
208 | lr_mult = 1
209 |
210 | # model definition files.
211 | train_net_file = "train.prototxt"
212 | deploy_net_file = "deploy.prototxt"
213 | solver_file = "solver.prototxt"
214 | # snapshot prefix.
215 | snapshot_prefix = "snapshot/VGG_HAND_SSD_300x300_iter_"
216 | # Stores LabelMapItem.
217 | label_map_file = "../data/labelmap_voc.prototxt"
218 |
219 | # MultiBoxLoss parameters.
220 | num_classes = 2
221 | share_location = True
222 | background_label_id=0
223 | train_on_diff_gt = True
224 | normalization_mode = P.Loss.VALID
225 | code_type = P.PriorBox.CENTER_SIZE
226 | ignore_cross_boundary_bbox = False
227 | mining_type = P.MultiBoxLoss.MAX_NEGATIVE
228 | neg_pos_ratio = 3.
229 | loc_weight = (neg_pos_ratio + 1.) / 4.
230 | multibox_loss_param = {
231 | 'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
232 | 'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
233 | 'loc_weight': loc_weight,
234 | 'num_classes': num_classes,
235 | 'share_location': share_location,
236 | 'match_type': P.MultiBoxLoss.PER_PREDICTION,
237 | 'overlap_threshold': 0.5,
238 | 'use_prior_for_matching': True,
239 | 'background_label_id': background_label_id,
240 | 'use_difficult_gt': train_on_diff_gt,
241 | 'mining_type': mining_type,
242 | 'neg_pos_ratio': neg_pos_ratio,
243 | 'neg_overlap': 0.5,
244 | 'code_type': code_type,
245 | 'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
246 | }
247 | loss_param = {
248 | 'normalization': normalization_mode,
249 | }
250 |
251 | # parameters for generating priors.
252 | # minimum dimension of input image
253 | min_dim = 300
254 | # conv4_3 ==> 38 x 38
255 | # fc7 ==> 19 x 19
256 | # conv6_2 ==> 10 x 10
257 | # conv7_2 ==> 5 x 5
258 | # conv8_2 ==> 3 x 3
259 | # conv9_2 ==> 1 x 1
260 | mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
261 | # in percent %
262 | min_ratio = 20
263 | max_ratio = 90
264 | step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
265 | min_sizes = []
266 | max_sizes = []
267 | for ratio in xrange(min_ratio, max_ratio + 1, step):
268 | min_sizes.append(min_dim * ratio / 100.)
269 | max_sizes.append(min_dim * (ratio + step) / 100.)
270 | min_sizes = [min_dim * 10 / 100.] + min_sizes
271 | max_sizes = [min_dim * 20 / 100.] + max_sizes
272 | steps = [8, 16, 32, 64, 100, 300]
273 | aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
274 | # L2 normalize conv4_3.
275 | normalizations = [20, -1, -1, -1, -1, -1]
276 | # variance used to encode/decode prior bboxes.
277 | if code_type == P.PriorBox.CENTER_SIZE:
278 | prior_variance = [0.1, 0.1, 0.2, 0.2]
279 | else:
280 | prior_variance = [0.1]
281 | flip = True
282 | clip = False
283 |
284 |
285 | # Divide the mini-batch to different GPUs.
286 | batch_size = 2
287 | num_gpus = 1
288 | iter_size = 200000
289 | device_id = 0
290 | batch_size_per_device = batch_size
291 |
292 | batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
293 | solver_mode = P.Solver.GPU
294 |
295 | base_lr = 0.0001
296 |
297 | solver_param = {
298 | # Train parameters
299 | 'base_lr': base_lr,
300 | 'weight_decay': 0.0005,
301 | 'lr_policy': "multistep",
302 | 'stepvalue': [80000, 100000, 120000],
303 | 'gamma': 0.1,
304 | 'momentum': 0.9,
305 | 'iter_size': iter_size,
306 | 'max_iter': 120000,
307 | 'snapshot': 80000,
308 | 'display': 10,
309 | 'average_loss': 10,
310 | 'type': "SGD",
311 | 'solver_mode': solver_mode,
312 | 'device_id': device_id,
313 | 'debug_info': False,
314 | 'snapshot_after_train': True
315 | }
316 |
317 | # Create train net.
318 | net = caffe.NetSpec()
319 | net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
320 | train=True, output_label=True, label_map_file=label_map_file,
321 | transform_param=train_transform_param, batch_sampler=batch_sampler)
322 |
323 | VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
324 | dropout=False)
325 |
326 | AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
327 |
328 | # New
329 | mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
330 | use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
331 | aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
332 | num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
333 | prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult, conf_postfix='_hand_detection')
334 |
335 |
336 | # ### initial
337 | # mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
338 | # use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
339 | # aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
340 | # num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
341 | # prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
342 |
343 | # Create the MultiBoxLossLayer.
344 | name = "mbox_loss"
345 | mbox_layers.append(net.label)
346 | net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
347 | loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
348 | propagate_down=[True, True, False, False])
349 |
350 | with open(train_net_file, 'w') as f:
351 | print('name: "{}_train"'.format(model_name), file=f)
352 | print(net.to_proto(), file=f)
353 |
354 | # Create deploy net.
355 | # Remove the first and last layer from test net.
356 | #########
357 |
358 |
359 | # parameters for generating detection output.
360 | det_out_param = {
361 | 'num_classes': num_classes,
362 | 'share_location': share_location,
363 | 'background_label_id': background_label_id,
364 | 'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
365 | 'keep_top_k': 200,
366 | 'confidence_threshold': 0.01,
367 | 'code_type': code_type,
368 | }
369 |
370 |
371 | conf_name = "mbox_conf"
372 | if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
373 | reshape_name = "{}_reshape".format(conf_name)
374 | net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
375 | softmax_name = "{}_softmax".format(conf_name)
376 | net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
377 | flatten_name = "{}_flatten".format(conf_name)
378 | net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
379 | mbox_layers[1] = net[flatten_name]
380 | elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
381 | sigmoid_name = "{}_sigmoid".format(conf_name)
382 | net[sigmoid_name] = L.Sigmoid(net[conf_name])
383 | mbox_layers[1] = net[sigmoid_name]
384 |
385 | net.detection_out = L.DetectionOutput(*mbox_layers,
386 | detection_output_param=det_out_param,
387 | include=dict(phase=caffe_pb2.Phase.Value('TEST')))
388 |
389 |
390 |
391 | deploy_net = net
392 | with open(deploy_net_file, 'w') as f:
393 | net_param = deploy_net.to_proto()
394 | # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
395 | del net_param.layer[0]
396 | del net_param.layer[-5]
397 | del net_param.layer[-1].bottom[-1]
398 | net_param.name = '{}_deploy'.format(model_name)
399 | net_param.input.extend(['data'])
400 | net_param.input_shape.extend([
401 | caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
402 | print(net_param, file=f)
403 |
404 | # Create solver.
405 | solver = caffe_pb2.SolverParameter(
406 | train_net=train_net_file,
407 | snapshot_prefix=snapshot_prefix,
408 | **solver_param)
409 |
410 | with open(solver_file, 'w') as f:
411 | print(solver, file=f)
412 |
--------------------------------------------------------------------------------
/other/Hand_Detection/model/snapshot/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/model/snapshot/.gitkeep
--------------------------------------------------------------------------------
/other/Hand_Detection/model/solver.prototxt:
--------------------------------------------------------------------------------
1 | train_net: "train.prototxt"
2 | base_lr: 0.0001
3 | display: 20
4 | max_iter: 200000
5 | lr_policy: "multistep"
6 | gamma: 0.10000000149
7 | momentum: 0.899999976158
8 | weight_decay: 0.000500000023749
9 | snapshot: 80000
10 | snapshot_prefix: "snapshot/VGG_HAND_SSD_300x300_iter_"
11 | solver_mode: GPU
12 | device_id: 0
13 | debug_info: false
14 | snapshot_after_train: true
15 | average_loss: 10
16 | stepvalue: 60000
17 | stepvalue: 100000
18 | stepvalue: 140000
19 | type: "SGD"
20 |
21 |
--------------------------------------------------------------------------------
/other/Hand_Detection/model/train.prototxt:
--------------------------------------------------------------------------------
1 | name: "VGG_HAND_SSD_300x300_train"
2 | layer {
3 | name: "data"
4 | type: "AnnotatedData"
5 | top: "data"
6 | top: "label"
7 | include {
8 | phase: TRAIN
9 | }
10 | transform_param {
11 | mirror: true
12 | mean_value: 127.0
13 | mean_value: 127.0
14 | mean_value: 127.0
15 | resize_param {
16 | prob: 1.0
17 | resize_mode: WARP
18 | height: 300
19 | width: 300
20 | interp_mode: LINEAR
21 | interp_mode: AREA
22 | interp_mode: NEAREST
23 | interp_mode: CUBIC
24 | interp_mode: LANCZOS4
25 | }
26 | emit_constraint {
27 | emit_type: CENTER
28 | }
29 | distort_param {
30 | brightness_prob: 0.5
31 | brightness_delta: 32.0
32 | contrast_prob: 0.5
33 | contrast_lower: 0.5
34 | contrast_upper: 1.5
35 | hue_prob: 0.5
36 | hue_delta: 18.0
37 | saturation_prob: 0.5
38 | saturation_lower: 0.5
39 | saturation_upper: 1.5
40 | random_order_prob: 0.0
41 | }
42 | expand_param {
43 | prob: 0.5
44 | max_expand_ratio: 4.0
45 | }
46 | }
47 | data_param {
48 | source: "../data/lmdb/trainval_lmdb"
49 | batch_size: 2
50 | backend: LMDB
51 | }
52 | annotated_data_param {
53 | batch_sampler {
54 | max_sample: 1
55 | max_trials: 1
56 | }
57 | batch_sampler {
58 | sampler {
59 | min_scale: 0.300000011921
60 | max_scale: 1.0
61 | min_aspect_ratio: 0.5
62 | max_aspect_ratio: 2.0
63 | }
64 | sample_constraint {
65 | min_jaccard_overlap: 0.10000000149
66 | }
67 | max_sample: 1
68 | max_trials: 50
69 | }
70 | batch_sampler {
71 | sampler {
72 | min_scale: 0.300000011921
73 | max_scale: 1.0
74 | min_aspect_ratio: 0.5
75 | max_aspect_ratio: 2.0
76 | }
77 | sample_constraint {
78 | min_jaccard_overlap: 0.300000011921
79 | }
80 | max_sample: 1
81 | max_trials: 50
82 | }
83 | batch_sampler {
84 | sampler {
85 | min_scale: 0.300000011921
86 | max_scale: 1.0
87 | min_aspect_ratio: 0.5
88 | max_aspect_ratio: 2.0
89 | }
90 | sample_constraint {
91 | min_jaccard_overlap: 0.5
92 | }
93 | max_sample: 1
94 | max_trials: 50
95 | }
96 | batch_sampler {
97 | sampler {
98 | min_scale: 0.300000011921
99 | max_scale: 1.0
100 | min_aspect_ratio: 0.5
101 | max_aspect_ratio: 2.0
102 | }
103 | sample_constraint {
104 | min_jaccard_overlap: 0.699999988079
105 | }
106 | max_sample: 1
107 | max_trials: 50
108 | }
109 | batch_sampler {
110 | sampler {
111 | min_scale: 0.300000011921
112 | max_scale: 1.0
113 | min_aspect_ratio: 0.5
114 | max_aspect_ratio: 2.0
115 | }
116 | sample_constraint {
117 | min_jaccard_overlap: 0.899999976158
118 | }
119 | max_sample: 1
120 | max_trials: 50
121 | }
122 | batch_sampler {
123 | sampler {
124 | min_scale: 0.300000011921
125 | max_scale: 1.0
126 | min_aspect_ratio: 0.5
127 | max_aspect_ratio: 2.0
128 | }
129 | sample_constraint {
130 | max_jaccard_overlap: 1.0
131 | }
132 | max_sample: 1
133 | max_trials: 50
134 | }
135 | label_map_file: "../data/labelmap_voc.prototxt"
136 | }
137 | }
138 | layer {
139 | name: "conv1_1"
140 | type: "Convolution"
141 | bottom: "data"
142 | top: "conv1_1"
143 | param {
144 | lr_mult: 1.0
145 | decay_mult: 1.0
146 | }
147 | param {
148 | lr_mult: 2.0
149 | decay_mult: 0.0
150 | }
151 | convolution_param {
152 | num_output: 64
153 | pad: 1
154 | kernel_size: 3
155 | weight_filler {
156 | type: "xavier"
157 | }
158 | bias_filler {
159 | type: "constant"
160 | value: 0.0
161 | }
162 | }
163 | }
164 | layer {
165 | name: "relu1_1"
166 | type: "ReLU"
167 | bottom: "conv1_1"
168 | top: "conv1_1"
169 | }
170 | layer {
171 | name: "conv1_2"
172 | type: "Convolution"
173 | bottom: "conv1_1"
174 | top: "conv1_2"
175 | param {
176 | lr_mult: 1.0
177 | decay_mult: 1.0
178 | }
179 | param {
180 | lr_mult: 2.0
181 | decay_mult: 0.0
182 | }
183 | convolution_param {
184 | num_output: 64
185 | pad: 1
186 | kernel_size: 3
187 | weight_filler {
188 | type: "xavier"
189 | }
190 | bias_filler {
191 | type: "constant"
192 | value: 0.0
193 | }
194 | }
195 | }
196 | layer {
197 | name: "relu1_2"
198 | type: "ReLU"
199 | bottom: "conv1_2"
200 | top: "conv1_2"
201 | }
202 | layer {
203 | name: "pool1"
204 | type: "Pooling"
205 | bottom: "conv1_2"
206 | top: "pool1"
207 | pooling_param {
208 | pool: MAX
209 | kernel_size: 2
210 | stride: 2
211 | }
212 | }
213 | layer {
214 | name: "conv2_1"
215 | type: "Convolution"
216 | bottom: "pool1"
217 | top: "conv2_1"
218 | param {
219 | lr_mult: 1.0
220 | decay_mult: 1.0
221 | }
222 | param {
223 | lr_mult: 2.0
224 | decay_mult: 0.0
225 | }
226 | convolution_param {
227 | num_output: 128
228 | pad: 1
229 | kernel_size: 3
230 | weight_filler {
231 | type: "xavier"
232 | }
233 | bias_filler {
234 | type: "constant"
235 | value: 0.0
236 | }
237 | }
238 | }
239 | layer {
240 | name: "relu2_1"
241 | type: "ReLU"
242 | bottom: "conv2_1"
243 | top: "conv2_1"
244 | }
245 | layer {
246 | name: "conv2_2"
247 | type: "Convolution"
248 | bottom: "conv2_1"
249 | top: "conv2_2"
250 | param {
251 | lr_mult: 1.0
252 | decay_mult: 1.0
253 | }
254 | param {
255 | lr_mult: 2.0
256 | decay_mult: 0.0
257 | }
258 | convolution_param {
259 | num_output: 128
260 | pad: 1
261 | kernel_size: 3
262 | weight_filler {
263 | type: "xavier"
264 | }
265 | bias_filler {
266 | type: "constant"
267 | value: 0.0
268 | }
269 | }
270 | }
271 | layer {
272 | name: "relu2_2"
273 | type: "ReLU"
274 | bottom: "conv2_2"
275 | top: "conv2_2"
276 | }
277 | layer {
278 | name: "pool2"
279 | type: "Pooling"
280 | bottom: "conv2_2"
281 | top: "pool2"
282 | pooling_param {
283 | pool: MAX
284 | kernel_size: 2
285 | stride: 2
286 | }
287 | }
288 | layer {
289 | name: "conv3_1"
290 | type: "Convolution"
291 | bottom: "pool2"
292 | top: "conv3_1"
293 | param {
294 | lr_mult: 1.0
295 | decay_mult: 1.0
296 | }
297 | param {
298 | lr_mult: 2.0
299 | decay_mult: 0.0
300 | }
301 | convolution_param {
302 | num_output: 256
303 | pad: 1
304 | kernel_size: 3
305 | weight_filler {
306 | type: "xavier"
307 | }
308 | bias_filler {
309 | type: "constant"
310 | value: 0.0
311 | }
312 | }
313 | }
314 | layer {
315 | name: "relu3_1"
316 | type: "ReLU"
317 | bottom: "conv3_1"
318 | top: "conv3_1"
319 | }
320 | layer {
321 | name: "conv3_2"
322 | type: "Convolution"
323 | bottom: "conv3_1"
324 | top: "conv3_2"
325 | param {
326 | lr_mult: 1.0
327 | decay_mult: 1.0
328 | }
329 | param {
330 | lr_mult: 2.0
331 | decay_mult: 0.0
332 | }
333 | convolution_param {
334 | num_output: 256
335 | pad: 1
336 | kernel_size: 3
337 | weight_filler {
338 | type: "xavier"
339 | }
340 | bias_filler {
341 | type: "constant"
342 | value: 0.0
343 | }
344 | }
345 | }
346 | layer {
347 | name: "relu3_2"
348 | type: "ReLU"
349 | bottom: "conv3_2"
350 | top: "conv3_2"
351 | }
352 | layer {
353 | name: "conv3_3"
354 | type: "Convolution"
355 | bottom: "conv3_2"
356 | top: "conv3_3"
357 | param {
358 | lr_mult: 1.0
359 | decay_mult: 1.0
360 | }
361 | param {
362 | lr_mult: 2.0
363 | decay_mult: 0.0
364 | }
365 | convolution_param {
366 | num_output: 256
367 | pad: 1
368 | kernel_size: 3
369 | weight_filler {
370 | type: "xavier"
371 | }
372 | bias_filler {
373 | type: "constant"
374 | value: 0.0
375 | }
376 | }
377 | }
378 | layer {
379 | name: "relu3_3"
380 | type: "ReLU"
381 | bottom: "conv3_3"
382 | top: "conv3_3"
383 | }
384 | layer {
385 | name: "pool3"
386 | type: "Pooling"
387 | bottom: "conv3_3"
388 | top: "pool3"
389 | pooling_param {
390 | pool: MAX
391 | kernel_size: 2
392 | stride: 2
393 | }
394 | }
395 | layer {
396 | name: "conv4_1"
397 | type: "Convolution"
398 | bottom: "pool3"
399 | top: "conv4_1"
400 | param {
401 | lr_mult: 1.0
402 | decay_mult: 1.0
403 | }
404 | param {
405 | lr_mult: 2.0
406 | decay_mult: 0.0
407 | }
408 | convolution_param {
409 | num_output: 512
410 | pad: 1
411 | kernel_size: 3
412 | weight_filler {
413 | type: "xavier"
414 | }
415 | bias_filler {
416 | type: "constant"
417 | value: 0.0
418 | }
419 | }
420 | }
421 | layer {
422 | name: "relu4_1"
423 | type: "ReLU"
424 | bottom: "conv4_1"
425 | top: "conv4_1"
426 | }
427 | layer {
428 | name: "conv4_2"
429 | type: "Convolution"
430 | bottom: "conv4_1"
431 | top: "conv4_2"
432 | param {
433 | lr_mult: 1.0
434 | decay_mult: 1.0
435 | }
436 | param {
437 | lr_mult: 2.0
438 | decay_mult: 0.0
439 | }
440 | convolution_param {
441 | num_output: 512
442 | pad: 1
443 | kernel_size: 3
444 | weight_filler {
445 | type: "xavier"
446 | }
447 | bias_filler {
448 | type: "constant"
449 | value: 0.0
450 | }
451 | }
452 | }
453 | layer {
454 | name: "relu4_2"
455 | type: "ReLU"
456 | bottom: "conv4_2"
457 | top: "conv4_2"
458 | }
459 | layer {
460 | name: "conv4_3"
461 | type: "Convolution"
462 | bottom: "conv4_2"
463 | top: "conv4_3"
464 | param {
465 | lr_mult: 1.0
466 | decay_mult: 1.0
467 | }
468 | param {
469 | lr_mult: 2.0
470 | decay_mult: 0.0
471 | }
472 | convolution_param {
473 | num_output: 512
474 | pad: 1
475 | kernel_size: 3
476 | weight_filler {
477 | type: "xavier"
478 | }
479 | bias_filler {
480 | type: "constant"
481 | value: 0.0
482 | }
483 | }
484 | }
485 | layer {
486 | name: "relu4_3"
487 | type: "ReLU"
488 | bottom: "conv4_3"
489 | top: "conv4_3"
490 | }
491 | layer {
492 | name: "pool4"
493 | type: "Pooling"
494 | bottom: "conv4_3"
495 | top: "pool4"
496 | pooling_param {
497 | pool: MAX
498 | kernel_size: 2
499 | stride: 2
500 | }
501 | }
502 | layer {
503 | name: "conv5_1"
504 | type: "Convolution"
505 | bottom: "pool4"
506 | top: "conv5_1"
507 | param {
508 | lr_mult: 1.0
509 | decay_mult: 1.0
510 | }
511 | param {
512 | lr_mult: 2.0
513 | decay_mult: 0.0
514 | }
515 | convolution_param {
516 | num_output: 512
517 | pad: 1
518 | kernel_size: 3
519 | weight_filler {
520 | type: "xavier"
521 | }
522 | bias_filler {
523 | type: "constant"
524 | value: 0.0
525 | }
526 | dilation: 1
527 | }
528 | }
529 | layer {
530 | name: "relu5_1"
531 | type: "ReLU"
532 | bottom: "conv5_1"
533 | top: "conv5_1"
534 | }
535 | layer {
536 | name: "conv5_2"
537 | type: "Convolution"
538 | bottom: "conv5_1"
539 | top: "conv5_2"
540 | param {
541 | lr_mult: 1.0
542 | decay_mult: 1.0
543 | }
544 | param {
545 | lr_mult: 2.0
546 | decay_mult: 0.0
547 | }
548 | convolution_param {
549 | num_output: 512
550 | pad: 1
551 | kernel_size: 3
552 | weight_filler {
553 | type: "xavier"
554 | }
555 | bias_filler {
556 | type: "constant"
557 | value: 0.0
558 | }
559 | dilation: 1
560 | }
561 | }
562 | layer {
563 | name: "relu5_2"
564 | type: "ReLU"
565 | bottom: "conv5_2"
566 | top: "conv5_2"
567 | }
568 | layer {
569 | name: "conv5_3"
570 | type: "Convolution"
571 | bottom: "conv5_2"
572 | top: "conv5_3"
573 | param {
574 | lr_mult: 1.0
575 | decay_mult: 1.0
576 | }
577 | param {
578 | lr_mult: 2.0
579 | decay_mult: 0.0
580 | }
581 | convolution_param {
582 | num_output: 512
583 | pad: 1
584 | kernel_size: 3
585 | weight_filler {
586 | type: "xavier"
587 | }
588 | bias_filler {
589 | type: "constant"
590 | value: 0.0
591 | }
592 | dilation: 1
593 | }
594 | }
595 | layer {
596 | name: "relu5_3"
597 | type: "ReLU"
598 | bottom: "conv5_3"
599 | top: "conv5_3"
600 | }
601 | layer {
602 | name: "pool5"
603 | type: "Pooling"
604 | bottom: "conv5_3"
605 | top: "pool5"
606 | pooling_param {
607 | pool: MAX
608 | kernel_size: 3
609 | stride: 1
610 | pad: 1
611 | }
612 | }
613 | layer {
614 | name: "fc6"
615 | type: "Convolution"
616 | bottom: "pool5"
617 | top: "fc6"
618 | param {
619 | lr_mult: 1.0
620 | decay_mult: 1.0
621 | }
622 | param {
623 | lr_mult: 2.0
624 | decay_mult: 0.0
625 | }
626 | convolution_param {
627 | num_output: 1024
628 | pad: 6
629 | kernel_size: 3
630 | weight_filler {
631 | type: "xavier"
632 | }
633 | bias_filler {
634 | type: "constant"
635 | value: 0.0
636 | }
637 | dilation: 6
638 | }
639 | }
640 | layer {
641 | name: "relu6"
642 | type: "ReLU"
643 | bottom: "fc6"
644 | top: "fc6"
645 | }
646 | layer {
647 | name: "fc7"
648 | type: "Convolution"
649 | bottom: "fc6"
650 | top: "fc7"
651 | param {
652 | lr_mult: 1.0
653 | decay_mult: 1.0
654 | }
655 | param {
656 | lr_mult: 2.0
657 | decay_mult: 0.0
658 | }
659 | convolution_param {
660 | num_output: 1024
661 | kernel_size: 1
662 | weight_filler {
663 | type: "xavier"
664 | }
665 | bias_filler {
666 | type: "constant"
667 | value: 0.0
668 | }
669 | }
670 | }
671 | layer {
672 | name: "relu7"
673 | type: "ReLU"
674 | bottom: "fc7"
675 | top: "fc7"
676 | }
677 | layer {
678 | name: "conv6_1"
679 | type: "Convolution"
680 | bottom: "fc7"
681 | top: "conv6_1"
682 | param {
683 | lr_mult: 1.0
684 | decay_mult: 1.0
685 | }
686 | param {
687 | lr_mult: 2.0
688 | decay_mult: 0.0
689 | }
690 | convolution_param {
691 | num_output: 256
692 | pad: 0
693 | kernel_size: 1
694 | stride: 1
695 | weight_filler {
696 | type: "xavier"
697 | }
698 | bias_filler {
699 | type: "constant"
700 | value: 0.0
701 | }
702 | }
703 | }
704 | layer {
705 | name: "conv6_1_relu"
706 | type: "ReLU"
707 | bottom: "conv6_1"
708 | top: "conv6_1"
709 | }
710 | layer {
711 | name: "conv6_2"
712 | type: "Convolution"
713 | bottom: "conv6_1"
714 | top: "conv6_2"
715 | param {
716 | lr_mult: 1.0
717 | decay_mult: 1.0
718 | }
719 | param {
720 | lr_mult: 2.0
721 | decay_mult: 0.0
722 | }
723 | convolution_param {
724 | num_output: 512
725 | pad: 1
726 | kernel_size: 3
727 | stride: 2
728 | weight_filler {
729 | type: "xavier"
730 | }
731 | bias_filler {
732 | type: "constant"
733 | value: 0.0
734 | }
735 | }
736 | }
737 | layer {
738 | name: "conv6_2_relu"
739 | type: "ReLU"
740 | bottom: "conv6_2"
741 | top: "conv6_2"
742 | }
743 | layer {
744 | name: "conv7_1"
745 | type: "Convolution"
746 | bottom: "conv6_2"
747 | top: "conv7_1"
748 | param {
749 | lr_mult: 1.0
750 | decay_mult: 1.0
751 | }
752 | param {
753 | lr_mult: 2.0
754 | decay_mult: 0.0
755 | }
756 | convolution_param {
757 | num_output: 128
758 | pad: 0
759 | kernel_size: 1
760 | stride: 1
761 | weight_filler {
762 | type: "xavier"
763 | }
764 | bias_filler {
765 | type: "constant"
766 | value: 0.0
767 | }
768 | }
769 | }
770 | layer {
771 | name: "conv7_1_relu"
772 | type: "ReLU"
773 | bottom: "conv7_1"
774 | top: "conv7_1"
775 | }
776 | layer {
777 | name: "conv7_2"
778 | type: "Convolution"
779 | bottom: "conv7_1"
780 | top: "conv7_2"
781 | param {
782 | lr_mult: 1.0
783 | decay_mult: 1.0
784 | }
785 | param {
786 | lr_mult: 2.0
787 | decay_mult: 0.0
788 | }
789 | convolution_param {
790 | num_output: 256
791 | pad: 1
792 | kernel_size: 3
793 | stride: 2
794 | weight_filler {
795 | type: "xavier"
796 | }
797 | bias_filler {
798 | type: "constant"
799 | value: 0.0
800 | }
801 | }
802 | }
803 | layer {
804 | name: "conv7_2_relu"
805 | type: "ReLU"
806 | bottom: "conv7_2"
807 | top: "conv7_2"
808 | }
809 | layer {
810 | name: "conv8_1"
811 | type: "Convolution"
812 | bottom: "conv7_2"
813 | top: "conv8_1"
814 | param {
815 | lr_mult: 1.0
816 | decay_mult: 1.0
817 | }
818 | param {
819 | lr_mult: 2.0
820 | decay_mult: 0.0
821 | }
822 | convolution_param {
823 | num_output: 128
824 | pad: 0
825 | kernel_size: 1
826 | stride: 1
827 | weight_filler {
828 | type: "xavier"
829 | }
830 | bias_filler {
831 | type: "constant"
832 | value: 0.0
833 | }
834 | }
835 | }
836 | layer {
837 | name: "conv8_1_relu"
838 | type: "ReLU"
839 | bottom: "conv8_1"
840 | top: "conv8_1"
841 | }
842 | layer {
843 | name: "conv8_2"
844 | type: "Convolution"
845 | bottom: "conv8_1"
846 | top: "conv8_2"
847 | param {
848 | lr_mult: 1.0
849 | decay_mult: 1.0
850 | }
851 | param {
852 | lr_mult: 2.0
853 | decay_mult: 0.0
854 | }
855 | convolution_param {
856 | num_output: 256
857 | pad: 0
858 | kernel_size: 3
859 | stride: 1
860 | weight_filler {
861 | type: "xavier"
862 | }
863 | bias_filler {
864 | type: "constant"
865 | value: 0.0
866 | }
867 | }
868 | }
869 | layer {
870 | name: "conv8_2_relu"
871 | type: "ReLU"
872 | bottom: "conv8_2"
873 | top: "conv8_2"
874 | }
875 | layer {
876 | name: "conv9_1"
877 | type: "Convolution"
878 | bottom: "conv8_2"
879 | top: "conv9_1"
880 | param {
881 | lr_mult: 1.0
882 | decay_mult: 1.0
883 | }
884 | param {
885 | lr_mult: 2.0
886 | decay_mult: 0.0
887 | }
888 | convolution_param {
889 | num_output: 128
890 | pad: 0
891 | kernel_size: 1
892 | stride: 1
893 | weight_filler {
894 | type: "xavier"
895 | }
896 | bias_filler {
897 | type: "constant"
898 | value: 0.0
899 | }
900 | }
901 | }
902 | layer {
903 | name: "conv9_1_relu"
904 | type: "ReLU"
905 | bottom: "conv9_1"
906 | top: "conv9_1"
907 | }
908 | layer {
909 | name: "conv9_2"
910 | type: "Convolution"
911 | bottom: "conv9_1"
912 | top: "conv9_2"
913 | param {
914 | lr_mult: 1.0
915 | decay_mult: 1.0
916 | }
917 | param {
918 | lr_mult: 2.0
919 | decay_mult: 0.0
920 | }
921 | convolution_param {
922 | num_output: 256
923 | pad: 0
924 | kernel_size: 3
925 | stride: 1
926 | weight_filler {
927 | type: "xavier"
928 | }
929 | bias_filler {
930 | type: "constant"
931 | value: 0.0
932 | }
933 | }
934 | }
935 | layer {
936 | name: "conv9_2_relu"
937 | type: "ReLU"
938 | bottom: "conv9_2"
939 | top: "conv9_2"
940 | }
941 | layer {
942 | name: "conv4_3_norm"
943 | type: "Normalize"
944 | bottom: "conv4_3"
945 | top: "conv4_3_norm"
946 | norm_param {
947 | across_spatial: false
948 | scale_filler {
949 | type: "constant"
950 | value: 20.0
951 | }
952 | channel_shared: false
953 | }
954 | }
955 | layer {
956 | name: "conv4_3_norm_mbox_loc"
957 | type: "Convolution"
958 | bottom: "conv4_3_norm"
959 | top: "conv4_3_norm_mbox_loc"
960 | param {
961 | lr_mult: 1.0
962 | decay_mult: 1.0
963 | }
964 | param {
965 | lr_mult: 2.0
966 | decay_mult: 0.0
967 | }
968 | convolution_param {
969 | num_output: 16
970 | pad: 1
971 | kernel_size: 3
972 | stride: 1
973 | weight_filler {
974 | type: "xavier"
975 | }
976 | bias_filler {
977 | type: "constant"
978 | value: 0.0
979 | }
980 | }
981 | }
982 | layer {
983 | name: "conv4_3_norm_mbox_loc_perm"
984 | type: "Permute"
985 | bottom: "conv4_3_norm_mbox_loc"
986 | top: "conv4_3_norm_mbox_loc_perm"
987 | permute_param {
988 | order: 0
989 | order: 2
990 | order: 3
991 | order: 1
992 | }
993 | }
994 | layer {
995 | name: "conv4_3_norm_mbox_loc_flat"
996 | type: "Flatten"
997 | bottom: "conv4_3_norm_mbox_loc_perm"
998 | top: "conv4_3_norm_mbox_loc_flat"
999 | flatten_param {
1000 | axis: 1
1001 | }
1002 | }
1003 | layer {
1004 | name: "conv4_3_norm_mbox_conf_hand_detection"
1005 | type: "Convolution"
1006 | bottom: "conv4_3_norm"
1007 | top: "conv4_3_norm_mbox_conf_hand_detection"
1008 | param {
1009 | lr_mult: 1.0
1010 | decay_mult: 1.0
1011 | }
1012 | param {
1013 | lr_mult: 2.0
1014 | decay_mult: 0.0
1015 | }
1016 | convolution_param {
1017 | num_output: 8
1018 | pad: 1
1019 | kernel_size: 3
1020 | stride: 1
1021 | weight_filler {
1022 | type: "xavier"
1023 | }
1024 | bias_filler {
1025 | type: "constant"
1026 | value: 0.0
1027 | }
1028 | }
1029 | }
1030 | layer {
1031 | name: "conv4_3_norm_mbox_conf_hand_detection_perm"
1032 | type: "Permute"
1033 | bottom: "conv4_3_norm_mbox_conf_hand_detection"
1034 | top: "conv4_3_norm_mbox_conf_hand_detection_perm"
1035 | permute_param {
1036 | order: 0
1037 | order: 2
1038 | order: 3
1039 | order: 1
1040 | }
1041 | }
1042 | layer {
1043 | name: "conv4_3_norm_mbox_conf_hand_detection_flat"
1044 | type: "Flatten"
1045 | bottom: "conv4_3_norm_mbox_conf_hand_detection_perm"
1046 | top: "conv4_3_norm_mbox_conf_hand_detection_flat"
1047 | flatten_param {
1048 | axis: 1
1049 | }
1050 | }
1051 | layer {
1052 | name: "conv4_3_norm_mbox_priorbox"
1053 | type: "PriorBox"
1054 | bottom: "conv4_3_norm"
1055 | bottom: "data"
1056 | top: "conv4_3_norm_mbox_priorbox"
1057 | prior_box_param {
1058 | min_size: 30.0
1059 | max_size: 60.0
1060 | aspect_ratio: 2.0
1061 | flip: true
1062 | clip: false
1063 | variance: 0.10000000149
1064 | variance: 0.10000000149
1065 | variance: 0.20000000298
1066 | variance: 0.20000000298
1067 | step: 8.0
1068 | offset: 0.5
1069 | }
1070 | }
1071 | layer {
1072 | name: "fc7_mbox_loc"
1073 | type: "Convolution"
1074 | bottom: "fc7"
1075 | top: "fc7_mbox_loc"
1076 | param {
1077 | lr_mult: 1.0
1078 | decay_mult: 1.0
1079 | }
1080 | param {
1081 | lr_mult: 2.0
1082 | decay_mult: 0.0
1083 | }
1084 | convolution_param {
1085 | num_output: 24
1086 | pad: 1
1087 | kernel_size: 3
1088 | stride: 1
1089 | weight_filler {
1090 | type: "xavier"
1091 | }
1092 | bias_filler {
1093 | type: "constant"
1094 | value: 0.0
1095 | }
1096 | }
1097 | }
1098 | layer {
1099 | name: "fc7_mbox_loc_perm"
1100 | type: "Permute"
1101 | bottom: "fc7_mbox_loc"
1102 | top: "fc7_mbox_loc_perm"
1103 | permute_param {
1104 | order: 0
1105 | order: 2
1106 | order: 3
1107 | order: 1
1108 | }
1109 | }
1110 | layer {
1111 | name: "fc7_mbox_loc_flat"
1112 | type: "Flatten"
1113 | bottom: "fc7_mbox_loc_perm"
1114 | top: "fc7_mbox_loc_flat"
1115 | flatten_param {
1116 | axis: 1
1117 | }
1118 | }
1119 | layer {
1120 | name: "fc7_mbox_conf_hand_detection"
1121 | type: "Convolution"
1122 | bottom: "fc7"
1123 | top: "fc7_mbox_conf_hand_detection"
1124 | param {
1125 | lr_mult: 1.0
1126 | decay_mult: 1.0
1127 | }
1128 | param {
1129 | lr_mult: 2.0
1130 | decay_mult: 0.0
1131 | }
1132 | convolution_param {
1133 | num_output: 12
1134 | pad: 1
1135 | kernel_size: 3
1136 | stride: 1
1137 | weight_filler {
1138 | type: "xavier"
1139 | }
1140 | bias_filler {
1141 | type: "constant"
1142 | value: 0.0
1143 | }
1144 | }
1145 | }
1146 | layer {
1147 | name: "fc7_mbox_conf_hand_detection_perm"
1148 | type: "Permute"
1149 | bottom: "fc7_mbox_conf_hand_detection"
1150 | top: "fc7_mbox_conf_hand_detection_perm"
1151 | permute_param {
1152 | order: 0
1153 | order: 2
1154 | order: 3
1155 | order: 1
1156 | }
1157 | }
1158 | layer {
1159 | name: "fc7_mbox_conf_hand_detection_flat"
1160 | type: "Flatten"
1161 | bottom: "fc7_mbox_conf_hand_detection_perm"
1162 | top: "fc7_mbox_conf_hand_detection_flat"
1163 | flatten_param {
1164 | axis: 1
1165 | }
1166 | }
1167 | layer {
1168 | name: "fc7_mbox_priorbox"
1169 | type: "PriorBox"
1170 | bottom: "fc7"
1171 | bottom: "data"
1172 | top: "fc7_mbox_priorbox"
1173 | prior_box_param {
1174 | min_size: 60.0
1175 | max_size: 111.0
1176 | aspect_ratio: 2.0
1177 | aspect_ratio: 3.0
1178 | flip: true
1179 | clip: false
1180 | variance: 0.10000000149
1181 | variance: 0.10000000149
1182 | variance: 0.20000000298
1183 | variance: 0.20000000298
1184 | step: 16.0
1185 | offset: 0.5
1186 | }
1187 | }
1188 | layer {
1189 | name: "conv6_2_mbox_loc"
1190 | type: "Convolution"
1191 | bottom: "conv6_2"
1192 | top: "conv6_2_mbox_loc"
1193 | param {
1194 | lr_mult: 1.0
1195 | decay_mult: 1.0
1196 | }
1197 | param {
1198 | lr_mult: 2.0
1199 | decay_mult: 0.0
1200 | }
1201 | convolution_param {
1202 | num_output: 24
1203 | pad: 1
1204 | kernel_size: 3
1205 | stride: 1
1206 | weight_filler {
1207 | type: "xavier"
1208 | }
1209 | bias_filler {
1210 | type: "constant"
1211 | value: 0.0
1212 | }
1213 | }
1214 | }
1215 | layer {
1216 | name: "conv6_2_mbox_loc_perm"
1217 | type: "Permute"
1218 | bottom: "conv6_2_mbox_loc"
1219 | top: "conv6_2_mbox_loc_perm"
1220 | permute_param {
1221 | order: 0
1222 | order: 2
1223 | order: 3
1224 | order: 1
1225 | }
1226 | }
1227 | layer {
1228 | name: "conv6_2_mbox_loc_flat"
1229 | type: "Flatten"
1230 | bottom: "conv6_2_mbox_loc_perm"
1231 | top: "conv6_2_mbox_loc_flat"
1232 | flatten_param {
1233 | axis: 1
1234 | }
1235 | }
1236 | layer {
1237 | name: "conv6_2_mbox_conf_hand_detection"
1238 | type: "Convolution"
1239 | bottom: "conv6_2"
1240 | top: "conv6_2_mbox_conf_hand_detection"
1241 | param {
1242 | lr_mult: 1.0
1243 | decay_mult: 1.0
1244 | }
1245 | param {
1246 | lr_mult: 2.0
1247 | decay_mult: 0.0
1248 | }
1249 | convolution_param {
1250 | num_output: 12
1251 | pad: 1
1252 | kernel_size: 3
1253 | stride: 1
1254 | weight_filler {
1255 | type: "xavier"
1256 | }
1257 | bias_filler {
1258 | type: "constant"
1259 | value: 0.0
1260 | }
1261 | }
1262 | }
1263 | layer {
1264 | name: "conv6_2_mbox_conf_hand_detection_perm"
1265 | type: "Permute"
1266 | bottom: "conv6_2_mbox_conf_hand_detection"
1267 | top: "conv6_2_mbox_conf_hand_detection_perm"
1268 | permute_param {
1269 | order: 0
1270 | order: 2
1271 | order: 3
1272 | order: 1
1273 | }
1274 | }
1275 | layer {
1276 | name: "conv6_2_mbox_conf_hand_detection_flat"
1277 | type: "Flatten"
1278 | bottom: "conv6_2_mbox_conf_hand_detection_perm"
1279 | top: "conv6_2_mbox_conf_hand_detection_flat"
1280 | flatten_param {
1281 | axis: 1
1282 | }
1283 | }
1284 | layer {
1285 | name: "conv6_2_mbox_priorbox"
1286 | type: "PriorBox"
1287 | bottom: "conv6_2"
1288 | bottom: "data"
1289 | top: "conv6_2_mbox_priorbox"
1290 | prior_box_param {
1291 | min_size: 111.0
1292 | max_size: 162.0
1293 | aspect_ratio: 2.0
1294 | aspect_ratio: 3.0
1295 | flip: true
1296 | clip: false
1297 | variance: 0.10000000149
1298 | variance: 0.10000000149
1299 | variance: 0.20000000298
1300 | variance: 0.20000000298
1301 | step: 32.0
1302 | offset: 0.5
1303 | }
1304 | }
1305 | layer {
1306 | name: "conv7_2_mbox_loc"
1307 | type: "Convolution"
1308 | bottom: "conv7_2"
1309 | top: "conv7_2_mbox_loc"
1310 | param {
1311 | lr_mult: 1.0
1312 | decay_mult: 1.0
1313 | }
1314 | param {
1315 | lr_mult: 2.0
1316 | decay_mult: 0.0
1317 | }
1318 | convolution_param {
1319 | num_output: 24
1320 | pad: 1
1321 | kernel_size: 3
1322 | stride: 1
1323 | weight_filler {
1324 | type: "xavier"
1325 | }
1326 | bias_filler {
1327 | type: "constant"
1328 | value: 0.0
1329 | }
1330 | }
1331 | }
1332 | layer {
1333 | name: "conv7_2_mbox_loc_perm"
1334 | type: "Permute"
1335 | bottom: "conv7_2_mbox_loc"
1336 | top: "conv7_2_mbox_loc_perm"
1337 | permute_param {
1338 | order: 0
1339 | order: 2
1340 | order: 3
1341 | order: 1
1342 | }
1343 | }
1344 | layer {
1345 | name: "conv7_2_mbox_loc_flat"
1346 | type: "Flatten"
1347 | bottom: "conv7_2_mbox_loc_perm"
1348 | top: "conv7_2_mbox_loc_flat"
1349 | flatten_param {
1350 | axis: 1
1351 | }
1352 | }
1353 | layer {
1354 | name: "conv7_2_mbox_conf_hand_detection"
1355 | type: "Convolution"
1356 | bottom: "conv7_2"
1357 | top: "conv7_2_mbox_conf_hand_detection"
1358 | param {
1359 | lr_mult: 1.0
1360 | decay_mult: 1.0
1361 | }
1362 | param {
1363 | lr_mult: 2.0
1364 | decay_mult: 0.0
1365 | }
1366 | convolution_param {
1367 | num_output: 12
1368 | pad: 1
1369 | kernel_size: 3
1370 | stride: 1
1371 | weight_filler {
1372 | type: "xavier"
1373 | }
1374 | bias_filler {
1375 | type: "constant"
1376 | value: 0.0
1377 | }
1378 | }
1379 | }
1380 | layer {
1381 | name: "conv7_2_mbox_conf_hand_detection_perm"
1382 | type: "Permute"
1383 | bottom: "conv7_2_mbox_conf_hand_detection"
1384 | top: "conv7_2_mbox_conf_hand_detection_perm"
1385 | permute_param {
1386 | order: 0
1387 | order: 2
1388 | order: 3
1389 | order: 1
1390 | }
1391 | }
1392 | layer {
1393 | name: "conv7_2_mbox_conf_hand_detection_flat"
1394 | type: "Flatten"
1395 | bottom: "conv7_2_mbox_conf_hand_detection_perm"
1396 | top: "conv7_2_mbox_conf_hand_detection_flat"
1397 | flatten_param {
1398 | axis: 1
1399 | }
1400 | }
1401 | layer {
1402 | name: "conv7_2_mbox_priorbox"
1403 | type: "PriorBox"
1404 | bottom: "conv7_2"
1405 | bottom: "data"
1406 | top: "conv7_2_mbox_priorbox"
1407 | prior_box_param {
1408 | min_size: 162.0
1409 | max_size: 213.0
1410 | aspect_ratio: 2.0
1411 | aspect_ratio: 3.0
1412 | flip: true
1413 | clip: false
1414 | variance: 0.10000000149
1415 | variance: 0.10000000149
1416 | variance: 0.20000000298
1417 | variance: 0.20000000298
1418 | step: 64.0
1419 | offset: 0.5
1420 | }
1421 | }
1422 | layer {
1423 | name: "conv8_2_mbox_loc"
1424 | type: "Convolution"
1425 | bottom: "conv8_2"
1426 | top: "conv8_2_mbox_loc"
1427 | param {
1428 | lr_mult: 1.0
1429 | decay_mult: 1.0
1430 | }
1431 | param {
1432 | lr_mult: 2.0
1433 | decay_mult: 0.0
1434 | }
1435 | convolution_param {
1436 | num_output: 16
1437 | pad: 1
1438 | kernel_size: 3
1439 | stride: 1
1440 | weight_filler {
1441 | type: "xavier"
1442 | }
1443 | bias_filler {
1444 | type: "constant"
1445 | value: 0.0
1446 | }
1447 | }
1448 | }
1449 | layer {
1450 | name: "conv8_2_mbox_loc_perm"
1451 | type: "Permute"
1452 | bottom: "conv8_2_mbox_loc"
1453 | top: "conv8_2_mbox_loc_perm"
1454 | permute_param {
1455 | order: 0
1456 | order: 2
1457 | order: 3
1458 | order: 1
1459 | }
1460 | }
1461 | layer {
1462 | name: "conv8_2_mbox_loc_flat"
1463 | type: "Flatten"
1464 | bottom: "conv8_2_mbox_loc_perm"
1465 | top: "conv8_2_mbox_loc_flat"
1466 | flatten_param {
1467 | axis: 1
1468 | }
1469 | }
1470 | layer {
1471 | name: "conv8_2_mbox_conf_hand_detection"
1472 | type: "Convolution"
1473 | bottom: "conv8_2"
1474 | top: "conv8_2_mbox_conf_hand_detection"
1475 | param {
1476 | lr_mult: 1.0
1477 | decay_mult: 1.0
1478 | }
1479 | param {
1480 | lr_mult: 2.0
1481 | decay_mult: 0.0
1482 | }
1483 | convolution_param {
1484 | num_output: 8
1485 | pad: 1
1486 | kernel_size: 3
1487 | stride: 1
1488 | weight_filler {
1489 | type: "xavier"
1490 | }
1491 | bias_filler {
1492 | type: "constant"
1493 | value: 0.0
1494 | }
1495 | }
1496 | }
1497 | layer {
1498 | name: "conv8_2_mbox_conf_hand_detection_perm"
1499 | type: "Permute"
1500 | bottom: "conv8_2_mbox_conf_hand_detection"
1501 | top: "conv8_2_mbox_conf_hand_detection_perm"
1502 | permute_param {
1503 | order: 0
1504 | order: 2
1505 | order: 3
1506 | order: 1
1507 | }
1508 | }
1509 | layer {
1510 | name: "conv8_2_mbox_conf_hand_detection_flat"
1511 | type: "Flatten"
1512 | bottom: "conv8_2_mbox_conf_hand_detection_perm"
1513 | top: "conv8_2_mbox_conf_hand_detection_flat"
1514 | flatten_param {
1515 | axis: 1
1516 | }
1517 | }
1518 | layer {
1519 | name: "conv8_2_mbox_priorbox"
1520 | type: "PriorBox"
1521 | bottom: "conv8_2"
1522 | bottom: "data"
1523 | top: "conv8_2_mbox_priorbox"
1524 | prior_box_param {
1525 | min_size: 213.0
1526 | max_size: 264.0
1527 | aspect_ratio: 2.0
1528 | flip: true
1529 | clip: false
1530 | variance: 0.10000000149
1531 | variance: 0.10000000149
1532 | variance: 0.20000000298
1533 | variance: 0.20000000298
1534 | step: 100.0
1535 | offset: 0.5
1536 | }
1537 | }
1538 | layer {
1539 | name: "conv9_2_mbox_loc"
1540 | type: "Convolution"
1541 | bottom: "conv9_2"
1542 | top: "conv9_2_mbox_loc"
1543 | param {
1544 | lr_mult: 1.0
1545 | decay_mult: 1.0
1546 | }
1547 | param {
1548 | lr_mult: 2.0
1549 | decay_mult: 0.0
1550 | }
1551 | convolution_param {
1552 | num_output: 16
1553 | pad: 1
1554 | kernel_size: 3
1555 | stride: 1
1556 | weight_filler {
1557 | type: "xavier"
1558 | }
1559 | bias_filler {
1560 | type: "constant"
1561 | value: 0.0
1562 | }
1563 | }
1564 | }
1565 | layer {
1566 | name: "conv9_2_mbox_loc_perm"
1567 | type: "Permute"
1568 | bottom: "conv9_2_mbox_loc"
1569 | top: "conv9_2_mbox_loc_perm"
1570 | permute_param {
1571 | order: 0
1572 | order: 2
1573 | order: 3
1574 | order: 1
1575 | }
1576 | }
1577 | layer {
1578 | name: "conv9_2_mbox_loc_flat"
1579 | type: "Flatten"
1580 | bottom: "conv9_2_mbox_loc_perm"
1581 | top: "conv9_2_mbox_loc_flat"
1582 | flatten_param {
1583 | axis: 1
1584 | }
1585 | }
1586 | layer {
1587 | name: "conv9_2_mbox_conf_hand_detection"
1588 | type: "Convolution"
1589 | bottom: "conv9_2"
1590 | top: "conv9_2_mbox_conf_hand_detection"
1591 | param {
1592 | lr_mult: 1.0
1593 | decay_mult: 1.0
1594 | }
1595 | param {
1596 | lr_mult: 2.0
1597 | decay_mult: 0.0
1598 | }
1599 | convolution_param {
1600 | num_output: 8
1601 | pad: 1
1602 | kernel_size: 3
1603 | stride: 1
1604 | weight_filler {
1605 | type: "xavier"
1606 | }
1607 | bias_filler {
1608 | type: "constant"
1609 | value: 0.0
1610 | }
1611 | }
1612 | }
1613 | layer {
1614 | name: "conv9_2_mbox_conf_hand_detection_perm"
1615 | type: "Permute"
1616 | bottom: "conv9_2_mbox_conf_hand_detection"
1617 | top: "conv9_2_mbox_conf_hand_detection_perm"
1618 | permute_param {
1619 | order: 0
1620 | order: 2
1621 | order: 3
1622 | order: 1
1623 | }
1624 | }
1625 | layer {
1626 | name: "conv9_2_mbox_conf_hand_detection_flat"
1627 | type: "Flatten"
1628 | bottom: "conv9_2_mbox_conf_hand_detection_perm"
1629 | top: "conv9_2_mbox_conf_hand_detection_flat"
1630 | flatten_param {
1631 | axis: 1
1632 | }
1633 | }
1634 | layer {
1635 | name: "conv9_2_mbox_priorbox"
1636 | type: "PriorBox"
1637 | bottom: "conv9_2"
1638 | bottom: "data"
1639 | top: "conv9_2_mbox_priorbox"
1640 | prior_box_param {
1641 | min_size: 264.0
1642 | max_size: 315.0
1643 | aspect_ratio: 2.0
1644 | flip: true
1645 | clip: false
1646 | variance: 0.10000000149
1647 | variance: 0.10000000149
1648 | variance: 0.20000000298
1649 | variance: 0.20000000298
1650 | step: 300.0
1651 | offset: 0.5
1652 | }
1653 | }
1654 | layer {
1655 | name: "mbox_loc"
1656 | type: "Concat"
1657 | bottom: "conv4_3_norm_mbox_loc_flat"
1658 | bottom: "fc7_mbox_loc_flat"
1659 | bottom: "conv6_2_mbox_loc_flat"
1660 | bottom: "conv7_2_mbox_loc_flat"
1661 | bottom: "conv8_2_mbox_loc_flat"
1662 | bottom: "conv9_2_mbox_loc_flat"
1663 | top: "mbox_loc"
1664 | concat_param {
1665 | axis: 1
1666 | }
1667 | }
1668 | layer {
1669 | name: "mbox_conf"
1670 | type: "Concat"
1671 | bottom: "conv4_3_norm_mbox_conf_hand_detection_flat"
1672 | bottom: "fc7_mbox_conf_hand_detection_flat"
1673 | bottom: "conv6_2_mbox_conf_hand_detection_flat"
1674 | bottom: "conv7_2_mbox_conf_hand_detection_flat"
1675 | bottom: "conv8_2_mbox_conf_hand_detection_flat"
1676 | bottom: "conv9_2_mbox_conf_hand_detection_flat"
1677 | top: "mbox_conf"
1678 | concat_param {
1679 | axis: 1
1680 | }
1681 | }
1682 | layer {
1683 | name: "mbox_priorbox"
1684 | type: "Concat"
1685 | bottom: "conv4_3_norm_mbox_priorbox"
1686 | bottom: "fc7_mbox_priorbox"
1687 | bottom: "conv6_2_mbox_priorbox"
1688 | bottom: "conv7_2_mbox_priorbox"
1689 | bottom: "conv8_2_mbox_priorbox"
1690 | bottom: "conv9_2_mbox_priorbox"
1691 | top: "mbox_priorbox"
1692 | concat_param {
1693 | axis: 2
1694 | }
1695 | }
1696 | layer {
1697 | name: "mbox_loss"
1698 | type: "MultiBoxLoss"
1699 | bottom: "mbox_loc"
1700 | bottom: "mbox_conf"
1701 | bottom: "mbox_priorbox"
1702 | bottom: "label"
1703 | top: "mbox_loss"
1704 | include {
1705 | phase: TRAIN
1706 | }
1707 | propagate_down: true
1708 | propagate_down: true
1709 | propagate_down: false
1710 | propagate_down: false
1711 | loss_param {
1712 | normalization: VALID
1713 | }
1714 | multibox_loss_param {
1715 | loc_loss_type: SMOOTH_L1
1716 | conf_loss_type: SOFTMAX
1717 | loc_weight: 1.0
1718 | num_classes: 2
1719 | share_location: true
1720 | match_type: PER_PREDICTION
1721 | overlap_threshold: 0.5
1722 | use_prior_for_matching: true
1723 | background_label_id: 0
1724 | use_difficult_gt: true
1725 | neg_pos_ratio: 3.0
1726 | neg_overlap: 0.5
1727 | code_type: CENTER_SIZE
1728 | ignore_cross_boundary_bbox: false
1729 | mining_type: MAX_NEGATIVE
1730 | }
1731 | }
1732 |
1733 |
--------------------------------------------------------------------------------
/other/Hand_Detection/old_README.md:
--------------------------------------------------------------------------------
1 | ### Roadmap of hand keypoint detection
2 | * 第一步
3 | 分为三个网络,大小分别是propoasl-net: 12,refine-net: 24,output-net: 48。propoasl-net和refine-net使用检测手的数据集,output-net使用手部关键点数据集。
4 | 12,24输出之后生成hard example和原来的数据集一起作为下一个网络的输入,具体思路如下:
5 | 12-net(生成12大小的数据集) ---> 24-net(12-net生成的hard example+生成的24大小的数据集) ---> 48-net
6 | * 生成数据
7 |
8 | 生成数据的时候,有一个ground truth, 高或宽小于40的话,我就认为那不是一只手,是错的标签。在12-net, 24-net输出时使用NMS(非极大值抑制)去除重复框,可以既减少计算量。
9 | [mtcnn-caffe](https://github.com/CongWeilin/mtcnn-caffe)的复现里自定义了data层,我不希望这样做,我会生成hdf5文件,这样更灵活,可以加入测试、训练阶段。
10 |
11 | * 标签
12 |
13 | 所有在整理数据中,对于每个图片进行了15个label的标注信息:
14 |
15 | 1. 第1列:为正负样本标志,1正样本, 0负样本,-1部分样本,3关键点信息
16 |
17 | 2. 第2-5列:为边框偏移,为float类型,对于无边框信息的数据,全部置为-1
18 |
19 | 3. 42列:为关键点偏移,为float类型,对于无边框信息的数据,全部置为-1
20 |
21 |
22 | > 修改softmax_loss_layer.cpp 增加判断,只对于1,0计算loss值
23 | 修改euclidean_loss_layer.cpp 增加判断,对于置为-1的不进行loss计算
24 |
25 | 换句话说,hdf5文件里有四块区域,除了data,还有label(标注正负部分样本), points(21个关键点,所以有42列), roi(边框信息,两个点,四列) 。
26 |
27 | * 自定义层
28 | 1. 自定义一个fc层,只对标签不是-1的进行向前推进,这样就不区分到底是第几个网络,我也不需要写几个文件生成数据集。相当于修改了softmax_loss_layer
29 | 2. 自定义euclidean_loss_layer, 同理也不对-1进行计算
30 |
31 | ### 数据集
32 | 手势识别包括Hand detection和Hand keypoint detection两个问题。首先将手从原图片中提取出,然后针对特征点进行回归,因此需要两种类型的数据集完成问题。
33 | 1. Hand detection数据集,用一个矩形分割出图片中的手的位置,标注了矩形在原始图片中的坐标。
34 | 2. Hand keypoint detection数据集,标注了包括掌心、关节、指尖等关键点的坐标。
35 |
36 | 因此,我们采用的数据集如下:
37 | #### Hand detection数据集
38 | * [Hand Dataset by Arpit Mittal, Andrew Zisserman and Phil Torr](http://www.robots.ox.ac.uk/~vgg/data/hands/)
39 | 这个数据集从各种不同的公共图像数据集源收集的手图像的全面数据集。总共有13050个实例被注释。大于固定框包围面积(1500平方像素)的手实例被认为是“足够大”用于检测并用于评估,给出了大约4170个高质量的手工实例。在收集数据时,没有对人的姿势或能见度施加限制,也没有对环境施加任何限制。在每个图像中,所有能被人类清晰感知的手都有注释。注解由一个包围矩形组成,它不必是轴向对齐的,而是面向手腕的。
40 |
41 |
159 | 数据集预览:
160 |
161 |

162 |
163 |
164 | * [VIVA Hand Detection Dataset](http://cvrr.ucsd.edu/vivachallenge/index.php/hands/hand-detection/)
165 | 该数据集由2D的bounding box标注司机和乘客的手。由54个在自然驾驶中收集的视频组成,包括照明的变化,大的手运动,和普遍的遮挡问题。一些数据由平台拍摄,还有一些是由YouTube提供。
166 | 数据集预览:
167 |
168 |

169 |
170 | #### Hand keypoint detection数据集
171 | * [CMU Hand Database](http://domedb.perception.cs.cmu.edu/handdb.html)
172 | 该数据集由CMU从不同公开数据集进行采集,并进行人工标记手的关键点。并且通过能够容纳关键点的放大的矩形来生成更多的Hand detection数据集。
173 | 数据集预览:
174 |
175 |

176 |
177 | ### 预处理
178 | 我们主要使用Hand detection数据集,Hand keypoint detection数据集。其中Hand detection数据集包含人手边框标注数据,主要用于检测任务的训练;Hand keypoint detection数据集包含边框标注数据和关键点信息,主要用于关键点的训练。训练集分为四种:负样本,正样本,部分样本,关键点样本. 三个样本的比例为$3:1:1:2$。
179 | #### 正负样本,部分样本提取
180 | 1. 从Hand detection数据集随机选出边框,然后和标注数据计算$IOU$,如果大于$0.65$,则为正样本,大于$0.4$小于$0.65$为部分样本,小于$0.4$为负样本。
181 | $IOU$: 简单来讲就是模型产生的目标窗口和原来标记窗口的交叠率。具体我们可以简单的理解为: 即检测结果(DetectionResult)与Ground Truth的交集比上它们的并集,即为检测的准确率IOU,公式如下:
182 | $$ IOU = \frac{DetectionResult \bigcap Ground Truth }{DetectionResult \bigcup Ground Truth} $$
183 | 2. 计算边框偏移.对于边框,$(x1,y1)$为左上角坐标,$(x2,y2)$为右下角坐标,新剪裁的边框坐标为
184 | $(xn1,yn1)$, $(xn2,yn2)$, $width$, $height$。则
185 | $$ offset x1 = (x1 - xn1)/width$$
186 | 同上,计算另三个点的坐标偏移.
187 |
188 | 3. 对于正样本,部分样本均有边框信息,而对于负样本不需要边框信息
189 |
190 | #### 关键点样本提取
191 | 从Hand keypoint detection数据集中提取,可以根据标注的边框,在满足正样本的要求下,随机裁剪出图片,然后调整关键点的坐标。
192 |
193 | ### 路线
194 | #### loss修改
195 | 由于训练过程中需要同时计算3个loss,但是对于不同的任务,每个任务需要的loss不同,所以在整理数据中,对于每个图片进行15个label的标注信息:
196 | 1. 第1列:为正负样本标志,1正样本,0负样本,2部分样本,3关键点信息
197 | 2. 第2-5列:为边框偏移,为float类型,对于无边框信息的数据,全部置为-1
198 | 3. 第6-15列:为关键点偏移,为floagt类型,对于无边框信息的数据,全部置为-1
199 |
200 | 标注好label之后,在训练过程中,采取以下措施:
201 | 1. 自定义softmax_loss,增加判断,只对于1,0计算loss值。
202 | 2. 自定义euclidean_loss,增加判断,对于置为-1的不进行loss计算。
203 | 3. Hard Example选择,在进行人脸分类任务时,采用了在线困难样本选择,即在训练过程中,根据计算出的loss值,进行排序,只对于70%的值较低的数据,进行反向传播。
204 |
205 | #### 网络描述
206 | 分为三个阶段,分别是classifier、boundingbox regression和landmarks detection
207 | 1. stage1: 在构建图像金字塔的基础上,利用fully convolutional network来进行检测,同时利用boundingbox regression和非极大值抑制(NMS)来合并高度重叠的候选框。在这一步获得了手的区域的候选窗口和边界框的回归向量,并用该边界框做回归,对候选窗口进行了校准。
208 | 2. stage2: 将通过stage1的所有窗口输入作进一步判断,同时也通过boundingbox regression和 NMS去掉那些false-positive区域。
209 | 3. stage3: 作用和stage2相似,但是stage3对手的区域进行了更多的监督和更强的约束即手的关键点,因此在stage3还会输出手的关键点。
210 |
--------------------------------------------------------------------------------
/other/Hand_Detection/pic/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/pic/demo.jpg
--------------------------------------------------------------------------------
/other/Hand_Detection/pic/example_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/pic/example_image.jpg
--------------------------------------------------------------------------------
/other/Hand_Detection/pic/width_and_height.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/pic/width_and_height.png
--------------------------------------------------------------------------------
/other/Hand_Detection/ssd_camera.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 | import sys
5 | sys.path.insert(0, 'caffe/python')
6 | import caffe
7 | from utils.ssd_net import *
8 | import time
9 | import urllib
10 |
11 |
12 | ## Use local camera
13 | # cap = cv2.VideoCapture(0)
14 | # # width = 720
15 | # # height = 480
16 | width = 640
17 | height = 480
18 | # cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
19 | # cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
20 |
21 | ## Use ipcam
22 | # url = r"http://192.168.1.190:8080/videofeed"
23 | # capture = cv2.VideoCapture(url)
24 |
25 | # Replace the URL with your own IPwebcam shot.jpg IP:port
26 | url = 'http://192.168.1.190:8080/shot.jpg'
27 |
28 |
29 | model_def = 'model/deploy.prototxt'
30 | model_weights = 'model/snapshot/VGG_HAND_SSD_300x300_iter_50000.caffemodel'
31 |
32 | ssd_net = SSD_NET(model_weights, model_def, GPU_MODE=True, threshold=0.7)
33 |
34 | while True:
35 | # get a frame
36 | # start_time = time.time()
37 | # ret, frame = capture.read()
38 |
39 | # Use urllib to get the image from the IP camera
40 | imgResp = urllib.urlopen(url)
41 |
42 | # Numpy to convert into a array
43 | imgNp = np.array(bytearray(imgResp.read()),dtype=np.uint8)
44 |
45 | # Finally decode the array to OpenCV usable format ;)
46 | frame = cv2.imdecode(imgNp,-1)
47 |
48 | start_time = time.time()
49 |
50 | # show a frame
51 | try:
52 | image_np = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
53 | except:
54 | print("Error converting to RGB")
55 |
56 | top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax = ssd_net.detect(image_np/255.0)
57 | print(image_np.shape)
58 |
59 | print(top_conf)
60 | print(top_label_indices)
61 | for i in range(len(top_conf)):
62 | xmin = int(round(top_xmin[i] * width))
63 | ymin = int(round(top_ymin[i] * height))
64 | xmax = int(round(top_xmax[i] * width))
65 | ymax = int(round(top_ymax[i] * height))
66 | print(xmin, ymin, xmax, ymax, top_conf[i])
67 | # if np.sum(top_xmin[i]<0) > 0 or np.sum(top_xmax[i]<0) > 0 or np.sum(top_ymin[i]<0) > 0 or np.sum(top_ymax[i]<0) > 0:
68 | # continue
69 | cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
70 | # # time.sleep(0.1)
71 | fps = 1/(time.time() - start_time)
72 | cv2.putText(frame, 'FPS: %d' % fps, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
73 |
74 | cv2.imshow("capture", frame)
75 |
76 | if cv2.waitKey(1) == 27:
77 | break # esc to quit
78 |
79 | # capture.release()
80 | cv2.destroyAllWindows()
--------------------------------------------------------------------------------
/other/Hand_Detection/ssd_detection.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 | import sys
4 | sys.path.insert(0, 'caffe/python')
5 | import caffe
6 | from utils.ssd_net import *
7 |
8 | plt.rcParams['figure.figsize'] = (10, 10)
9 | plt.rcParams['image.interpolation'] = 'nearest'
10 | plt.rcParams['image.cmap'] = 'gray'
11 |
12 | model_def = 'model/deploy.prototxt'
13 | model_weights = 'model/snapshot/VGG_HAND_SSD_300x300_iter_50000.caffemodel'
14 |
15 | ssd_net = SSD_NET(model_weights, model_def, GPU_MODE=True, threshold=0.5)
16 |
17 | # image = caffe.io.load_image('/Users/hzzone/Desktop/CARDS_COURTYARD_B_T_0324.jpg')
18 | image = caffe.io.load_image('/home/hzzone/Desktop/2.jpg')
19 |
20 | top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax = ssd_net.detect(image)
21 |
22 | # print(top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax)
23 |
24 | colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()
25 |
26 | plt.imshow(image)
27 | currentAxis = plt.gca()
28 |
29 | for i in xrange(top_conf.shape[0]):
30 | xmin = int(round(top_xmin[i] * image.shape[1]))
31 | ymin = int(round(top_ymin[i] * image.shape[0]))
32 | xmax = int(round(top_xmax[i] * image.shape[1]))
33 | ymax = int(round(top_ymax[i] * image.shape[0]))
34 | score = top_conf[i]
35 | label = int(top_label_indices[i])
36 | # label_name = top_labels[i]
37 | label_name = label
38 | display_txt = '%s: %.2f' % ('hand', score)
39 | coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
40 | color = colors[label]
41 | currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
42 | currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5})
43 |
44 | plt.show()
45 |
46 |
--------------------------------------------------------------------------------
/other/Hand_Detection/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/utils/__init__.py
--------------------------------------------------------------------------------
/other/Hand_Detection/utils/mAP.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | def eval_mAP(predict_file, ground_truth_file):
4 | predict_dict = dict()
5 | ground_truth_dict = dict()
6 | def get_info(info_file, info_dict):
7 | bbox_num = 0
8 | first_line = True
9 | with open(info_file) as fr:
10 | for line in fr:
11 | if first_line:
12 | first_line = False
13 | continue
14 | if len(line.strip().split(',')) == 6:
15 | line_data = line.strip().split(',')
16 | im_id = line_data[0]
17 | xmin,ymin,xmax,ymax,score = map(float, line_data[1:])
18 | else:
19 | im_id,xmin,ymin,xmax,ymax = map(float, line.strip().split(','))
20 | score = 1.
21 | if not im_id in info_dict:
22 | info_dict[im_id] = list()
23 | info_dict[im_id].append([xmin,ymin,xmax,ymax,score])
24 | bbox_num+=1
25 | return bbox_num
26 |
27 | predict_bbox_num = get_info(predict_file, predict_dict)
28 | ground_truth_bbox_num = get_info(ground_truth_file, ground_truth_dict)
29 | score_list = list()
30 | match_list = list()
31 |
32 | def iou(predict_bbox, ground_truth_bbox):
33 | predict_area = (predict_bbox[2] - predict_bbox[0])*(predict_bbox[3] - predict_bbox[1])
34 | ground_truth_area = (ground_truth_bbox[2] - ground_truth_bbox[0])*(ground_truth_bbox[3] - ground_truth_bbox[1])
35 | inter_x = min(predict_bbox[2],ground_truth_bbox[2]) - max(predict_bbox[0],ground_truth_bbox[0])
36 | inter_y = min(predict_bbox[3],ground_truth_bbox[3]) - max(predict_bbox[1],ground_truth_bbox[1])
37 | if inter_x<=0 or inter_y<=0:
38 | return 0
39 | inter_area = inter_x*inter_y
40 | return inter_area / (predict_area+ground_truth_area-inter_area)
41 |
42 | def compare(predict_list, ground_truth_list, score_list, match_list):
43 | ground_truth_unuse = [True for i in range(len(ground_truth_list))]
44 | for predict_bbox in predict_list:
45 | match = False
46 | for i in range(len(ground_truth_list)):
47 | if ground_truth_unuse[i]:
48 | if iou(predict_bbox, ground_truth_list[i])>0.5:
49 | match = True
50 | ground_truth_unuse[i] = False
51 | break
52 | score_list.append(predict_bbox[-1])
53 | match_list.append(int(match))
54 |
55 | for key in predict_dict.keys():
56 | compare(predict_dict[key], ground_truth_dict[key], score_list, match_list)
57 |
58 | p = list()
59 | r = list()
60 | predict_num = 0
61 | truth_num = 0
62 | score_match_list = list(zip(score_list, match_list))
63 | score_match_list.sort(key=lambda x:x[0], reverse = True)
64 | for item in score_match_list:
65 | predict_num+=1
66 | truth_num+=item[1]
67 | p.append(float(truth_num)/ground_truth_bbox_num)
68 | r.append(float(truth_num)/predict_num)
69 | mAP = 0
70 | for i in range(1,len(p)):
71 | mAP += (r[i-1]+r[i])/2*(p[i]-p[i-1])
72 | return p, r, mAP
73 |
74 | if __name__ == "__main__":
75 | p, r, mAP = eval_mAP("/Users/hzzone/Downloads/object_detection_mAP-master/example/val_pred.csv",
76 | "/Users/hzzone/Downloads/object_detection_mAP-master/example/val_gt.csv")
77 | print(mAP)
78 |
--------------------------------------------------------------------------------
/other/Hand_Detection/utils/output.py:
--------------------------------------------------------------------------------
1 | import os
2 | from ssd_net import *
3 | import sys
4 | sys.path.insert(0, '../caffe/python')
5 | import xml.dom.minidom
6 | import csv
7 | import re
8 | import time
9 |
10 | data_dir = '../data'
11 |
12 | def read_xmlfile(file_path):
13 | DomTree = xml.dom.minidom.parse(file_path)
14 | annotation = DomTree.documentElement
15 | objectlist = annotation.getElementsByTagName('object')
16 | label = file_path.split(os.sep)[-1].strip('.xml')
17 | boxes = []
18 | for objects in objectlist:
19 | bndbox = objects.getElementsByTagName('bndbox')[0]
20 | xmin = int(bndbox.getElementsByTagName('xmin')[0].childNodes[0].data)
21 | ymin = int(bndbox.getElementsByTagName('ymin')[0].childNodes[0].data)
22 | xmax = int(bndbox.getElementsByTagName('xmax')[0].childNodes[0].data)
23 | ymax = int(bndbox.getElementsByTagName('ymax')[0].childNodes[0].data)
24 | print(xmin, ymin, xmax, ymax)
25 | boxes.append([label, xmin, ymin, xmax, ymax, 1])
26 | # print(bndbox)
27 | return boxes
28 |
29 |
30 | def output_gt_label(datatset_name):
31 | anno_path = os.path.join(data_dir, datatset_name, 'test', 'Annotations')
32 | # img_dir = os.path.join(data_dir, datatset_name, 'test', 'JPEGImages')
33 | all_boxes = [['id', 'x1', 'y1', 'x2', 'y2', 'score'], ]
34 | for root, dirs, files in os.walk(anno_path):
35 | for xml_file in files:
36 | xml_file_path = os.path.join(root, xml_file)
37 | all_boxes.extend(read_xmlfile(xml_file_path))
38 | with open('../data/gth/{}.csv'.format(datatset_name), 'wb') as csvfile:
39 | csvwriter = csv.writer(csvfile, delimiter=',')
40 | for box in all_boxes:
41 | csvwriter.writerow(box)
42 |
43 | def output(model_def, model_weights, datatset_name):
44 |
45 | img_dir = os.path.join(data_dir, datatset_name, 'test', 'JPEGImages')
46 | ssd_net = SSD_NET(model_weights, model_def, GPU_MODE=True, threshold=0.2)
47 |
48 | output_boxes = [['id', 'x1', 'y1', 'x2', 'y2', 'score'], ]
49 |
50 |
51 | total_time = 0.0
52 |
53 | for img_name in os.listdir(img_dir):
54 | img_path = os.path.join(img_dir, img_name)
55 | img_name = img_name.strip('.jpg')
56 |
57 | image = caffe.io.load_image(img_path)
58 |
59 | start = time.time()
60 |
61 | top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax = ssd_net.detect(image)
62 |
63 | total_time = total_time + time.time() - start
64 |
65 | print(img_path)
66 |
67 | for i in xrange(top_conf.shape[0]):
68 | xmin = int(round(top_xmin[i] * image.shape[1]))
69 | ymin = int(round(top_ymin[i] * image.shape[0]))
70 | xmax = int(round(top_xmax[i] * image.shape[1]))
71 | ymax = int(round(top_ymax[i] * image.shape[0]))
72 | score = top_conf[i]
73 | label_indice = top_label_indices[i]
74 |
75 | output_boxes.append([img_name, xmin, ymin, xmax, ymax, score])
76 |
77 | assert label_indice == 1.0
78 |
79 |
80 | iter_times = re.findall('VGG_HAND_SSD_300x300_(.*?).caffemodel', model_weights.split(os.sep)[-1])[0]
81 | print(iter_times)
82 | output_dir = '../output/{}'.format(iter_times)
83 | if not os.path.exists(output_dir):
84 | os.makedirs(output_dir)
85 | output_file = os.path.join(output_dir, '{}.csv'.format(datatset_name))
86 | with open(output_file, 'wb') as csvfile:
87 | csvwriter = csv.writer(csvfile, delimiter=',')
88 | for box in output_boxes:
89 | csvwriter.writerow(box)
90 | return total_time/len(os.listdir(img_dir))
91 |
92 |
93 |
94 | model_def = '../model/deploy.prototxt'
95 | model_weights = '../model/snapshot/VGG_HAND_SSD_300x300_iter_50000.caffemodel'
96 | # model_path = '../model/snapshot'
97 | # total_time = []
98 | # for model_weights in os.listdir(model_path):
99 | # if model_weights.endswith('.caffemodel'):
100 | # total_time.append(output(model_def, os.path.join(model_path, model_weights), 'stanfordhands'))
101 | # total_time.append(output(model_def, os.path.join(model_path, model_weights), 'egohands'))
102 |
103 | print(output(model_def, model_weights, 'stanfordhands'))
104 | print(output(model_def, model_weights, 'egohands'))
105 |
106 | # print(total_time)
107 | # output_gt_label('egohands')
108 | # output_gt_label('stanfordhands')
109 | # read_xmlfile('/Users/hzzone/Desktop/Hand-Keypoint-Detection/data/stanfordhands/test/Annotations/VOC2007_1.xml')
110 |
111 |
--------------------------------------------------------------------------------
/other/Hand_Detection/utils/plot_loss.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import re
3 |
4 | with open('../model/train.log') as f:
5 | data = f.read()
6 |
7 | pattern = re.compile(r'''
8 | I0(.*?)solver.cpp:243] Iteration (.*?), loss = (.*?)
9 | I0(.*?)solver.cpp:259] Train net output #0: mbox_loss = (.*?) \(\* 1 = (.*?) loss\)
10 | I0(.*?)sgd_solver.cpp:138] Iteration (.*?), lr = (.*?)
11 | ''')
12 | results = re.findall(pattern, data)
13 | iter_num = []
14 | total_loss = []
15 | mbox_loss = []
16 | learning_rate = []
17 | print(results)
18 |
19 | for result in results:
20 | iter_num.append(int(result[1]))
21 | total_loss.append(float(result[2]))
22 | mbox_loss.append(float(result[4]))
23 | learning_rate.append(float(result[-1]))
24 |
25 | plt.subplot(311)
26 | plt.plot(iter_num, total_loss)
27 | plt.subplot(312)
28 | plt.plot(iter_num, mbox_loss)
29 | plt.subplot(313)
30 | plt.plot(iter_num, learning_rate)
31 |
32 | plt.show()
33 |
34 |
--------------------------------------------------------------------------------
/other/Hand_Detection/utils/score.py:
--------------------------------------------------------------------------------
1 | import mAP
2 | import os
3 |
4 | p, r, AP = mAP.eval_mAP('/home/hzzone/Hand-Keypoint-Detection/output/iter_50000/egohands.csv', '/home/hzzone/Hand-Keypoint-Detection/data/gth/egohands.csv')
5 | print(AP)
6 | p, r, AP = mAP.eval_mAP('/home/hzzone/Hand-Keypoint-Detection/output/iter_50000/stanfordhands.csv', '/home/hzzone/Hand-Keypoint-Detection/data/gth/stanfordhands.csv')
7 | print(AP)
8 | # for test_data in ['egohands', 'stanfordhands']:
9 | # gth_path = '../data/gth/{}.csv'.format(test_data)
10 | # output_path = [os.path.join('../output', iter_num) for iter_num in os.listdir('../output')]
11 | # for iter_num_output in output_path:
12 | # p, r, AP = mAP.eval_mAP('{}/{}.csv'.format(iter_num_output, test_data), gth_path)
13 | # print(iter_num_output, AP)
14 |
--------------------------------------------------------------------------------
/other/Hand_Detection/utils/ssd_net.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, '../caffe/python')
3 | import caffe
4 | import numpy as np
5 | from google.protobuf import text_format
6 | from caffe.proto import caffe_pb2
7 |
8 | def get_labelname(labelmap, labels):
9 | num_labels = len(labelmap.item)
10 | print(labelmap.item[0])
11 | print(num_labels)
12 | labelnames = []
13 | if type(labels) is not list:
14 | labels = [labels]
15 | for label in labels:
16 | found = False
17 | for i in xrange(0, num_labels):
18 | if label == labelmap.item[i].label:
19 | found = True
20 | labelnames.append(labelmap.item[i].display_name)
21 | break
22 | assert found == True
23 | return labelnames
24 |
25 | class SSD_NET(object):
26 |
27 | def __init__(self, model_weights, model_def, threshold=0.5, GPU_MODE=False):
28 | if GPU_MODE:
29 | caffe.set_device(0)
30 | caffe.set_mode_gpu()
31 | else:
32 | caffe.set_mode_cpu()
33 | self.net = caffe.Net(model_def, # defines the structure of the model
34 | model_weights, # contains the trained weights
35 | caffe.TEST) # use test mode (e.g., don't perform dropout)
36 | self.threshold = threshold
37 | self.transformer = caffe.io.Transformer({'data': self.net.blobs['data'].data.shape})
38 | self.transformer.set_transpose('data', (2, 0, 1))
39 | self.transformer.set_mean('data', np.array([127.0, 127.0, 127.0])) # mean pixel
40 | self.transformer.set_raw_scale('data',
41 | 255) # the reference model operates on images in [0,255] range instead of [0,1]
42 | self.transformer.set_channel_swap('data', (2, 1, 0)) # the reference model has channels in BGR order instead of RGB
43 | image_resize = 300
44 | self.net.blobs['data'].reshape(1, 3, image_resize, image_resize)
45 |
46 |
47 | def detect(self, img):
48 | transformed_image = self.transformer.preprocess('data', img)
49 | self.net.blobs['data'].data[...] = transformed_image
50 | detections = self.net.forward()['detection_out']
51 | # Parse the outputs.
52 | det_label = detections[0, 0, :, 1]
53 | det_conf = detections[0, 0, :, 2]
54 | det_xmin = detections[0, 0, :, 3]
55 | det_ymin = detections[0, 0, :, 4]
56 | det_xmax = detections[0, 0, :, 5]
57 | det_ymax = detections[0, 0, :, 6]
58 | # Get detections with confidence higher than 0.6.
59 | # print(det_conf)
60 | top_indices = [i for i, conf in enumerate(det_conf) if conf >= self.threshold]
61 |
62 | top_conf = det_conf[top_indices]
63 | top_label_indices = det_label[top_indices].tolist()
64 | top_xmin = det_xmin[top_indices]
65 | top_ymin = det_ymin[top_indices]
66 | top_xmax = det_xmax[top_indices]
67 | top_ymax = det_ymax[top_indices]
68 |
69 | return top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax
70 |
71 |
72 |
73 |
--------------------------------------------------------------------------------
/other/Hand_Detection/utils/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def IoU(box, boxes):
4 | """Compute IoU between detect box and gt boxes
5 |
6 | Parameters:
7 | ----------
8 | box: numpy array , shape (5, ): x1, y1, x2, y2, score
9 | input box
10 | boxes: numpy array, shape (n, 4): x1, y1, x2, y2
11 | input ground truth boxes
12 |
13 | Returns:
14 | -------
15 | ovr: numpy.array, shape (n, )
16 | IoU
17 | """
18 | box_area = (box[2] - box[0] + 1) * (box[3] - box[1] + 1)
19 | area = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1)
20 | xx1 = np.maximum(box[0], boxes[:, 0])
21 | yy1 = np.maximum(box[1], boxes[:, 1])
22 | xx2 = np.minimum(box[2], boxes[:, 2])
23 | yy2 = np.minimum(box[3], boxes[:, 3])
24 |
25 | # compute the width and height of the bounding box
26 | w = np.maximum(0, xx2 - xx1 + 1)
27 | h = np.maximum(0, yy2 - yy1 + 1)
28 |
29 | inter = w * h
30 | ovr = inter / (box_area + area - inter)
31 | return ovr
32 |
33 |
34 | def convert_to_square(bbox):
35 | """Convert bbox to square
36 |
37 | Parameters:
38 | ----------
39 | bbox: numpy array , shape n x 5
40 | input bbox
41 |
42 | Returns:
43 | -------
44 | square bbox
45 | """
46 | square_bbox = bbox.copy()
47 |
48 | h = bbox[:, 3] - bbox[:, 1] + 1
49 | w = bbox[:, 2] - bbox[:, 0] + 1
50 | max_size = np.maximum(h, w)
51 | square_bbox[:, 0] = bbox[:, 0] + w*0.5 - max_size*0.5
52 | square_bbox[:, 1] = bbox[:, 1] + h*0.5 - max_size*0.5
53 | square_bbox[:, 2] = square_bbox[:, 0] + max_size - 1
54 | square_bbox[:, 3] = square_bbox[:, 1] + max_size - 1
55 | return square_bbox
56 |
--------------------------------------------------------------------------------
/other/Openpose-Keras/.gitignore:
--------------------------------------------------------------------------------
1 | # TOTALLY IGNORE THE MODEL FILES
2 | .ipynb_checkpoints
3 | *.h5
4 | *.h5py
5 | *.npy
--------------------------------------------------------------------------------
/other/Openpose-Keras/README.md:
--------------------------------------------------------------------------------
1 | # OpenPose-Keras
2 | A little bit of play with OpenPose without using their API but allowing to build / prototype pre and post-processing steps in Keras. Please keep in mind that this is more of a toy project and not anything even close to any production applications. If you are looking for something more *useful* please invest some time and get the actual OpenPose up and running :)
3 |
4 | Table of contents:
5 | - Body keypoint estimation network (coming soon)
6 | - Face keypoint estimation network (coming soon)
7 | - [Hand keypoint estimation network](#handKeypointEstimationNetwork)
8 | * [Model description](#handKeypointEstimationNetwork_modelDescription)
9 | * [Input format and pre-processing](#handKeypointEstimationNetwork_inputFormatAndPreProcessing)
10 | * [Post-processing](#handKeypointEstimationNetwork_postProcessing)
11 | * [Discovered issues](#handKeypointEstimationNetwork_issues)
12 | - [External resources](#externalResources)
13 |
14 |
15 | ## Hand keypoint estimation network
16 | [](https://www.youtube.com/watch?v=FnoI8ufwhbs)
17 | Please check out the demo on yt: https://www.youtube.com/watch?v=FnoI8ufwhbs
18 |
19 |
20 | ### Model description
21 | Original model can be found on the [OpenPose's github](https://github.com/CMU-Perceptual-Computing-Lab/openpose). Model weights converted from Caffe model definition available for download: https://drive.google.com/file/d/1yPQFrCrDltqzYAnWBl__O7oZxGL0sQlu/view
22 | The readme on the main page says that the "hand keypoint detection" 2x21 keypoint estimation. The network itself outputs 22 channels (21 keypoints + background). The final layer feeds from the 128-deep convolutional layer (Mconv6_stage6). It is defined as follows (as defined in the [models/hand/pose_deploy.prototxt](https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/models/hand/pose_deploy.prototxt)):
23 | ```
24 | layer {
25 | name: "Mconv7_stage6"
26 | type: "Convolution"
27 | bottom: "Mconv6_stage6"
28 | top: "net_output"
29 | param {
30 | lr_mult: 4.0
31 | decay_mult: 1
32 | }
33 | param {
34 | lr_mult: 8.0
35 | decay_mult: 0
36 | }
37 | convolution_param {
38 | num_output: 22
39 | pad: 0
40 | kernel_size: 1
41 | weight_filler {
42 | type: "gaussian"
43 | std: 0.01
44 | }
45 | bias_filler {
46 | type: "constant"
47 | }
48 | dilation: 1
49 | }
50 | }
51 | ```
52 | Keep in mind that this particular network does NOT produce any part affinity fields, just finger-keypoints. OpenPose's documentation contains the following picture describing the keypoint channel ids:
53 |
54 |
55 | ### Input format and pre-processing
56 | I believe that the natural resolution of the input images (e.g. the standard network input size) is 368 on the width and whatever turns out to be on the height. From the papers presenting this method one figure out that the authors use multi-scale inputs. Basically they go through different scales from 0.5 to 1.5 and average the heatmaps. Network accepts 3 channel RGB images with 32-bit floating point values scaled between -0.5 <= x <= 0.5.
57 |
58 |
59 | ### Post-processing
60 | I haven't studied the code of the OpenPose library very well (yet!) but I noticed that the returned heatmaps seem to have bi-modal distributions. E.g. some values of detections are strongly negative and some are strongly positive. I understood that it may be their way of distinguishing the left hand from the right one. I still need to investigate that.
61 |
62 |
63 | ### Discovered issues
64 | - It seems like the model does not capture hand keypoints when exposed to images of people wearing gloves. I haven't figured out how exaclty the network was trained, but I can imagine that there was no emphasis on glove-wearing targets.
65 |
66 |
67 |
68 | # External resources
69 | 1. OpenPose GitHub repo: https://github.com/CMU-Perceptual-Computing-Lab/openpose
70 | 2. Origin of OpenPose: https://github.com/ZheC/Realtime_Multi-Person_Pose_Estimation
71 | 3. Paper describing the method: https://arxiv.org/abs/1611.08050
72 | 4. Keras implementation of the Realtime Multi-Person Pose Estimation (my major inspiration): https://github.com/michalfaber/keras_Realtime_Multi-Person_Pose_Estimation
73 |
--------------------------------------------------------------------------------
/other/Openpose-Keras/images/test_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Openpose-Keras/images/test_image.png
--------------------------------------------------------------------------------
/other/asl.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/asl.mp4
--------------------------------------------------------------------------------
/other/front-back.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/front-back.jpg
--------------------------------------------------------------------------------
/other/getModels.sh:
--------------------------------------------------------------------------------
1 | # ------------------------- BODY, FACE AND HAND MODELS -------------------------
2 | # Downloading body pose (COCO and MPI), face and hand models
3 | OPENPOSE_URL="http://posefs1.perception.cs.cmu.edu/OpenPose/models/"
4 | HAND_FOLDER="hand/"
5 |
6 | # "------------------------- HAND MODELS -------------------------"
7 | # Hand
8 | HAND_MODEL=$HAND_FOLDER"pose_iter_102000.caffemodel"
9 | wget -c ${OPENPOSE_URL}${HAND_MODEL} -P ${HAND_FOLDER}
10 |
--------------------------------------------------------------------------------
/other/hand.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/hand.jpg
--------------------------------------------------------------------------------
/other/hand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/hand.png
--------------------------------------------------------------------------------
/other/handPoseImage.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | using namespace std;
7 | using namespace cv;
8 | using namespace cv::dnn;
9 |
10 |
11 | const int POSE_PAIRS[20][2] =
12 | {
13 | {0,1}, {1,2}, {2,3}, {3,4}, // thumb
14 | {0,5}, {5,6}, {6,7}, {7,8}, // index
15 | {0,9}, {9,10}, {10,11}, {11,12}, // middle
16 | {0,13}, {13,14}, {14,15}, {15,16}, // ring
17 | {0,17}, {17,18}, {18,19}, {19,20} // small
18 | };
19 |
20 | string protoFile = "hand/pose_deploy.prototxt";
21 | string weightsFile = "hand/pose_iter_102000.caffemodel";
22 |
23 | int nPoints = 22;
24 |
25 | int main(int argc, char **argv)
26 | {
27 |
28 | cout << "USAGE : ./handPoseImage " << endl;
29 |
30 | string imageFile = "right-frontal.jpg";
31 | // Take arguments from commmand line
32 | if (argc == 2)
33 | {
34 | imageFile = argv[1];
35 | }
36 |
37 | float thresh = 0.01;
38 |
39 | Mat frame = imread(imageFile);
40 | Mat frameCopy = frame.clone();
41 | int frameWidth = frame.cols;
42 | int frameHeight = frame.rows;
43 |
44 | float aspect_ratio = frameWidth/(float)frameHeight;
45 | int inHeight = 368;
46 | int inWidth = (int(aspect_ratio*inHeight) * 8) / 8;
47 |
48 | cout << "inWidth = " << inWidth << " ; inHeight = " << inHeight << endl;
49 |
50 | double t = (double) cv::getTickCount();
51 | Net net = readNetFromCaffe(protoFile, weightsFile);
52 |
53 | Mat inpBlob = blobFromImage(frame, 1.0 / 255, Size(inWidth, inHeight), Scalar(0, 0, 0), false, false);
54 |
55 | net.setInput(inpBlob);
56 |
57 | Mat output = net.forward();
58 |
59 | int H = output.size[2];
60 | int W = output.size[3];
61 |
62 | // find the position of the body parts
63 | vector points(nPoints);
64 | for (int n=0; n < nPoints; n++)
65 | {
66 | // Probability map of corresponding body's part.
67 | Mat probMap(H, W, CV_32F, output.ptr(0,n));
68 | resize(probMap, probMap, Size(frameWidth, frameHeight));
69 |
70 | Point maxLoc;
71 | double prob;
72 | minMaxLoc(probMap, 0, &prob, 0, &maxLoc);
73 | if (prob > thresh)
74 | {
75 | circle(frameCopy, cv::Point((int)maxLoc.x, (int)maxLoc.y), 8, Scalar(0,255,255), -1);
76 | cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)maxLoc.x, (int)maxLoc.y), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 0, 255), 2);
77 |
78 | }
79 | points[n] = maxLoc;
80 | }
81 |
82 | int nPairs = sizeof(POSE_PAIRS)/sizeof(POSE_PAIRS[0]);
83 |
84 | for (int n = 0; n < nPairs; n++)
85 | {
86 | // lookup 2 connected body/hand parts
87 | Point2f partA = points[POSE_PAIRS[n][0]];
88 | Point2f partB = points[POSE_PAIRS[n][1]];
89 |
90 | if (partA.x<=0 || partA.y<=0 || partB.x<=0 || partB.y<=0)
91 | continue;
92 |
93 | line(frame, partA, partB, Scalar(0,255,255), 8);
94 | circle(frame, partA, 8, Scalar(0,0,255), -1);
95 | circle(frame, partB, 8, Scalar(0,0,255), -1);
96 | }
97 |
98 | t = ((double)cv::getTickCount() - t)/cv::getTickFrequency();
99 | cout << "Time Taken = " << t << endl;
100 | imshow("Output-Keypoints", frameCopy);
101 | imshow("Output-Skeleton", frame);
102 | imwrite("Output-Skeleton.jpg", frame);
103 |
104 | waitKey();
105 |
106 | return 0;
107 | }
108 |
--------------------------------------------------------------------------------
/other/handPoseImage.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 | import cv2
3 | import time
4 | import numpy as np
5 |
6 | protoFile = "hand/pose_deploy.prototxt"
7 | weightsFile = "hand/pose_iter_102000.caffemodel"
8 | nPoints = 22
9 | POSE_PAIRS = [ [0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20] ]
10 | net = cv2.dnn.readNetFromCaffe(protoFile, weightsFile)
11 |
12 | frame = cv2.imread("right-frontal.jpg")
13 | frameCopy = np.copy(frame)
14 | frameWidth = frame.shape[1]
15 | frameHeight = frame.shape[0]
16 | aspect_ratio = frameWidth/frameHeight
17 |
18 | threshold = 0.1
19 |
20 | t = time.time()
21 | # input image dimensions for the network
22 | inHeight = 368
23 | inWidth = int(((aspect_ratio*inHeight)*8)//8)
24 | inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight), (0, 0, 0), swapRB=False, crop=False)
25 |
26 | net.setInput(inpBlob)
27 |
28 | output = net.forward()
29 | print("time taken by network : {:.3f}".format(time.time() - t))
30 |
31 | # Empty list to store the detected keypoints
32 | points = []
33 |
34 | for i in range(nPoints):
35 | # confidence map of corresponding body's part.
36 | probMap = output[0, i, :, :]
37 | probMap = cv2.resize(probMap, (frameWidth, frameHeight))
38 |
39 | # Find global maxima of the probMap.
40 | minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)
41 |
42 | if prob > threshold :
43 | cv2.circle(frameCopy, (int(point[0]), int(point[1])), 8, (0, 255, 255), thickness=-1, lineType=cv2.FILLED)
44 | cv2.putText(frameCopy, "{}".format(i), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, lineType=cv2.LINE_AA)
45 |
46 | # Add the point to the list if the probability is greater than the threshold
47 | points.append((int(point[0]), int(point[1])))
48 | else :
49 | points.append(None)
50 |
51 | # Draw Skeleton
52 | for pair in POSE_PAIRS:
53 | partA = pair[0]
54 | partB = pair[1]
55 |
56 | if points[partA] and points[partB]:
57 | cv2.line(frame, points[partA], points[partB], (0, 255, 255), 2)
58 | cv2.circle(frame, points[partA], 8, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
59 | cv2.circle(frame, points[partB], 8, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
60 |
61 |
62 | cv2.imshow('Output-Keypoints', frameCopy)
63 | cv2.imshow('Output-Skeleton', frame)
64 |
65 |
66 | cv2.imwrite('Output-Keypoints.jpg', frameCopy)
67 | cv2.imwrite('Output-Skeleton.jpg', frame)
68 |
69 | print("Total time taken : {:.3f}".format(time.time() - t))
70 |
71 | cv2.waitKey(0)
72 |
--------------------------------------------------------------------------------
/other/handPoseVideo.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | using namespace std;
7 | using namespace cv;
8 | using namespace cv::dnn;
9 |
10 | const int POSE_PAIRS[20][2] =
11 | {
12 | {0,1}, {1,2}, {2,3}, {3,4}, // thumb
13 | {0,5}, {5,6}, {6,7}, {7,8}, // index
14 | {0,9}, {9,10}, {10,11}, {11,12}, // middle
15 | {0,13}, {13,14}, {14,15}, {15,16}, // ring
16 | {0,17}, {17,18}, {18,19}, {19,20} // small
17 | };
18 |
19 | string protoFile = "hand/pose_deploy.prototxt";
20 | string weightsFile = "hand/pose_iter_102000.caffemodel";
21 |
22 | int nPoints = 22;
23 |
24 | int main(int argc, char **argv)
25 | {
26 | float thresh = 0.01;
27 |
28 | cv::VideoCapture cap("asl.mp4");
29 |
30 | if (!cap.isOpened())
31 | {
32 | cerr << "Unable to connect to camera" << endl;
33 | return 1;
34 | }
35 |
36 | Mat frame, frameCopy;
37 | int frameWidth = cap.get(CAP_PROP_FRAME_WIDTH);
38 | int frameHeight = cap.get(CAP_PROP_FRAME_HEIGHT);
39 | float aspect_ratio = frameWidth/(float)frameHeight;
40 | int inHeight = 368;
41 | int inWidth = (int(aspect_ratio*inHeight) * 8) / 8;
42 |
43 | cout << "inWidth = " << inWidth << " ; inHeight = " << inHeight << endl;
44 |
45 | VideoWriter video("Output-Skeleton.avi",VideoWriter::fourcc('M','J','P','G'), 10, Size(frameWidth,frameHeight));
46 |
47 | Net net = readNetFromCaffe(protoFile, weightsFile);
48 |
49 | double t=0;
50 | while(1)
51 | {
52 | double t = (double) cv::getTickCount();
53 |
54 | cap >> frame;
55 | frameCopy = frame.clone();
56 | Mat inpBlob = blobFromImage(frame, 1.0 / 255, Size(inWidth, inHeight), Scalar(0, 0, 0), false, false);
57 |
58 | net.setInput(inpBlob);
59 |
60 | Mat output = net.forward();
61 |
62 | int H = output.size[2];
63 | int W = output.size[3];
64 |
65 | // find the position of the body parts
66 | vector points(nPoints);
67 | for (int n=0; n < nPoints; n++)
68 | {
69 | // Probability map of corresponding body's part.
70 | Mat probMap(H, W, CV_32F, output.ptr(0,n));
71 | resize(probMap, probMap, Size(frameWidth, frameHeight));
72 |
73 | Point maxLoc;
74 | double prob;
75 | minMaxLoc(probMap, 0, &prob, 0, &maxLoc);
76 | if (prob > thresh)
77 | {
78 | circle(frameCopy, cv::Point((int)maxLoc.x, (int)maxLoc.y), 8, Scalar(0,255,255), -1);
79 | cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)maxLoc.x, (int)maxLoc.y), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 0, 255), 2);
80 |
81 | }
82 | points[n] = maxLoc;
83 | }
84 |
85 | int nPairs = sizeof(POSE_PAIRS)/sizeof(POSE_PAIRS[0]);
86 |
87 | for (int n = 0; n < nPairs; n++)
88 | {
89 | // lookup 2 connected body/hand parts
90 | Point2f partA = points[POSE_PAIRS[n][0]];
91 | Point2f partB = points[POSE_PAIRS[n][1]];
92 |
93 | if (partA.x<=0 || partA.y<=0 || partB.x<=0 || partB.y<=0)
94 | continue;
95 |
96 | line(frame, partA, partB, Scalar(0,255,255), 8);
97 | circle(frame, partA, 8, Scalar(0,0,255), -1);
98 | circle(frame, partB, 8, Scalar(0,0,255), -1);
99 | }
100 |
101 | t = ((double)cv::getTickCount() - t)/cv::getTickFrequency();
102 | cout << "Time Taken for frame = " << t << endl;
103 | cv::putText(frame, cv::format("time taken = %.2f sec", t), cv::Point(50, 50), cv::FONT_HERSHEY_COMPLEX, .8, cv::Scalar(255, 50, 0), 2);
104 | // imshow("Output-Keypoints", frameCopy);
105 | imshow("Output-Skeleton", frame);
106 | video.write(frame);
107 | char key = waitKey(1);
108 | if (key==27)
109 | break;
110 | }
111 | // When everything done, release the video capture and write object
112 | cap.release();
113 | video.release();
114 |
115 | return 0;
116 | }
117 |
--------------------------------------------------------------------------------
/other/handPoseVideo.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import time
3 | import numpy as np
4 |
5 |
6 | protoFile = "hand/pose_deploy.prototxt"
7 | weightsFile = "hand/pose_iter_102000.caffemodel"
8 | nPoints = 22
9 | POSE_PAIRS = [ [0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20] ]
10 |
11 | threshold = 0.2
12 |
13 |
14 | input_source = "asl.mp4"
15 | cap = cv2.VideoCapture(input_source)
16 | hasFrame, frame = cap.read()
17 |
18 | frameWidth = frame.shape[1]
19 | frameHeight = frame.shape[0]
20 |
21 | aspect_ratio = frameWidth/frameHeight
22 |
23 | inHeight = 368
24 | inWidth = int(((aspect_ratio*inHeight)*8)//8)
25 |
26 | vid_writer = cv2.VideoWriter('output.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 15, (frame.shape[1],frame.shape[0]))
27 |
28 | net = cv2.dnn.readNetFromCaffe(protoFile, weightsFile)
29 | k = 0
30 | while 1:
31 | k+=1
32 | t = time.time()
33 | hasFrame, frame = cap.read()
34 | frameCopy = np.copy(frame)
35 | if not hasFrame:
36 | cv2.waitKey()
37 | break
38 |
39 | inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight),
40 | (0, 0, 0), swapRB=False, crop=False)
41 |
42 | net.setInput(inpBlob)
43 |
44 | output = net.forward()
45 |
46 | print("forward = {}".format(time.time() - t))
47 |
48 | # Empty list to store the detected keypoints
49 | points = []
50 |
51 | for i in range(nPoints):
52 | # confidence map of corresponding body's part.
53 | probMap = output[0, i, :, :]
54 | probMap = cv2.resize(probMap, (frameWidth, frameHeight))
55 |
56 | # Find global maxima of the probMap.
57 | minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)
58 |
59 | if prob > threshold :
60 | cv2.circle(frameCopy, (int(point[0]), int(point[1])), 6, (0, 255, 255), thickness=-1, lineType=cv2.FILLED)
61 | cv2.putText(frameCopy, "{}".format(i), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_SIMPLEX, .8, (0, 0, 255), 2, lineType=cv2.LINE_AA)
62 |
63 | # Add the point to the list if the probability is greater than the threshold
64 | points.append((int(point[0]), int(point[1])))
65 | else :
66 | points.append(None)
67 |
68 | # Draw Skeleton
69 | for pair in POSE_PAIRS:
70 | partA = pair[0]
71 | partB = pair[1]
72 |
73 | if points[partA] and points[partB]:
74 | cv2.line(frame, points[partA], points[partB], (0, 255, 255), 2, lineType=cv2.LINE_AA)
75 | cv2.circle(frame, points[partA], 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
76 | cv2.circle(frame, points[partB], 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
77 |
78 | print("Time Taken for frame = {}".format(time.time() - t))
79 |
80 | # cv2.putText(frame, "time taken = {:.2f} sec".format(time.time() - t), (50, 50), cv2.FONT_HERSHEY_COMPLEX, .8, (255, 50, 0), 2, lineType=cv2.LINE_AA)
81 | # cv2.putText(frame, "Hand Pose using OpenCV", (50, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 50, 0), 2, lineType=cv2.LINE_AA)
82 | cv2.imshow('Output-Skeleton', frame)
83 | # cv2.imwrite("video_output/{:03d}.jpg".format(k), frame)
84 | key = cv2.waitKey(1)
85 | if key == 27:
86 | break
87 |
88 | print("total = {}".format(time.time() - t))
89 |
90 | vid_writer.write(frame)
91 |
92 | vid_writer.release()
93 |
--------------------------------------------------------------------------------