├── .gitignore
├── README.md
├── demo
    ├── Openpose.py
    ├── SqueezeNet.ipynb
    └── img_keypoint_show.py
├── image
    ├── Loss.png
    ├── com.png
    ├── front-back.jpg
    ├── hand.jpeg
    ├── hand.jpg
    ├── hand.png
    ├── right-frontal.jpg
    └── unnamed.png
├── main
    ├── data
    │   └── dataset.py
    ├── demo.py
    └── train.py
└── other
    ├── CMakeLists.txt
    ├── Hand_Caffe
        ├── 1_F_deploy.prototxt
        ├── 1_F_solver.prototxt
        ├── 1_F_train.prototxt
        ├── create_txt.py
        ├── getBox.py
        ├── hand.jpeg
        ├── level1.py
        ├── read_im_json.py
        └── utils.py
    ├── Hand_Detection
        ├── README.md
        ├── data
        │   ├── create_annoset.py
        │   ├── create_data.sh
        │   ├── create_txt.py
        │   ├── egohands
        │   │   ├── _screenshot_17.04.2018.png
        │   │   ├── egohands_data.txt
        │   │   ├── generate_egohands.py
        │   │   └── getInfo.m
        │   ├── gth
        │   │   └── .gitkeep
        │   ├── labelmap_voc.prototxt
        │   └── stanfordhands
        │   │   └── generate_stanfordhands.py
        ├── model
        │   ├── deploy.prototxt
        │   ├── generate_model.py
        │   ├── snapshot
        │   │   └── .gitkeep
        │   ├── solver.prototxt
        │   └── train.prototxt
        ├── old_README.md
        ├── pic
        │   ├── demo.jpg
        │   ├── example_image.jpg
        │   └── width_and_height.png
        ├── ssd_camera.py
        ├── ssd_detection.py
        └── utils
        │   ├── __init__.py
        │   ├── mAP.py
        │   ├── output.py
        │   ├── plot_loss.py
        │   ├── score.py
        │   ├── ssd_net.py
        │   └── utils.py
    ├── Openpose-Keras
        ├── .gitignore
        ├── README.md
        ├── StolenOpenPoseHandTracking.ipynb
        └── images
        │   └── test_image.png
    ├── asl.mp4
    ├── front-back.jpg
    ├── getModels.sh
    ├── hand.jpg
    ├── hand.png
    ├── handPose-Notebook.ipynb
    ├── handPoseImage.cpp
    ├── handPoseImage.py
    ├── handPoseVideo.cpp
    └── handPoseVideo.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # TOTALLY IGNORE THE MODEL FILES
  2 | .ipynb_checkpoints
  3 | *.h5
  4 | *.h5py
  5 | *.npy
  6 | *.zip
  7 | /.idea
  8 | Push.sh
  9 | /data
 10 | /文献
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | .hypothesis/
 59 | .pytest_cache/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # celery beat schedule file
 90 | celerybeat-schedule
 91 | 
 92 | # SageMath parsed files
 93 | *.sage.py
 94 | 
 95 | # Environments
 96 | .env
 97 | .venv
 98 | env/
 99 | venv/
100 | ENV/
101 | env.bak/
102 | venv.bak/
103 | 
104 | # Spyder project settings
105 | .spyderproject
106 | .spyproject
107 | 
108 | # Rope project settings
109 | .ropeproject
110 | 
111 | # mkdocs documentation
112 | /site
113 | 
114 | # mypy
115 | .mypy_cache/
116 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hand-Keypoint-Estimation
 2 | 
 3 | ## Introduction
 4 | 
 5 | <div align=center><img src="image/unnamed.png" alt="unnamed"/></div>
 6 | 
 7 | 手部21点关键点识别
 8 | 
 9 | <div align=center><img src="image/com.png" alt="unnamed"/></div>
10 | 
11 | ## TODO
12 | 
13 | - [x] ResNet34+Finetune
14 | - [x] SqueezeNet+Finetune
15 | - [ ] Hourglass
16 | - [ ] Openpose+Design Loss
17 | - [ ] 更好的效果展示
18 | - [ ] 抗遮挡
19 | 
20 | ## Dataset
21 | 
22 | [CMU手部数据集](http://domedb.perception.cs.cmu.edu/handdb.html)（遮挡比较变态）
23 | 
24 | ```
25 | Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)
26 | └─hand_labels_synth
27 |     ├─output_viz_synth
28 |     ├─synth1（json文件数据缺失指尖5个关键点）
29 |     ├─synth2
30 |     ├─synth3
31 |     └─synth4
32 | ```
33 | 
34 | ## Benchmarking
35 | 
36 | **SqueezeNet+Finetune**
37 | 
38 | ```
39 | Finetune = nn.Sequential(
40 |     Flatten(), 
41 |     nn.ReLU(),
42 |     nn.Dropout(0.5),
43 |     nn.Linear(247808, 256),
44 |     #ReLU不能放BN前会导致BN方差计算错误
45 |     nn.BatchNorm1d(256),
46 |     nn.ReLU(),
47 |     nn.Dropout(0.5),
48 |     nn.Linear(256, 42),
49 |     Reshape(-1,21,2),
50 |     nn.Tanh()
51 |     ) 
52 | Total params: 64,172,906
53 | Total trainable params: 64,172,906
54 | Total non-trainable params: 0
55 | Loss function : MSELoss
56 | Epoch : 200
57 | LR : 0.01->0.0001
58 | Train Loss end : 0.010500	
59 | Valid Loss end : 0.012454
60 | ```
61 | 
62 | <div align=center><img src="image/Loss.png" alt="unnamed"/></div>
63 | 
64 | CPU上0.0234s一张图片
65 | 
66 | GPU-2070Ti上0.00727s一张图片
67 | 
68 | 


--------------------------------------------------------------------------------
/demo/Openpose.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import json
  3 | from collections import OrderedDict
  4 | from torch.autograd import Variable
  5 | 
  6 | import torch
  7 | import torch.nn as nn
  8 | from fastai.vision import *
  9 | from fastai import *
 10 | import os
 11 | os.environ['CUDA_VISIBLE_DEVICES'] = '3'
 12 | 
 13 | class Reshape(nn.Module):
 14 |     def __init__(self, *args):
 15 |         super(Reshape, self).__init__()
 16 |         self.shape = args
 17 | 
 18 |     def forward(self, x):
 19 |         return x.view(self.shape)
 20 | 
 21 | def make_layers(block, no_relu_layers):
 22 |     layers = []
 23 |     for layer_name, v in block.items():
 24 |         if 'pool' in layer_name:
 25 |             layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1],
 26 |                                  padding=v[2])
 27 |             layers.append((layer_name, layer))
 28 |         else:
 29 |             conv2d = nn.Conv2d(in_channels=v[0], out_channels=v[1],
 30 |                                kernel_size=v[2], stride=v[3],
 31 |                                padding=v[4])
 32 |             layers.append((layer_name, conv2d))
 33 |             if layer_name not in no_relu_layers:
 34 |                 layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
 35 | 
 36 |     return nn.Sequential(OrderedDict(layers))
 37 | 
 38 | 
 39 | class handpose_model(nn.Module):
 40 |     def __init__(self):
 41 |         super().__init__()
 42 | 
 43 |         # these layers have no relu layer
 44 |         no_relu_layers = ['conv6_2_CPM', 'Mconv7_stage2', 'Mconv7_stage3', \
 45 |                           'Mconv7_stage4', 'Mconv7_stage5', 'Mconv7_stage6']
 46 |         # stage 1
 47 |         block1_0 = OrderedDict({
 48 |             'conv1_1': [3, 64, 3, 1, 1],
 49 |             'conv1_2': [64, 64, 3, 1, 1],
 50 |             'pool1_stage1': [2, 2, 0],
 51 |             'conv2_1': [64, 128, 3, 1, 1],
 52 |             'conv2_2': [128, 128, 3, 1, 1],
 53 |             'pool2_stage1': [2, 2, 0],
 54 |             'conv3_1': [128, 256, 3, 1, 1],
 55 |             'conv3_2': [256, 256, 3, 1, 1],
 56 |             'conv3_3': [256, 256, 3, 1, 1],
 57 |             'conv3_4': [256, 256, 3, 1, 1],
 58 |             'pool3_stage1': [2, 2, 0],
 59 |             'conv4_1': [256, 512, 3, 1, 1],
 60 |             'conv4_2': [512, 512, 3, 1, 1],
 61 |             'conv4_3': [512, 512, 3, 1, 1],
 62 |             'conv4_4': [512, 512, 3, 1, 1],
 63 |             'conv5_1': [512, 512, 3, 1, 1],
 64 |             'conv5_2': [512, 512, 3, 1, 1],
 65 |             'conv5_3_CPM': [512, 128, 3, 1, 1]})
 66 | 
 67 |         block1_1 = OrderedDict({
 68 |             'conv6_1_CPM': [128, 512, 1, 1, 0],
 69 |             'conv6_2_CPM': [512, 22, 1, 1, 0]
 70 |         })
 71 | 
 72 |         blocks = {}
 73 |         blocks['block1_0'] = block1_0
 74 |         blocks['block1_1'] = block1_1
 75 | 
 76 |         # stage 2-6
 77 |         for i in range(2, 7):
 78 |             blocks['block%d' % i] = OrderedDict({
 79 |                 'Mconv1_stage%d' % i: [150, 128, 7, 1, 3],
 80 |                 'Mconv2_stage%d' % i: [128, 128, 7, 1, 3],
 81 |                 'Mconv3_stage%d' % i: [128, 128, 7, 1, 3],
 82 |                 'Mconv4_stage%d' % i: [128, 128, 7, 1, 3],
 83 |                 'Mconv5_stage%d' % i: [128, 128, 7, 1, 3],
 84 |                 'Mconv6_stage%d' % i: [128, 128, 1, 1, 0],
 85 |                 'Mconv7_stage%d' % i: [128, 22, 1, 1, 0]})
 86 | 
 87 |         for k in blocks.keys():
 88 |             blocks[k] = make_layers(blocks[k], no_relu_layers)
 89 | 
 90 |         self.model1_0 = blocks['block1_0']
 91 |         self.model1_1 = blocks['block1_1']
 92 |         self.model2 = blocks['block2']
 93 |         self.model3 = blocks['block3']
 94 |         self.model4 = blocks['block4']
 95 |         self.model5 = blocks['block5']
 96 |         self.model6 = blocks['block6']
 97 |         self.head_reg = nn.Sequential(
 98 |             Flatten(),
 99 |             nn.ReLU(),
100 |             nn.Dropout(0.5),
101 |             nn.Linear(22*46*46, 256),
102 |             nn.ReLU(),
103 |             nn.Dropout(0.5),
104 |             nn.Linear(256, 42),
105 |             Reshape(-1, 21, 2),
106 |             nn.Tanh())
107 |         self._initialize_weights()
108 | 
109 |     def forward(self, x):
110 |         out1_0 = self.model1_0(x)
111 |         out1_1 = self.model1_1(out1_0)
112 |         concat_stage2 = torch.cat([out1_1, out1_0], 1)
113 |         out_stage2 = self.model2(concat_stage2)
114 |         concat_stage3 = torch.cat([out_stage2, out1_0], 1)
115 |         out_stage3 = self.model3(concat_stage3)
116 |         concat_stage4 = torch.cat([out_stage3, out1_0], 1)
117 |         out_stage4 = self.model4(concat_stage4)
118 |         concat_stage5 = torch.cat([out_stage4, out1_0], 1)
119 |         out_stage5 = self.model5(concat_stage5)
120 |         concat_stage6 = torch.cat([out_stage5, out1_0], 1)
121 |         out_stage6 = self.model6(concat_stage6)
122 |         x = self.head_reg(out_stage6)
123 |         return x
124 | 
125 |     def _initialize_weights(self):
126 |         for m in self.modules():
127 |             if isinstance(m, nn.Conv2d):
128 |                 nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
129 |                 if m.bias is not None:
130 |                     nn.init.constant_(m.bias, 0)
131 |             elif isinstance(m, nn.BatchNorm2d):
132 |                 nn.init.constant_(m.weight, 1)
133 |                 nn.init.constant_(m.bias, 0)
134 |             elif isinstance(m, nn.Linear):
135 |                 nn.init.normal_(m.weight, 0, 0.01)
136 |                 nn.init.constant_(m.bias, 0)
137 | 
138 | 
139 | image_path = '/home/hanwei-1/data/hand_labels_synth/synth2_3'
140 | 
141 | 
142 | transforms = get_transforms(do_flip=False, max_zoom=1.05, max_warp=0.01,max_rotate=3, p_lighting=1)
143 | 
144 | def get_y_func(x):
145 |     pre, ext = os.path.splitext(x)
146 |     hand_data_out = []
147 |     # pre = pre.replace('synth2', 'synth2_json')
148 |     hand_data = json.load(open(pre + '.json'))
149 |     for i in range(21):
150 |         hand_tem_xy = hand_data['hand_pts'][i][:2]
151 |         hand_tem_xy.reverse()
152 |         hand_data_out.append(hand_tem_xy)
153 |     return Tensor(hand_data_out)
154 | 
155 | 
156 | data = (PointsItemList.from_folder(path=image_path, extensions=['.jpg'], presort=True)
157 |         .split_by_rand_pct()
158 |         .label_from_func(get_y_func)
159 |         .transform(transforms, size=368, tfm_y=True, remove_out=False,
160 |                    padding_mode='border', resize_method=ResizeMethod.PAD)
161 |         .databunch(bs=32)
162 |         .normalize(imagenet_stats))
163 | 
164 | 
165 | class MSELossFlat(nn.MSELoss):
166 |     def forward(self, input:Tensor, target:Tensor):
167 |      return super().forward(input.view(-1), target.view(-1))
168 | 
169 | 
170 | mse_loss_flat = MSELossFlat()
171 | 
172 | 
173 | class L2Loss(torch.nn.Module):
174 |     def __init__(self, batch_size):
175 |         super(L2Loss, self).__init__()
176 |         self.batch_size = batch_size
177 | 
178 |     def forward(self, x: Variable, y: Variable, weights: Variable = None):
179 |         if weights is not None:
180 |             val = (x-y) * weights[:x.data.shape[0], :, :, :] # Slice by shape[n,..] for batch size (last batch < batch_size)
181 |         else:
182 |             val = x-y
183 |         l = torch.sum(val ** 2) / self.batch_size / 2
184 |         return l
185 | 
186 | 
187 | l2loss = L2Loss(batch_size=8)
188 | 
189 | net = handpose_model()
190 | 
191 | 
192 | learn = Learner(data, net, loss_func=mse_loss_flat)
193 | learn.fit_one_cycle(cyc_len=200, max_lr=0.0001)
194 | learn.recorder.plot()
195 | plt.show()
196 | learn.lr_find()
197 | learn.recorder.plot()
198 | plt.show()
199 | 


--------------------------------------------------------------------------------
/demo/img_keypoint_show.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | import matplotlib.pyplot as plt
 4 | import cv2
 5 | 
 6 | im_dir = 'Path'
 7 | json_dir = 'Path'
 8 | hand_data_out = {}
 9 | 
10 | with open(json_dir, 'r') as f:
11 |     hand_data = json.load(f)
12 | 
13 | for i in range(21):
14 |     hand_data_out[i] = hand_data['hand_pts'][i][:2]
15 | 
16 | for j in range(21):
17 |     for i in range(2):
18 |         hand_data_out[j][i] = int(hand_data_out[j][i])
19 | 
20 | 
21 | def get_json_point(json_path):
22 |     hand_data_out = {}
23 |     hand_return = {}
24 |     str_point = ''
25 |     with open(json_dir, 'r') as f:
26 |         hand_data = json.load(f)
27 | 
28 |     for i in range(21):
29 |         hand_data_out[i] = hand_data['hand_pts'][i][:2]
30 | 
31 |     for j in range(21):
32 |         for i in range(2):
33 |             hand_data_out[j][i] = int(hand_data_out[j][i])
34 | 
35 |     hand_return[0] = hand_data_out[1]
36 |     hand_return[1] = hand_data_out[7]
37 |     hand_return[2] = hand_data_out[11]
38 |     hand_return[3] = hand_data_out[15]
39 |     hand_return[4] = hand_data_out[19]
40 |     for key, value in hand_return.items():
41 |         for i in range(2):
42 |             str_point += str(value[i])
43 |             str_point += ' '
44 | 
45 |     return hand_data_out
46 | 
47 | 
48 | data = get_json_point(json_dir)
49 | 
50 | output = cv2.imread(im_dir)
51 | for i in range(21):
52 |     cv2.circle(output, tuple(data[i]), 2, (0, 0, 255), 1)
53 | plt.imshow(output)
54 | plt.show()
55 | 
56 | 


--------------------------------------------------------------------------------
/image/Loss.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/Loss.png


--------------------------------------------------------------------------------
/image/com.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/com.png


--------------------------------------------------------------------------------
/image/front-back.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/front-back.jpg


--------------------------------------------------------------------------------
/image/hand.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/hand.jpeg


--------------------------------------------------------------------------------
/image/hand.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/hand.jpg


--------------------------------------------------------------------------------
/image/hand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/hand.png


--------------------------------------------------------------------------------
/image/right-frontal.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/right-frontal.jpg


--------------------------------------------------------------------------------
/image/unnamed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/image/unnamed.png


--------------------------------------------------------------------------------
/main/data/dataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @File    : dataset.py
  4 | @Time    : 2019/9/13 16:34
  5 | @Author  : KeyForce
  6 | @Email   : july.master@outlook.com
  7 | """
  8 | import os
  9 | import torch
 10 | import pandas as pd
 11 | from skimage import io, transform
 12 | import numpy as np
 13 | import matplotlib.pyplot as plt
 14 | from torch.utils.data import Dataset, DataLoader
 15 | from torchvision import transforms, utils
 16 | import json
 17 | 
 18 | 
 19 | class ReadJsonPoint:
 20 |     """读取CMU手部21点关键点数据"""
 21 |     def __init__(self, json_path):
 22 |         self.json_path = json_path
 23 |         self.hand_point = []
 24 | 
 25 |     def read(self):
 26 |         with open(self.json_path, 'r') as f:
 27 |             hand_data = json.load(f)
 28 | 
 29 |         for i in range(21):
 30 |             # 这边要注意不要xy坐标搞混
 31 |             hand_tem_xy = hand_data['hand_pts'][i][:2]
 32 |             hand_tem_xy = list(map(int, hand_tem_xy))
 33 |             self.hand_point.append(hand_tem_xy)
 34 | 
 35 |         # hand_point = list(map(int, hand_point))
 36 | 
 37 |         return np.array(self.hand_point)
 38 | 
 39 | 
 40 | class CMUHandPointDataset(Dataset):
 41 |     """读取CMU手部关键点数据"""
 42 | 
 43 |     def __init__(self, root_dir, transform=None):
 44 |         self.root_dir = root_dir
 45 |         self.transform = transform
 46 |         self.image_name = []
 47 | 
 48 |         # 分离目录下的jpg和json
 49 |         file_list = os.listdir(root_dir)
 50 |         for i in file_list:
 51 |             if os.path.splitext(i)[1] == '.jpg':
 52 |                 self.image_name.append(i)
 53 | 
 54 |     def __getitem__(self, item):
 55 |         if torch.is_tensor(item):
 56 |             item = item.tolist()
 57 | 
 58 |         img_path = os.path.join(self.root_dir,
 59 |                                 self.image_name[item])
 60 |         image = io.imread(img_path)
 61 |         json_path = os.path.join(img_path.replace('.jpg', '.json'))
 62 |         # 调用read方法读取数据
 63 |         landmarks = ReadJsonPoint(json_path).read()
 64 |         sample = {'image': image, 'landmarks': landmarks}
 65 | 
 66 |         if self.transform:
 67 |             sample = self.transform(sample)
 68 | 
 69 |         return sample
 70 | 
 71 |     def __len__(self):
 72 |         return len(self.image_name)
 73 | 
 74 | 
 75 | class Rescale(object):
 76 |     """Rescale the image in a sample to a given size.
 77 | 
 78 |     Args:
 79 |         output_size (tuple or int): Desired output size. If tuple, output is
 80 |             matched to output_size. If int, smaller of image edges is matched
 81 |             to output_size keeping aspect ratio the same.
 82 |     """
 83 | 
 84 |     def __init__(self, output_size):
 85 |         assert isinstance(output_size, (int, tuple))
 86 |         self.output_size = output_size
 87 | 
 88 |     def __call__(self, sample):
 89 |         image, landmarks = sample['image'], sample['landmarks']
 90 | 
 91 |         h, w = image.shape[:2]
 92 |         if isinstance(self.output_size, int):
 93 |             if h > w:
 94 |                 new_h, new_w = self.output_size * h / w, self.output_size
 95 |             else:
 96 |                 new_h, new_w = self.output_size, self.output_size * w / h
 97 |         else:
 98 |             new_h, new_w = self.output_size
 99 | 
100 |         new_h, new_w = int(new_h), int(new_w)
101 | 
102 |         img = transform.resize(image, (new_h, new_w))
103 | 
104 |         # h and w are swapped for landmarks because for images,
105 |         # x and y axes are axis 1 and 0 respectively
106 |         landmarks = landmarks * [new_w / w, new_h / h]
107 | 
108 | 
109 |         return {'image': img, 'landmarks': landmarks}
110 | 
111 | 
112 | class RandomCrop(object):
113 |     """Crop randomly the image in a sample.
114 | 
115 |     Args:
116 |         output_size (tuple or int): Desired output size. If int, square crop
117 |             is made.
118 |     """
119 | 
120 |     def __init__(self, output_size):
121 |         assert isinstance(output_size, (int, tuple))
122 |         if isinstance(output_size, int):
123 |             self.output_size = (output_size, output_size)
124 |         else:
125 |             assert len(output_size) == 2
126 |             self.output_size = output_size
127 | 
128 |     def __call__(self, sample):
129 |         image, landmarks = sample['image'], sample['landmarks']
130 | 
131 |         h, w = image.shape[:2]
132 |         new_h, new_w = self.output_size
133 | 
134 |         top = np.random.randint(0, h - new_h)
135 |         left = np.random.randint(0, w - new_w)
136 | 
137 |         image = image[top: top + new_h,
138 |                       left: left + new_w]
139 | 
140 |         landmarks = landmarks - [left, top]
141 | 
142 |         return {'image': image, 'landmarks': landmarks}
143 | 
144 | 
145 | class ToTensor(object):
146 |     """Convert ndarrays in sample to Tensors."""
147 | 
148 |     def __call__(self, sample):
149 |         image, landmarks = sample['image'], sample['landmarks']
150 | 
151 |         # swap color axis because
152 |         # numpy image: H x W x C
153 |         # torch image: C X H X W
154 |         image = image.transpose((2, 0, 1))
155 |         return {'image': torch.from_numpy(image),
156 |                 'landmarks': torch.from_numpy(landmarks)}
157 | 
158 | 
159 | def show_landmarks(image, landmarks):
160 |     """显示landmark，以方便检查数据"""
161 |     plt.imshow(image)
162 |     x = []
163 |     y = []
164 |     for i in range(21):
165 |         x.append(landmarks[i][0])
166 |         y.append(landmarks[i][1])
167 |     plt.scatter(x, y, s=10, marker='.', c='r')
168 | 
169 | 
170 | if __name__ == '__main__':
171 |     root_dir = '/home/wild/Hand-Keypoint-Estimation/data/Hands from Synthetic Data (6546 + 3243 + 2348 ' \
172 |                '+ 2124 = 14261 annotations)/hand_labels_synth/synth2'
173 | 
174 |     composed = transforms.Compose([Rescale(368),
175 |                                    ToTensor()])
176 | 
177 |     Data = CMUHandPointDataset(root_dir, composed)
178 | 
179 |     for i in range(8):
180 |         sample = Data[i]
181 | 
182 |         print(i, sample['image'].shape)
183 |         print('First 4 Landmarks: {}'.format(sample['landmarks'][:4]))
184 |         ax = plt.subplot(2, 4, i + 1)
185 |         plt.imshow(sample['image'].permute(1, 2, 0))
186 |         x = []
187 |         y = []
188 |         for i in range(21):
189 |             x.append(np.array(sample['landmarks'][i][0]))
190 |             y.append(np.array(sample['landmarks'][i][1]))
191 |         plt.scatter(x, y, s=10, marker='.', c='r')
192 | 
193 |     plt.show()
194 | 


--------------------------------------------------------------------------------
/main/demo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @File    : demo.py
4 | @Time    : 2019/9/14 11:41
5 | @Author  : KeyForce
6 | @Email   : july.master@outlook.com
7 | """
8 | 


--------------------------------------------------------------------------------
/main/train.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | @File    : train.py
  4 | @Time    : 2019/9/13 16:33
  5 | @Author  : KeyForce
  6 | @Email   : july.master@outlook.com
  7 | """
  8 | import matplotlib.pyplot as plt
  9 | import torch
 10 | import torch.optim as optim
 11 | import numpy as np
 12 | import torch.nn as nn
 13 | 
 14 | 
 15 | def Train(model, train_loader, criterion, optimizer, device, metrics=None, lr_scheduler=None, epoch=30):
 16 |     """
 17 |     训练模型
 18 |     :param model: 模型
 19 |     :param train_loader: 训练集
 20 |     :param criterion: 损失
 21 |     :param optimizer: 优化器
 22 |     :param device: GPU 或者CPU
 23 |     :param metrics: 评价指标
 24 |     :param lr_scheduler: 学习率调整
 25 |     :param epoch: 迭代次数
 26 |     :return:
 27 |     """
 28 |     model.train()
 29 |     for batch_idx, (image, label) in enumerate(train_loader):
 30 |         image, label = image.to(device), label.to(device)
 31 |         optimizer.zero_grad()
 32 |         output = model(image)
 33 |         label = label.long()
 34 |         loss = criterion(output, label)
 35 |         loss.backward()
 36 |         optimizer.step()
 37 |         # Log
 38 |         if batch_idx % 10 == 0:
 39 |             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.
 40 |                   format(epoch,
 41 |                          batch_idx * len(image),
 42 |                          len(train_loader.dataset),
 43 |                          100. * batch_idx / len(train_loader),
 44 |                          loss.item())
 45 |                   )
 46 | 
 47 | 
 48 | def Test(model, test_loader, criterion, device, epoch):
 49 |     """
 50 |     测试模型
 51 |     :param model: 模型
 52 |     :param test_loader: 测试集
 53 |     :param criterion: 损失
 54 |     :param device: GPU 或者CPU
 55 |     :param epoch:
 56 |     :return:
 57 |     """
 58 |     model.eval()
 59 |     test_loss = 0
 60 |     correct = 0
 61 |     confusion_matrix = np.zeros((21, 21))
 62 |     flag = 0
 63 |     with torch.no_grad():
 64 |         for image, label in test_loader:
 65 |             image, label = image.to(device), label.to(device)
 66 |             output = model(image)
 67 |             label = label.long()
 68 |             loss = criterion(output, label)
 69 |             test_loss += loss.item()
 70 |             pred = output.argmax(dim=1, keepdim=True)
 71 |             # PA像素精度
 72 |             num_class = 21
 73 |             pre_image = pred.squeeze(1).cpu().numpy()
 74 | 
 75 |             gt_image = label.cpu().numpy()
 76 | 
 77 |             confusion_matrix = fast_hist(gt_image, pre_image, num_class)
 78 |             # plt.close()
 79 |             PA = np.diag(confusion_matrix).sum() / confusion_matrix.sum()
 80 |             test_loss /= len(test_loader.dataset)
 81 | 
 82 |             print('\nTest set: Average loss: {:.4f}, PA: {}\n'.
 83 |                   format(loss,
 84 |                          PA,
 85 |                          )
 86 |                   )
 87 | 
 88 | 
 89 | def fast_hist(a, b, n):
 90 |     k = (a >= 0) & (a < n)
 91 |     return np.bincount(n * a[k].astype(int) + b[k], minlength=n ** 2).reshape(n, n)
 92 | 
 93 | 
 94 | def main():
 95 |     # 加载数据
 96 |     
 97 |     
 98 |     # 使用drop_last让Batch能够整除
 99 |     train_loader = torch.utils.data.DataLoader(train_data, batch_size=16, drop_last=True)
100 |     test_loader = torch.utils.data.DataLoader(test_data, batch_size=16, drop_last=True)
101 | 
102 |     # 设置GPU
103 |     torch.cuda.set_device(0)
104 |     device = torch.device("cuda")
105 |     # 初始化模型，损失，优化器
106 |     model =
107 |     loss = nn.CrossEntropyLoss(ignore_index=255, reduction='mean').to(device)
108 |     optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=5e-4)
109 |     # optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.8, weight_decay=5e-4)
110 |     # 开始训练
111 |     for epoch in range(40):
112 |         Train(model, train_loader=train_loader,
113 |               criterion=loss, optimizer=optimizer,
114 |               device=device, epoch=epoch)
115 |         Test(model, test_loader, loss, device, epoch)
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     main()


--------------------------------------------------------------------------------
/other/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8.12)
 2 | 
 3 | PROJECT(handPose)
 4 | 
 5 | find_package( OpenCV REQUIRED )
 6 | 
 7 | include_directories( ${OpenCV_INCLUDE_DIRS})
 8 | 
 9 | MACRO(add_example name)
10 |   ADD_EXECUTABLE(${name} ${name}.cpp)
11 |   TARGET_LINK_LIBRARIES(${name} ${OpenCV_LIBS})
12 | ENDMACRO()
13 | 
14 | 
15 | add_example(handPoseImage)
16 | add_example(handPoseVideo)
17 | 


--------------------------------------------------------------------------------
/other/Hand_Caffe/1_F_deploy.prototxt:
--------------------------------------------------------------------------------
  1 | # This file gives the CNN model to predict all landmark in LEVEL-1
  2 | name: "landmark_1_F"
  3 | layer {
  4 |     name: "data"
  5 |     type: "MemoryData"
  6 |     top: "data"
  7 |     top: "landmark"
  8 | 
  9 |     memory_data_param {
 10 |         batch_size: 1
 11 |         channels: 1
 12 |         height: 39
 13 |         width: 39
 14 |     }
 15 |     transform_param {
 16 |         scale: 0.00390625
 17 |     }
 18 | }
 19 | layer {
 20 |     name: "conv1"
 21 |     type: "Convolution"
 22 |     bottom: "data"
 23 |     top: "conv1"
 24 |     param {
 25 |         lr_mult: 1
 26 |     }
 27 |     param {
 28 |         lr_mult: 2
 29 |     }
 30 |     convolution_param {
 31 |         num_output: 20
 32 |         kernel_size: 4
 33 |         weight_filler {
 34 |             type: "xavier"
 35 |         }
 36 |         bias_filler {
 37 |             type: "constant"
 38 |         }
 39 |     }
 40 | }
 41 | layer {
 42 |     name: "relu1"
 43 |     type: "ReLU"
 44 |     bottom: "conv1"
 45 |     top: "conv1"
 46 | }
 47 | layer {
 48 |     name: "pool1"
 49 |     type: "Pooling"
 50 |     bottom: "conv1"
 51 |     top: "pool1"
 52 |     pooling_param {
 53 |         pool: MAX
 54 |         kernel_size: 2
 55 |         stride: 2
 56 |     }
 57 | }
 58 | layer {
 59 |     name: "conv2"
 60 |     type: "Convolution"
 61 |     bottom: "pool1"
 62 |     top: "conv2"
 63 |     param {
 64 |         lr_mult: 1
 65 |     }
 66 |     param {
 67 |         lr_mult: 2
 68 |     }
 69 |     convolution_param {
 70 |         num_output: 40
 71 |         kernel_size: 3
 72 |         weight_filler {
 73 |             type: "xavier"
 74 |         }
 75 |         bias_filler {
 76 |             type: "constant"
 77 |         }
 78 |     }
 79 | }
 80 | layer {
 81 |     name: "relu2"
 82 |     type: "ReLU"
 83 |     bottom: "conv2"
 84 |     top: "conv2"
 85 | }
 86 | layer {
 87 |     name: "pool2"
 88 |     type: "Pooling"
 89 |     bottom: "conv2"
 90 |     top: "pool2"
 91 |     pooling_param {
 92 |         pool: MAX
 93 |         kernel_size: 2
 94 |         stride: 2
 95 |     }
 96 | }
 97 | layer {
 98 |     name: "conv3"
 99 |     type: "Convolution"
100 |     bottom: "pool2"
101 |     top: "conv3"
102 |     param {
103 |         lr_mult: 1
104 |     }
105 |     param {
106 |         lr_mult: 2
107 |     }
108 |     convolution_param {
109 |         num_output: 60
110 |         kernel_size: 3
111 |         weight_filler {
112 |             type: "xavier"
113 |         }
114 |         bias_filler {
115 |             type: "constant"
116 |         }
117 |     }
118 | }
119 | layer {
120 |     name: "relu3"
121 |     type: "ReLU"
122 |     bottom: "conv3"
123 |     top: "conv3"
124 | }
125 | layer {
126 |     name: "pool3"
127 |     type: "Pooling"
128 |     bottom: "conv3"
129 |     top: "pool3"
130 |     pooling_param {
131 |         pool: MAX
132 |         kernel_size: 2
133 |         stride: 2
134 |     }
135 | }
136 | layer {
137 |     name: "conv4"
138 |     type: "Convolution"
139 |     bottom: "pool3"
140 |     top: "conv4"
141 |     param {
142 |         lr_mult: 1
143 |     }
144 |     param {
145 |         lr_mult: 2
146 |     }
147 |     convolution_param {
148 |         num_output: 80
149 |         kernel_size: 2
150 |         weight_filler {
151 |             type: "xavier"
152 |         }
153 |         bias_filler {
154 |             type: "constant"
155 |         }
156 |     }
157 | }
158 | layer {
159 |     name: "relu4"
160 |     type: "ReLU"
161 |     bottom: "conv4"
162 |     top: "conv4"
163 | }
164 | layer {
165 |     name: "pool3_flat"
166 |     type: "Flatten"
167 |     bottom: "pool3"
168 |     top: "pool3_flat"
169 | }
170 | layer {
171 |     name: "conv4_flat"
172 |     type: "Flatten"
173 |     bottom: "conv4"
174 |     top: "conv4_flat"
175 | }
176 | layer {
177 |     name: "concat"
178 |     type: "Concat"
179 |     bottom: "pool3_flat"
180 |     bottom: "conv4_flat"
181 |     top: "faker"
182 |     concat_param {
183 |       concat_dim: 1
184 |     }
185 | }
186 | layer {
187 |     name: "fc1"
188 |     type: "InnerProduct"
189 |     bottom: "faker"
190 |     top: "fc1"
191 |     param {
192 |         lr_mult: 1
193 |     }
194 |     param {
195 |         lr_mult: 2
196 |     }
197 |     inner_product_param {
198 |         num_output: 120
199 |         weight_filler {
200 |             type: "xavier"
201 |         }
202 |         bias_filler {
203 |             type: "constant"
204 |         }
205 |     }
206 | }
207 | layer {
208 |     name: "relu_fc1"
209 |     type: "ReLU"
210 |     bottom: "fc1"
211 |     top: "fc1"
212 | }
213 | layer {
214 |     name: "fc2"
215 |     type: "InnerProduct"
216 |     bottom: "fc1"
217 |     top: "fc2"
218 |     param {
219 |         lr_mult: 1
220 |     }
221 |     param {
222 |         lr_mult: 2
223 |     }
224 |     inner_product_param {
225 |         num_output: 10
226 |         weight_filler {
227 |             type: "xavier"
228 |         }
229 |         bias_filler {
230 |             type: "constant"
231 |         }
232 |     }
233 | }
234 | layer {
235 |     name: "relu_fc2"
236 |     type: "ReLU"
237 |     bottom: "fc2"
238 |     top: "fc2"
239 | }
240 | 


--------------------------------------------------------------------------------
/other/Hand_Caffe/1_F_solver.prototxt:
--------------------------------------------------------------------------------
 1 | net: "/home/wild/Face_Landmark/Hand_Test/1_F_train.prototxt"
 2 | 
 3 | test_iter: 25
 4 | test_interval: 1000
 5 | 
 6 | base_lr: 0.001
 7 | momentum: 0.9
 8 | weight_decay: 0.0005
 9 | 
10 | lr_policy: "inv"
11 | gamma: 0.0001
12 | power: 0.75
13 | 
14 | #lr_policy: "step"
15 | #gamma: 0.1
16 | #stepsize: 50000
17 | 
18 | display: 200
19 | 
20 | max_iter: 50000
21 | 
22 | snapshot: 50000
23 | snapshot_prefix: "/home/wild/Face_Landmark/Hand_Test/"
24 | 
25 | test_compute_loss: true
26 | 
27 | solver_mode: GPU


--------------------------------------------------------------------------------
/other/Hand_Caffe/1_F_train.prototxt:
--------------------------------------------------------------------------------
  1 | # This file gives the CNN model to predict all landmark in LEVEL-1
  2 | name: "landmark_1_F"
  3 | layer {
  4 |     name: "hdf5_train_data"
  5 |     type: "HDF5Data"
  6 |     top: "data"
  7 |     top: "landmark"
  8 |     include {
  9 |         phase: TRAIN
 10 |     }
 11 |     hdf5_data_param {
 12 |         source: "/home/wild/Face_Landmark/Hand_Test/Mytrain/1_F/train.txt"
 13 |         batch_size: 64
 14 |     }
 15 | }
 16 | layer {
 17 |     name: "hdf5_test_data"
 18 |     type: "HDF5Data"
 19 |     top: "data"
 20 |     top: "landmark"
 21 |     include {
 22 |         phase: TEST
 23 |     }
 24 |     hdf5_data_param {
 25 |         source: "/home/wild/Face_Landmark/Hand_Test/Mytrain/1_F/test.txt"
 26 |         batch_size: 64
 27 |     }
 28 | }
 29 | layer {
 30 |     name: "conv1"
 31 |     type: "Convolution"
 32 |     bottom: "data"
 33 |     top: "conv1"
 34 |     param {
 35 |         lr_mult: 1
 36 |     }
 37 |     param {
 38 |         lr_mult: 2
 39 |     }
 40 |     convolution_param {
 41 |         num_output: 20
 42 |         kernel_size: 4
 43 |         weight_filler {
 44 |             type: "xavier"
 45 |         }
 46 |         bias_filler {
 47 |             type: "constant"
 48 |         }
 49 |     }
 50 | }
 51 | layer {
 52 |     name: "relu1"
 53 |     type: "ReLU"
 54 |     bottom: "conv1"
 55 |     top: "conv1"
 56 | }
 57 | layer {
 58 |     name: "pool1"
 59 |     type: "Pooling"
 60 |     bottom: "conv1"
 61 |     top: "pool1"
 62 |     pooling_param {
 63 |         pool: MAX
 64 |         kernel_size: 2
 65 |         stride: 2
 66 |     }
 67 | }
 68 | layer {
 69 |     name: "conv2"
 70 |     type: "Convolution"
 71 |     bottom: "pool1"
 72 |     top: "conv2"
 73 |     param {
 74 |         lr_mult: 1
 75 |     }
 76 |     param {
 77 |         lr_mult: 2
 78 |     }
 79 |     convolution_param {
 80 |         num_output: 40
 81 |         kernel_size: 3
 82 |         weight_filler {
 83 |             type: "xavier"
 84 |         }
 85 |         bias_filler {
 86 |             type: "constant"
 87 |         }
 88 |     }
 89 | }
 90 | layer {
 91 |     name: "relu2"
 92 |     type: "ReLU"
 93 |     bottom: "conv2"
 94 |     top: "conv2"
 95 | }
 96 | layer {
 97 |     name: "pool2"
 98 |     type: "Pooling"
 99 |     bottom: "conv2"
100 |     top: "pool2"
101 |     pooling_param {
102 |         pool: MAX
103 |         kernel_size: 2
104 |         stride: 2
105 |     }
106 | }
107 | layer {
108 |     name: "conv3"
109 |     type: "Convolution"
110 |     bottom: "pool2"
111 |     top: "conv3"
112 |     param {
113 |         lr_mult: 1
114 |     }
115 |     param {
116 |         lr_mult: 2
117 |     }
118 |     convolution_param {
119 |         num_output: 60
120 |         kernel_size: 3
121 |         weight_filler {
122 |             type: "xavier"
123 |         }
124 |         bias_filler {
125 |             type: "constant"
126 |         }
127 |     }
128 | }
129 | layer {
130 |     name: "relu3"
131 |     type: "ReLU"
132 |     bottom: "conv3"
133 |     top: "conv3"
134 | }
135 | layer {
136 |     name: "pool3"
137 |     type: "Pooling"
138 |     bottom: "conv3"
139 |     top: "pool3"
140 |     pooling_param {
141 |         pool: MAX
142 |         kernel_size: 2
143 |         stride: 2
144 |     }
145 | }
146 | layer {
147 |     name: "conv4"
148 |     type: "Convolution"
149 |     bottom: "pool3"
150 |     top: "conv4"
151 |     param {
152 |         lr_mult: 1
153 |     }
154 |     param {
155 |         lr_mult: 2
156 |     }
157 |     convolution_param {
158 |         num_output: 80
159 |         kernel_size: 2
160 |         weight_filler {
161 |             type: "xavier"
162 |         }
163 |         bias_filler {
164 |             type: "constant"
165 |         }
166 |     }
167 | }
168 | layer {
169 |     name: "relu4"
170 |     type: "ReLU"
171 |     bottom: "conv4"
172 |     top: "conv4"
173 | }
174 | layer {
175 |     name: "pool3_flat"
176 |     type: "Flatten"
177 |     bottom: "pool3"
178 |     top: "pool3_flat"
179 | }
180 | layer {
181 |     name: "conv4_flat"
182 |     type: "Flatten"
183 |     bottom: "conv4"
184 |     top: "conv4_flat"
185 | }
186 | layer {
187 |     name: "concat"
188 |     type: "Concat"
189 |     bottom: "pool3_flat"
190 |     bottom: "conv4_flat"
191 |     top: "faker"
192 |     concat_param {
193 |       concat_dim: 1
194 |     }
195 | }
196 | layer {
197 |     name: "fc1"
198 |     type: "InnerProduct"
199 |     bottom: "faker"
200 |     top: "fc1"
201 |     param {
202 |         lr_mult: 1
203 |     }
204 |     param {
205 |         lr_mult: 2
206 |     }
207 |     inner_product_param {
208 |         num_output: 120
209 |         weight_filler {
210 |             type: "xavier"
211 |         }
212 |         bias_filler {
213 |             type: "constant"
214 |         }
215 |     }
216 | }
217 | layer {
218 |     name: "relu_fc1"
219 |     type: "ReLU"
220 |     bottom: "fc1"
221 |     top: "fc1"
222 | }
223 | layer {
224 |     name: "fc2"
225 |     type: "InnerProduct"
226 |     bottom: "fc1"
227 |     top: "fc2"
228 |     param {
229 |         lr_mult: 1
230 |     }
231 |     param {
232 |         lr_mult: 2
233 |     }
234 |     inner_product_param {
235 |         num_output: 10
236 |         weight_filler {
237 |             type: "xavier"
238 |         }
239 |         bias_filler {
240 |             type: "constant"
241 |         }
242 |     }
243 | }
244 | layer {
245 |     name: "relu_fc2"
246 |     type: "ReLU"
247 |     bottom: "fc2"
248 |     top: "fc2"
249 | }
250 | layer {
251 |     name: "error"
252 |     type: "EuclideanLoss"
253 |     bottom: "fc2"
254 |     bottom: "landmark"
255 |     top: "error"
256 |     include {
257 |         phase: TEST
258 |     }
259 | }
260 | layer {
261 |     name: "loss"
262 |     type: "EuclideanLoss"
263 |     bottom: "fc2"
264 |     bottom: "landmark"
265 |     top: "loss"
266 |     include {
267 |         phase: TRAIN
268 |     }
269 | }
270 | 


--------------------------------------------------------------------------------
/other/Hand_Caffe/create_txt.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import cv2
  4 | import numpy
  5 | 
  6 | def make_bbox_bigger(data, xR, yR, wR, hR):
  7 | 
  8 |     xDelta = data[0] * xR
  9 |     yDelta = data[1] * yR
 10 |     wDelta = data[2] * wR
 11 |     hDelta = data[3] * hR
 12 | 
 13 |     x = data[0] + xDelta
 14 |     y = data[1] + yDelta
 15 |     w = data[2] + wDelta
 16 |     h = data[3] + hDelta
 17 |     return [int(x), int(y), int(w), int(h)]
 18 | 
 19 | def get_json_point(json_path):
 20 |     hand_data_out = {}
 21 |     hand_return = {}
 22 |     str_point = ''
 23 |     cnt = numpy.zeros((21, 2), dtype=int)
 24 |     with open(json_path, 'r') as f:
 25 |         hand_data = json.load(f)
 26 | 
 27 |     for i in range(21):
 28 |         hand_data_out[i] = hand_data['hand_pts'][i][:2]
 29 | 
 30 |     for j in range(21):
 31 |         for i in range(2):
 32 |             hand_data_out[j][i] = int(hand_data_out[j][i])
 33 | 
 34 |     for i in range(21):
 35 |         cnt[i] = numpy.array(hand_data_out[i])
 36 | 
 37 |     index = [4, 8, 12, 16, 20]
 38 |     new_a = numpy.delete(cnt, index, axis=0)
 39 |     x, y, w, h = cv2.boundingRect(new_a)
 40 |     x, y, w, h = make_bbox_bigger([x, y, w, h], -0.08, -0.08, 0.8, 0.8)
 41 | 
 42 |     hand_return[0] = hand_data_out[1]
 43 |     hand_return[1] = hand_data_out[7]
 44 |     hand_return[2] = hand_data_out[11]
 45 |     hand_return[3] = hand_data_out[15]
 46 |     hand_return[4] = hand_data_out[19]
 47 | 
 48 |     # box
 49 |     hand_return[5] = [x, y]
 50 |     hand_return[6] = [w, h]
 51 |     for key, value in hand_return.items():
 52 |         for i in range(2):
 53 |             str_point += str(value[i])
 54 |             str_point += ' '
 55 | 
 56 |     return str_point
 57 | 
 58 | 
 59 | if __name__ == '__main__':
 60 |     data_sources = ['synth1', 'synth2', 'synth3', 'synth4']
 61 |     root_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth'
 62 | 
 63 |     data = []
 64 | 
 65 |     for data_source in data_sources:
 66 |         im_dir = os.path.join(root_dir, data_source)
 67 |         for im_file in os.listdir(im_dir):
 68 |             if '.jpg' in im_file:
 69 |                 name = im_file.rstrip('.jpg')
 70 |                 json_file_path = os.path.join(root_dir, data_source, name + '.json')
 71 |                 im_file_path = os.path.join(data_source, name + '.jpg')
 72 |                 point = get_json_point(json_file_path)
 73 |                 data.append(" ".join([im_file_path, point]))
 74 | 
 75 |     with open('{}/data.txt'.format(root_dir), 'w') as f:
 76 |             for image_point in data:
 77 |                 f.write('{}\r\n'.format(image_point))
 78 | 
 79 |     train = data[:int(len(data) * 0.7)]
 80 |     test = data[int(len(data) * 0.7):]
 81 | 
 82 |     with open('{}/train.txt'.format(root_dir), 'w') as f:
 83 |             for image_point in data:
 84 |                 f.write('{}\r\n'.format(image_point))
 85 | 
 86 |     with open('{}/test.txt'.format(root_dir), 'w') as f:
 87 |             for image_point in data:
 88 |                 f.write('{}\r\n'.format(image_point))
 89 | 
 90 | 
 91 | # random.shuffle(test_data)
 92 | # random.shuffle(test_data)
 93 | # random.shuffle(train_data)
 94 | # random.shuffle(train_data)
 95 | #
 96 | # with open('test.txt', 'w') as f:
 97 | #     f.write('\n'.join(test_data))
 98 | # with open('trainval.txt', 'w') as f:
 99 | #     f.write('\n'.join(train_data))
100 | 


--------------------------------------------------------------------------------
/other/Hand_Caffe/getBox.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | import numpy
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | im_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.jpg'
 7 | json_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.json'
 8 | hand_data_out = {}
 9 | 
10 | hand_data_out = {}
11 | cnt = numpy.zeros((21, 2), dtype=int)
12 | with open(json_dir, 'r') as f:
13 |     hand_data = json.load(f)
14 | 
15 | for i in range(21):
16 |     hand_data_out[i] = hand_data['hand_pts'][i][:2]
17 | 
18 | for j in range(21):
19 |     for i in range(2):
20 |         hand_data_out[j][i] = int(hand_data_out[j][i])
21 | 
22 | for i in range(21):
23 |     cnt[i] = numpy.array(hand_data_out[i])
24 | 
25 | index = [4, 8, 12, 16, 20]
26 | new_a = numpy.delete(cnt, index, axis=0)
27 | img = cv2.imread(im_dir)
28 | x, y, w, h = cv2.boundingRect(new_a)
29 | 
30 | 
31 | def make_bbox_bigger(data, xR, yR, wR, hR):
32 | 
33 |     xDelta = data[0] * xR
34 |     yDelta = data[1] * yR
35 |     wDelta = data[2] * wR
36 |     hDelta = data[3] * hR
37 | 
38 |     x = data[0] + xDelta
39 |     y = data[1] + yDelta
40 |     w = data[2] + wDelta
41 |     h = data[3] + hDelta
42 |     return [int(x), int(y), int(w), int(h)]
43 | 
44 | x, y, w, h = make_bbox_bigger([x, y, w, h], -0.08, -0.08, 0.08, 0.08)
45 | 
46 | cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 1)
47 | 
48 | cv2.imwrite('hand.jpeg', img)
49 | plt.imshow(img)
50 | plt.show()
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/other/Hand_Caffe/hand.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Caffe/hand.jpeg


--------------------------------------------------------------------------------
/other/Hand_Caffe/level1.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2.7
 2 | # coding: utf-8
 3 | 
 4 | 
 5 | import os
 6 | from os.path import join, exists
 7 | 
 8 | import cv2
 9 | import h5py
10 | import numpy as np
11 | import matplotlib.pyplot as plt
12 | from utils import getDataFromTxt
13 | from utils import shuffle_in_unison_scary, logger, createDir, processImage
14 | 
15 | TRAIN = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth'
16 | OUTPUT = '/home/wild/Face_Landmark/Hand_Test/Mytrain'
17 | if not exists(OUTPUT):
18 |     os.mkdir(OUTPUT)
19 | assert(exists(TRAIN) and exists(OUTPUT))
20 | 
21 | 
22 | def generate_hdf5(ftxt, output, fname, argument=False):
23 | 
24 |     data = getDataFromTxt(ftxt)
25 |     F_imgs = []
26 |     F_landmarks = []
27 | 
28 |     for (imgPath, landmarkGt, bbox) in data:
29 |         img = cv2.imread(imgPath, cv2.IMREAD_GRAYSCALE)
30 |         assert(img is not None)
31 |         logger("process %s" % imgPath)
32 |         # plt.imshow(img)
33 |         # plt.show()
34 | 
35 |         f_face = img[int(bbox[0]):int(bbox[2]), int(bbox[1]):int(bbox[3])]
36 |         plt.imshow(f_face)
37 |         plt.show()
38 | 
39 |         f_face = cv2.resize(f_face, (39, 39))
40 | 
41 |         f_face = f_face.reshape((1, 39, 39))
42 | 
43 |         f_landmark = landmarkGt.reshape((10))
44 |         F_imgs.append(f_face)
45 |         F_landmarks.append(f_landmark)
46 | 
47 | 
48 | 
49 |     F_imgs, F_landmarks = np.asarray(F_imgs), np.asarray(F_landmarks)
50 | 
51 | 
52 |     F_imgs = processImage(F_imgs)
53 |     shuffle_in_unison_scary(F_imgs, F_landmarks)
54 | 
55 | 
56 |     # full face
57 |     base = join(OUTPUT, '1_F')
58 |     createDir(base)
59 |     output = join(base, fname)
60 |     logger("generate %s" % output)
61 | 
62 | 
63 |     with h5py.File(output, 'w') as h5:
64 |         h5['data'] = F_imgs.astype(np.float32)
65 |         h5['landmark'] = F_landmarks.astype(np.float32)
66 | 
67 | 
68 | 
69 | if __name__ == '__main__':
70 | 
71 |     h5_path = '/home/wild/Face_Landmark/Hand_Test/Mytrain'
72 |     # 训练集
73 |     train_txt = join(TRAIN, 'train.txt')
74 |     generate_hdf5(train_txt, OUTPUT, 'train.h5', argument=True)
75 |     # 测试集
76 |     test_txt = join(TRAIN, 'test.txt')
77 |     generate_hdf5(test_txt, OUTPUT, 'test.h5')
78 | 
79 |     with open(join(OUTPUT, '1_F/train.txt'), 'w') as fd:
80 |         fd.write(h5_path+'/1_F/train.h5')
81 | 
82 |     with open(join(OUTPUT, '1_F/test.txt'), 'w') as fd:
83 |         fd.write(h5_path+'/1_F/test.h5')
84 | 
85 |     print 'ok'


--------------------------------------------------------------------------------
/other/Hand_Caffe/read_im_json.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import cv2
 3 | im_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.jpg'
 4 | json_dir = '/home/wild/Hand-Keypoint-Estimation/Hands from Synthetic Data (6546 + 3243 + 2348 + 2124 = 14261 annotations)/hand_labels_synth/synth1/0001.json'
 5 | hand_data_out = {}
 6 | with open(json_dir, 'r') as f:
 7 |     hand_data = json.load(f)
 8 | 
 9 | for i in range(21):
10 |     hand_data_out[i] = hand_data['hand_pts'][i][:2]
11 | 
12 | for j in range(21):
13 |     for i in range(2):
14 |         hand_data_out[j][i] = int(hand_data_out[j][i])
15 | 
16 | def get_json_point(json_path):
17 |     hand_data_out = {}
18 |     hand_return = {}
19 |     str_point = ''
20 |     with open(json_dir, 'r') as f:
21 |         hand_data = json.load(f)
22 | 
23 |     for i in range(21):
24 |         hand_data_out[i] = hand_data['hand_pts'][i][:2]
25 | 
26 |     for j in range(21):
27 |         for i in range(2):
28 |             hand_data_out[j][i] = int(hand_data_out[j][i])
29 | 
30 |     hand_return[0] = hand_data_out[1]
31 |     hand_return[1] = hand_data_out[7]
32 |     hand_return[2] = hand_data_out[11]
33 |     hand_return[3] = hand_data_out[15]
34 |     hand_return[4] = hand_data_out[19]
35 |     for key, value in hand_return.items():
36 |         for i in range(2):
37 |             str_point += str(value[i])
38 |             str_point += ' '
39 | 
40 |     return str_point
41 | 
42 | data = get_json_point(json_dir)
43 | 
44 | # output = cv2.imread(im_dir)
45 | # for i in range(5):
46 | #     cv2.circle(output, tuple(data[i]), 2, (0, 0, 255), 1)
47 | # cv2.imshow("capture", output)
48 | # while True:
49 | #     if cv2.waitKey(1) == 27:
50 | #         break  # esc to quit


--------------------------------------------------------------------------------
/other/Hand_Caffe/utils.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import os
  4 | import time
  5 | 
  6 | import cv2
  7 | import numpy as np
  8 | 
  9 | 
 10 | def logger(msg):
 11 |     """
 12 |         log message
 13 |     """
 14 |     now = time.ctime()
 15 |     print("[%s] %s" % (now, msg))
 16 | 
 17 | 
 18 | def createDir(p):
 19 |     if not os.path.exists(p):
 20 |         os.mkdir(p)
 21 | 
 22 | 
 23 | def shuffle_in_unison_scary(a, b):
 24 |     rng_state = np.random.get_state()
 25 |     np.random.shuffle(a)
 26 |     np.random.set_state(rng_state)
 27 |     np.random.shuffle(b)
 28 | 
 29 | 
 30 | def drawLandmark(img, bbox, landmark):
 31 |     cv2.rectangle(img, (bbox.left, bbox.top), (bbox.right, bbox.bottom), (0, 0, 255), 2)
 32 |     for x, y in landmark:
 33 |         cv2.circle(img, (int(x), int(y)), 2, (0, 255, 0), -1)
 34 |     return img
 35 | 
 36 | 
 37 | def getDataFromTxt(txt, with_landmark=True):
 38 |     """
 39 |         Generate data from txt file
 40 |         return [(img_path, landmark)]
 41 |             landmark: [(x1, y1), (x2, y2), ...]
 42 |     """
 43 |     dirname = os.path.dirname(txt)
 44 |     with open(txt, 'r') as fd:
 45 |         lines = fd.readlines()
 46 | 
 47 |     result = []
 48 |     for line in lines:
 49 |         line = line.strip()
 50 |         components = line.split(' ')
 51 |         img_path = os.path.join(dirname, components[0].replace('\\', '/'))  # file path
 52 |         # bounding box, (x, y, w, h)
 53 |         bbox = (components[11], components[12], int(components[11]) + int(components[13]), int(components[12]) + int(components[14]))
 54 |         # 将字符串转换成整型
 55 |         bbox = [int(_) for _ in bbox]
 56 |         # landmark
 57 |         if not with_landmark:
 58 |             result.append((img_path, BBox(bbox)))
 59 |             continue
 60 |         landmark = np.zeros((5, 2))
 61 |         # 读取关键点坐标
 62 |         for index in range(0, 5):
 63 |             rv = (float(components[1 + 2 * index]), float(components[1 + 2 * index + 1]))
 64 |             landmark[index] = rv
 65 |         for index, one in enumerate(landmark):
 66 |             rv = ((one[0] - bbox[0]) / (bbox[1] - bbox[0]), (one[1] - bbox[2]) / (bbox[3] - bbox[2]))
 67 |             landmark[index] = rv
 68 |         result.append((img_path, landmark, bbox))
 69 |     return result
 70 | 
 71 | 
 72 | def getPatch(img, bbox, point, padding):
 73 |     """
 74 |         Get a patch iamge around the given point in bbox with padding
 75 |         point: relative_point in [0, 1] in bbox
 76 |     """
 77 |     point_x = bbox.x + point[0] * bbox.w
 78 |     point_y = bbox.y + point[1] * bbox.h
 79 |     patch_left = point_x - bbox.w * padding
 80 |     patch_right = point_x + bbox.w * padding
 81 |     patch_top = point_y - bbox.h * padding
 82 |     patch_bottom = point_y + bbox.h * padding
 83 |     patch = img[patch_top: patch_bottom + 1, patch_left: patch_right + 1]
 84 |     patch_bbox = BBox([patch_left, patch_right, patch_top, patch_bottom])
 85 |     return patch, patch_bbox
 86 | 
 87 | 
 88 | def processImage(imgs):
 89 |     """
 90 |         process images before feeding to CNNs
 91 |         imgs: N x 1 x W x H
 92 |     """
 93 |     imgs = imgs.astype(np.float32)
 94 |     for i, img in enumerate(imgs):
 95 |         m = img.mean()
 96 |         s = img.std()
 97 |         imgs[i] = (img - m) / s
 98 |     return imgs
 99 | 
100 | 
101 | def dataArgument(data):
102 |     """
103 |         dataArguments
104 |         data:
105 |             imgs: N x 1 x W x H
106 |             bbox: N x BBox
107 |             landmarks: N x 10
108 |     """
109 |     pass
110 | 
111 | 
112 | class BBox(object):
113 |     """
114 |         Bounding Box of face
115 |     """
116 | 
117 |     def __init__(self, bbox):
118 |         self.left = bbox[0]
119 |         self.right = bbox[1]
120 |         self.top = bbox[2]
121 |         self.bottom = bbox[3]
122 |         self.x = bbox[0]
123 |         self.y = bbox[1]
124 |         self.w = bbox[2] - bbox[0]
125 |         self.h = bbox[3] - bbox[1]
126 | 
127 |     def expand(self, scale=0.05):
128 |         bbox = [self.left, self.right, self.top, self.bottom]
129 |         bbox[0] -= int(self.w * scale)
130 |         bbox[1] += int(self.w * scale)
131 |         bbox[2] -= int(self.h * scale)
132 |         bbox[3] += int(self.h * scale)
133 |         return BBox(bbox)
134 | 
135 |     def project(self, point):
136 |         x = (point[0] - self.x) / self.w
137 |         y = (point[1] - self.y) / self.h
138 |         return np.asarray([x, y])
139 | 
140 |     def reproject(self, point):
141 |         x = self.x + self.w * point[0]
142 |         y = self.y + self.h * point[1]
143 |         return np.asarray([x, y])
144 | 
145 |     def reprojectLandmark(self, landmark):
146 |         p = np.zeros((len(landmark), 2))
147 |         for i in range(len(landmark)):
148 |             p[i] = self.reproject(landmark[i])
149 |         return p
150 | 
151 |     def projectLandmark(self, landmark):
152 |         p = np.zeros((len(landmark), 2))
153 |         for i in range(len(landmark)):
154 |             p[i] = self.project(landmark[i])
155 |         return p
156 | 
157 |     # 扩大框
158 |     def subBBox(self, leftR, rightR, topR, bottomR):
159 |         leftDelta = self.w * leftR
160 |         rightDelta = self.w * rightR
161 |         topDelta = self.h * topR
162 |         bottomDelta = self.h * bottomR
163 |         left = self.left + leftDelta
164 |         right = self.left + rightDelta
165 |         top = self.top + topDelta
166 |         bottom = self.top + bottomDelta
167 |         return BBox([left, right, top, bottom])
168 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/README.md:
--------------------------------------------------------------------------------
 1 | ### [SSD-Hand-Detection](https://github.com/weiliu89/caffe/tree/ssd)
 2 | #### Dataset
 3 | * [egohands](http://vision.soic.indiana.edu/projects/egohands/)
 4 | * [stanfordhands](http://www.robots.ox.ac.uk/~vgg/data/hands/)
 5 | 
 6 | #### Preprocess
 7 | * `min(hands width and height) > threshold`, for egohands, `threshold=40`; for stanfordhands, `threshold=20`.
 8 | * clean dataset can be downloaded from [onedrive]().
 9 | * run `create_txt.py` to generate `test.txt` and `trainval.txt`.
10 | * At last run `./create_data.sh` to generate lmdb file in data/lmdb folder.
11 | 
12 | #### Train
13 | * pretrained model are provided by the author and trained on [PASCAL VOC 2012, 2017](http://host.robots.ox.ac.uk/pascal/VOC/). [Download Link]().
14 | 
15 | #### demo
16 | ![](pic/demo.jpg)


--------------------------------------------------------------------------------
/other/Hand_Detection/data/create_annoset.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import shutil
  4 | import subprocess
  5 | import sys
  6 | # get caffe root directory
  7 | caffe_root = '../caffe'
  8 | sys.path.insert(0, os.path.join(caffe_root, 'python'))
  9 | from caffe.proto import caffe_pb2
 10 | from google.protobuf import text_format
 11 | 
 12 | if __name__ == "__main__":
 13 |   parser = argparse.ArgumentParser(description="Create AnnotatedDatum database")
 14 |   parser.add_argument("root",
 15 |       help="The root directory which contains the images and annotations.")
 16 |   parser.add_argument("listfile",
 17 |       help="The file which contains image paths and annotation info.")
 18 |   parser.add_argument("outdir",
 19 |       help="The output directory which stores the database file.")
 20 |   parser.add_argument("exampledir",
 21 |       help="The directory to store the link of the database files.")
 22 |   parser.add_argument("--redo", default = False, action = "store_true",
 23 |       help="Recreate the database.")
 24 |   parser.add_argument("--anno-type", default = "classification",
 25 |       help="The type of annotation {classification, detection}.")
 26 |   parser.add_argument("--label-type", default = "xml",
 27 |       help="The type of label file format for detection {xml, json, txt}.")
 28 |   parser.add_argument("--backend", default = "lmdb",
 29 |       help="The backend {lmdb, leveldb} for storing the result")
 30 |   parser.add_argument("--check-size", default = False, action = "store_true",
 31 |       help="Check that all the datum have the same size.")
 32 |   parser.add_argument("--encode-type", default = "",
 33 |       help="What type should we encode the image as ('png','jpg',...).")
 34 |   parser.add_argument("--encoded", default = False, action = "store_true",
 35 |       help="The encoded image will be save in datum.")
 36 |   parser.add_argument("--gray", default = False, action = "store_true",
 37 |       help="Treat images as grayscale ones.")
 38 |   parser.add_argument("--label-map-file", default = "",
 39 |       help="A file with LabelMap protobuf message.")
 40 |   parser.add_argument("--min-dim", default = 0, type = int,
 41 |       help="Minimum dimension images are resized to.")
 42 |   parser.add_argument("--max-dim", default = 0, type = int,
 43 |       help="Maximum dimension images are resized to.")
 44 |   parser.add_argument("--resize-height", default = 0, type = int,
 45 |       help="Height images are resized to.")
 46 |   parser.add_argument("--resize-width", default = 0, type = int,
 47 |       help="Width images are resized to.")
 48 |   parser.add_argument("--shuffle", default = False, action = "store_true",
 49 |       help="Randomly shuffle the order of images and their labels.")
 50 |   parser.add_argument("--check-label", default = False, action = "store_true",
 51 |       help="Check that there is no duplicated name/label.")
 52 | 
 53 |   args = parser.parse_args()
 54 |   root_dir = args.root
 55 |   list_file = args.listfile
 56 |   out_dir = args.outdir
 57 |   example_dir = args.exampledir
 58 | 
 59 |   redo = args.redo
 60 |   anno_type = args.anno_type
 61 |   label_type = args.label_type
 62 |   backend = args.backend
 63 |   check_size = args.check_size
 64 |   encode_type = args.encode_type
 65 |   encoded = args.encoded
 66 |   gray = args.gray
 67 |   label_map_file = args.label_map_file
 68 |   min_dim = args.min_dim
 69 |   max_dim = args.max_dim
 70 |   resize_height = args.resize_height
 71 |   resize_width = args.resize_width
 72 |   shuffle = args.shuffle
 73 |   check_label = args.check_label
 74 | 
 75 |   # check if root directory exists
 76 |   if not os.path.exists(root_dir):
 77 |     print("root directory: {} does not exist".format(root_dir))
 78 |     sys.exit()
 79 |   # add "/" to root directory if needed
 80 |   if root_dir[-1] != "/":
 81 |     root_dir += "/"
 82 |   # check if list file exists
 83 |   if not os.path.exists(list_file):
 84 |     print("list file: {} does not exist".format(list_file))
 85 |     sys.exit()
 86 |   # check list file format is correct
 87 |   with open(list_file, "r") as lf:
 88 |     for line in lf.readlines():
 89 |       img_file, anno = line.strip("\n").split(" ")
 90 |       if not os.path.exists(root_dir + img_file):
 91 |         print("image file: {} does not exist".format(root_dir + img_file))
 92 |       if anno_type == "classification":
 93 |         if not anno.isdigit():
 94 |           print("annotation: {} is not an integer".format(anno))
 95 |       elif anno_type == "detection":
 96 |         if not os.path.exists(root_dir + anno):
 97 |           print("annofation file: {} does not exist".format(root_dir + anno))
 98 |           sys.exit()
 99 |       break
100 |   # check if label map file exist
101 |   if anno_type == "detection":
102 |     if not os.path.exists(label_map_file):
103 |       print("label map file: {} does not exist".format(label_map_file))
104 |       sys.exit()
105 |     label_map = caffe_pb2.LabelMap()
106 |     lmf = open(label_map_file, "r")
107 |     try:
108 |       text_format.Merge(str(lmf.read()), label_map)
109 |     except:
110 |       print("Cannot parse label map file: {}".format(label_map_file))
111 |       sys.exit()
112 |   out_parent_dir = os.path.dirname(out_dir)
113 |   if not os.path.exists(out_parent_dir):
114 |     os.makedirs(out_parent_dir)
115 |   if os.path.exists(out_dir) and not redo:
116 |     print("{} already exists and I do not hear redo".format(out_dir))
117 |     sys.exit()
118 |   if os.path.exists(out_dir):
119 |     shutil.rmtree(out_dir)
120 | 
121 | 
122 |   if anno_type == "detection":
123 |     cmd = "{}/build/tools/convert_annoset" \
124 |         " --anno_type={}" \
125 |         " --label_type={}" \
126 |         " --label_map_file={}" \
127 |         " --check_label={}" \
128 |         " --min_dim={}" \
129 |         " --max_dim={}" \
130 |         " --resize_height={}" \
131 |         " --resize_width={}" \
132 |         " --backend={}" \
133 |         " --shuffle={}" \
134 |         " --check_size={}" \
135 |         " --encode_type={}" \
136 |         " --encoded={}" \
137 |         " --gray={}" \
138 |         " {} {} {}" \
139 |         .format(caffe_root, anno_type, label_type, label_map_file, check_label,
140 |             min_dim, max_dim, resize_height, resize_width, backend, shuffle,
141 |             check_size, encode_type, encoded, gray, root_dir, list_file, out_dir)
142 |   elif anno_type == "classification":
143 |     cmd = "{}/build/tools/convert_annoset" \
144 |         " --anno_type={}" \
145 |         " --min_dim={}" \
146 |         " --max_dim={}" \
147 |         " --resize_height={}" \
148 |         " --resize_width={}" \
149 |         " --backend={}" \
150 |         " --shuffle={}" \
151 |         " --check_size={}" \
152 |         " --encode_type={}" \
153 |         " --encoded={}" \
154 |         " --gray={}" \
155 |         " {} {} {}" \
156 |         .format(caffe_root, anno_type, min_dim, max_dim, resize_height,
157 |             resize_width, backend, shuffle, check_size, encode_type, encoded,
158 |             gray, root_dir, list_file, out_dir)
159 |   print(cmd)
160 |   process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
161 |   output = process.communicate()[0]
162 | 
163 |   if not os.path.exists(example_dir):
164 |     os.makedirs(example_dir)
165 |   # link_dir = os.path.join(example_dir, os.path.basename(out_dir))
166 |   # if os.path.exists(link_dir):
167 |   #   os.unlink(link_dir)
168 |   # os.symlink(out_dir, link_dir)
169 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/data/create_data.sh:
--------------------------------------------------------------------------------
 1 | redo=1
 2 | data_root_dir="."
 3 | mapfile="labelmap_voc.prototxt"
 4 | anno_type="detection"
 5 | db="lmdb"
 6 | min_dim=0
 7 | max_dim=0
 8 | width=0
 9 | height=0
10 | 
11 | extra_cmd="--encode-type=jpg --encoded"
12 | if [ $redo ]
13 | then
14 |   extra_cmd="$extra_cmd --redo"
15 | fi
16 | for subset in test trainval
17 | do
18 |   python create_annoset.py --anno-type=$anno_type --label-map-file=$mapfile --min-dim=$min_dim --max-dim=$max_dim --resize-width=$width --resize-height=$height --check-label $extra_cmd $data_root_dir $subset.txt $data_root_dir/$db/$subset"_"$db '.'
19 | done
20 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/data/create_txt.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | data_sources = ['egohands', 'stanfordhands']
 4 | root_dir = os.path.dirname(os.path.abspath(__file__))
 5 | test_data = []
 6 | train_data = []
 7 | 
 8 | for data_source in data_sources:
 9 |     test_im_dir = os.path.join(root_dir, data_source, 'test', 'JPEGImages')
10 |     train_im_dir = os.path.join(root_dir, data_source, 'trainval', 'JPEGImages')
11 |     for im_file in os.listdir(test_im_dir):
12 |         name = im_file.rstrip('.jpg')
13 |         xml_file_path = os.path.join(data_source, 'test', 'Annotations', name+'.xml')
14 |         im_file_path = os.path.join(data_source, 'test', 'JPEGImages', name+'.jpg')
15 |         test_data.append(" ".join([im_file_path, xml_file_path]))
16 |     for im_file in os.listdir(train_im_dir):
17 |         name = im_file.rstrip('.jpg')
18 |         xml_file_path = os.path.join(data_source, 'trainval', 'Annotations', name+'.xml')
19 |         im_file_path = os.path.join(data_source, 'trainval', 'JPEGImages', name+'.jpg')
20 |         train_data.append(" ".join([im_file_path, xml_file_path]))
21 | 
22 | 
23 | 
24 | random.shuffle(test_data)
25 | random.shuffle(test_data)
26 | random.shuffle(train_data)
27 | random.shuffle(train_data)
28 | 
29 | with open('test.txt', 'w') as f:
30 |     f.write('\n'.join(test_data))
31 | with open('trainval.txt', 'w') as f:
32 |     f.write('\n'.join(train_data))
33 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/data/egohands/_screenshot_17.04.2018.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/data/egohands/_screenshot_17.04.2018.png


--------------------------------------------------------------------------------
/other/Hand_Detection/data/egohands/generate_egohands.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | from lxml.etree import Element, SubElement, tostring
  4 | import random
  5 | import cv2
  6 | import shutil
  7 | import tqdm
  8 | 
  9 | data_root = '/Users/hzzone/Downloads/egohands_data/_LABELLED_SAMPLES'
 10 | with open("egohands_data.txt") as f:
 11 |     data = f.readlines()
 12 | 
 13 | 
 14 | random.shuffle(data)
 15 | random.shuffle(data)
 16 | 
 17 | test_data = random.sample(data, int(len(data)*0.2))
 18 | train_data = list(set(data) - set(test_data))
 19 | 
 20 | def trans(data, set_name):
 21 | 
 22 |     curr_dir = os.path.dirname(os.path.abspath(__file__))
 23 |     os.mkdir(os.path.join(curr_dir, set_name))
 24 |     Annotations_dir = os.path.join(curr_dir, set_name, 'Annotations')
 25 |     JPEGImages_dir = os.path.join(curr_dir, set_name, 'JPEGImages')
 26 |     os.mkdir(Annotations_dir)
 27 |     os.mkdir(JPEGImages_dir)
 28 | 
 29 |     for each_pic_data in tqdm.tqdm(data):
 30 |     # for each_pic_data in data:
 31 |         data_list = each_pic_data.strip().split()
 32 |         video_id = data_list[0]
 33 |         frame_num = str(data_list[1]).zfill(4)
 34 |         new_img_name = '{}_{}'.format(video_id, frame_num)
 35 |         frame_num = 'frame_{}.jpg'.format(frame_num)
 36 | 
 37 | 
 38 |         im_path = os.path.join(data_root, video_id, frame_num)
 39 | 
 40 |         boxes = np.reshape(np.array(map(int, data_list[2:])), (-1, 4))
 41 |         node_root = Element('annotation')
 42 | 
 43 |         node_folder = SubElement(node_root, 'folder')
 44 |         node_folder.text = 'egohands'
 45 | 
 46 |         node_filename = SubElement(node_root, 'filename')
 47 |         node_filename.text = new_img_name
 48 |         #
 49 |         node_size = SubElement(node_root, 'size')
 50 |         node_segmented = SubElement(node_root, 'segmented')
 51 |         node_segmented.text = '0'
 52 |         node_width = SubElement(node_size, 'width')
 53 |         im_height, im_width, channel = cv2.imread(im_path).shape
 54 |         node_width.text = str(im_width)
 55 |         #
 56 |         node_height = SubElement(node_size, 'height')
 57 |         node_height.text = str(im_height)
 58 |         #
 59 |         node_depth = SubElement(node_size, 'depth')
 60 |         node_depth.text = str(channel)
 61 |         #
 62 |         # im = cv2.imread(im_path)
 63 |         # for index in range(boxes.shape[0]):
 64 |         #     minx, miny, w, h = boxes[index]
 65 |         #     cv2.namedWindow("", 0)
 66 |         #     cv2.resizeWindow('', 300, 300)
 67 |         #     cv2.rectangle(im, (minx, miny), (minx+w-1, miny+h-1), (0, 255, 0), thickness=2)
 68 |         #     print(w, h)
 69 |         # cv2.imshow('', im)
 70 |         # cv2.waitKey(0)
 71 | 
 72 |         effective_hands = 0
 73 |         for index in range(boxes.shape[0]):
 74 |             minx, miny, w, h = boxes[index]
 75 |             maxx = minx+w-1
 76 |             maxy = miny+h-1
 77 |             maxx = im_width if maxx > im_width else maxx
 78 |             maxy = im_height if maxy > im_height else maxy
 79 |             minx = 0 if minx < 0 else minx
 80 |             miny = 0 if miny < 0 else miny
 81 |             w = maxx-minx+1
 82 |             h = maxy-miny+1
 83 |             if min(w, h) < 40:
 84 |                 continue
 85 |             if maxx <= minx or maxy <= miny:
 86 |                 print(minx, miny)
 87 | 
 88 |             effective_hands = effective_hands + 1
 89 |             node_object = SubElement(node_root, 'object')
 90 |             node_name = SubElement(node_object, 'name')
 91 |             node_name.text = 'hand'
 92 |             node_difficult = SubElement(node_object, 'difficult')
 93 |             node_difficult.text = '0'
 94 |             node_bndbox = SubElement(node_object, 'bndbox')
 95 |             node_xmin = SubElement(node_bndbox, 'xmin')
 96 |             node_xmin.text = str(minx)
 97 |             node_ymin = SubElement(node_bndbox, 'ymin')
 98 |             node_ymin.text = str(miny)
 99 |             node_xmax = SubElement(node_bndbox, 'xmax')
100 |             node_xmax.text = str(maxx)
101 |             node_ymax = SubElement(node_bndbox, 'ymax')
102 |             node_ymax.text = str(maxy)
103 | 
104 |         xml = tostring(node_root, pretty_print=True)
105 |         # if effective_hands == 0:
106 |         #     print(im_path)
107 |         if effective_hands != 0:
108 |             # print(im_path)
109 |             with open(Annotations_dir + "/" + new_img_name+'.xml', 'w') as f:
110 |                 f.write(xml)
111 |             shutil.copy(im_path, JPEGImages_dir + '/' + new_img_name + '.jpg')
112 | 
113 | trans(train_data, 'trainval')
114 | trans(test_data, 'test')
115 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/data/egohands/getInfo.m:
--------------------------------------------------------------------------------
 1 | video = getMetaBy();
 2 | fid = fopen('egohands_data.txt','w');
 3 | for i=1:1:48
 4 |     video_id = video(i).video_id;
 5 |     for j=1:1:100
 6 |         fprintf(fid,'%s ', video_id);
 7 |         frame_num = video(i).labelled_frames(j).frame_num;
 8 |         fprintf(fid,'%s ', num2str(frame_num));
 9 |         boxes = getBoundingBoxes(video(i), j);
10 |         for x=1:4
11 |             if sum(boxes(x, :)) ~=0
12 |                 box = boxes(x, :);
13 |                 fprintf(fid,'%d %d %d %d ', box(1), box(2), box(3), box(4));
14 |             end
15 |         end
16 |         fprintf(fid,'\n');
17 |     end
18 | end
19 | fclose(fid);
20 | 
21 |     


--------------------------------------------------------------------------------
/other/Hand_Detection/data/gth/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/data/gth/.gitkeep


--------------------------------------------------------------------------------
/other/Hand_Detection/data/labelmap_voc.prototxt:
--------------------------------------------------------------------------------
 1 | item {
 2 |   name: "none_of_the_above"
 3 |   label: 0
 4 |   display_name: "background"
 5 | }
 6 | item {
 7 |   name: "hand"
 8 |   label: 1
 9 |   display_name: "hand"
10 | }


--------------------------------------------------------------------------------
/other/Hand_Detection/data/stanfordhands/generate_stanfordhands.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | import scipy.io as sio
  3 | import os
  4 | import numpy as np
  5 | import cv2
  6 | from lxml.etree import Element, SubElement, tostring
  7 | import shutil
  8 | 
  9 | test_data = ['/Users/hzzone/Downloads/hand_dataset/test_dataset/test_data']
 10 | trainval_data = ['/Users/hzzone/Downloads/hand_dataset/training_dataset/training_data', '/Users/hzzone/Downloads/hand_dataset/validation_dataset/validation_data']
 11 | def trans(data_sources, set_name):
 12 |     curr_dir = os.path.dirname(os.path.abspath(__file__))
 13 |     os.mkdir(os.path.join(curr_dir, set_name))
 14 |     Annotations_dir = os.path.join(curr_dir, set_name, 'Annotations')
 15 |     JPEGImages_dir = os.path.join(curr_dir, set_name, 'JPEGImages')
 16 |     os.mkdir(Annotations_dir)
 17 |     os.mkdir(JPEGImages_dir)
 18 |     # cv2.namedWindow("", 0)
 19 |     # cv2.resizeWindow('', 300, 300)
 20 |     for each_source in data_sources:
 21 |         annotations_source = osp.join(each_source, 'annotations')
 22 |         img_source = osp.join(each_source, 'images')
 23 |         for mat_file in os.listdir(annotations_source):
 24 |             mat_file_path = osp.join(annotations_source, mat_file)
 25 |             # print(mat_file_path)
 26 |             img_file_path = osp.join(img_source, mat_file.rstrip('.mat'))+'.jpg'
 27 |             img = cv2.imread(img_file_path)
 28 |             boxes_data = sio.loadmat(mat_file_path)["boxes"].flatten()
 29 | 
 30 | 
 31 |             node_root = Element('annotation')
 32 | 
 33 |             node_folder = SubElement(node_root, 'folder')
 34 |             node_folder.text = 'egohands'
 35 | 
 36 |             node_filename = SubElement(node_root, 'filename')
 37 |             node_filename.text = mat_file.strip('.mat')+'.jpg'
 38 |             #
 39 |             node_size = SubElement(node_root, 'size')
 40 |             node_segmented = SubElement(node_root, 'segmented')
 41 |             node_segmented.text = '0'
 42 |             node_width = SubElement(node_size, 'width')
 43 |             im_height, im_width, channel = img.shape
 44 |             node_width.text = str(im_width)
 45 |             #
 46 |             node_height = SubElement(node_size, 'height')
 47 |             node_height.text = str(im_height)
 48 |             #
 49 |             node_depth = SubElement(node_size, 'depth')
 50 |             node_depth.text = str(channel)
 51 | 
 52 |             effective_hands = 0
 53 |             for box in boxes_data:
 54 |                 tmp = np.reshape(box[0, 0].tolist()[:4], (-1, 2))
 55 |                 y1 = int(round(min(tmp[:, 0]), 0))
 56 |                 y2 = int(round(max(tmp[:, 0]), 0))
 57 |                 x1 = int(round(min(tmp[:, 1]), 0))
 58 |                 x2 = int(round(max(tmp[:, 1]), 0))
 59 |                 # cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), thickness=2)
 60 |                 x2 = im_width if x2 > im_width else x2
 61 |                 y2 = im_height if y2 > im_height else y2
 62 |                 x1 = 0 if x1 < 0 else x1
 63 |                 y1 = 0 if y1 < 0 else y1
 64 | 
 65 |                 width = x2-x1+1
 66 |                 height = y2-y1+1
 67 | 
 68 |                 if(min(width, height)<20):
 69 |                     continue
 70 | 
 71 |                 # if x2>im_width or x1<0 or y2>im_height or y1<0:
 72 |                 #     print(x1, x2, y1, y2, width, height, im_height, im_width)
 73 |                     # cv2.imshow("", img)
 74 |                     # cv2.waitKey(0)
 75 |                 if x2<=x1 or y2<=y1:
 76 |                     print(x1, y1)
 77 | 
 78 | 
 79 |                 effective_hands = effective_hands + 1
 80 |                 node_object = SubElement(node_root, 'object')
 81 |                 node_name = SubElement(node_object, 'name')
 82 |                 node_name.text = 'hand'
 83 |                 node_difficult = SubElement(node_object, 'difficult')
 84 |                 node_difficult.text = '0'
 85 |                 node_bndbox = SubElement(node_object, 'bndbox')
 86 |                 node_xmin = SubElement(node_bndbox, 'xmin')
 87 |                 node_xmin.text = str(x1)
 88 |                 node_ymin = SubElement(node_bndbox, 'ymin')
 89 |                 node_ymin.text = str(y1)
 90 |                 node_xmax = SubElement(node_bndbox, 'xmax')
 91 |                 node_xmax.text = str(x2)
 92 |                 node_ymax = SubElement(node_bndbox, 'ymax')
 93 |                 node_ymax.text = str(y2)
 94 |             xml = tostring(node_root, pretty_print=True)
 95 |             if effective_hands != 0:
 96 |                 with open(Annotations_dir + "/" + mat_file.rstrip('.mat') +'.xml', 'w') as f:
 97 |                     f.write(xml)
 98 |                 shutil.copy(img_file_path, JPEGImages_dir + '/' + mat_file.rstrip('.mat') + '.jpg')
 99 | 
100 | 
101 | trans(trainval_data, 'trainval')
102 | trans(test_data, 'test')


--------------------------------------------------------------------------------
/other/Hand_Detection/model/deploy.prototxt:
--------------------------------------------------------------------------------
   1 | name: "VGG_HAND_SSD_300x300_deploy"
   2 | input: "data"
   3 | input_shape {
   4 |   dim: 1
   5 |   dim: 3
   6 |   dim: 300
   7 |   dim: 300
   8 | }
   9 | layer {
  10 |   name: "conv1_1"
  11 |   type: "Convolution"
  12 |   bottom: "data"
  13 |   top: "conv1_1"
  14 |   param {
  15 |     lr_mult: 1.0
  16 |     decay_mult: 1.0
  17 |   }
  18 |   param {
  19 |     lr_mult: 2.0
  20 |     decay_mult: 0.0
  21 |   }
  22 |   convolution_param {
  23 |     num_output: 64
  24 |     pad: 1
  25 |     kernel_size: 3
  26 |     weight_filler {
  27 |       type: "xavier"
  28 |     }
  29 |     bias_filler {
  30 |       type: "constant"
  31 |       value: 0.0
  32 |     }
  33 |   }
  34 | }
  35 | layer {
  36 |   name: "relu1_1"
  37 |   type: "ReLU"
  38 |   bottom: "conv1_1"
  39 |   top: "conv1_1"
  40 | }
  41 | layer {
  42 |   name: "conv1_2"
  43 |   type: "Convolution"
  44 |   bottom: "conv1_1"
  45 |   top: "conv1_2"
  46 |   param {
  47 |     lr_mult: 1.0
  48 |     decay_mult: 1.0
  49 |   }
  50 |   param {
  51 |     lr_mult: 2.0
  52 |     decay_mult: 0.0
  53 |   }
  54 |   convolution_param {
  55 |     num_output: 64
  56 |     pad: 1
  57 |     kernel_size: 3
  58 |     weight_filler {
  59 |       type: "xavier"
  60 |     }
  61 |     bias_filler {
  62 |       type: "constant"
  63 |       value: 0.0
  64 |     }
  65 |   }
  66 | }
  67 | layer {
  68 |   name: "relu1_2"
  69 |   type: "ReLU"
  70 |   bottom: "conv1_2"
  71 |   top: "conv1_2"
  72 | }
  73 | layer {
  74 |   name: "pool1"
  75 |   type: "Pooling"
  76 |   bottom: "conv1_2"
  77 |   top: "pool1"
  78 |   pooling_param {
  79 |     pool: MAX
  80 |     kernel_size: 2
  81 |     stride: 2
  82 |   }
  83 | }
  84 | layer {
  85 |   name: "conv2_1"
  86 |   type: "Convolution"
  87 |   bottom: "pool1"
  88 |   top: "conv2_1"
  89 |   param {
  90 |     lr_mult: 1.0
  91 |     decay_mult: 1.0
  92 |   }
  93 |   param {
  94 |     lr_mult: 2.0
  95 |     decay_mult: 0.0
  96 |   }
  97 |   convolution_param {
  98 |     num_output: 128
  99 |     pad: 1
 100 |     kernel_size: 3
 101 |     weight_filler {
 102 |       type: "xavier"
 103 |     }
 104 |     bias_filler {
 105 |       type: "constant"
 106 |       value: 0.0
 107 |     }
 108 |   }
 109 | }
 110 | layer {
 111 |   name: "relu2_1"
 112 |   type: "ReLU"
 113 |   bottom: "conv2_1"
 114 |   top: "conv2_1"
 115 | }
 116 | layer {
 117 |   name: "conv2_2"
 118 |   type: "Convolution"
 119 |   bottom: "conv2_1"
 120 |   top: "conv2_2"
 121 |   param {
 122 |     lr_mult: 1.0
 123 |     decay_mult: 1.0
 124 |   }
 125 |   param {
 126 |     lr_mult: 2.0
 127 |     decay_mult: 0.0
 128 |   }
 129 |   convolution_param {
 130 |     num_output: 128
 131 |     pad: 1
 132 |     kernel_size: 3
 133 |     weight_filler {
 134 |       type: "xavier"
 135 |     }
 136 |     bias_filler {
 137 |       type: "constant"
 138 |       value: 0.0
 139 |     }
 140 |   }
 141 | }
 142 | layer {
 143 |   name: "relu2_2"
 144 |   type: "ReLU"
 145 |   bottom: "conv2_2"
 146 |   top: "conv2_2"
 147 | }
 148 | layer {
 149 |   name: "pool2"
 150 |   type: "Pooling"
 151 |   bottom: "conv2_2"
 152 |   top: "pool2"
 153 |   pooling_param {
 154 |     pool: MAX
 155 |     kernel_size: 2
 156 |     stride: 2
 157 |   }
 158 | }
 159 | layer {
 160 |   name: "conv3_1"
 161 |   type: "Convolution"
 162 |   bottom: "pool2"
 163 |   top: "conv3_1"
 164 |   param {
 165 |     lr_mult: 1.0
 166 |     decay_mult: 1.0
 167 |   }
 168 |   param {
 169 |     lr_mult: 2.0
 170 |     decay_mult: 0.0
 171 |   }
 172 |   convolution_param {
 173 |     num_output: 256
 174 |     pad: 1
 175 |     kernel_size: 3
 176 |     weight_filler {
 177 |       type: "xavier"
 178 |     }
 179 |     bias_filler {
 180 |       type: "constant"
 181 |       value: 0.0
 182 |     }
 183 |   }
 184 | }
 185 | layer {
 186 |   name: "relu3_1"
 187 |   type: "ReLU"
 188 |   bottom: "conv3_1"
 189 |   top: "conv3_1"
 190 | }
 191 | layer {
 192 |   name: "conv3_2"
 193 |   type: "Convolution"
 194 |   bottom: "conv3_1"
 195 |   top: "conv3_2"
 196 |   param {
 197 |     lr_mult: 1.0
 198 |     decay_mult: 1.0
 199 |   }
 200 |   param {
 201 |     lr_mult: 2.0
 202 |     decay_mult: 0.0
 203 |   }
 204 |   convolution_param {
 205 |     num_output: 256
 206 |     pad: 1
 207 |     kernel_size: 3
 208 |     weight_filler {
 209 |       type: "xavier"
 210 |     }
 211 |     bias_filler {
 212 |       type: "constant"
 213 |       value: 0.0
 214 |     }
 215 |   }
 216 | }
 217 | layer {
 218 |   name: "relu3_2"
 219 |   type: "ReLU"
 220 |   bottom: "conv3_2"
 221 |   top: "conv3_2"
 222 | }
 223 | layer {
 224 |   name: "conv3_3"
 225 |   type: "Convolution"
 226 |   bottom: "conv3_2"
 227 |   top: "conv3_3"
 228 |   param {
 229 |     lr_mult: 1.0
 230 |     decay_mult: 1.0
 231 |   }
 232 |   param {
 233 |     lr_mult: 2.0
 234 |     decay_mult: 0.0
 235 |   }
 236 |   convolution_param {
 237 |     num_output: 256
 238 |     pad: 1
 239 |     kernel_size: 3
 240 |     weight_filler {
 241 |       type: "xavier"
 242 |     }
 243 |     bias_filler {
 244 |       type: "constant"
 245 |       value: 0.0
 246 |     }
 247 |   }
 248 | }
 249 | layer {
 250 |   name: "relu3_3"
 251 |   type: "ReLU"
 252 |   bottom: "conv3_3"
 253 |   top: "conv3_3"
 254 | }
 255 | layer {
 256 |   name: "pool3"
 257 |   type: "Pooling"
 258 |   bottom: "conv3_3"
 259 |   top: "pool3"
 260 |   pooling_param {
 261 |     pool: MAX
 262 |     kernel_size: 2
 263 |     stride: 2
 264 |   }
 265 | }
 266 | layer {
 267 |   name: "conv4_1"
 268 |   type: "Convolution"
 269 |   bottom: "pool3"
 270 |   top: "conv4_1"
 271 |   param {
 272 |     lr_mult: 1.0
 273 |     decay_mult: 1.0
 274 |   }
 275 |   param {
 276 |     lr_mult: 2.0
 277 |     decay_mult: 0.0
 278 |   }
 279 |   convolution_param {
 280 |     num_output: 512
 281 |     pad: 1
 282 |     kernel_size: 3
 283 |     weight_filler {
 284 |       type: "xavier"
 285 |     }
 286 |     bias_filler {
 287 |       type: "constant"
 288 |       value: 0.0
 289 |     }
 290 |   }
 291 | }
 292 | layer {
 293 |   name: "relu4_1"
 294 |   type: "ReLU"
 295 |   bottom: "conv4_1"
 296 |   top: "conv4_1"
 297 | }
 298 | layer {
 299 |   name: "conv4_2"
 300 |   type: "Convolution"
 301 |   bottom: "conv4_1"
 302 |   top: "conv4_2"
 303 |   param {
 304 |     lr_mult: 1.0
 305 |     decay_mult: 1.0
 306 |   }
 307 |   param {
 308 |     lr_mult: 2.0
 309 |     decay_mult: 0.0
 310 |   }
 311 |   convolution_param {
 312 |     num_output: 512
 313 |     pad: 1
 314 |     kernel_size: 3
 315 |     weight_filler {
 316 |       type: "xavier"
 317 |     }
 318 |     bias_filler {
 319 |       type: "constant"
 320 |       value: 0.0
 321 |     }
 322 |   }
 323 | }
 324 | layer {
 325 |   name: "relu4_2"
 326 |   type: "ReLU"
 327 |   bottom: "conv4_2"
 328 |   top: "conv4_2"
 329 | }
 330 | layer {
 331 |   name: "conv4_3"
 332 |   type: "Convolution"
 333 |   bottom: "conv4_2"
 334 |   top: "conv4_3"
 335 |   param {
 336 |     lr_mult: 1.0
 337 |     decay_mult: 1.0
 338 |   }
 339 |   param {
 340 |     lr_mult: 2.0
 341 |     decay_mult: 0.0
 342 |   }
 343 |   convolution_param {
 344 |     num_output: 512
 345 |     pad: 1
 346 |     kernel_size: 3
 347 |     weight_filler {
 348 |       type: "xavier"
 349 |     }
 350 |     bias_filler {
 351 |       type: "constant"
 352 |       value: 0.0
 353 |     }
 354 |   }
 355 | }
 356 | layer {
 357 |   name: "relu4_3"
 358 |   type: "ReLU"
 359 |   bottom: "conv4_3"
 360 |   top: "conv4_3"
 361 | }
 362 | layer {
 363 |   name: "pool4"
 364 |   type: "Pooling"
 365 |   bottom: "conv4_3"
 366 |   top: "pool4"
 367 |   pooling_param {
 368 |     pool: MAX
 369 |     kernel_size: 2
 370 |     stride: 2
 371 |   }
 372 | }
 373 | layer {
 374 |   name: "conv5_1"
 375 |   type: "Convolution"
 376 |   bottom: "pool4"
 377 |   top: "conv5_1"
 378 |   param {
 379 |     lr_mult: 1.0
 380 |     decay_mult: 1.0
 381 |   }
 382 |   param {
 383 |     lr_mult: 2.0
 384 |     decay_mult: 0.0
 385 |   }
 386 |   convolution_param {
 387 |     num_output: 512
 388 |     pad: 1
 389 |     kernel_size: 3
 390 |     weight_filler {
 391 |       type: "xavier"
 392 |     }
 393 |     bias_filler {
 394 |       type: "constant"
 395 |       value: 0.0
 396 |     }
 397 |     dilation: 1
 398 |   }
 399 | }
 400 | layer {
 401 |   name: "relu5_1"
 402 |   type: "ReLU"
 403 |   bottom: "conv5_1"
 404 |   top: "conv5_1"
 405 | }
 406 | layer {
 407 |   name: "conv5_2"
 408 |   type: "Convolution"
 409 |   bottom: "conv5_1"
 410 |   top: "conv5_2"
 411 |   param {
 412 |     lr_mult: 1.0
 413 |     decay_mult: 1.0
 414 |   }
 415 |   param {
 416 |     lr_mult: 2.0
 417 |     decay_mult: 0.0
 418 |   }
 419 |   convolution_param {
 420 |     num_output: 512
 421 |     pad: 1
 422 |     kernel_size: 3
 423 |     weight_filler {
 424 |       type: "xavier"
 425 |     }
 426 |     bias_filler {
 427 |       type: "constant"
 428 |       value: 0.0
 429 |     }
 430 |     dilation: 1
 431 |   }
 432 | }
 433 | layer {
 434 |   name: "relu5_2"
 435 |   type: "ReLU"
 436 |   bottom: "conv5_2"
 437 |   top: "conv5_2"
 438 | }
 439 | layer {
 440 |   name: "conv5_3"
 441 |   type: "Convolution"
 442 |   bottom: "conv5_2"
 443 |   top: "conv5_3"
 444 |   param {
 445 |     lr_mult: 1.0
 446 |     decay_mult: 1.0
 447 |   }
 448 |   param {
 449 |     lr_mult: 2.0
 450 |     decay_mult: 0.0
 451 |   }
 452 |   convolution_param {
 453 |     num_output: 512
 454 |     pad: 1
 455 |     kernel_size: 3
 456 |     weight_filler {
 457 |       type: "xavier"
 458 |     }
 459 |     bias_filler {
 460 |       type: "constant"
 461 |       value: 0.0
 462 |     }
 463 |     dilation: 1
 464 |   }
 465 | }
 466 | layer {
 467 |   name: "relu5_3"
 468 |   type: "ReLU"
 469 |   bottom: "conv5_3"
 470 |   top: "conv5_3"
 471 | }
 472 | layer {
 473 |   name: "pool5"
 474 |   type: "Pooling"
 475 |   bottom: "conv5_3"
 476 |   top: "pool5"
 477 |   pooling_param {
 478 |     pool: MAX
 479 |     kernel_size: 3
 480 |     stride: 1
 481 |     pad: 1
 482 |   }
 483 | }
 484 | layer {
 485 |   name: "fc6"
 486 |   type: "Convolution"
 487 |   bottom: "pool5"
 488 |   top: "fc6"
 489 |   param {
 490 |     lr_mult: 1.0
 491 |     decay_mult: 1.0
 492 |   }
 493 |   param {
 494 |     lr_mult: 2.0
 495 |     decay_mult: 0.0
 496 |   }
 497 |   convolution_param {
 498 |     num_output: 1024
 499 |     pad: 6
 500 |     kernel_size: 3
 501 |     weight_filler {
 502 |       type: "xavier"
 503 |     }
 504 |     bias_filler {
 505 |       type: "constant"
 506 |       value: 0.0
 507 |     }
 508 |     dilation: 6
 509 |   }
 510 | }
 511 | layer {
 512 |   name: "relu6"
 513 |   type: "ReLU"
 514 |   bottom: "fc6"
 515 |   top: "fc6"
 516 | }
 517 | layer {
 518 |   name: "fc7"
 519 |   type: "Convolution"
 520 |   bottom: "fc6"
 521 |   top: "fc7"
 522 |   param {
 523 |     lr_mult: 1.0
 524 |     decay_mult: 1.0
 525 |   }
 526 |   param {
 527 |     lr_mult: 2.0
 528 |     decay_mult: 0.0
 529 |   }
 530 |   convolution_param {
 531 |     num_output: 1024
 532 |     kernel_size: 1
 533 |     weight_filler {
 534 |       type: "xavier"
 535 |     }
 536 |     bias_filler {
 537 |       type: "constant"
 538 |       value: 0.0
 539 |     }
 540 |   }
 541 | }
 542 | layer {
 543 |   name: "relu7"
 544 |   type: "ReLU"
 545 |   bottom: "fc7"
 546 |   top: "fc7"
 547 | }
 548 | layer {
 549 |   name: "conv6_1"
 550 |   type: "Convolution"
 551 |   bottom: "fc7"
 552 |   top: "conv6_1"
 553 |   param {
 554 |     lr_mult: 1.0
 555 |     decay_mult: 1.0
 556 |   }
 557 |   param {
 558 |     lr_mult: 2.0
 559 |     decay_mult: 0.0
 560 |   }
 561 |   convolution_param {
 562 |     num_output: 256
 563 |     pad: 0
 564 |     kernel_size: 1
 565 |     stride: 1
 566 |     weight_filler {
 567 |       type: "xavier"
 568 |     }
 569 |     bias_filler {
 570 |       type: "constant"
 571 |       value: 0.0
 572 |     }
 573 |   }
 574 | }
 575 | layer {
 576 |   name: "conv6_1_relu"
 577 |   type: "ReLU"
 578 |   bottom: "conv6_1"
 579 |   top: "conv6_1"
 580 | }
 581 | layer {
 582 |   name: "conv6_2"
 583 |   type: "Convolution"
 584 |   bottom: "conv6_1"
 585 |   top: "conv6_2"
 586 |   param {
 587 |     lr_mult: 1.0
 588 |     decay_mult: 1.0
 589 |   }
 590 |   param {
 591 |     lr_mult: 2.0
 592 |     decay_mult: 0.0
 593 |   }
 594 |   convolution_param {
 595 |     num_output: 512
 596 |     pad: 1
 597 |     kernel_size: 3
 598 |     stride: 2
 599 |     weight_filler {
 600 |       type: "xavier"
 601 |     }
 602 |     bias_filler {
 603 |       type: "constant"
 604 |       value: 0.0
 605 |     }
 606 |   }
 607 | }
 608 | layer {
 609 |   name: "conv6_2_relu"
 610 |   type: "ReLU"
 611 |   bottom: "conv6_2"
 612 |   top: "conv6_2"
 613 | }
 614 | layer {
 615 |   name: "conv7_1"
 616 |   type: "Convolution"
 617 |   bottom: "conv6_2"
 618 |   top: "conv7_1"
 619 |   param {
 620 |     lr_mult: 1.0
 621 |     decay_mult: 1.0
 622 |   }
 623 |   param {
 624 |     lr_mult: 2.0
 625 |     decay_mult: 0.0
 626 |   }
 627 |   convolution_param {
 628 |     num_output: 128
 629 |     pad: 0
 630 |     kernel_size: 1
 631 |     stride: 1
 632 |     weight_filler {
 633 |       type: "xavier"
 634 |     }
 635 |     bias_filler {
 636 |       type: "constant"
 637 |       value: 0.0
 638 |     }
 639 |   }
 640 | }
 641 | layer {
 642 |   name: "conv7_1_relu"
 643 |   type: "ReLU"
 644 |   bottom: "conv7_1"
 645 |   top: "conv7_1"
 646 | }
 647 | layer {
 648 |   name: "conv7_2"
 649 |   type: "Convolution"
 650 |   bottom: "conv7_1"
 651 |   top: "conv7_2"
 652 |   param {
 653 |     lr_mult: 1.0
 654 |     decay_mult: 1.0
 655 |   }
 656 |   param {
 657 |     lr_mult: 2.0
 658 |     decay_mult: 0.0
 659 |   }
 660 |   convolution_param {
 661 |     num_output: 256
 662 |     pad: 1
 663 |     kernel_size: 3
 664 |     stride: 2
 665 |     weight_filler {
 666 |       type: "xavier"
 667 |     }
 668 |     bias_filler {
 669 |       type: "constant"
 670 |       value: 0.0
 671 |     }
 672 |   }
 673 | }
 674 | layer {
 675 |   name: "conv7_2_relu"
 676 |   type: "ReLU"
 677 |   bottom: "conv7_2"
 678 |   top: "conv7_2"
 679 | }
 680 | layer {
 681 |   name: "conv8_1"
 682 |   type: "Convolution"
 683 |   bottom: "conv7_2"
 684 |   top: "conv8_1"
 685 |   param {
 686 |     lr_mult: 1.0
 687 |     decay_mult: 1.0
 688 |   }
 689 |   param {
 690 |     lr_mult: 2.0
 691 |     decay_mult: 0.0
 692 |   }
 693 |   convolution_param {
 694 |     num_output: 128
 695 |     pad: 0
 696 |     kernel_size: 1
 697 |     stride: 1
 698 |     weight_filler {
 699 |       type: "xavier"
 700 |     }
 701 |     bias_filler {
 702 |       type: "constant"
 703 |       value: 0.0
 704 |     }
 705 |   }
 706 | }
 707 | layer {
 708 |   name: "conv8_1_relu"
 709 |   type: "ReLU"
 710 |   bottom: "conv8_1"
 711 |   top: "conv8_1"
 712 | }
 713 | layer {
 714 |   name: "conv8_2"
 715 |   type: "Convolution"
 716 |   bottom: "conv8_1"
 717 |   top: "conv8_2"
 718 |   param {
 719 |     lr_mult: 1.0
 720 |     decay_mult: 1.0
 721 |   }
 722 |   param {
 723 |     lr_mult: 2.0
 724 |     decay_mult: 0.0
 725 |   }
 726 |   convolution_param {
 727 |     num_output: 256
 728 |     pad: 0
 729 |     kernel_size: 3
 730 |     stride: 1
 731 |     weight_filler {
 732 |       type: "xavier"
 733 |     }
 734 |     bias_filler {
 735 |       type: "constant"
 736 |       value: 0.0
 737 |     }
 738 |   }
 739 | }
 740 | layer {
 741 |   name: "conv8_2_relu"
 742 |   type: "ReLU"
 743 |   bottom: "conv8_2"
 744 |   top: "conv8_2"
 745 | }
 746 | layer {
 747 |   name: "conv9_1"
 748 |   type: "Convolution"
 749 |   bottom: "conv8_2"
 750 |   top: "conv9_1"
 751 |   param {
 752 |     lr_mult: 1.0
 753 |     decay_mult: 1.0
 754 |   }
 755 |   param {
 756 |     lr_mult: 2.0
 757 |     decay_mult: 0.0
 758 |   }
 759 |   convolution_param {
 760 |     num_output: 128
 761 |     pad: 0
 762 |     kernel_size: 1
 763 |     stride: 1
 764 |     weight_filler {
 765 |       type: "xavier"
 766 |     }
 767 |     bias_filler {
 768 |       type: "constant"
 769 |       value: 0.0
 770 |     }
 771 |   }
 772 | }
 773 | layer {
 774 |   name: "conv9_1_relu"
 775 |   type: "ReLU"
 776 |   bottom: "conv9_1"
 777 |   top: "conv9_1"
 778 | }
 779 | layer {
 780 |   name: "conv9_2"
 781 |   type: "Convolution"
 782 |   bottom: "conv9_1"
 783 |   top: "conv9_2"
 784 |   param {
 785 |     lr_mult: 1.0
 786 |     decay_mult: 1.0
 787 |   }
 788 |   param {
 789 |     lr_mult: 2.0
 790 |     decay_mult: 0.0
 791 |   }
 792 |   convolution_param {
 793 |     num_output: 256
 794 |     pad: 0
 795 |     kernel_size: 3
 796 |     stride: 1
 797 |     weight_filler {
 798 |       type: "xavier"
 799 |     }
 800 |     bias_filler {
 801 |       type: "constant"
 802 |       value: 0.0
 803 |     }
 804 |   }
 805 | }
 806 | layer {
 807 |   name: "conv9_2_relu"
 808 |   type: "ReLU"
 809 |   bottom: "conv9_2"
 810 |   top: "conv9_2"
 811 | }
 812 | layer {
 813 |   name: "conv4_3_norm"
 814 |   type: "Normalize"
 815 |   bottom: "conv4_3"
 816 |   top: "conv4_3_norm"
 817 |   norm_param {
 818 |     across_spatial: false
 819 |     scale_filler {
 820 |       type: "constant"
 821 |       value: 20.0
 822 |     }
 823 |     channel_shared: false
 824 |   }
 825 | }
 826 | layer {
 827 |   name: "conv4_3_norm_mbox_loc"
 828 |   type: "Convolution"
 829 |   bottom: "conv4_3_norm"
 830 |   top: "conv4_3_norm_mbox_loc"
 831 |   param {
 832 |     lr_mult: 1.0
 833 |     decay_mult: 1.0
 834 |   }
 835 |   param {
 836 |     lr_mult: 2.0
 837 |     decay_mult: 0.0
 838 |   }
 839 |   convolution_param {
 840 |     num_output: 16
 841 |     pad: 1
 842 |     kernel_size: 3
 843 |     stride: 1
 844 |     weight_filler {
 845 |       type: "xavier"
 846 |     }
 847 |     bias_filler {
 848 |       type: "constant"
 849 |       value: 0.0
 850 |     }
 851 |   }
 852 | }
 853 | layer {
 854 |   name: "conv4_3_norm_mbox_loc_perm"
 855 |   type: "Permute"
 856 |   bottom: "conv4_3_norm_mbox_loc"
 857 |   top: "conv4_3_norm_mbox_loc_perm"
 858 |   permute_param {
 859 |     order: 0
 860 |     order: 2
 861 |     order: 3
 862 |     order: 1
 863 |   }
 864 | }
 865 | layer {
 866 |   name: "conv4_3_norm_mbox_loc_flat"
 867 |   type: "Flatten"
 868 |   bottom: "conv4_3_norm_mbox_loc_perm"
 869 |   top: "conv4_3_norm_mbox_loc_flat"
 870 |   flatten_param {
 871 |     axis: 1
 872 |   }
 873 | }
 874 | layer {
 875 |   name: "conv4_3_norm_mbox_conf_hand_detection"
 876 |   type: "Convolution"
 877 |   bottom: "conv4_3_norm"
 878 |   top: "conv4_3_norm_mbox_conf_hand_detection"
 879 |   param {
 880 |     lr_mult: 1.0
 881 |     decay_mult: 1.0
 882 |   }
 883 |   param {
 884 |     lr_mult: 2.0
 885 |     decay_mult: 0.0
 886 |   }
 887 |   convolution_param {
 888 |     num_output: 8
 889 |     pad: 1
 890 |     kernel_size: 3
 891 |     stride: 1
 892 |     weight_filler {
 893 |       type: "xavier"
 894 |     }
 895 |     bias_filler {
 896 |       type: "constant"
 897 |       value: 0.0
 898 |     }
 899 |   }
 900 | }
 901 | layer {
 902 |   name: "conv4_3_norm_mbox_conf_hand_detection_perm"
 903 |   type: "Permute"
 904 |   bottom: "conv4_3_norm_mbox_conf_hand_detection"
 905 |   top: "conv4_3_norm_mbox_conf_hand_detection_perm"
 906 |   permute_param {
 907 |     order: 0
 908 |     order: 2
 909 |     order: 3
 910 |     order: 1
 911 |   }
 912 | }
 913 | layer {
 914 |   name: "conv4_3_norm_mbox_conf_hand_detection_flat"
 915 |   type: "Flatten"
 916 |   bottom: "conv4_3_norm_mbox_conf_hand_detection_perm"
 917 |   top: "conv4_3_norm_mbox_conf_hand_detection_flat"
 918 |   flatten_param {
 919 |     axis: 1
 920 |   }
 921 | }
 922 | layer {
 923 |   name: "conv4_3_norm_mbox_priorbox"
 924 |   type: "PriorBox"
 925 |   bottom: "conv4_3_norm"
 926 |   bottom: "data"
 927 |   top: "conv4_3_norm_mbox_priorbox"
 928 |   prior_box_param {
 929 |     min_size: 30.0
 930 |     max_size: 60.0
 931 |     aspect_ratio: 2.0
 932 |     flip: true
 933 |     clip: false
 934 |     variance: 0.10000000149
 935 |     variance: 0.10000000149
 936 |     variance: 0.20000000298
 937 |     variance: 0.20000000298
 938 |     step: 8.0
 939 |     offset: 0.5
 940 |   }
 941 | }
 942 | layer {
 943 |   name: "fc7_mbox_loc"
 944 |   type: "Convolution"
 945 |   bottom: "fc7"
 946 |   top: "fc7_mbox_loc"
 947 |   param {
 948 |     lr_mult: 1.0
 949 |     decay_mult: 1.0
 950 |   }
 951 |   param {
 952 |     lr_mult: 2.0
 953 |     decay_mult: 0.0
 954 |   }
 955 |   convolution_param {
 956 |     num_output: 24
 957 |     pad: 1
 958 |     kernel_size: 3
 959 |     stride: 1
 960 |     weight_filler {
 961 |       type: "xavier"
 962 |     }
 963 |     bias_filler {
 964 |       type: "constant"
 965 |       value: 0.0
 966 |     }
 967 |   }
 968 | }
 969 | layer {
 970 |   name: "fc7_mbox_loc_perm"
 971 |   type: "Permute"
 972 |   bottom: "fc7_mbox_loc"
 973 |   top: "fc7_mbox_loc_perm"
 974 |   permute_param {
 975 |     order: 0
 976 |     order: 2
 977 |     order: 3
 978 |     order: 1
 979 |   }
 980 | }
 981 | layer {
 982 |   name: "fc7_mbox_loc_flat"
 983 |   type: "Flatten"
 984 |   bottom: "fc7_mbox_loc_perm"
 985 |   top: "fc7_mbox_loc_flat"
 986 |   flatten_param {
 987 |     axis: 1
 988 |   }
 989 | }
 990 | layer {
 991 |   name: "fc7_mbox_conf_hand_detection"
 992 |   type: "Convolution"
 993 |   bottom: "fc7"
 994 |   top: "fc7_mbox_conf_hand_detection"
 995 |   param {
 996 |     lr_mult: 1.0
 997 |     decay_mult: 1.0
 998 |   }
 999 |   param {
1000 |     lr_mult: 2.0
1001 |     decay_mult: 0.0
1002 |   }
1003 |   convolution_param {
1004 |     num_output: 12
1005 |     pad: 1
1006 |     kernel_size: 3
1007 |     stride: 1
1008 |     weight_filler {
1009 |       type: "xavier"
1010 |     }
1011 |     bias_filler {
1012 |       type: "constant"
1013 |       value: 0.0
1014 |     }
1015 |   }
1016 | }
1017 | layer {
1018 |   name: "fc7_mbox_conf_hand_detection_perm"
1019 |   type: "Permute"
1020 |   bottom: "fc7_mbox_conf_hand_detection"
1021 |   top: "fc7_mbox_conf_hand_detection_perm"
1022 |   permute_param {
1023 |     order: 0
1024 |     order: 2
1025 |     order: 3
1026 |     order: 1
1027 |   }
1028 | }
1029 | layer {
1030 |   name: "fc7_mbox_conf_hand_detection_flat"
1031 |   type: "Flatten"
1032 |   bottom: "fc7_mbox_conf_hand_detection_perm"
1033 |   top: "fc7_mbox_conf_hand_detection_flat"
1034 |   flatten_param {
1035 |     axis: 1
1036 |   }
1037 | }
1038 | layer {
1039 |   name: "fc7_mbox_priorbox"
1040 |   type: "PriorBox"
1041 |   bottom: "fc7"
1042 |   bottom: "data"
1043 |   top: "fc7_mbox_priorbox"
1044 |   prior_box_param {
1045 |     min_size: 60.0
1046 |     max_size: 111.0
1047 |     aspect_ratio: 2.0
1048 |     aspect_ratio: 3.0
1049 |     flip: true
1050 |     clip: false
1051 |     variance: 0.10000000149
1052 |     variance: 0.10000000149
1053 |     variance: 0.20000000298
1054 |     variance: 0.20000000298
1055 |     step: 16.0
1056 |     offset: 0.5
1057 |   }
1058 | }
1059 | layer {
1060 |   name: "conv6_2_mbox_loc"
1061 |   type: "Convolution"
1062 |   bottom: "conv6_2"
1063 |   top: "conv6_2_mbox_loc"
1064 |   param {
1065 |     lr_mult: 1.0
1066 |     decay_mult: 1.0
1067 |   }
1068 |   param {
1069 |     lr_mult: 2.0
1070 |     decay_mult: 0.0
1071 |   }
1072 |   convolution_param {
1073 |     num_output: 24
1074 |     pad: 1
1075 |     kernel_size: 3
1076 |     stride: 1
1077 |     weight_filler {
1078 |       type: "xavier"
1079 |     }
1080 |     bias_filler {
1081 |       type: "constant"
1082 |       value: 0.0
1083 |     }
1084 |   }
1085 | }
1086 | layer {
1087 |   name: "conv6_2_mbox_loc_perm"
1088 |   type: "Permute"
1089 |   bottom: "conv6_2_mbox_loc"
1090 |   top: "conv6_2_mbox_loc_perm"
1091 |   permute_param {
1092 |     order: 0
1093 |     order: 2
1094 |     order: 3
1095 |     order: 1
1096 |   }
1097 | }
1098 | layer {
1099 |   name: "conv6_2_mbox_loc_flat"
1100 |   type: "Flatten"
1101 |   bottom: "conv6_2_mbox_loc_perm"
1102 |   top: "conv6_2_mbox_loc_flat"
1103 |   flatten_param {
1104 |     axis: 1
1105 |   }
1106 | }
1107 | layer {
1108 |   name: "conv6_2_mbox_conf_hand_detection"
1109 |   type: "Convolution"
1110 |   bottom: "conv6_2"
1111 |   top: "conv6_2_mbox_conf_hand_detection"
1112 |   param {
1113 |     lr_mult: 1.0
1114 |     decay_mult: 1.0
1115 |   }
1116 |   param {
1117 |     lr_mult: 2.0
1118 |     decay_mult: 0.0
1119 |   }
1120 |   convolution_param {
1121 |     num_output: 12
1122 |     pad: 1
1123 |     kernel_size: 3
1124 |     stride: 1
1125 |     weight_filler {
1126 |       type: "xavier"
1127 |     }
1128 |     bias_filler {
1129 |       type: "constant"
1130 |       value: 0.0
1131 |     }
1132 |   }
1133 | }
1134 | layer {
1135 |   name: "conv6_2_mbox_conf_hand_detection_perm"
1136 |   type: "Permute"
1137 |   bottom: "conv6_2_mbox_conf_hand_detection"
1138 |   top: "conv6_2_mbox_conf_hand_detection_perm"
1139 |   permute_param {
1140 |     order: 0
1141 |     order: 2
1142 |     order: 3
1143 |     order: 1
1144 |   }
1145 | }
1146 | layer {
1147 |   name: "conv6_2_mbox_conf_hand_detection_flat"
1148 |   type: "Flatten"
1149 |   bottom: "conv6_2_mbox_conf_hand_detection_perm"
1150 |   top: "conv6_2_mbox_conf_hand_detection_flat"
1151 |   flatten_param {
1152 |     axis: 1
1153 |   }
1154 | }
1155 | layer {
1156 |   name: "conv6_2_mbox_priorbox"
1157 |   type: "PriorBox"
1158 |   bottom: "conv6_2"
1159 |   bottom: "data"
1160 |   top: "conv6_2_mbox_priorbox"
1161 |   prior_box_param {
1162 |     min_size: 111.0
1163 |     max_size: 162.0
1164 |     aspect_ratio: 2.0
1165 |     aspect_ratio: 3.0
1166 |     flip: true
1167 |     clip: false
1168 |     variance: 0.10000000149
1169 |     variance: 0.10000000149
1170 |     variance: 0.20000000298
1171 |     variance: 0.20000000298
1172 |     step: 32.0
1173 |     offset: 0.5
1174 |   }
1175 | }
1176 | layer {
1177 |   name: "conv7_2_mbox_loc"
1178 |   type: "Convolution"
1179 |   bottom: "conv7_2"
1180 |   top: "conv7_2_mbox_loc"
1181 |   param {
1182 |     lr_mult: 1.0
1183 |     decay_mult: 1.0
1184 |   }
1185 |   param {
1186 |     lr_mult: 2.0
1187 |     decay_mult: 0.0
1188 |   }
1189 |   convolution_param {
1190 |     num_output: 24
1191 |     pad: 1
1192 |     kernel_size: 3
1193 |     stride: 1
1194 |     weight_filler {
1195 |       type: "xavier"
1196 |     }
1197 |     bias_filler {
1198 |       type: "constant"
1199 |       value: 0.0
1200 |     }
1201 |   }
1202 | }
1203 | layer {
1204 |   name: "conv7_2_mbox_loc_perm"
1205 |   type: "Permute"
1206 |   bottom: "conv7_2_mbox_loc"
1207 |   top: "conv7_2_mbox_loc_perm"
1208 |   permute_param {
1209 |     order: 0
1210 |     order: 2
1211 |     order: 3
1212 |     order: 1
1213 |   }
1214 | }
1215 | layer {
1216 |   name: "conv7_2_mbox_loc_flat"
1217 |   type: "Flatten"
1218 |   bottom: "conv7_2_mbox_loc_perm"
1219 |   top: "conv7_2_mbox_loc_flat"
1220 |   flatten_param {
1221 |     axis: 1
1222 |   }
1223 | }
1224 | layer {
1225 |   name: "conv7_2_mbox_conf_hand_detection"
1226 |   type: "Convolution"
1227 |   bottom: "conv7_2"
1228 |   top: "conv7_2_mbox_conf_hand_detection"
1229 |   param {
1230 |     lr_mult: 1.0
1231 |     decay_mult: 1.0
1232 |   }
1233 |   param {
1234 |     lr_mult: 2.0
1235 |     decay_mult: 0.0
1236 |   }
1237 |   convolution_param {
1238 |     num_output: 12
1239 |     pad: 1
1240 |     kernel_size: 3
1241 |     stride: 1
1242 |     weight_filler {
1243 |       type: "xavier"
1244 |     }
1245 |     bias_filler {
1246 |       type: "constant"
1247 |       value: 0.0
1248 |     }
1249 |   }
1250 | }
1251 | layer {
1252 |   name: "conv7_2_mbox_conf_hand_detection_perm"
1253 |   type: "Permute"
1254 |   bottom: "conv7_2_mbox_conf_hand_detection"
1255 |   top: "conv7_2_mbox_conf_hand_detection_perm"
1256 |   permute_param {
1257 |     order: 0
1258 |     order: 2
1259 |     order: 3
1260 |     order: 1
1261 |   }
1262 | }
1263 | layer {
1264 |   name: "conv7_2_mbox_conf_hand_detection_flat"
1265 |   type: "Flatten"
1266 |   bottom: "conv7_2_mbox_conf_hand_detection_perm"
1267 |   top: "conv7_2_mbox_conf_hand_detection_flat"
1268 |   flatten_param {
1269 |     axis: 1
1270 |   }
1271 | }
1272 | layer {
1273 |   name: "conv7_2_mbox_priorbox"
1274 |   type: "PriorBox"
1275 |   bottom: "conv7_2"
1276 |   bottom: "data"
1277 |   top: "conv7_2_mbox_priorbox"
1278 |   prior_box_param {
1279 |     min_size: 162.0
1280 |     max_size: 213.0
1281 |     aspect_ratio: 2.0
1282 |     aspect_ratio: 3.0
1283 |     flip: true
1284 |     clip: false
1285 |     variance: 0.10000000149
1286 |     variance: 0.10000000149
1287 |     variance: 0.20000000298
1288 |     variance: 0.20000000298
1289 |     step: 64.0
1290 |     offset: 0.5
1291 |   }
1292 | }
1293 | layer {
1294 |   name: "conv8_2_mbox_loc"
1295 |   type: "Convolution"
1296 |   bottom: "conv8_2"
1297 |   top: "conv8_2_mbox_loc"
1298 |   param {
1299 |     lr_mult: 1.0
1300 |     decay_mult: 1.0
1301 |   }
1302 |   param {
1303 |     lr_mult: 2.0
1304 |     decay_mult: 0.0
1305 |   }
1306 |   convolution_param {
1307 |     num_output: 16
1308 |     pad: 1
1309 |     kernel_size: 3
1310 |     stride: 1
1311 |     weight_filler {
1312 |       type: "xavier"
1313 |     }
1314 |     bias_filler {
1315 |       type: "constant"
1316 |       value: 0.0
1317 |     }
1318 |   }
1319 | }
1320 | layer {
1321 |   name: "conv8_2_mbox_loc_perm"
1322 |   type: "Permute"
1323 |   bottom: "conv8_2_mbox_loc"
1324 |   top: "conv8_2_mbox_loc_perm"
1325 |   permute_param {
1326 |     order: 0
1327 |     order: 2
1328 |     order: 3
1329 |     order: 1
1330 |   }
1331 | }
1332 | layer {
1333 |   name: "conv8_2_mbox_loc_flat"
1334 |   type: "Flatten"
1335 |   bottom: "conv8_2_mbox_loc_perm"
1336 |   top: "conv8_2_mbox_loc_flat"
1337 |   flatten_param {
1338 |     axis: 1
1339 |   }
1340 | }
1341 | layer {
1342 |   name: "conv8_2_mbox_conf_hand_detection"
1343 |   type: "Convolution"
1344 |   bottom: "conv8_2"
1345 |   top: "conv8_2_mbox_conf_hand_detection"
1346 |   param {
1347 |     lr_mult: 1.0
1348 |     decay_mult: 1.0
1349 |   }
1350 |   param {
1351 |     lr_mult: 2.0
1352 |     decay_mult: 0.0
1353 |   }
1354 |   convolution_param {
1355 |     num_output: 8
1356 |     pad: 1
1357 |     kernel_size: 3
1358 |     stride: 1
1359 |     weight_filler {
1360 |       type: "xavier"
1361 |     }
1362 |     bias_filler {
1363 |       type: "constant"
1364 |       value: 0.0
1365 |     }
1366 |   }
1367 | }
1368 | layer {
1369 |   name: "conv8_2_mbox_conf_hand_detection_perm"
1370 |   type: "Permute"
1371 |   bottom: "conv8_2_mbox_conf_hand_detection"
1372 |   top: "conv8_2_mbox_conf_hand_detection_perm"
1373 |   permute_param {
1374 |     order: 0
1375 |     order: 2
1376 |     order: 3
1377 |     order: 1
1378 |   }
1379 | }
1380 | layer {
1381 |   name: "conv8_2_mbox_conf_hand_detection_flat"
1382 |   type: "Flatten"
1383 |   bottom: "conv8_2_mbox_conf_hand_detection_perm"
1384 |   top: "conv8_2_mbox_conf_hand_detection_flat"
1385 |   flatten_param {
1386 |     axis: 1
1387 |   }
1388 | }
1389 | layer {
1390 |   name: "conv8_2_mbox_priorbox"
1391 |   type: "PriorBox"
1392 |   bottom: "conv8_2"
1393 |   bottom: "data"
1394 |   top: "conv8_2_mbox_priorbox"
1395 |   prior_box_param {
1396 |     min_size: 213.0
1397 |     max_size: 264.0
1398 |     aspect_ratio: 2.0
1399 |     flip: true
1400 |     clip: false
1401 |     variance: 0.10000000149
1402 |     variance: 0.10000000149
1403 |     variance: 0.20000000298
1404 |     variance: 0.20000000298
1405 |     step: 100.0
1406 |     offset: 0.5
1407 |   }
1408 | }
1409 | layer {
1410 |   name: "conv9_2_mbox_loc"
1411 |   type: "Convolution"
1412 |   bottom: "conv9_2"
1413 |   top: "conv9_2_mbox_loc"
1414 |   param {
1415 |     lr_mult: 1.0
1416 |     decay_mult: 1.0
1417 |   }
1418 |   param {
1419 |     lr_mult: 2.0
1420 |     decay_mult: 0.0
1421 |   }
1422 |   convolution_param {
1423 |     num_output: 16
1424 |     pad: 1
1425 |     kernel_size: 3
1426 |     stride: 1
1427 |     weight_filler {
1428 |       type: "xavier"
1429 |     }
1430 |     bias_filler {
1431 |       type: "constant"
1432 |       value: 0.0
1433 |     }
1434 |   }
1435 | }
1436 | layer {
1437 |   name: "conv9_2_mbox_loc_perm"
1438 |   type: "Permute"
1439 |   bottom: "conv9_2_mbox_loc"
1440 |   top: "conv9_2_mbox_loc_perm"
1441 |   permute_param {
1442 |     order: 0
1443 |     order: 2
1444 |     order: 3
1445 |     order: 1
1446 |   }
1447 | }
1448 | layer {
1449 |   name: "conv9_2_mbox_loc_flat"
1450 |   type: "Flatten"
1451 |   bottom: "conv9_2_mbox_loc_perm"
1452 |   top: "conv9_2_mbox_loc_flat"
1453 |   flatten_param {
1454 |     axis: 1
1455 |   }
1456 | }
1457 | layer {
1458 |   name: "conv9_2_mbox_conf_hand_detection"
1459 |   type: "Convolution"
1460 |   bottom: "conv9_2"
1461 |   top: "conv9_2_mbox_conf_hand_detection"
1462 |   param {
1463 |     lr_mult: 1.0
1464 |     decay_mult: 1.0
1465 |   }
1466 |   param {
1467 |     lr_mult: 2.0
1468 |     decay_mult: 0.0
1469 |   }
1470 |   convolution_param {
1471 |     num_output: 8
1472 |     pad: 1
1473 |     kernel_size: 3
1474 |     stride: 1
1475 |     weight_filler {
1476 |       type: "xavier"
1477 |     }
1478 |     bias_filler {
1479 |       type: "constant"
1480 |       value: 0.0
1481 |     }
1482 |   }
1483 | }
1484 | layer {
1485 |   name: "conv9_2_mbox_conf_hand_detection_perm"
1486 |   type: "Permute"
1487 |   bottom: "conv9_2_mbox_conf_hand_detection"
1488 |   top: "conv9_2_mbox_conf_hand_detection_perm"
1489 |   permute_param {
1490 |     order: 0
1491 |     order: 2
1492 |     order: 3
1493 |     order: 1
1494 |   }
1495 | }
1496 | layer {
1497 |   name: "conv9_2_mbox_conf_hand_detection_flat"
1498 |   type: "Flatten"
1499 |   bottom: "conv9_2_mbox_conf_hand_detection_perm"
1500 |   top: "conv9_2_mbox_conf_hand_detection_flat"
1501 |   flatten_param {
1502 |     axis: 1
1503 |   }
1504 | }
1505 | layer {
1506 |   name: "conv9_2_mbox_priorbox"
1507 |   type: "PriorBox"
1508 |   bottom: "conv9_2"
1509 |   bottom: "data"
1510 |   top: "conv9_2_mbox_priorbox"
1511 |   prior_box_param {
1512 |     min_size: 264.0
1513 |     max_size: 315.0
1514 |     aspect_ratio: 2.0
1515 |     flip: true
1516 |     clip: false
1517 |     variance: 0.10000000149
1518 |     variance: 0.10000000149
1519 |     variance: 0.20000000298
1520 |     variance: 0.20000000298
1521 |     step: 300.0
1522 |     offset: 0.5
1523 |   }
1524 | }
1525 | layer {
1526 |   name: "mbox_loc"
1527 |   type: "Concat"
1528 |   bottom: "conv4_3_norm_mbox_loc_flat"
1529 |   bottom: "fc7_mbox_loc_flat"
1530 |   bottom: "conv6_2_mbox_loc_flat"
1531 |   bottom: "conv7_2_mbox_loc_flat"
1532 |   bottom: "conv8_2_mbox_loc_flat"
1533 |   bottom: "conv9_2_mbox_loc_flat"
1534 |   top: "mbox_loc"
1535 |   concat_param {
1536 |     axis: 1
1537 |   }
1538 | }
1539 | layer {
1540 |   name: "mbox_conf"
1541 |   type: "Concat"
1542 |   bottom: "conv4_3_norm_mbox_conf_hand_detection_flat"
1543 |   bottom: "fc7_mbox_conf_hand_detection_flat"
1544 |   bottom: "conv6_2_mbox_conf_hand_detection_flat"
1545 |   bottom: "conv7_2_mbox_conf_hand_detection_flat"
1546 |   bottom: "conv8_2_mbox_conf_hand_detection_flat"
1547 |   bottom: "conv9_2_mbox_conf_hand_detection_flat"
1548 |   top: "mbox_conf"
1549 |   concat_param {
1550 |     axis: 1
1551 |   }
1552 | }
1553 | layer {
1554 |   name: "mbox_priorbox"
1555 |   type: "Concat"
1556 |   bottom: "conv4_3_norm_mbox_priorbox"
1557 |   bottom: "fc7_mbox_priorbox"
1558 |   bottom: "conv6_2_mbox_priorbox"
1559 |   bottom: "conv7_2_mbox_priorbox"
1560 |   bottom: "conv8_2_mbox_priorbox"
1561 |   bottom: "conv9_2_mbox_priorbox"
1562 |   top: "mbox_priorbox"
1563 |   concat_param {
1564 |     axis: 2
1565 |   }
1566 | }
1567 | layer {
1568 |   name: "mbox_conf_reshape"
1569 |   type: "Reshape"
1570 |   bottom: "mbox_conf"
1571 |   top: "mbox_conf_reshape"
1572 |   reshape_param {
1573 |     shape {
1574 |       dim: 0
1575 |       dim: -1
1576 |       dim: 2
1577 |     }
1578 |   }
1579 | }
1580 | layer {
1581 |   name: "mbox_conf_softmax"
1582 |   type: "Softmax"
1583 |   bottom: "mbox_conf_reshape"
1584 |   top: "mbox_conf_softmax"
1585 |   softmax_param {
1586 |     axis: 2
1587 |   }
1588 | }
1589 | layer {
1590 |   name: "mbox_conf_flatten"
1591 |   type: "Flatten"
1592 |   bottom: "mbox_conf_softmax"
1593 |   top: "mbox_conf_flatten"
1594 |   flatten_param {
1595 |     axis: 1
1596 |   }
1597 | }
1598 | layer {
1599 |   name: "detection_out"
1600 |   type: "DetectionOutput"
1601 |   bottom: "mbox_loc"
1602 |   bottom: "mbox_conf_flatten"
1603 |   bottom: "mbox_priorbox"
1604 |   top: "detection_out"
1605 |   include {
1606 |     phase: TEST
1607 |   }
1608 |   detection_output_param {
1609 |     num_classes: 2
1610 |     share_location: true
1611 |     background_label_id: 0
1612 |     nms_param {
1613 |       nms_threshold: 0.449999988079
1614 |       top_k: 400
1615 |     }
1616 |     code_type: CENTER_SIZE
1617 |     keep_top_k: 200
1618 |     confidence_threshold: 0.00999999977648
1619 |   }
1620 | }
1621 | 
1622 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/model/generate_model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import sys
  3 | sys.path.insert(0, '../caffe/python')
  4 | import caffe
  5 | from caffe.model_libs import *
  6 | from google.protobuf import text_format
  7 | 
  8 | import math
  9 | import os
 10 | import shutil
 11 | import stat
 12 | import subprocess
 13 | 
 14 | # Add extra layers on top of a "base" network (e.g. VGGNet or Inception).
 15 | def AddExtraLayers(net, use_batchnorm=True, lr_mult=1):
 16 |     use_relu = True
 17 | 
 18 |     # Add additional convolutional layers.
 19 |     # 19 x 19
 20 |     from_layer = net.keys()[-1]
 21 | 
 22 |     # TODO(weiliu89): Construct the name using the last layer to avoid duplication.
 23 |     # 10 x 10
 24 |     out_layer = "conv6_1"
 25 |     ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 1, 0, 1,
 26 |         lr_mult=lr_mult)
 27 | 
 28 |     from_layer = out_layer
 29 |     out_layer = "conv6_2"
 30 |     ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 512, 3, 1, 2,
 31 |         lr_mult=lr_mult)
 32 | 
 33 |     # 5 x 5
 34 |     from_layer = out_layer
 35 |     out_layer = "conv7_1"
 36 |     ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
 37 |       lr_mult=lr_mult)
 38 | 
 39 |     from_layer = out_layer
 40 |     out_layer = "conv7_2"
 41 |     ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 1, 2,
 42 |       lr_mult=lr_mult)
 43 | 
 44 |     # 3 x 3
 45 |     from_layer = out_layer
 46 |     out_layer = "conv8_1"
 47 |     ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
 48 |       lr_mult=lr_mult)
 49 | 
 50 |     from_layer = out_layer
 51 |     out_layer = "conv8_2"
 52 |     ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
 53 |       lr_mult=lr_mult)
 54 | 
 55 |     # 1 x 1
 56 |     from_layer = out_layer
 57 |     out_layer = "conv9_1"
 58 |     ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 128, 1, 0, 1,
 59 |       lr_mult=lr_mult)
 60 | 
 61 |     from_layer = out_layer
 62 |     out_layer = "conv9_2"
 63 |     ConvBNLayer(net, from_layer, out_layer, use_batchnorm, use_relu, 256, 3, 0, 1,
 64 |       lr_mult=lr_mult)
 65 | 
 66 |     return net
 67 | 
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | # The database file for training data. Created by data/VOC0712/create_data.sh
 74 | train_data = "../data/lmdb/trainval_lmdb"
 75 | model_name = "VGG_HAND_SSD_300x300"
 76 | # The database file for testing data. Created by data/VOC0712/create_data.sh
 77 | # Specify the batch sampler.
 78 | resize_width = 300
 79 | resize_height = 300
 80 | resize = "{}x{}".format(resize_width, resize_height)
 81 | batch_sampler = [
 82 |         {
 83 |                 'sampler': {
 84 |                         },
 85 |                 'max_trials': 1,
 86 |                 'max_sample': 1,
 87 |         },
 88 |         {
 89 |                 'sampler': {
 90 |                         'min_scale': 0.3,
 91 |                         'max_scale': 1.0,
 92 |                         'min_aspect_ratio': 0.5,
 93 |                         'max_aspect_ratio': 2.0,
 94 |                         },
 95 |                 'sample_constraint': {
 96 |                         'min_jaccard_overlap': 0.1,
 97 |                         },
 98 |                 'max_trials': 50,
 99 |                 'max_sample': 1,
100 |         },
101 |         {
102 |                 'sampler': {
103 |                         'min_scale': 0.3,
104 |                         'max_scale': 1.0,
105 |                         'min_aspect_ratio': 0.5,
106 |                         'max_aspect_ratio': 2.0,
107 |                         },
108 |                 'sample_constraint': {
109 |                         'min_jaccard_overlap': 0.3,
110 |                         },
111 |                 'max_trials': 50,
112 |                 'max_sample': 1,
113 |         },
114 |         {
115 |                 'sampler': {
116 |                         'min_scale': 0.3,
117 |                         'max_scale': 1.0,
118 |                         'min_aspect_ratio': 0.5,
119 |                         'max_aspect_ratio': 2.0,
120 |                         },
121 |                 'sample_constraint': {
122 |                         'min_jaccard_overlap': 0.5,
123 |                         },
124 |                 'max_trials': 50,
125 |                 'max_sample': 1,
126 |         },
127 |         {
128 |                 'sampler': {
129 |                         'min_scale': 0.3,
130 |                         'max_scale': 1.0,
131 |                         'min_aspect_ratio': 0.5,
132 |                         'max_aspect_ratio': 2.0,
133 |                         },
134 |                 'sample_constraint': {
135 |                         'min_jaccard_overlap': 0.7,
136 |                         },
137 |                 'max_trials': 50,
138 |                 'max_sample': 1,
139 |         },
140 |         {
141 |                 'sampler': {
142 |                         'min_scale': 0.3,
143 |                         'max_scale': 1.0,
144 |                         'min_aspect_ratio': 0.5,
145 |                         'max_aspect_ratio': 2.0,
146 |                         },
147 |                 'sample_constraint': {
148 |                         'min_jaccard_overlap': 0.9,
149 |                         },
150 |                 'max_trials': 50,
151 |                 'max_sample': 1,
152 |         },
153 |         {
154 |                 'sampler': {
155 |                         'min_scale': 0.3,
156 |                         'max_scale': 1.0,
157 |                         'min_aspect_ratio': 0.5,
158 |                         'max_aspect_ratio': 2.0,
159 |                         },
160 |                 'sample_constraint': {
161 |                         'max_jaccard_overlap': 1.0,
162 |                         },
163 |                 'max_trials': 50,
164 |                 'max_sample': 1,
165 |         },
166 |         ]
167 | train_transform_param = {
168 |         'mirror': True,
169 |         'mean_value': [127.0, 127.0, 127.0],
170 |         'resize_param': {
171 |                 'prob': 1,
172 |                 'resize_mode': P.Resize.WARP,
173 |                 'height': resize_height,
174 |                 'width': resize_width,
175 |                 'interp_mode': [
176 |                         P.Resize.LINEAR,
177 |                         P.Resize.AREA,
178 |                         P.Resize.NEAREST,
179 |                         P.Resize.CUBIC,
180 |                         P.Resize.LANCZOS4,
181 |                         ],
182 |                 },
183 |         'distort_param': {
184 |                 'brightness_prob': 0.5,
185 |                 'brightness_delta': 32,
186 |                 'contrast_prob': 0.5,
187 |                 'contrast_lower': 0.5,
188 |                 'contrast_upper': 1.5,
189 |                 'hue_prob': 0.5,
190 |                 'hue_delta': 18,
191 |                 'saturation_prob': 0.5,
192 |                 'saturation_lower': 0.5,
193 |                 'saturation_upper': 1.5,
194 |                 'random_order_prob': 0.0,
195 |                 },
196 |         'expand_param': {
197 |                 'prob': 0.5,
198 |                 'max_expand_ratio': 4.0,
199 |                 },
200 |         'emit_constraint': {
201 |             'emit_type': caffe_pb2.EmitConstraint.CENTER,
202 |             }
203 |         }
204 | 
205 | # If true, use batch norm for all newly added layers.
206 | # Currently only the non batch norm version has been tested.
207 | use_batchnorm = False
208 | lr_mult = 1
209 | 
210 | # model definition files.
211 | train_net_file = "train.prototxt"
212 | deploy_net_file = "deploy.prototxt"
213 | solver_file = "solver.prototxt"
214 | # snapshot prefix.
215 | snapshot_prefix = "snapshot/VGG_HAND_SSD_300x300_iter_"
216 | # Stores LabelMapItem.
217 | label_map_file = "../data/labelmap_voc.prototxt"
218 | 
219 | # MultiBoxLoss parameters.
220 | num_classes = 2
221 | share_location = True
222 | background_label_id=0
223 | train_on_diff_gt = True
224 | normalization_mode = P.Loss.VALID
225 | code_type = P.PriorBox.CENTER_SIZE
226 | ignore_cross_boundary_bbox = False
227 | mining_type = P.MultiBoxLoss.MAX_NEGATIVE
228 | neg_pos_ratio = 3.
229 | loc_weight = (neg_pos_ratio + 1.) / 4.
230 | multibox_loss_param = {
231 |     'loc_loss_type': P.MultiBoxLoss.SMOOTH_L1,
232 |     'conf_loss_type': P.MultiBoxLoss.SOFTMAX,
233 |     'loc_weight': loc_weight,
234 |     'num_classes': num_classes,
235 |     'share_location': share_location,
236 |     'match_type': P.MultiBoxLoss.PER_PREDICTION,
237 |     'overlap_threshold': 0.5,
238 |     'use_prior_for_matching': True,
239 |     'background_label_id': background_label_id,
240 |     'use_difficult_gt': train_on_diff_gt,
241 |     'mining_type': mining_type,
242 |     'neg_pos_ratio': neg_pos_ratio,
243 |     'neg_overlap': 0.5,
244 |     'code_type': code_type,
245 |     'ignore_cross_boundary_bbox': ignore_cross_boundary_bbox,
246 |     }
247 | loss_param = {
248 |     'normalization': normalization_mode,
249 |     }
250 | 
251 | # parameters for generating priors.
252 | # minimum dimension of input image
253 | min_dim = 300
254 | # conv4_3 ==> 38 x 38
255 | # fc7 ==> 19 x 19
256 | # conv6_2 ==> 10 x 10
257 | # conv7_2 ==> 5 x 5
258 | # conv8_2 ==> 3 x 3
259 | # conv9_2 ==> 1 x 1
260 | mbox_source_layers = ['conv4_3', 'fc7', 'conv6_2', 'conv7_2', 'conv8_2', 'conv9_2']
261 | # in percent %
262 | min_ratio = 20
263 | max_ratio = 90
264 | step = int(math.floor((max_ratio - min_ratio) / (len(mbox_source_layers) - 2)))
265 | min_sizes = []
266 | max_sizes = []
267 | for ratio in xrange(min_ratio, max_ratio + 1, step):
268 |   min_sizes.append(min_dim * ratio / 100.)
269 |   max_sizes.append(min_dim * (ratio + step) / 100.)
270 | min_sizes = [min_dim * 10 / 100.] + min_sizes
271 | max_sizes = [min_dim * 20 / 100.] + max_sizes
272 | steps = [8, 16, 32, 64, 100, 300]
273 | aspect_ratios = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]
274 | # L2 normalize conv4_3.
275 | normalizations = [20, -1, -1, -1, -1, -1]
276 | # variance used to encode/decode prior bboxes.
277 | if code_type == P.PriorBox.CENTER_SIZE:
278 |   prior_variance = [0.1, 0.1, 0.2, 0.2]
279 | else:
280 |   prior_variance = [0.1]
281 | flip = True
282 | clip = False
283 | 
284 | 
285 | # Divide the mini-batch to different GPUs.
286 | batch_size = 2
287 | num_gpus = 1
288 | iter_size = 200000
289 | device_id = 0
290 | batch_size_per_device = batch_size
291 | 
292 | batch_size_per_device = int(math.ceil(float(batch_size) / num_gpus))
293 | solver_mode = P.Solver.GPU
294 | 
295 | base_lr = 0.0001
296 | 
297 | solver_param = {
298 |     # Train parameters
299 |     'base_lr': base_lr,
300 |     'weight_decay': 0.0005,
301 |     'lr_policy': "multistep",
302 |     'stepvalue': [80000, 100000, 120000],
303 |     'gamma': 0.1,
304 |     'momentum': 0.9,
305 |     'iter_size': iter_size,
306 |     'max_iter': 120000,
307 |     'snapshot': 80000,
308 |     'display': 10,
309 |     'average_loss': 10,
310 |     'type': "SGD",
311 |     'solver_mode': solver_mode,
312 |     'device_id': device_id,
313 |     'debug_info': False,
314 |     'snapshot_after_train': True
315 |     }
316 | 
317 | # Create train net.
318 | net = caffe.NetSpec()
319 | net.data, net.label = CreateAnnotatedDataLayer(train_data, batch_size=batch_size_per_device,
320 |         train=True, output_label=True, label_map_file=label_map_file,
321 |         transform_param=train_transform_param, batch_sampler=batch_sampler)
322 | 
323 | VGGNetBody(net, from_layer='data', fully_conv=True, reduced=True, dilated=True,
324 |     dropout=False)
325 | 
326 | AddExtraLayers(net, use_batchnorm, lr_mult=lr_mult)
327 | 
328 | # New
329 | mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
330 |         use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
331 |         aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
332 |         num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
333 |         prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult, conf_postfix='_hand_detection')
334 | 
335 | 
336 | # ### initial
337 | # mbox_layers = CreateMultiBoxHead(net, data_layer='data', from_layers=mbox_source_layers,
338 | #         use_batchnorm=use_batchnorm, min_sizes=min_sizes, max_sizes=max_sizes,
339 | #         aspect_ratios=aspect_ratios, steps=steps, normalizations=normalizations,
340 | #         num_classes=num_classes, share_location=share_location, flip=flip, clip=clip,
341 | #         prior_variance=prior_variance, kernel_size=3, pad=1, lr_mult=lr_mult)
342 | 
343 | # Create the MultiBoxLossLayer.
344 | name = "mbox_loss"
345 | mbox_layers.append(net.label)
346 | net[name] = L.MultiBoxLoss(*mbox_layers, multibox_loss_param=multibox_loss_param,
347 |         loss_param=loss_param, include=dict(phase=caffe_pb2.Phase.Value('TRAIN')),
348 |         propagate_down=[True, True, False, False])
349 | 
350 | with open(train_net_file, 'w') as f:
351 |     print('name: "{}_train"'.format(model_name), file=f)
352 |     print(net.to_proto(), file=f)
353 | 
354 | # Create deploy net.
355 | # Remove the first and last layer from test net.
356 | #########
357 | 
358 | 
359 | # parameters for generating detection output.
360 | det_out_param = {
361 |     'num_classes': num_classes,
362 |     'share_location': share_location,
363 |     'background_label_id': background_label_id,
364 |     'nms_param': {'nms_threshold': 0.45, 'top_k': 400},
365 |     'keep_top_k': 200,
366 |     'confidence_threshold': 0.01,
367 |     'code_type': code_type,
368 |     }
369 | 
370 | 
371 | conf_name = "mbox_conf"
372 | if multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.SOFTMAX:
373 |   reshape_name = "{}_reshape".format(conf_name)
374 |   net[reshape_name] = L.Reshape(net[conf_name], shape=dict(dim=[0, -1, num_classes]))
375 |   softmax_name = "{}_softmax".format(conf_name)
376 |   net[softmax_name] = L.Softmax(net[reshape_name], axis=2)
377 |   flatten_name = "{}_flatten".format(conf_name)
378 |   net[flatten_name] = L.Flatten(net[softmax_name], axis=1)
379 |   mbox_layers[1] = net[flatten_name]
380 | elif multibox_loss_param["conf_loss_type"] == P.MultiBoxLoss.LOGISTIC:
381 |   sigmoid_name = "{}_sigmoid".format(conf_name)
382 |   net[sigmoid_name] = L.Sigmoid(net[conf_name])
383 |   mbox_layers[1] = net[sigmoid_name]
384 | 
385 | net.detection_out = L.DetectionOutput(*mbox_layers,
386 |     detection_output_param=det_out_param,
387 |     include=dict(phase=caffe_pb2.Phase.Value('TEST')))
388 | 
389 | 
390 | 
391 | deploy_net = net
392 | with open(deploy_net_file, 'w') as f:
393 |     net_param = deploy_net.to_proto()
394 |     # Remove the first (AnnotatedData) and last (DetectionEvaluate) layer from test net.
395 |     del net_param.layer[0]
396 |     del net_param.layer[-5]
397 |     del net_param.layer[-1].bottom[-1]
398 |     net_param.name = '{}_deploy'.format(model_name)
399 |     net_param.input.extend(['data'])
400 |     net_param.input_shape.extend([
401 |         caffe_pb2.BlobShape(dim=[1, 3, resize_height, resize_width])])
402 |     print(net_param, file=f)
403 | 
404 | # Create solver.
405 | solver = caffe_pb2.SolverParameter(
406 |         train_net=train_net_file,
407 |         snapshot_prefix=snapshot_prefix,
408 |         **solver_param)
409 | 
410 | with open(solver_file, 'w') as f:
411 |     print(solver, file=f)
412 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/model/snapshot/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/model/snapshot/.gitkeep


--------------------------------------------------------------------------------
/other/Hand_Detection/model/solver.prototxt:
--------------------------------------------------------------------------------
 1 | train_net: "train.prototxt"
 2 | base_lr: 0.0001
 3 | display: 20
 4 | max_iter: 200000
 5 | lr_policy: "multistep"
 6 | gamma: 0.10000000149
 7 | momentum: 0.899999976158
 8 | weight_decay: 0.000500000023749
 9 | snapshot: 80000
10 | snapshot_prefix: "snapshot/VGG_HAND_SSD_300x300_iter_"
11 | solver_mode: GPU
12 | device_id: 0
13 | debug_info: false
14 | snapshot_after_train: true
15 | average_loss: 10
16 | stepvalue: 60000
17 | stepvalue: 100000
18 | stepvalue: 140000
19 | type: "SGD"
20 | 
21 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/model/train.prototxt:
--------------------------------------------------------------------------------
   1 | name: "VGG_HAND_SSD_300x300_train"
   2 | layer {
   3 |   name: "data"
   4 |   type: "AnnotatedData"
   5 |   top: "data"
   6 |   top: "label"
   7 |   include {
   8 |     phase: TRAIN
   9 |   }
  10 |   transform_param {
  11 |     mirror: true
  12 |     mean_value: 127.0
  13 |     mean_value: 127.0
  14 |     mean_value: 127.0
  15 |     resize_param {
  16 |       prob: 1.0
  17 |       resize_mode: WARP
  18 |       height: 300
  19 |       width: 300
  20 |       interp_mode: LINEAR
  21 |       interp_mode: AREA
  22 |       interp_mode: NEAREST
  23 |       interp_mode: CUBIC
  24 |       interp_mode: LANCZOS4
  25 |     }
  26 |     emit_constraint {
  27 |       emit_type: CENTER
  28 |     }
  29 |     distort_param {
  30 |       brightness_prob: 0.5
  31 |       brightness_delta: 32.0
  32 |       contrast_prob: 0.5
  33 |       contrast_lower: 0.5
  34 |       contrast_upper: 1.5
  35 |       hue_prob: 0.5
  36 |       hue_delta: 18.0
  37 |       saturation_prob: 0.5
  38 |       saturation_lower: 0.5
  39 |       saturation_upper: 1.5
  40 |       random_order_prob: 0.0
  41 |     }
  42 |     expand_param {
  43 |       prob: 0.5
  44 |       max_expand_ratio: 4.0
  45 |     }
  46 |   }
  47 |   data_param {
  48 |     source: "../data/lmdb/trainval_lmdb"
  49 |     batch_size: 2
  50 |     backend: LMDB
  51 |   }
  52 |   annotated_data_param {
  53 |     batch_sampler {
  54 |       max_sample: 1
  55 |       max_trials: 1
  56 |     }
  57 |     batch_sampler {
  58 |       sampler {
  59 |         min_scale: 0.300000011921
  60 |         max_scale: 1.0
  61 |         min_aspect_ratio: 0.5
  62 |         max_aspect_ratio: 2.0
  63 |       }
  64 |       sample_constraint {
  65 |         min_jaccard_overlap: 0.10000000149
  66 |       }
  67 |       max_sample: 1
  68 |       max_trials: 50
  69 |     }
  70 |     batch_sampler {
  71 |       sampler {
  72 |         min_scale: 0.300000011921
  73 |         max_scale: 1.0
  74 |         min_aspect_ratio: 0.5
  75 |         max_aspect_ratio: 2.0
  76 |       }
  77 |       sample_constraint {
  78 |         min_jaccard_overlap: 0.300000011921
  79 |       }
  80 |       max_sample: 1
  81 |       max_trials: 50
  82 |     }
  83 |     batch_sampler {
  84 |       sampler {
  85 |         min_scale: 0.300000011921
  86 |         max_scale: 1.0
  87 |         min_aspect_ratio: 0.5
  88 |         max_aspect_ratio: 2.0
  89 |       }
  90 |       sample_constraint {
  91 |         min_jaccard_overlap: 0.5
  92 |       }
  93 |       max_sample: 1
  94 |       max_trials: 50
  95 |     }
  96 |     batch_sampler {
  97 |       sampler {
  98 |         min_scale: 0.300000011921
  99 |         max_scale: 1.0
 100 |         min_aspect_ratio: 0.5
 101 |         max_aspect_ratio: 2.0
 102 |       }
 103 |       sample_constraint {
 104 |         min_jaccard_overlap: 0.699999988079
 105 |       }
 106 |       max_sample: 1
 107 |       max_trials: 50
 108 |     }
 109 |     batch_sampler {
 110 |       sampler {
 111 |         min_scale: 0.300000011921
 112 |         max_scale: 1.0
 113 |         min_aspect_ratio: 0.5
 114 |         max_aspect_ratio: 2.0
 115 |       }
 116 |       sample_constraint {
 117 |         min_jaccard_overlap: 0.899999976158
 118 |       }
 119 |       max_sample: 1
 120 |       max_trials: 50
 121 |     }
 122 |     batch_sampler {
 123 |       sampler {
 124 |         min_scale: 0.300000011921
 125 |         max_scale: 1.0
 126 |         min_aspect_ratio: 0.5
 127 |         max_aspect_ratio: 2.0
 128 |       }
 129 |       sample_constraint {
 130 |         max_jaccard_overlap: 1.0
 131 |       }
 132 |       max_sample: 1
 133 |       max_trials: 50
 134 |     }
 135 |     label_map_file: "../data/labelmap_voc.prototxt"
 136 |   }
 137 | }
 138 | layer {
 139 |   name: "conv1_1"
 140 |   type: "Convolution"
 141 |   bottom: "data"
 142 |   top: "conv1_1"
 143 |   param {
 144 |     lr_mult: 1.0
 145 |     decay_mult: 1.0
 146 |   }
 147 |   param {
 148 |     lr_mult: 2.0
 149 |     decay_mult: 0.0
 150 |   }
 151 |   convolution_param {
 152 |     num_output: 64
 153 |     pad: 1
 154 |     kernel_size: 3
 155 |     weight_filler {
 156 |       type: "xavier"
 157 |     }
 158 |     bias_filler {
 159 |       type: "constant"
 160 |       value: 0.0
 161 |     }
 162 |   }
 163 | }
 164 | layer {
 165 |   name: "relu1_1"
 166 |   type: "ReLU"
 167 |   bottom: "conv1_1"
 168 |   top: "conv1_1"
 169 | }
 170 | layer {
 171 |   name: "conv1_2"
 172 |   type: "Convolution"
 173 |   bottom: "conv1_1"
 174 |   top: "conv1_2"
 175 |   param {
 176 |     lr_mult: 1.0
 177 |     decay_mult: 1.0
 178 |   }
 179 |   param {
 180 |     lr_mult: 2.0
 181 |     decay_mult: 0.0
 182 |   }
 183 |   convolution_param {
 184 |     num_output: 64
 185 |     pad: 1
 186 |     kernel_size: 3
 187 |     weight_filler {
 188 |       type: "xavier"
 189 |     }
 190 |     bias_filler {
 191 |       type: "constant"
 192 |       value: 0.0
 193 |     }
 194 |   }
 195 | }
 196 | layer {
 197 |   name: "relu1_2"
 198 |   type: "ReLU"
 199 |   bottom: "conv1_2"
 200 |   top: "conv1_2"
 201 | }
 202 | layer {
 203 |   name: "pool1"
 204 |   type: "Pooling"
 205 |   bottom: "conv1_2"
 206 |   top: "pool1"
 207 |   pooling_param {
 208 |     pool: MAX
 209 |     kernel_size: 2
 210 |     stride: 2
 211 |   }
 212 | }
 213 | layer {
 214 |   name: "conv2_1"
 215 |   type: "Convolution"
 216 |   bottom: "pool1"
 217 |   top: "conv2_1"
 218 |   param {
 219 |     lr_mult: 1.0
 220 |     decay_mult: 1.0
 221 |   }
 222 |   param {
 223 |     lr_mult: 2.0
 224 |     decay_mult: 0.0
 225 |   }
 226 |   convolution_param {
 227 |     num_output: 128
 228 |     pad: 1
 229 |     kernel_size: 3
 230 |     weight_filler {
 231 |       type: "xavier"
 232 |     }
 233 |     bias_filler {
 234 |       type: "constant"
 235 |       value: 0.0
 236 |     }
 237 |   }
 238 | }
 239 | layer {
 240 |   name: "relu2_1"
 241 |   type: "ReLU"
 242 |   bottom: "conv2_1"
 243 |   top: "conv2_1"
 244 | }
 245 | layer {
 246 |   name: "conv2_2"
 247 |   type: "Convolution"
 248 |   bottom: "conv2_1"
 249 |   top: "conv2_2"
 250 |   param {
 251 |     lr_mult: 1.0
 252 |     decay_mult: 1.0
 253 |   }
 254 |   param {
 255 |     lr_mult: 2.0
 256 |     decay_mult: 0.0
 257 |   }
 258 |   convolution_param {
 259 |     num_output: 128
 260 |     pad: 1
 261 |     kernel_size: 3
 262 |     weight_filler {
 263 |       type: "xavier"
 264 |     }
 265 |     bias_filler {
 266 |       type: "constant"
 267 |       value: 0.0
 268 |     }
 269 |   }
 270 | }
 271 | layer {
 272 |   name: "relu2_2"
 273 |   type: "ReLU"
 274 |   bottom: "conv2_2"
 275 |   top: "conv2_2"
 276 | }
 277 | layer {
 278 |   name: "pool2"
 279 |   type: "Pooling"
 280 |   bottom: "conv2_2"
 281 |   top: "pool2"
 282 |   pooling_param {
 283 |     pool: MAX
 284 |     kernel_size: 2
 285 |     stride: 2
 286 |   }
 287 | }
 288 | layer {
 289 |   name: "conv3_1"
 290 |   type: "Convolution"
 291 |   bottom: "pool2"
 292 |   top: "conv3_1"
 293 |   param {
 294 |     lr_mult: 1.0
 295 |     decay_mult: 1.0
 296 |   }
 297 |   param {
 298 |     lr_mult: 2.0
 299 |     decay_mult: 0.0
 300 |   }
 301 |   convolution_param {
 302 |     num_output: 256
 303 |     pad: 1
 304 |     kernel_size: 3
 305 |     weight_filler {
 306 |       type: "xavier"
 307 |     }
 308 |     bias_filler {
 309 |       type: "constant"
 310 |       value: 0.0
 311 |     }
 312 |   }
 313 | }
 314 | layer {
 315 |   name: "relu3_1"
 316 |   type: "ReLU"
 317 |   bottom: "conv3_1"
 318 |   top: "conv3_1"
 319 | }
 320 | layer {
 321 |   name: "conv3_2"
 322 |   type: "Convolution"
 323 |   bottom: "conv3_1"
 324 |   top: "conv3_2"
 325 |   param {
 326 |     lr_mult: 1.0
 327 |     decay_mult: 1.0
 328 |   }
 329 |   param {
 330 |     lr_mult: 2.0
 331 |     decay_mult: 0.0
 332 |   }
 333 |   convolution_param {
 334 |     num_output: 256
 335 |     pad: 1
 336 |     kernel_size: 3
 337 |     weight_filler {
 338 |       type: "xavier"
 339 |     }
 340 |     bias_filler {
 341 |       type: "constant"
 342 |       value: 0.0
 343 |     }
 344 |   }
 345 | }
 346 | layer {
 347 |   name: "relu3_2"
 348 |   type: "ReLU"
 349 |   bottom: "conv3_2"
 350 |   top: "conv3_2"
 351 | }
 352 | layer {
 353 |   name: "conv3_3"
 354 |   type: "Convolution"
 355 |   bottom: "conv3_2"
 356 |   top: "conv3_3"
 357 |   param {
 358 |     lr_mult: 1.0
 359 |     decay_mult: 1.0
 360 |   }
 361 |   param {
 362 |     lr_mult: 2.0
 363 |     decay_mult: 0.0
 364 |   }
 365 |   convolution_param {
 366 |     num_output: 256
 367 |     pad: 1
 368 |     kernel_size: 3
 369 |     weight_filler {
 370 |       type: "xavier"
 371 |     }
 372 |     bias_filler {
 373 |       type: "constant"
 374 |       value: 0.0
 375 |     }
 376 |   }
 377 | }
 378 | layer {
 379 |   name: "relu3_3"
 380 |   type: "ReLU"
 381 |   bottom: "conv3_3"
 382 |   top: "conv3_3"
 383 | }
 384 | layer {
 385 |   name: "pool3"
 386 |   type: "Pooling"
 387 |   bottom: "conv3_3"
 388 |   top: "pool3"
 389 |   pooling_param {
 390 |     pool: MAX
 391 |     kernel_size: 2
 392 |     stride: 2
 393 |   }
 394 | }
 395 | layer {
 396 |   name: "conv4_1"
 397 |   type: "Convolution"
 398 |   bottom: "pool3"
 399 |   top: "conv4_1"
 400 |   param {
 401 |     lr_mult: 1.0
 402 |     decay_mult: 1.0
 403 |   }
 404 |   param {
 405 |     lr_mult: 2.0
 406 |     decay_mult: 0.0
 407 |   }
 408 |   convolution_param {
 409 |     num_output: 512
 410 |     pad: 1
 411 |     kernel_size: 3
 412 |     weight_filler {
 413 |       type: "xavier"
 414 |     }
 415 |     bias_filler {
 416 |       type: "constant"
 417 |       value: 0.0
 418 |     }
 419 |   }
 420 | }
 421 | layer {
 422 |   name: "relu4_1"
 423 |   type: "ReLU"
 424 |   bottom: "conv4_1"
 425 |   top: "conv4_1"
 426 | }
 427 | layer {
 428 |   name: "conv4_2"
 429 |   type: "Convolution"
 430 |   bottom: "conv4_1"
 431 |   top: "conv4_2"
 432 |   param {
 433 |     lr_mult: 1.0
 434 |     decay_mult: 1.0
 435 |   }
 436 |   param {
 437 |     lr_mult: 2.0
 438 |     decay_mult: 0.0
 439 |   }
 440 |   convolution_param {
 441 |     num_output: 512
 442 |     pad: 1
 443 |     kernel_size: 3
 444 |     weight_filler {
 445 |       type: "xavier"
 446 |     }
 447 |     bias_filler {
 448 |       type: "constant"
 449 |       value: 0.0
 450 |     }
 451 |   }
 452 | }
 453 | layer {
 454 |   name: "relu4_2"
 455 |   type: "ReLU"
 456 |   bottom: "conv4_2"
 457 |   top: "conv4_2"
 458 | }
 459 | layer {
 460 |   name: "conv4_3"
 461 |   type: "Convolution"
 462 |   bottom: "conv4_2"
 463 |   top: "conv4_3"
 464 |   param {
 465 |     lr_mult: 1.0
 466 |     decay_mult: 1.0
 467 |   }
 468 |   param {
 469 |     lr_mult: 2.0
 470 |     decay_mult: 0.0
 471 |   }
 472 |   convolution_param {
 473 |     num_output: 512
 474 |     pad: 1
 475 |     kernel_size: 3
 476 |     weight_filler {
 477 |       type: "xavier"
 478 |     }
 479 |     bias_filler {
 480 |       type: "constant"
 481 |       value: 0.0
 482 |     }
 483 |   }
 484 | }
 485 | layer {
 486 |   name: "relu4_3"
 487 |   type: "ReLU"
 488 |   bottom: "conv4_3"
 489 |   top: "conv4_3"
 490 | }
 491 | layer {
 492 |   name: "pool4"
 493 |   type: "Pooling"
 494 |   bottom: "conv4_3"
 495 |   top: "pool4"
 496 |   pooling_param {
 497 |     pool: MAX
 498 |     kernel_size: 2
 499 |     stride: 2
 500 |   }
 501 | }
 502 | layer {
 503 |   name: "conv5_1"
 504 |   type: "Convolution"
 505 |   bottom: "pool4"
 506 |   top: "conv5_1"
 507 |   param {
 508 |     lr_mult: 1.0
 509 |     decay_mult: 1.0
 510 |   }
 511 |   param {
 512 |     lr_mult: 2.0
 513 |     decay_mult: 0.0
 514 |   }
 515 |   convolution_param {
 516 |     num_output: 512
 517 |     pad: 1
 518 |     kernel_size: 3
 519 |     weight_filler {
 520 |       type: "xavier"
 521 |     }
 522 |     bias_filler {
 523 |       type: "constant"
 524 |       value: 0.0
 525 |     }
 526 |     dilation: 1
 527 |   }
 528 | }
 529 | layer {
 530 |   name: "relu5_1"
 531 |   type: "ReLU"
 532 |   bottom: "conv5_1"
 533 |   top: "conv5_1"
 534 | }
 535 | layer {
 536 |   name: "conv5_2"
 537 |   type: "Convolution"
 538 |   bottom: "conv5_1"
 539 |   top: "conv5_2"
 540 |   param {
 541 |     lr_mult: 1.0
 542 |     decay_mult: 1.0
 543 |   }
 544 |   param {
 545 |     lr_mult: 2.0
 546 |     decay_mult: 0.0
 547 |   }
 548 |   convolution_param {
 549 |     num_output: 512
 550 |     pad: 1
 551 |     kernel_size: 3
 552 |     weight_filler {
 553 |       type: "xavier"
 554 |     }
 555 |     bias_filler {
 556 |       type: "constant"
 557 |       value: 0.0
 558 |     }
 559 |     dilation: 1
 560 |   }
 561 | }
 562 | layer {
 563 |   name: "relu5_2"
 564 |   type: "ReLU"
 565 |   bottom: "conv5_2"
 566 |   top: "conv5_2"
 567 | }
 568 | layer {
 569 |   name: "conv5_3"
 570 |   type: "Convolution"
 571 |   bottom: "conv5_2"
 572 |   top: "conv5_3"
 573 |   param {
 574 |     lr_mult: 1.0
 575 |     decay_mult: 1.0
 576 |   }
 577 |   param {
 578 |     lr_mult: 2.0
 579 |     decay_mult: 0.0
 580 |   }
 581 |   convolution_param {
 582 |     num_output: 512
 583 |     pad: 1
 584 |     kernel_size: 3
 585 |     weight_filler {
 586 |       type: "xavier"
 587 |     }
 588 |     bias_filler {
 589 |       type: "constant"
 590 |       value: 0.0
 591 |     }
 592 |     dilation: 1
 593 |   }
 594 | }
 595 | layer {
 596 |   name: "relu5_3"
 597 |   type: "ReLU"
 598 |   bottom: "conv5_3"
 599 |   top: "conv5_3"
 600 | }
 601 | layer {
 602 |   name: "pool5"
 603 |   type: "Pooling"
 604 |   bottom: "conv5_3"
 605 |   top: "pool5"
 606 |   pooling_param {
 607 |     pool: MAX
 608 |     kernel_size: 3
 609 |     stride: 1
 610 |     pad: 1
 611 |   }
 612 | }
 613 | layer {
 614 |   name: "fc6"
 615 |   type: "Convolution"
 616 |   bottom: "pool5"
 617 |   top: "fc6"
 618 |   param {
 619 |     lr_mult: 1.0
 620 |     decay_mult: 1.0
 621 |   }
 622 |   param {
 623 |     lr_mult: 2.0
 624 |     decay_mult: 0.0
 625 |   }
 626 |   convolution_param {
 627 |     num_output: 1024
 628 |     pad: 6
 629 |     kernel_size: 3
 630 |     weight_filler {
 631 |       type: "xavier"
 632 |     }
 633 |     bias_filler {
 634 |       type: "constant"
 635 |       value: 0.0
 636 |     }
 637 |     dilation: 6
 638 |   }
 639 | }
 640 | layer {
 641 |   name: "relu6"
 642 |   type: "ReLU"
 643 |   bottom: "fc6"
 644 |   top: "fc6"
 645 | }
 646 | layer {
 647 |   name: "fc7"
 648 |   type: "Convolution"
 649 |   bottom: "fc6"
 650 |   top: "fc7"
 651 |   param {
 652 |     lr_mult: 1.0
 653 |     decay_mult: 1.0
 654 |   }
 655 |   param {
 656 |     lr_mult: 2.0
 657 |     decay_mult: 0.0
 658 |   }
 659 |   convolution_param {
 660 |     num_output: 1024
 661 |     kernel_size: 1
 662 |     weight_filler {
 663 |       type: "xavier"
 664 |     }
 665 |     bias_filler {
 666 |       type: "constant"
 667 |       value: 0.0
 668 |     }
 669 |   }
 670 | }
 671 | layer {
 672 |   name: "relu7"
 673 |   type: "ReLU"
 674 |   bottom: "fc7"
 675 |   top: "fc7"
 676 | }
 677 | layer {
 678 |   name: "conv6_1"
 679 |   type: "Convolution"
 680 |   bottom: "fc7"
 681 |   top: "conv6_1"
 682 |   param {
 683 |     lr_mult: 1.0
 684 |     decay_mult: 1.0
 685 |   }
 686 |   param {
 687 |     lr_mult: 2.0
 688 |     decay_mult: 0.0
 689 |   }
 690 |   convolution_param {
 691 |     num_output: 256
 692 |     pad: 0
 693 |     kernel_size: 1
 694 |     stride: 1
 695 |     weight_filler {
 696 |       type: "xavier"
 697 |     }
 698 |     bias_filler {
 699 |       type: "constant"
 700 |       value: 0.0
 701 |     }
 702 |   }
 703 | }
 704 | layer {
 705 |   name: "conv6_1_relu"
 706 |   type: "ReLU"
 707 |   bottom: "conv6_1"
 708 |   top: "conv6_1"
 709 | }
 710 | layer {
 711 |   name: "conv6_2"
 712 |   type: "Convolution"
 713 |   bottom: "conv6_1"
 714 |   top: "conv6_2"
 715 |   param {
 716 |     lr_mult: 1.0
 717 |     decay_mult: 1.0
 718 |   }
 719 |   param {
 720 |     lr_mult: 2.0
 721 |     decay_mult: 0.0
 722 |   }
 723 |   convolution_param {
 724 |     num_output: 512
 725 |     pad: 1
 726 |     kernel_size: 3
 727 |     stride: 2
 728 |     weight_filler {
 729 |       type: "xavier"
 730 |     }
 731 |     bias_filler {
 732 |       type: "constant"
 733 |       value: 0.0
 734 |     }
 735 |   }
 736 | }
 737 | layer {
 738 |   name: "conv6_2_relu"
 739 |   type: "ReLU"
 740 |   bottom: "conv6_2"
 741 |   top: "conv6_2"
 742 | }
 743 | layer {
 744 |   name: "conv7_1"
 745 |   type: "Convolution"
 746 |   bottom: "conv6_2"
 747 |   top: "conv7_1"
 748 |   param {
 749 |     lr_mult: 1.0
 750 |     decay_mult: 1.0
 751 |   }
 752 |   param {
 753 |     lr_mult: 2.0
 754 |     decay_mult: 0.0
 755 |   }
 756 |   convolution_param {
 757 |     num_output: 128
 758 |     pad: 0
 759 |     kernel_size: 1
 760 |     stride: 1
 761 |     weight_filler {
 762 |       type: "xavier"
 763 |     }
 764 |     bias_filler {
 765 |       type: "constant"
 766 |       value: 0.0
 767 |     }
 768 |   }
 769 | }
 770 | layer {
 771 |   name: "conv7_1_relu"
 772 |   type: "ReLU"
 773 |   bottom: "conv7_1"
 774 |   top: "conv7_1"
 775 | }
 776 | layer {
 777 |   name: "conv7_2"
 778 |   type: "Convolution"
 779 |   bottom: "conv7_1"
 780 |   top: "conv7_2"
 781 |   param {
 782 |     lr_mult: 1.0
 783 |     decay_mult: 1.0
 784 |   }
 785 |   param {
 786 |     lr_mult: 2.0
 787 |     decay_mult: 0.0
 788 |   }
 789 |   convolution_param {
 790 |     num_output: 256
 791 |     pad: 1
 792 |     kernel_size: 3
 793 |     stride: 2
 794 |     weight_filler {
 795 |       type: "xavier"
 796 |     }
 797 |     bias_filler {
 798 |       type: "constant"
 799 |       value: 0.0
 800 |     }
 801 |   }
 802 | }
 803 | layer {
 804 |   name: "conv7_2_relu"
 805 |   type: "ReLU"
 806 |   bottom: "conv7_2"
 807 |   top: "conv7_2"
 808 | }
 809 | layer {
 810 |   name: "conv8_1"
 811 |   type: "Convolution"
 812 |   bottom: "conv7_2"
 813 |   top: "conv8_1"
 814 |   param {
 815 |     lr_mult: 1.0
 816 |     decay_mult: 1.0
 817 |   }
 818 |   param {
 819 |     lr_mult: 2.0
 820 |     decay_mult: 0.0
 821 |   }
 822 |   convolution_param {
 823 |     num_output: 128
 824 |     pad: 0
 825 |     kernel_size: 1
 826 |     stride: 1
 827 |     weight_filler {
 828 |       type: "xavier"
 829 |     }
 830 |     bias_filler {
 831 |       type: "constant"
 832 |       value: 0.0
 833 |     }
 834 |   }
 835 | }
 836 | layer {
 837 |   name: "conv8_1_relu"
 838 |   type: "ReLU"
 839 |   bottom: "conv8_1"
 840 |   top: "conv8_1"
 841 | }
 842 | layer {
 843 |   name: "conv8_2"
 844 |   type: "Convolution"
 845 |   bottom: "conv8_1"
 846 |   top: "conv8_2"
 847 |   param {
 848 |     lr_mult: 1.0
 849 |     decay_mult: 1.0
 850 |   }
 851 |   param {
 852 |     lr_mult: 2.0
 853 |     decay_mult: 0.0
 854 |   }
 855 |   convolution_param {
 856 |     num_output: 256
 857 |     pad: 0
 858 |     kernel_size: 3
 859 |     stride: 1
 860 |     weight_filler {
 861 |       type: "xavier"
 862 |     }
 863 |     bias_filler {
 864 |       type: "constant"
 865 |       value: 0.0
 866 |     }
 867 |   }
 868 | }
 869 | layer {
 870 |   name: "conv8_2_relu"
 871 |   type: "ReLU"
 872 |   bottom: "conv8_2"
 873 |   top: "conv8_2"
 874 | }
 875 | layer {
 876 |   name: "conv9_1"
 877 |   type: "Convolution"
 878 |   bottom: "conv8_2"
 879 |   top: "conv9_1"
 880 |   param {
 881 |     lr_mult: 1.0
 882 |     decay_mult: 1.0
 883 |   }
 884 |   param {
 885 |     lr_mult: 2.0
 886 |     decay_mult: 0.0
 887 |   }
 888 |   convolution_param {
 889 |     num_output: 128
 890 |     pad: 0
 891 |     kernel_size: 1
 892 |     stride: 1
 893 |     weight_filler {
 894 |       type: "xavier"
 895 |     }
 896 |     bias_filler {
 897 |       type: "constant"
 898 |       value: 0.0
 899 |     }
 900 |   }
 901 | }
 902 | layer {
 903 |   name: "conv9_1_relu"
 904 |   type: "ReLU"
 905 |   bottom: "conv9_1"
 906 |   top: "conv9_1"
 907 | }
 908 | layer {
 909 |   name: "conv9_2"
 910 |   type: "Convolution"
 911 |   bottom: "conv9_1"
 912 |   top: "conv9_2"
 913 |   param {
 914 |     lr_mult: 1.0
 915 |     decay_mult: 1.0
 916 |   }
 917 |   param {
 918 |     lr_mult: 2.0
 919 |     decay_mult: 0.0
 920 |   }
 921 |   convolution_param {
 922 |     num_output: 256
 923 |     pad: 0
 924 |     kernel_size: 3
 925 |     stride: 1
 926 |     weight_filler {
 927 |       type: "xavier"
 928 |     }
 929 |     bias_filler {
 930 |       type: "constant"
 931 |       value: 0.0
 932 |     }
 933 |   }
 934 | }
 935 | layer {
 936 |   name: "conv9_2_relu"
 937 |   type: "ReLU"
 938 |   bottom: "conv9_2"
 939 |   top: "conv9_2"
 940 | }
 941 | layer {
 942 |   name: "conv4_3_norm"
 943 |   type: "Normalize"
 944 |   bottom: "conv4_3"
 945 |   top: "conv4_3_norm"
 946 |   norm_param {
 947 |     across_spatial: false
 948 |     scale_filler {
 949 |       type: "constant"
 950 |       value: 20.0
 951 |     }
 952 |     channel_shared: false
 953 |   }
 954 | }
 955 | layer {
 956 |   name: "conv4_3_norm_mbox_loc"
 957 |   type: "Convolution"
 958 |   bottom: "conv4_3_norm"
 959 |   top: "conv4_3_norm_mbox_loc"
 960 |   param {
 961 |     lr_mult: 1.0
 962 |     decay_mult: 1.0
 963 |   }
 964 |   param {
 965 |     lr_mult: 2.0
 966 |     decay_mult: 0.0
 967 |   }
 968 |   convolution_param {
 969 |     num_output: 16
 970 |     pad: 1
 971 |     kernel_size: 3
 972 |     stride: 1
 973 |     weight_filler {
 974 |       type: "xavier"
 975 |     }
 976 |     bias_filler {
 977 |       type: "constant"
 978 |       value: 0.0
 979 |     }
 980 |   }
 981 | }
 982 | layer {
 983 |   name: "conv4_3_norm_mbox_loc_perm"
 984 |   type: "Permute"
 985 |   bottom: "conv4_3_norm_mbox_loc"
 986 |   top: "conv4_3_norm_mbox_loc_perm"
 987 |   permute_param {
 988 |     order: 0
 989 |     order: 2
 990 |     order: 3
 991 |     order: 1
 992 |   }
 993 | }
 994 | layer {
 995 |   name: "conv4_3_norm_mbox_loc_flat"
 996 |   type: "Flatten"
 997 |   bottom: "conv4_3_norm_mbox_loc_perm"
 998 |   top: "conv4_3_norm_mbox_loc_flat"
 999 |   flatten_param {
1000 |     axis: 1
1001 |   }
1002 | }
1003 | layer {
1004 |   name: "conv4_3_norm_mbox_conf_hand_detection"
1005 |   type: "Convolution"
1006 |   bottom: "conv4_3_norm"
1007 |   top: "conv4_3_norm_mbox_conf_hand_detection"
1008 |   param {
1009 |     lr_mult: 1.0
1010 |     decay_mult: 1.0
1011 |   }
1012 |   param {
1013 |     lr_mult: 2.0
1014 |     decay_mult: 0.0
1015 |   }
1016 |   convolution_param {
1017 |     num_output: 8
1018 |     pad: 1
1019 |     kernel_size: 3
1020 |     stride: 1
1021 |     weight_filler {
1022 |       type: "xavier"
1023 |     }
1024 |     bias_filler {
1025 |       type: "constant"
1026 |       value: 0.0
1027 |     }
1028 |   }
1029 | }
1030 | layer {
1031 |   name: "conv4_3_norm_mbox_conf_hand_detection_perm"
1032 |   type: "Permute"
1033 |   bottom: "conv4_3_norm_mbox_conf_hand_detection"
1034 |   top: "conv4_3_norm_mbox_conf_hand_detection_perm"
1035 |   permute_param {
1036 |     order: 0
1037 |     order: 2
1038 |     order: 3
1039 |     order: 1
1040 |   }
1041 | }
1042 | layer {
1043 |   name: "conv4_3_norm_mbox_conf_hand_detection_flat"
1044 |   type: "Flatten"
1045 |   bottom: "conv4_3_norm_mbox_conf_hand_detection_perm"
1046 |   top: "conv4_3_norm_mbox_conf_hand_detection_flat"
1047 |   flatten_param {
1048 |     axis: 1
1049 |   }
1050 | }
1051 | layer {
1052 |   name: "conv4_3_norm_mbox_priorbox"
1053 |   type: "PriorBox"
1054 |   bottom: "conv4_3_norm"
1055 |   bottom: "data"
1056 |   top: "conv4_3_norm_mbox_priorbox"
1057 |   prior_box_param {
1058 |     min_size: 30.0
1059 |     max_size: 60.0
1060 |     aspect_ratio: 2.0
1061 |     flip: true
1062 |     clip: false
1063 |     variance: 0.10000000149
1064 |     variance: 0.10000000149
1065 |     variance: 0.20000000298
1066 |     variance: 0.20000000298
1067 |     step: 8.0
1068 |     offset: 0.5
1069 |   }
1070 | }
1071 | layer {
1072 |   name: "fc7_mbox_loc"
1073 |   type: "Convolution"
1074 |   bottom: "fc7"
1075 |   top: "fc7_mbox_loc"
1076 |   param {
1077 |     lr_mult: 1.0
1078 |     decay_mult: 1.0
1079 |   }
1080 |   param {
1081 |     lr_mult: 2.0
1082 |     decay_mult: 0.0
1083 |   }
1084 |   convolution_param {
1085 |     num_output: 24
1086 |     pad: 1
1087 |     kernel_size: 3
1088 |     stride: 1
1089 |     weight_filler {
1090 |       type: "xavier"
1091 |     }
1092 |     bias_filler {
1093 |       type: "constant"
1094 |       value: 0.0
1095 |     }
1096 |   }
1097 | }
1098 | layer {
1099 |   name: "fc7_mbox_loc_perm"
1100 |   type: "Permute"
1101 |   bottom: "fc7_mbox_loc"
1102 |   top: "fc7_mbox_loc_perm"
1103 |   permute_param {
1104 |     order: 0
1105 |     order: 2
1106 |     order: 3
1107 |     order: 1
1108 |   }
1109 | }
1110 | layer {
1111 |   name: "fc7_mbox_loc_flat"
1112 |   type: "Flatten"
1113 |   bottom: "fc7_mbox_loc_perm"
1114 |   top: "fc7_mbox_loc_flat"
1115 |   flatten_param {
1116 |     axis: 1
1117 |   }
1118 | }
1119 | layer {
1120 |   name: "fc7_mbox_conf_hand_detection"
1121 |   type: "Convolution"
1122 |   bottom: "fc7"
1123 |   top: "fc7_mbox_conf_hand_detection"
1124 |   param {
1125 |     lr_mult: 1.0
1126 |     decay_mult: 1.0
1127 |   }
1128 |   param {
1129 |     lr_mult: 2.0
1130 |     decay_mult: 0.0
1131 |   }
1132 |   convolution_param {
1133 |     num_output: 12
1134 |     pad: 1
1135 |     kernel_size: 3
1136 |     stride: 1
1137 |     weight_filler {
1138 |       type: "xavier"
1139 |     }
1140 |     bias_filler {
1141 |       type: "constant"
1142 |       value: 0.0
1143 |     }
1144 |   }
1145 | }
1146 | layer {
1147 |   name: "fc7_mbox_conf_hand_detection_perm"
1148 |   type: "Permute"
1149 |   bottom: "fc7_mbox_conf_hand_detection"
1150 |   top: "fc7_mbox_conf_hand_detection_perm"
1151 |   permute_param {
1152 |     order: 0
1153 |     order: 2
1154 |     order: 3
1155 |     order: 1
1156 |   }
1157 | }
1158 | layer {
1159 |   name: "fc7_mbox_conf_hand_detection_flat"
1160 |   type: "Flatten"
1161 |   bottom: "fc7_mbox_conf_hand_detection_perm"
1162 |   top: "fc7_mbox_conf_hand_detection_flat"
1163 |   flatten_param {
1164 |     axis: 1
1165 |   }
1166 | }
1167 | layer {
1168 |   name: "fc7_mbox_priorbox"
1169 |   type: "PriorBox"
1170 |   bottom: "fc7"
1171 |   bottom: "data"
1172 |   top: "fc7_mbox_priorbox"
1173 |   prior_box_param {
1174 |     min_size: 60.0
1175 |     max_size: 111.0
1176 |     aspect_ratio: 2.0
1177 |     aspect_ratio: 3.0
1178 |     flip: true
1179 |     clip: false
1180 |     variance: 0.10000000149
1181 |     variance: 0.10000000149
1182 |     variance: 0.20000000298
1183 |     variance: 0.20000000298
1184 |     step: 16.0
1185 |     offset: 0.5
1186 |   }
1187 | }
1188 | layer {
1189 |   name: "conv6_2_mbox_loc"
1190 |   type: "Convolution"
1191 |   bottom: "conv6_2"
1192 |   top: "conv6_2_mbox_loc"
1193 |   param {
1194 |     lr_mult: 1.0
1195 |     decay_mult: 1.0
1196 |   }
1197 |   param {
1198 |     lr_mult: 2.0
1199 |     decay_mult: 0.0
1200 |   }
1201 |   convolution_param {
1202 |     num_output: 24
1203 |     pad: 1
1204 |     kernel_size: 3
1205 |     stride: 1
1206 |     weight_filler {
1207 |       type: "xavier"
1208 |     }
1209 |     bias_filler {
1210 |       type: "constant"
1211 |       value: 0.0
1212 |     }
1213 |   }
1214 | }
1215 | layer {
1216 |   name: "conv6_2_mbox_loc_perm"
1217 |   type: "Permute"
1218 |   bottom: "conv6_2_mbox_loc"
1219 |   top: "conv6_2_mbox_loc_perm"
1220 |   permute_param {
1221 |     order: 0
1222 |     order: 2
1223 |     order: 3
1224 |     order: 1
1225 |   }
1226 | }
1227 | layer {
1228 |   name: "conv6_2_mbox_loc_flat"
1229 |   type: "Flatten"
1230 |   bottom: "conv6_2_mbox_loc_perm"
1231 |   top: "conv6_2_mbox_loc_flat"
1232 |   flatten_param {
1233 |     axis: 1
1234 |   }
1235 | }
1236 | layer {
1237 |   name: "conv6_2_mbox_conf_hand_detection"
1238 |   type: "Convolution"
1239 |   bottom: "conv6_2"
1240 |   top: "conv6_2_mbox_conf_hand_detection"
1241 |   param {
1242 |     lr_mult: 1.0
1243 |     decay_mult: 1.0
1244 |   }
1245 |   param {
1246 |     lr_mult: 2.0
1247 |     decay_mult: 0.0
1248 |   }
1249 |   convolution_param {
1250 |     num_output: 12
1251 |     pad: 1
1252 |     kernel_size: 3
1253 |     stride: 1
1254 |     weight_filler {
1255 |       type: "xavier"
1256 |     }
1257 |     bias_filler {
1258 |       type: "constant"
1259 |       value: 0.0
1260 |     }
1261 |   }
1262 | }
1263 | layer {
1264 |   name: "conv6_2_mbox_conf_hand_detection_perm"
1265 |   type: "Permute"
1266 |   bottom: "conv6_2_mbox_conf_hand_detection"
1267 |   top: "conv6_2_mbox_conf_hand_detection_perm"
1268 |   permute_param {
1269 |     order: 0
1270 |     order: 2
1271 |     order: 3
1272 |     order: 1
1273 |   }
1274 | }
1275 | layer {
1276 |   name: "conv6_2_mbox_conf_hand_detection_flat"
1277 |   type: "Flatten"
1278 |   bottom: "conv6_2_mbox_conf_hand_detection_perm"
1279 |   top: "conv6_2_mbox_conf_hand_detection_flat"
1280 |   flatten_param {
1281 |     axis: 1
1282 |   }
1283 | }
1284 | layer {
1285 |   name: "conv6_2_mbox_priorbox"
1286 |   type: "PriorBox"
1287 |   bottom: "conv6_2"
1288 |   bottom: "data"
1289 |   top: "conv6_2_mbox_priorbox"
1290 |   prior_box_param {
1291 |     min_size: 111.0
1292 |     max_size: 162.0
1293 |     aspect_ratio: 2.0
1294 |     aspect_ratio: 3.0
1295 |     flip: true
1296 |     clip: false
1297 |     variance: 0.10000000149
1298 |     variance: 0.10000000149
1299 |     variance: 0.20000000298
1300 |     variance: 0.20000000298
1301 |     step: 32.0
1302 |     offset: 0.5
1303 |   }
1304 | }
1305 | layer {
1306 |   name: "conv7_2_mbox_loc"
1307 |   type: "Convolution"
1308 |   bottom: "conv7_2"
1309 |   top: "conv7_2_mbox_loc"
1310 |   param {
1311 |     lr_mult: 1.0
1312 |     decay_mult: 1.0
1313 |   }
1314 |   param {
1315 |     lr_mult: 2.0
1316 |     decay_mult: 0.0
1317 |   }
1318 |   convolution_param {
1319 |     num_output: 24
1320 |     pad: 1
1321 |     kernel_size: 3
1322 |     stride: 1
1323 |     weight_filler {
1324 |       type: "xavier"
1325 |     }
1326 |     bias_filler {
1327 |       type: "constant"
1328 |       value: 0.0
1329 |     }
1330 |   }
1331 | }
1332 | layer {
1333 |   name: "conv7_2_mbox_loc_perm"
1334 |   type: "Permute"
1335 |   bottom: "conv7_2_mbox_loc"
1336 |   top: "conv7_2_mbox_loc_perm"
1337 |   permute_param {
1338 |     order: 0
1339 |     order: 2
1340 |     order: 3
1341 |     order: 1
1342 |   }
1343 | }
1344 | layer {
1345 |   name: "conv7_2_mbox_loc_flat"
1346 |   type: "Flatten"
1347 |   bottom: "conv7_2_mbox_loc_perm"
1348 |   top: "conv7_2_mbox_loc_flat"
1349 |   flatten_param {
1350 |     axis: 1
1351 |   }
1352 | }
1353 | layer {
1354 |   name: "conv7_2_mbox_conf_hand_detection"
1355 |   type: "Convolution"
1356 |   bottom: "conv7_2"
1357 |   top: "conv7_2_mbox_conf_hand_detection"
1358 |   param {
1359 |     lr_mult: 1.0
1360 |     decay_mult: 1.0
1361 |   }
1362 |   param {
1363 |     lr_mult: 2.0
1364 |     decay_mult: 0.0
1365 |   }
1366 |   convolution_param {
1367 |     num_output: 12
1368 |     pad: 1
1369 |     kernel_size: 3
1370 |     stride: 1
1371 |     weight_filler {
1372 |       type: "xavier"
1373 |     }
1374 |     bias_filler {
1375 |       type: "constant"
1376 |       value: 0.0
1377 |     }
1378 |   }
1379 | }
1380 | layer {
1381 |   name: "conv7_2_mbox_conf_hand_detection_perm"
1382 |   type: "Permute"
1383 |   bottom: "conv7_2_mbox_conf_hand_detection"
1384 |   top: "conv7_2_mbox_conf_hand_detection_perm"
1385 |   permute_param {
1386 |     order: 0
1387 |     order: 2
1388 |     order: 3
1389 |     order: 1
1390 |   }
1391 | }
1392 | layer {
1393 |   name: "conv7_2_mbox_conf_hand_detection_flat"
1394 |   type: "Flatten"
1395 |   bottom: "conv7_2_mbox_conf_hand_detection_perm"
1396 |   top: "conv7_2_mbox_conf_hand_detection_flat"
1397 |   flatten_param {
1398 |     axis: 1
1399 |   }
1400 | }
1401 | layer {
1402 |   name: "conv7_2_mbox_priorbox"
1403 |   type: "PriorBox"
1404 |   bottom: "conv7_2"
1405 |   bottom: "data"
1406 |   top: "conv7_2_mbox_priorbox"
1407 |   prior_box_param {
1408 |     min_size: 162.0
1409 |     max_size: 213.0
1410 |     aspect_ratio: 2.0
1411 |     aspect_ratio: 3.0
1412 |     flip: true
1413 |     clip: false
1414 |     variance: 0.10000000149
1415 |     variance: 0.10000000149
1416 |     variance: 0.20000000298
1417 |     variance: 0.20000000298
1418 |     step: 64.0
1419 |     offset: 0.5
1420 |   }
1421 | }
1422 | layer {
1423 |   name: "conv8_2_mbox_loc"
1424 |   type: "Convolution"
1425 |   bottom: "conv8_2"
1426 |   top: "conv8_2_mbox_loc"
1427 |   param {
1428 |     lr_mult: 1.0
1429 |     decay_mult: 1.0
1430 |   }
1431 |   param {
1432 |     lr_mult: 2.0
1433 |     decay_mult: 0.0
1434 |   }
1435 |   convolution_param {
1436 |     num_output: 16
1437 |     pad: 1
1438 |     kernel_size: 3
1439 |     stride: 1
1440 |     weight_filler {
1441 |       type: "xavier"
1442 |     }
1443 |     bias_filler {
1444 |       type: "constant"
1445 |       value: 0.0
1446 |     }
1447 |   }
1448 | }
1449 | layer {
1450 |   name: "conv8_2_mbox_loc_perm"
1451 |   type: "Permute"
1452 |   bottom: "conv8_2_mbox_loc"
1453 |   top: "conv8_2_mbox_loc_perm"
1454 |   permute_param {
1455 |     order: 0
1456 |     order: 2
1457 |     order: 3
1458 |     order: 1
1459 |   }
1460 | }
1461 | layer {
1462 |   name: "conv8_2_mbox_loc_flat"
1463 |   type: "Flatten"
1464 |   bottom: "conv8_2_mbox_loc_perm"
1465 |   top: "conv8_2_mbox_loc_flat"
1466 |   flatten_param {
1467 |     axis: 1
1468 |   }
1469 | }
1470 | layer {
1471 |   name: "conv8_2_mbox_conf_hand_detection"
1472 |   type: "Convolution"
1473 |   bottom: "conv8_2"
1474 |   top: "conv8_2_mbox_conf_hand_detection"
1475 |   param {
1476 |     lr_mult: 1.0
1477 |     decay_mult: 1.0
1478 |   }
1479 |   param {
1480 |     lr_mult: 2.0
1481 |     decay_mult: 0.0
1482 |   }
1483 |   convolution_param {
1484 |     num_output: 8
1485 |     pad: 1
1486 |     kernel_size: 3
1487 |     stride: 1
1488 |     weight_filler {
1489 |       type: "xavier"
1490 |     }
1491 |     bias_filler {
1492 |       type: "constant"
1493 |       value: 0.0
1494 |     }
1495 |   }
1496 | }
1497 | layer {
1498 |   name: "conv8_2_mbox_conf_hand_detection_perm"
1499 |   type: "Permute"
1500 |   bottom: "conv8_2_mbox_conf_hand_detection"
1501 |   top: "conv8_2_mbox_conf_hand_detection_perm"
1502 |   permute_param {
1503 |     order: 0
1504 |     order: 2
1505 |     order: 3
1506 |     order: 1
1507 |   }
1508 | }
1509 | layer {
1510 |   name: "conv8_2_mbox_conf_hand_detection_flat"
1511 |   type: "Flatten"
1512 |   bottom: "conv8_2_mbox_conf_hand_detection_perm"
1513 |   top: "conv8_2_mbox_conf_hand_detection_flat"
1514 |   flatten_param {
1515 |     axis: 1
1516 |   }
1517 | }
1518 | layer {
1519 |   name: "conv8_2_mbox_priorbox"
1520 |   type: "PriorBox"
1521 |   bottom: "conv8_2"
1522 |   bottom: "data"
1523 |   top: "conv8_2_mbox_priorbox"
1524 |   prior_box_param {
1525 |     min_size: 213.0
1526 |     max_size: 264.0
1527 |     aspect_ratio: 2.0
1528 |     flip: true
1529 |     clip: false
1530 |     variance: 0.10000000149
1531 |     variance: 0.10000000149
1532 |     variance: 0.20000000298
1533 |     variance: 0.20000000298
1534 |     step: 100.0
1535 |     offset: 0.5
1536 |   }
1537 | }
1538 | layer {
1539 |   name: "conv9_2_mbox_loc"
1540 |   type: "Convolution"
1541 |   bottom: "conv9_2"
1542 |   top: "conv9_2_mbox_loc"
1543 |   param {
1544 |     lr_mult: 1.0
1545 |     decay_mult: 1.0
1546 |   }
1547 |   param {
1548 |     lr_mult: 2.0
1549 |     decay_mult: 0.0
1550 |   }
1551 |   convolution_param {
1552 |     num_output: 16
1553 |     pad: 1
1554 |     kernel_size: 3
1555 |     stride: 1
1556 |     weight_filler {
1557 |       type: "xavier"
1558 |     }
1559 |     bias_filler {
1560 |       type: "constant"
1561 |       value: 0.0
1562 |     }
1563 |   }
1564 | }
1565 | layer {
1566 |   name: "conv9_2_mbox_loc_perm"
1567 |   type: "Permute"
1568 |   bottom: "conv9_2_mbox_loc"
1569 |   top: "conv9_2_mbox_loc_perm"
1570 |   permute_param {
1571 |     order: 0
1572 |     order: 2
1573 |     order: 3
1574 |     order: 1
1575 |   }
1576 | }
1577 | layer {
1578 |   name: "conv9_2_mbox_loc_flat"
1579 |   type: "Flatten"
1580 |   bottom: "conv9_2_mbox_loc_perm"
1581 |   top: "conv9_2_mbox_loc_flat"
1582 |   flatten_param {
1583 |     axis: 1
1584 |   }
1585 | }
1586 | layer {
1587 |   name: "conv9_2_mbox_conf_hand_detection"
1588 |   type: "Convolution"
1589 |   bottom: "conv9_2"
1590 |   top: "conv9_2_mbox_conf_hand_detection"
1591 |   param {
1592 |     lr_mult: 1.0
1593 |     decay_mult: 1.0
1594 |   }
1595 |   param {
1596 |     lr_mult: 2.0
1597 |     decay_mult: 0.0
1598 |   }
1599 |   convolution_param {
1600 |     num_output: 8
1601 |     pad: 1
1602 |     kernel_size: 3
1603 |     stride: 1
1604 |     weight_filler {
1605 |       type: "xavier"
1606 |     }
1607 |     bias_filler {
1608 |       type: "constant"
1609 |       value: 0.0
1610 |     }
1611 |   }
1612 | }
1613 | layer {
1614 |   name: "conv9_2_mbox_conf_hand_detection_perm"
1615 |   type: "Permute"
1616 |   bottom: "conv9_2_mbox_conf_hand_detection"
1617 |   top: "conv9_2_mbox_conf_hand_detection_perm"
1618 |   permute_param {
1619 |     order: 0
1620 |     order: 2
1621 |     order: 3
1622 |     order: 1
1623 |   }
1624 | }
1625 | layer {
1626 |   name: "conv9_2_mbox_conf_hand_detection_flat"
1627 |   type: "Flatten"
1628 |   bottom: "conv9_2_mbox_conf_hand_detection_perm"
1629 |   top: "conv9_2_mbox_conf_hand_detection_flat"
1630 |   flatten_param {
1631 |     axis: 1
1632 |   }
1633 | }
1634 | layer {
1635 |   name: "conv9_2_mbox_priorbox"
1636 |   type: "PriorBox"
1637 |   bottom: "conv9_2"
1638 |   bottom: "data"
1639 |   top: "conv9_2_mbox_priorbox"
1640 |   prior_box_param {
1641 |     min_size: 264.0
1642 |     max_size: 315.0
1643 |     aspect_ratio: 2.0
1644 |     flip: true
1645 |     clip: false
1646 |     variance: 0.10000000149
1647 |     variance: 0.10000000149
1648 |     variance: 0.20000000298
1649 |     variance: 0.20000000298
1650 |     step: 300.0
1651 |     offset: 0.5
1652 |   }
1653 | }
1654 | layer {
1655 |   name: "mbox_loc"
1656 |   type: "Concat"
1657 |   bottom: "conv4_3_norm_mbox_loc_flat"
1658 |   bottom: "fc7_mbox_loc_flat"
1659 |   bottom: "conv6_2_mbox_loc_flat"
1660 |   bottom: "conv7_2_mbox_loc_flat"
1661 |   bottom: "conv8_2_mbox_loc_flat"
1662 |   bottom: "conv9_2_mbox_loc_flat"
1663 |   top: "mbox_loc"
1664 |   concat_param {
1665 |     axis: 1
1666 |   }
1667 | }
1668 | layer {
1669 |   name: "mbox_conf"
1670 |   type: "Concat"
1671 |   bottom: "conv4_3_norm_mbox_conf_hand_detection_flat"
1672 |   bottom: "fc7_mbox_conf_hand_detection_flat"
1673 |   bottom: "conv6_2_mbox_conf_hand_detection_flat"
1674 |   bottom: "conv7_2_mbox_conf_hand_detection_flat"
1675 |   bottom: "conv8_2_mbox_conf_hand_detection_flat"
1676 |   bottom: "conv9_2_mbox_conf_hand_detection_flat"
1677 |   top: "mbox_conf"
1678 |   concat_param {
1679 |     axis: 1
1680 |   }
1681 | }
1682 | layer {
1683 |   name: "mbox_priorbox"
1684 |   type: "Concat"
1685 |   bottom: "conv4_3_norm_mbox_priorbox"
1686 |   bottom: "fc7_mbox_priorbox"
1687 |   bottom: "conv6_2_mbox_priorbox"
1688 |   bottom: "conv7_2_mbox_priorbox"
1689 |   bottom: "conv8_2_mbox_priorbox"
1690 |   bottom: "conv9_2_mbox_priorbox"
1691 |   top: "mbox_priorbox"
1692 |   concat_param {
1693 |     axis: 2
1694 |   }
1695 | }
1696 | layer {
1697 |   name: "mbox_loss"
1698 |   type: "MultiBoxLoss"
1699 |   bottom: "mbox_loc"
1700 |   bottom: "mbox_conf"
1701 |   bottom: "mbox_priorbox"
1702 |   bottom: "label"
1703 |   top: "mbox_loss"
1704 |   include {
1705 |     phase: TRAIN
1706 |   }
1707 |   propagate_down: true
1708 |   propagate_down: true
1709 |   propagate_down: false
1710 |   propagate_down: false
1711 |   loss_param {
1712 |     normalization: VALID
1713 |   }
1714 |   multibox_loss_param {
1715 |     loc_loss_type: SMOOTH_L1
1716 |     conf_loss_type: SOFTMAX
1717 |     loc_weight: 1.0
1718 |     num_classes: 2
1719 |     share_location: true
1720 |     match_type: PER_PREDICTION
1721 |     overlap_threshold: 0.5
1722 |     use_prior_for_matching: true
1723 |     background_label_id: 0
1724 |     use_difficult_gt: true
1725 |     neg_pos_ratio: 3.0
1726 |     neg_overlap: 0.5
1727 |     code_type: CENTER_SIZE
1728 |     ignore_cross_boundary_bbox: false
1729 |     mining_type: MAX_NEGATIVE
1730 |   }
1731 | }
1732 | 
1733 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/old_README.md:
--------------------------------------------------------------------------------
  1 | ### Roadmap of hand keypoint detection
  2 | * 第一步
  3 | 分为三个网络，大小分别是propoasl-net: 12，refine-net: 24，output-net: 48。propoasl-net和refine-net使用检测手的数据集，output-net使用手部关键点数据集。   
  4 | 12，24输出之后生成hard example和原来的数据集一起作为下一个网络的输入，具体思路如下:
  5 | 12-net(生成12大小的数据集) ---> 24-net(12-net生成的hard example+生成的24大小的数据集) ---> 48-net
  6 | * 生成数据
  7 | 
  8 | 生成数据的时候，有一个ground truth, 高或宽小于40的话，我就认为那不是一只手，是错的标签。在12-net, 24-net输出时使用NMS（非极大值抑制）去除重复框，可以既减少计算量。  
  9 | [mtcnn-caffe](https://github.com/CongWeilin/mtcnn-caffe)的复现里自定义了data层，我不希望这样做，我会生成hdf5文件，这样更灵活，可以加入测试、训练阶段。 
 10 | 
 11 | * 标签
 12 | 
 13 | 所有在整理数据中，对于每个图片进行了15个label的标注信息：    
 14 | 
 15 | 1. 第1列：为正负样本标志，１正样本, 0负样本,-1部分样本,3关键点信息
 16 | 
 17 | 2. 第2-5列：为边框偏移，为float类型，对于无边框信息的数据，全部置为-1
 18 | 
 19 | 3. 42列：为关键点偏移，为float类型，对于无边框信息的数据，全部置为-1    
 20 |  
 21 | 
 22 | > 修改softmax_loss_layer.cpp　增加判断，只对于1,0计算loss值
 23 | 修改euclidean_loss_layer.cpp　增加判断，对于置为-1的不进行loss计算
 24 | 
 25 | 换句话说，hdf5文件里有四块区域，除了data，还有label(标注正负部分样本), points(21个关键点,所以有42列), roi(边框信息，两个点，四列) 。
 26 | 
 27 | * 自定义层
 28 | 1. 自定义一个fc层，只对标签不是-1的进行向前推进，这样就不区分到底是第几个网络，我也不需要写几个文件生成数据集。相当于修改了softmax_loss_layer
 29 | 2. 自定义euclidean_loss_layer, 同理也不对-1进行计算
 30 | 
 31 | ### 数据集
 32 | 手势识别包括Hand detection和Hand keypoint detection两个问题。首先将手从原图片中提取出，然后针对特征点进行回归，因此需要两种类型的数据集完成问题。
 33 | 1. Hand detection数据集，用一个矩形分割出图片中的手的位置，标注了矩形在原始图片中的坐标。
 34 | 2. Hand keypoint detection数据集，标注了包括掌心、关节、指尖等关键点的坐标。
 35 | 
 36 | 因此，我们采用的数据集如下：  
 37 | #### Hand detection数据集  
 38 | * [Hand Dataset by Arpit Mittal, Andrew Zisserman and Phil Torr](http://www.robots.ox.ac.uk/~vgg/data/hands/) 
 39 | 这个数据集从各种不同的公共图像数据集源收集的手图像的全面数据集。总共有13050个实例被注释。大于固定框包围面积（1500平方像素）的手实例被认为是“足够大”用于检测并用于评估，给出了大约4170个高质量的手工实例。在收集数据时，没有对人的姿势或能见度施加限制，也没有对环境施加任何限制。在每个图像中，所有能被人类清晰感知的手都有注释。注解由一个包围矩形组成，它不必是轴向对齐的，而是面向手腕的。
 40 | 
 41 | <table border="1" cellpadding="2" cellspacing="0" width="80%">
 42 | <tbody>
 43 |   <tr>
 44 |     <td colspan="3" align="center">
 45 |       <b>Training Dataset</b>
 46 |     </td>
 47 |     <td colspan="3" align="center">
 48 |       <b>Validation Dataset</b>
 49 |     </td>
 50 |   </tr>
 51 |   <tr>
 52 |     <td align="center">
 53 |       <b>Source</b>
 54 |     </td>
 55 |     <td align="center">
 56 |       <b>#Instances</b>
 57 |     </td>
 58 |     <td align="center">
 59 |       <b>#Big Instances</b>
 60 |     </td>
 61 |     <td align="center">
 62 |       <b>Source</b>
 63 |     </td>
 64 |     <td align="center">
 65 |       <b>#Instances</b>
 66 |     </td>
 67 |     <td align="center">
 68 |       <b>#Big Instances</b>
 69 |     </td>
 70 |   </tr>
 71 |   <tr>
 72 |     <td>
 73 |       <a href="http://www.robots.ox.ac.uk/~vgg/data/stickmen/index.html">
 74 |         <font color="black">Buffy Stickman</font></a>
 75 |     </td>
 76 |     <td align="center">887</td>
 77 |     <td align="center">438</td>
 78 |     <td>Movie Dataset*</td>
 79 |     <td align="center">1856</td>
 80 |     <td align="center">649</td></tr>
 81 |   <tr>
 82 |     <td>
 83 |       <a href="http://pascal.inrialpes.fr/data/human/">
 84 |         <font color="black">INRIA pedestrian</font></a>
 85 |     </td>
 86 |     <td align="center">1343</td>
 87 |     <td align="center">137</td>
 88 |     <td>
 89 |       <i>Total</i>
 90 |     </td>
 91 |     <td align="center">1856</td>
 92 |     <td align="center">649</td></tr>
 93 |   <tr>
 94 |     <td>
 95 |       <a href="http://www.eecs.berkeley.edu/~lbourdev/poselets/">
 96 |         <font color="black">Poselet (H3D)</font></a>
 97 |     </td>
 98 |     <td align="center">1355</td>
 99 |     <td align="center">580</td>
100 |     <td colspan="3" align="center">
101 |       <b>Test Dataset</b>
102 |     </td>
103 |   </tr>
104 |   <tr>
105 |     <td>Skin Dataset [2]</td>
106 |     <td align="center">703</td>
107 |     <td align="center">139</td>
108 |     <td align="center">
109 |       <b>Source</b>
110 |     </td>
111 |     <td align="center">
112 |       <b>#Instances</b>
113 |     </td>
114 |     <td align="center">
115 |       <b>#Big Instances</b>
116 |     </td>
117 |   </tr>
118 |   <tr>
119 |     <td>
120 |       <a href="http://pascallin.ecs.soton.ac.uk/challenges/VOC/voc2007/">
121 |         <font color="black">PASCAL VOC 2007 train and val set
122 |           <font></font></font>
123 |       </a>
124 |     </td>
125 |     <td align="center">1867</td>
126 |     <td align="center">507</td>
127 |     <td>
128 |       <a href="http://pascallin.ecs.soton.ac.uk/challenges/VOC/voc2007/">
129 |         <font color="black">PASCAL VOC 2007 test set</font></a>
130 |     </td>
131 |     <td align="center">1626</td>
132 |     <td align="center">562</td></tr>
133 |   <tr>
134 |     <td width="28%">
135 |       <a href="http://pascallin.ecs.soton.ac.uk/challenges/VOC/voc2010/">
136 |         <font color="black">PASCAL VOC 2010 train and val set (except human layout set)</font></a>
137 |     </td>
138 |     <td align="center">3008</td>
139 |     <td align="center">1060</td>
140 |     <td width="20%">
141 |       <a href="http://pascallin.ecs.soton.ac.uk/challenges/VOC/voc2010/">
142 |         <font color="black">PASCAL VOC 2010 human layout val set</font></a>
143 |     </td>
144 |     <td align="center">405</td>
145 |     <td align="center">98</td></tr>
146 |   <tr>
147 |     <td>
148 |       <i>Total</i>
149 |     </td>
150 |     <td align="center">9163</td>
151 |     <td align="center">2861</td>
152 |     <td>
153 |       <i>Total</i>
154 |     </td>
155 |     <td align="center">2031</td>
156 |     <td align="center">660</td></tr>
157 | </tbody>
158 | </table>
159 | 数据集预览：
160 | <div align="center">
161 |     <img src="http://omoitwcai.bkt.clouddn.com/2017-12-21-Picture_1.png">
162 | </div>  
163 | 
164 | * [VIVA Hand Detection Dataset](http://cvrr.ucsd.edu/vivachallenge/index.php/hands/hand-detection/)   
165 | 该数据集由2D的bounding box标注司机和乘客的手。由54个在自然驾驶中收集的视频组成，包括照明的变化，大的手运动，和普遍的遮挡问题。一些数据由平台拍摄，还有一些是由YouTube提供。
166 | 数据集预览：  
167 | <div align="center">
168 |     <img src="http://omoitwcai.bkt.clouddn.com/2017-12-21-example-1024x576.png">
169 | </div>
170 | #### Hand keypoint detection数据集 
171 | * [CMU Hand Database](http://domedb.perception.cs.cmu.edu/handdb.html)
172 | 该数据集由CMU从不同公开数据集进行采集，并进行人工标记手的关键点。并且通过能够容纳关键点的放大的矩形来生成更多的Hand detection数据集。
173 | 数据集预览：  
174 | <div align="center">
175 |     <img src="http://omoitwcai.bkt.clouddn.com/2017-12-21-fig_hand_manual-1.jpg">
176 | </div>
177 | ### 预处理   
178 | 我们主要使用Hand detection数据集，Hand keypoint detection数据集。其中Hand detection数据集包含人手边框标注数据，主要用于检测任务的训练；Hand keypoint detection数据集包含边框标注数据和关键点信息，主要用于关键点的训练。训练集分为四种:负样本，正样本，部分样本，关键点样本. 三个样本的比例为$3:1:1:2$。   
179 | #### 正负样本，部分样本提取    
180 | 1. 从Hand detection数据集随机选出边框，然后和标注数据计算$IOU$，如果大于$0.65$，则为正样本，大于$0.4$小于$0.65$为部分样本，小于$0.4$为负样本。   
181 | $IOU$: 简单来讲就是模型产生的目标窗口和原来标记窗口的交叠率。具体我们可以简单的理解为： 即检测结果(DetectionResult)与Ground Truth的交集比上它们的并集，即为检测的准确率IOU，公式如下：   
182 | $$ IOU = \frac{DetectionResult \bigcap Ground Truth }{DetectionResult \bigcup Ground Truth} $$
183 | 2. 计算边框偏移．对于边框，$(x1,y1)$为左上角坐标，$(x2,y2)$为右下角坐标，新剪裁的边框坐标为
184 | $(xn1,yn1)$, $(xn2,yn2)$, $width$, $height$。则 
185 | $$ offset x1 = (x1 - xn1)/width$$
186 | 同上，计算另三个点的坐标偏移．   
187 | 
188 | 3. 对于正样本，部分样本均有边框信息，而对于负样本不需要边框信息   
189 | 
190 | #### 关键点样本提取    
191 | 从Hand keypoint detection数据集中提取，可以根据标注的边框，在满足正样本的要求下，随机裁剪出图片，然后调整关键点的坐标。   
192 | 
193 | ### 路线    
194 | #### loss修改   
195 | 由于训练过程中需要同时计算３个loss，但是对于不同的任务，每个任务需要的loss不同，所以在整理数据中，对于每个图片进行15个label的标注信息：   
196 | 1. 第1列：为正负样本标志，1正样本，0负样本，2部分样本，3关键点信息 
197 | 2. 第2-5列：为边框偏移，为float类型，对于无边框信息的数据，全部置为-1 
198 | 3. 第6-15列：为关键点偏移，为floagt类型，对于无边框信息的数据，全部置为-1  
199 | 
200 | 标注好label之后，在训练过程中，采取以下措施： 
201 | 1. 自定义softmax_loss，增加判断，只对于1,0计算loss值。  
202 | 2. 自定义euclidean_loss，增加判断，对于置为-1的不进行loss计算。 
203 | 3. Hard Example选择，在进行人脸分类任务时，采用了在线困难样本选择，即在训练过程中，根据计算出的loss值，进行排序，只对于70%的值较低的数据，进行反向传播。 
204 |   
205 | #### 网络描述 
206 | 分为三个阶段，分别是classifier、boundingbox regression和landmarks detection 
207 | 1. stage1: 在构建图像金字塔的基础上，利用fully convolutional   network来进行检测，同时利用boundingbox regression和非极大值抑制（NMS）来合并高度重叠的候选框。在这一步获得了手的区域的候选窗口和边界框的回归向量，并用该边界框做回归，对候选窗口进行了校准。  
208 | 2. stage2: 将通过stage1的所有窗口输入作进一步判断，同时也通过boundingbox regression和 NMS去掉那些false-positive区域。 
209 | 3. stage3: 作用和stage2相似，但是stage3对手的区域进行了更多的监督和更强的约束即手的关键点，因此在stage3还会输出手的关键点。  
210 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/pic/demo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/pic/demo.jpg


--------------------------------------------------------------------------------
/other/Hand_Detection/pic/example_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/pic/example_image.jpg


--------------------------------------------------------------------------------
/other/Hand_Detection/pic/width_and_height.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/pic/width_and_height.png


--------------------------------------------------------------------------------
/other/Hand_Detection/ssd_camera.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import sys
 5 | sys.path.insert(0, 'caffe/python')
 6 | import caffe
 7 | from utils.ssd_net import *
 8 | import time
 9 | import urllib
10 | 
11 | 
12 | ## Use local camera
13 | # cap = cv2.VideoCapture(0)
14 | # # width = 720
15 | # # height = 480
16 | width = 640
17 | height = 480
18 | # cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
19 | # cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
20 | 
21 | ## Use ipcam
22 | # url = r"http://192.168.1.190:8080/videofeed"
23 | # capture = cv2.VideoCapture(url)
24 | 
25 | # Replace the URL with your own IPwebcam shot.jpg IP:port
26 | url = 'http://192.168.1.190:8080/shot.jpg'
27 | 
28 | 
29 | model_def = 'model/deploy.prototxt'
30 | model_weights = 'model/snapshot/VGG_HAND_SSD_300x300_iter_50000.caffemodel'
31 | 
32 | ssd_net = SSD_NET(model_weights, model_def, GPU_MODE=True, threshold=0.7)
33 | 
34 | while True:
35 |     # get a frame
36 |     # start_time = time.time()
37 |     # ret, frame = capture.read()
38 | 
39 |     # Use urllib to get the image from the IP camera
40 |     imgResp = urllib.urlopen(url)
41 |     
42 |     # Numpy to convert into a array
43 |     imgNp = np.array(bytearray(imgResp.read()),dtype=np.uint8)
44 |     
45 |     # Finally decode the array to OpenCV usable format ;) 
46 |     frame = cv2.imdecode(imgNp,-1)
47 | 
48 |     start_time = time.time()
49 | 
50 |     # show a frame
51 |     try:
52 |         image_np = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
53 |     except:
54 |         print("Error converting to RGB")
55 | 
56 |     top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax = ssd_net.detect(image_np/255.0)
57 |     print(image_np.shape)
58 | 
59 |     print(top_conf)
60 |     print(top_label_indices)
61 |     for i in range(len(top_conf)):
62 |         xmin = int(round(top_xmin[i] * width))
63 |         ymin = int(round(top_ymin[i] * height))
64 |         xmax = int(round(top_xmax[i] * width))
65 |         ymax = int(round(top_ymax[i] * height))
66 |         print(xmin, ymin, xmax, ymax, top_conf[i])
67 |         # if np.sum(top_xmin[i]<0) > 0 or np.sum(top_xmax[i]<0) > 0 or np.sum(top_ymin[i]<0) > 0 or np.sum(top_ymax[i]<0) > 0:
68 |         #     continue
69 |         cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 255, 0), 2)
70 |     # # time.sleep(0.1)
71 |     fps = 1/(time.time() - start_time)
72 |     cv2.putText(frame, 'FPS: %d' % fps, (20, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
73 | 
74 |     cv2.imshow("capture", frame)
75 | 
76 |     if cv2.waitKey(1) == 27:
77 |         break  # esc to quit
78 | 
79 | # capture.release()
80 | cv2.destroyAllWindows()


--------------------------------------------------------------------------------
/other/Hand_Detection/ssd_detection.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import sys
 4 | sys.path.insert(0, 'caffe/python')
 5 | import caffe
 6 | from utils.ssd_net import *
 7 | 
 8 | plt.rcParams['figure.figsize'] = (10, 10)
 9 | plt.rcParams['image.interpolation'] = 'nearest'
10 | plt.rcParams['image.cmap'] = 'gray'
11 | 
12 | model_def = 'model/deploy.prototxt'
13 | model_weights = 'model/snapshot/VGG_HAND_SSD_300x300_iter_50000.caffemodel'
14 | 
15 | ssd_net = SSD_NET(model_weights, model_def, GPU_MODE=True, threshold=0.5)
16 | 
17 | # image = caffe.io.load_image('/Users/hzzone/Desktop/CARDS_COURTYARD_B_T_0324.jpg')
18 | image = caffe.io.load_image('/home/hzzone/Desktop/2.jpg')
19 | 
20 | top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax = ssd_net.detect(image)
21 | 
22 | # print(top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax)
23 | 
24 | colors = plt.cm.hsv(np.linspace(0, 1, 21)).tolist()
25 | 
26 | plt.imshow(image)
27 | currentAxis = plt.gca()
28 | 
29 | for i in xrange(top_conf.shape[0]):
30 |     xmin = int(round(top_xmin[i] * image.shape[1]))
31 |     ymin = int(round(top_ymin[i] * image.shape[0]))
32 |     xmax = int(round(top_xmax[i] * image.shape[1]))
33 |     ymax = int(round(top_ymax[i] * image.shape[0]))
34 |     score = top_conf[i]
35 |     label = int(top_label_indices[i])
36 |     # label_name = top_labels[i]
37 |     label_name = label
38 |     display_txt = '%s: %.2f' % ('hand', score)
39 |     coords = (xmin, ymin), xmax-xmin+1, ymax-ymin+1
40 |     color = colors[label]
41 |     currentAxis.add_patch(plt.Rectangle(*coords, fill=False, edgecolor=color, linewidth=2))
42 |     currentAxis.text(xmin, ymin, display_txt, bbox={'facecolor':color, 'alpha':0.5})
43 | 
44 | plt.show()
45 | 
46 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Hand_Detection/utils/__init__.py


--------------------------------------------------------------------------------
/other/Hand_Detection/utils/mAP.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | def eval_mAP(predict_file, ground_truth_file):
 4 |     predict_dict = dict()
 5 |     ground_truth_dict = dict()
 6 |     def get_info(info_file, info_dict):
 7 |         bbox_num = 0
 8 |         first_line = True
 9 |         with open(info_file) as fr:
10 |             for line in fr:
11 |                 if first_line:
12 |                     first_line = False
13 |                     continue
14 |                 if len(line.strip().split(',')) == 6:
15 |                     line_data = line.strip().split(',')
16 |                     im_id = line_data[0]
17 |                     xmin,ymin,xmax,ymax,score = map(float, line_data[1:])
18 |                 else:
19 |                     im_id,xmin,ymin,xmax,ymax = map(float, line.strip().split(','))
20 |                     score = 1.
21 |                 if not im_id in info_dict:
22 |                     info_dict[im_id] = list()
23 |                 info_dict[im_id].append([xmin,ymin,xmax,ymax,score])
24 |                 bbox_num+=1
25 |         return bbox_num
26 | 
27 |     predict_bbox_num = get_info(predict_file, predict_dict)
28 |     ground_truth_bbox_num = get_info(ground_truth_file, ground_truth_dict)
29 |     score_list = list()
30 |     match_list = list()
31 | 
32 |     def iou(predict_bbox, ground_truth_bbox):
33 |         predict_area = (predict_bbox[2] - predict_bbox[0])*(predict_bbox[3] - predict_bbox[1])
34 |         ground_truth_area = (ground_truth_bbox[2] - ground_truth_bbox[0])*(ground_truth_bbox[3] - ground_truth_bbox[1])
35 |         inter_x = min(predict_bbox[2],ground_truth_bbox[2]) - max(predict_bbox[0],ground_truth_bbox[0])
36 |         inter_y = min(predict_bbox[3],ground_truth_bbox[3]) - max(predict_bbox[1],ground_truth_bbox[1])
37 |         if inter_x<=0 or inter_y<=0:
38 |             return 0
39 |         inter_area = inter_x*inter_y
40 |         return inter_area / (predict_area+ground_truth_area-inter_area)
41 | 
42 |     def compare(predict_list, ground_truth_list, score_list, match_list):
43 |         ground_truth_unuse = [True for i in range(len(ground_truth_list))]
44 |         for predict_bbox in predict_list:
45 |             match = False
46 |             for i in range(len(ground_truth_list)):
47 |                 if ground_truth_unuse[i]:
48 |                     if iou(predict_bbox, ground_truth_list[i])>0.5:
49 |                         match = True
50 |                         ground_truth_unuse[i] = False
51 |                         break
52 |             score_list.append(predict_bbox[-1])
53 |             match_list.append(int(match))
54 | 
55 |     for key in predict_dict.keys():
56 |         compare(predict_dict[key], ground_truth_dict[key], score_list, match_list)
57 | 
58 |     p = list()
59 |     r = list()
60 |     predict_num = 0
61 |     truth_num = 0
62 |     score_match_list = list(zip(score_list, match_list))
63 |     score_match_list.sort(key=lambda x:x[0], reverse = True)
64 |     for item in score_match_list:
65 |         predict_num+=1
66 |         truth_num+=item[1]
67 |         p.append(float(truth_num)/ground_truth_bbox_num)
68 |         r.append(float(truth_num)/predict_num)
69 |     mAP = 0
70 |     for i in range(1,len(p)):
71 |         mAP += (r[i-1]+r[i])/2*(p[i]-p[i-1])
72 |     return p, r, mAP
73 | 
74 | if __name__ == "__main__":
75 |     p, r, mAP = eval_mAP("/Users/hzzone/Downloads/object_detection_mAP-master/example/val_pred.csv",
76 |              "/Users/hzzone/Downloads/object_detection_mAP-master/example/val_gt.csv")
77 |     print(mAP)
78 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/utils/output.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from ssd_net import *
  3 | import sys
  4 | sys.path.insert(0, '../caffe/python')
  5 | import xml.dom.minidom
  6 | import csv
  7 | import re
  8 | import time
  9 | 
 10 | data_dir = '../data'
 11 | 
 12 | def read_xmlfile(file_path):
 13 |     DomTree = xml.dom.minidom.parse(file_path)
 14 |     annotation = DomTree.documentElement
 15 |     objectlist = annotation.getElementsByTagName('object')
 16 |     label = file_path.split(os.sep)[-1].strip('.xml')
 17 |     boxes = []
 18 |     for objects in objectlist:
 19 |         bndbox = objects.getElementsByTagName('bndbox')[0]
 20 |         xmin = int(bndbox.getElementsByTagName('xmin')[0].childNodes[0].data)
 21 |         ymin = int(bndbox.getElementsByTagName('ymin')[0].childNodes[0].data)
 22 |         xmax = int(bndbox.getElementsByTagName('xmax')[0].childNodes[0].data)
 23 |         ymax = int(bndbox.getElementsByTagName('ymax')[0].childNodes[0].data)
 24 |         print(xmin, ymin, xmax, ymax)
 25 |         boxes.append([label, xmin, ymin, xmax, ymax, 1])
 26 |         # print(bndbox)
 27 |     return boxes
 28 | 
 29 | 
 30 | def output_gt_label(datatset_name):
 31 |     anno_path = os.path.join(data_dir, datatset_name, 'test', 'Annotations')
 32 |     # img_dir = os.path.join(data_dir, datatset_name, 'test', 'JPEGImages')
 33 |     all_boxes = [['id', 'x1', 'y1', 'x2', 'y2', 'score'], ]
 34 |     for root, dirs, files in os.walk(anno_path):
 35 |         for xml_file in files:
 36 |             xml_file_path = os.path.join(root, xml_file)
 37 |             all_boxes.extend(read_xmlfile(xml_file_path))
 38 |     with open('../data/gth/{}.csv'.format(datatset_name), 'wb') as csvfile:
 39 |         csvwriter = csv.writer(csvfile, delimiter=',')
 40 |         for box in all_boxes:
 41 |             csvwriter.writerow(box)
 42 | 
 43 | def output(model_def, model_weights, datatset_name):
 44 | 
 45 |     img_dir = os.path.join(data_dir, datatset_name, 'test', 'JPEGImages')
 46 |     ssd_net = SSD_NET(model_weights, model_def, GPU_MODE=True, threshold=0.2)
 47 | 
 48 |     output_boxes = [['id', 'x1', 'y1', 'x2', 'y2', 'score'], ]
 49 | 
 50 | 
 51 |     total_time = 0.0
 52 | 
 53 |     for img_name in os.listdir(img_dir):
 54 |         img_path = os.path.join(img_dir, img_name)
 55 |         img_name = img_name.strip('.jpg')
 56 | 
 57 |         image = caffe.io.load_image(img_path)
 58 | 
 59 |         start = time.time()
 60 | 
 61 |         top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax = ssd_net.detect(image)
 62 | 
 63 |         total_time = total_time + time.time() - start
 64 | 
 65 |         print(img_path)
 66 | 
 67 |         for i in xrange(top_conf.shape[0]):
 68 |             xmin = int(round(top_xmin[i] * image.shape[1]))
 69 |             ymin = int(round(top_ymin[i] * image.shape[0]))
 70 |             xmax = int(round(top_xmax[i] * image.shape[1]))
 71 |             ymax = int(round(top_ymax[i] * image.shape[0]))
 72 |             score = top_conf[i]
 73 |             label_indice = top_label_indices[i]
 74 | 
 75 |             output_boxes.append([img_name, xmin, ymin, xmax, ymax, score])
 76 | 
 77 |             assert label_indice == 1.0
 78 | 
 79 | 
 80 |     iter_times = re.findall('VGG_HAND_SSD_300x300_(.*?).caffemodel', model_weights.split(os.sep)[-1])[0]
 81 |     print(iter_times)
 82 |     output_dir = '../output/{}'.format(iter_times)
 83 |     if not os.path.exists(output_dir):
 84 |         os.makedirs(output_dir)
 85 |     output_file = os.path.join(output_dir, '{}.csv'.format(datatset_name))
 86 |     with open(output_file, 'wb') as csvfile:
 87 |         csvwriter = csv.writer(csvfile, delimiter=',')
 88 |         for box in output_boxes:
 89 |             csvwriter.writerow(box)
 90 |     return total_time/len(os.listdir(img_dir))
 91 | 
 92 | 
 93 | 
 94 | model_def = '../model/deploy.prototxt'
 95 | model_weights = '../model/snapshot/VGG_HAND_SSD_300x300_iter_50000.caffemodel'
 96 | # model_path = '../model/snapshot'
 97 | # total_time = []
 98 | # for model_weights in os.listdir(model_path):
 99 | #     if model_weights.endswith('.caffemodel'):
100 | #         total_time.append(output(model_def, os.path.join(model_path, model_weights), 'stanfordhands'))
101 | #         total_time.append(output(model_def, os.path.join(model_path, model_weights), 'egohands'))
102 | 
103 | print(output(model_def, model_weights, 'stanfordhands'))
104 | print(output(model_def, model_weights, 'egohands'))
105 | 
106 | # print(total_time)
107 | # output_gt_label('egohands')
108 | # output_gt_label('stanfordhands')
109 | # read_xmlfile('/Users/hzzone/Desktop/Hand-Keypoint-Detection/data/stanfordhands/test/Annotations/VOC2007_1.xml')
110 | 
111 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/utils/plot_loss.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import re
 3 | 
 4 | with open('../model/train.log') as f:
 5 |     data = f.read()
 6 | 
 7 | pattern = re.compile(r'''
 8 | I0(.*?)solver.cpp:243] Iteration (.*?), loss = (.*?)
 9 | I0(.*?)solver.cpp:259]     Train net output #0: mbox_loss = (.*?) \(\* 1 = (.*?) loss\)
10 | I0(.*?)sgd_solver.cpp:138] Iteration (.*?), lr = (.*?)
11 | ''')
12 | results = re.findall(pattern, data)
13 | iter_num = []
14 | total_loss = []
15 | mbox_loss = []
16 | learning_rate = []
17 | print(results)
18 | 
19 | for result in results:
20 |     iter_num.append(int(result[1]))
21 |     total_loss.append(float(result[2]))
22 |     mbox_loss.append(float(result[4]))
23 |     learning_rate.append(float(result[-1]))
24 | 
25 | plt.subplot(311)
26 | plt.plot(iter_num, total_loss)
27 | plt.subplot(312)
28 | plt.plot(iter_num, mbox_loss)
29 | plt.subplot(313)
30 | plt.plot(iter_num, learning_rate)
31 | 
32 | plt.show()
33 | 
34 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/utils/score.py:
--------------------------------------------------------------------------------
 1 | import mAP
 2 | import os
 3 | 
 4 | p, r, AP = mAP.eval_mAP('/home/hzzone/Hand-Keypoint-Detection/output/iter_50000/egohands.csv', '/home/hzzone/Hand-Keypoint-Detection/data/gth/egohands.csv')
 5 | print(AP)
 6 | p, r, AP = mAP.eval_mAP('/home/hzzone/Hand-Keypoint-Detection/output/iter_50000/stanfordhands.csv', '/home/hzzone/Hand-Keypoint-Detection/data/gth/stanfordhands.csv')
 7 | print(AP)
 8 | # for test_data in ['egohands', 'stanfordhands']:
 9 | #     gth_path = '../data/gth/{}.csv'.format(test_data)
10 | #     output_path = [os.path.join('../output', iter_num) for iter_num in os.listdir('../output')]
11 | #     for iter_num_output in output_path:
12 | #         p, r, AP = mAP.eval_mAP('{}/{}.csv'.format(iter_num_output, test_data), gth_path)
13 | #         print(iter_num_output, AP)
14 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/utils/ssd_net.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.insert(0, '../caffe/python')
 3 | import caffe
 4 | import numpy as np
 5 | from google.protobuf import text_format
 6 | from caffe.proto import caffe_pb2
 7 | 
 8 | def get_labelname(labelmap, labels):
 9 |     num_labels = len(labelmap.item)
10 |     print(labelmap.item[0])
11 |     print(num_labels)
12 |     labelnames = []
13 |     if type(labels) is not list:
14 |         labels = [labels]
15 |     for label in labels:
16 |         found = False
17 |         for i in xrange(0, num_labels):
18 |             if label == labelmap.item[i].label:
19 |                 found = True
20 |                 labelnames.append(labelmap.item[i].display_name)
21 |                 break
22 |         assert found == True
23 |     return labelnames
24 | 
25 | class SSD_NET(object):
26 | 
27 |     def __init__(self, model_weights, model_def, threshold=0.5, GPU_MODE=False):
28 |         if GPU_MODE:
29 |             caffe.set_device(0)
30 |             caffe.set_mode_gpu()
31 |         else:
32 |             caffe.set_mode_cpu()
33 |         self.net = caffe.Net(model_def,  # defines the structure of the model
34 |                         model_weights,  # contains the trained weights
35 |                         caffe.TEST)  # use test mode (e.g., don't perform dropout)
36 |         self.threshold = threshold
37 |         self.transformer = caffe.io.Transformer({'data': self.net.blobs['data'].data.shape})
38 |         self.transformer.set_transpose('data', (2, 0, 1))
39 |         self.transformer.set_mean('data', np.array([127.0, 127.0, 127.0]))  # mean pixel
40 |         self.transformer.set_raw_scale('data',
41 |                                   255)  # the reference model operates on images in [0,255] range instead of [0,1]
42 |         self.transformer.set_channel_swap('data', (2, 1, 0))  # the reference model has channels in BGR order instead of RGB
43 |         image_resize = 300
44 |         self.net.blobs['data'].reshape(1, 3, image_resize, image_resize)
45 | 
46 | 
47 |     def detect(self, img):
48 |         transformed_image = self.transformer.preprocess('data', img)
49 |         self.net.blobs['data'].data[...] = transformed_image
50 |         detections = self.net.forward()['detection_out']
51 |         # Parse the outputs.
52 |         det_label = detections[0, 0, :, 1]
53 |         det_conf = detections[0, 0, :, 2]
54 |         det_xmin = detections[0, 0, :, 3]
55 |         det_ymin = detections[0, 0, :, 4]
56 |         det_xmax = detections[0, 0, :, 5]
57 |         det_ymax = detections[0, 0, :, 6]
58 |         # Get detections with confidence higher than 0.6.
59 |         # print(det_conf)
60 |         top_indices = [i for i, conf in enumerate(det_conf) if conf >= self.threshold]
61 | 
62 |         top_conf = det_conf[top_indices]
63 |         top_label_indices = det_label[top_indices].tolist()
64 |         top_xmin = det_xmin[top_indices]
65 |         top_ymin = det_ymin[top_indices]
66 |         top_xmax = det_xmax[top_indices]
67 |         top_ymax = det_ymax[top_indices]
68 | 
69 |         return top_label_indices, top_conf, top_xmin, top_ymin, top_xmax, top_ymax
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/other/Hand_Detection/utils/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def IoU(box, boxes):
 4 |     """Compute IoU between detect box and gt boxes
 5 | 
 6 |     Parameters:
 7 |     ----------
 8 |     box: numpy array , shape (5, ): x1, y1, x2, y2, score
 9 |         input box
10 |     boxes: numpy array, shape (n, 4): x1, y1, x2, y2
11 |         input ground truth boxes
12 | 
13 |     Returns:
14 |     -------
15 |     ovr: numpy.array, shape (n, )
16 |         IoU
17 |     """
18 |     box_area = (box[2] - box[0] + 1) * (box[3] - box[1] + 1)
19 |     area = (boxes[:, 2] - boxes[:, 0] + 1) * (boxes[:, 3] - boxes[:, 1] + 1)
20 |     xx1 = np.maximum(box[0], boxes[:, 0])
21 |     yy1 = np.maximum(box[1], boxes[:, 1])
22 |     xx2 = np.minimum(box[2], boxes[:, 2])
23 |     yy2 = np.minimum(box[3], boxes[:, 3])
24 | 
25 |     # compute the width and height of the bounding box
26 |     w = np.maximum(0, xx2 - xx1 + 1)
27 |     h = np.maximum(0, yy2 - yy1 + 1)
28 | 
29 |     inter = w * h
30 |     ovr = inter / (box_area + area - inter)
31 |     return ovr
32 | 
33 | 
34 | def convert_to_square(bbox):
35 |     """Convert bbox to square
36 | 
37 |     Parameters:
38 |     ----------
39 |     bbox: numpy array , shape n x 5
40 |         input bbox
41 | 
42 |     Returns:
43 |     -------
44 |     square bbox
45 |     """
46 |     square_bbox = bbox.copy()
47 | 
48 |     h = bbox[:, 3] - bbox[:, 1] + 1
49 |     w = bbox[:, 2] - bbox[:, 0] + 1
50 |     max_size = np.maximum(h, w)
51 |     square_bbox[:, 0] = bbox[:, 0] + w*0.5 - max_size*0.5
52 |     square_bbox[:, 1] = bbox[:, 1] + h*0.5 - max_size*0.5
53 |     square_bbox[:, 2] = square_bbox[:, 0] + max_size - 1
54 |     square_bbox[:, 3] = square_bbox[:, 1] + max_size - 1
55 |     return square_bbox
56 | 


--------------------------------------------------------------------------------
/other/Openpose-Keras/.gitignore:
--------------------------------------------------------------------------------
1 | # TOTALLY IGNORE THE MODEL FILES
2 | .ipynb_checkpoints
3 | *.h5
4 | *.h5py
5 | *.npy


--------------------------------------------------------------------------------
/other/Openpose-Keras/README.md:
--------------------------------------------------------------------------------
 1 | # OpenPose-Keras
 2 | A little bit of play with OpenPose without using their API but allowing to build / prototype pre and post-processing steps in Keras. Please keep in mind that this is more of a toy project and not anything even close to any production applications. If you are looking for something more *useful* please invest some time and get the actual OpenPose up and running :)
 3 | 
 4 | Table of contents:   
 5 | - Body keypoint estimation network (coming soon)
 6 | - Face keypoint estimation network (coming soon)
 7 | - [Hand keypoint estimation network](#handKeypointEstimationNetwork)
 8 |    * [Model description](#handKeypointEstimationNetwork_modelDescription)
 9 |    * [Input format and pre-processing](#handKeypointEstimationNetwork_inputFormatAndPreProcessing)
10 |    * [Post-processing](#handKeypointEstimationNetwork_postProcessing)
11 |    * [Discovered issues](#handKeypointEstimationNetwork_issues)
12 | - [External resources](#externalResources)
13 | 
14 | 
15 | ## Hand keypoint estimation network <a name="handKeypointEstimationNetwork" />
16 | [![IMAGE ALT TEXT HERE](https://img.youtube.com/vi/FnoI8ufwhbs/0.jpg)](https://www.youtube.com/watch?v=FnoI8ufwhbs)      
17 | Please check out the demo on yt: https://www.youtube.com/watch?v=FnoI8ufwhbs
18 | 
19 |   
20 | ### Model description <a name="handKeypointEstimationNetwork_modelDescription" />   
21 | Original model can be found on the [OpenPose's github](https://github.com/CMU-Perceptual-Computing-Lab/openpose). Model weights converted from Caffe model definition available for download: https://drive.google.com/file/d/1yPQFrCrDltqzYAnWBl__O7oZxGL0sQlu/view
22 | The readme on the main page says that the "hand keypoint detection" 2x21 keypoint estimation. The network itself outputs 22 channels (21 keypoints + background). The final layer feeds from the 128-deep convolutional layer (Mconv6_stage6). It is defined as follows (as defined in the [models/hand/pose_deploy.prototxt](https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/models/hand/pose_deploy.prototxt)):
23 | ```
24 | layer {
25 |   name: "Mconv7_stage6"
26 |   type: "Convolution"
27 |   bottom: "Mconv6_stage6"
28 |   top: "net_output"
29 |   param {
30 |     lr_mult: 4.0
31 |     decay_mult: 1
32 |   }
33 |   param {
34 |     lr_mult: 8.0
35 |     decay_mult: 0
36 |   }
37 |   convolution_param {
38 |     num_output: 22
39 |     pad: 0
40 |     kernel_size: 1
41 |     weight_filler {
42 |       type: "gaussian"
43 |       std: 0.01
44 |     }
45 |     bias_filler {
46 |       type: "constant"
47 |     }
48 |     dilation: 1
49 |   }
50 | }
51 | ```
52 | Keep in mind that this particular network does NOT produce any part affinity fields, just finger-keypoints. OpenPose's documentation contains the following picture describing the keypoint channel ids:    
53 | <a href="https://github.com/CMU-Perceptual-Computing-Lab/openpose/blob/master/doc/output.md#face-and-hands"><img src="https://raw.githubusercontent.com/CMU-Perceptual-Computing-Lab/openpose/master/doc/media/keypoints_hand.png" width="300" /></a>
54 | 
55 | ### Input format and pre-processing <a name="handKeypointEstimationNetwork_inputFormatAndPreProcessing" />
56 | I believe that the natural resolution of the input images (e.g. the standard network input size) is 368 on the width and whatever turns out to be on the height. From the papers presenting this method one figure out that the authors use multi-scale inputs. Basically they go through different scales from 0.5 to 1.5 and average the heatmaps. Network accepts 3 channel RGB images with 32-bit floating point values scaled between -0.5 <= x <= 0.5. 
57 | 
58 | 
59 | ### Post-processing  <a name="handKeypointEstimationNetwork_postProcessing" />
60 | I haven't studied the code of the OpenPose library very well (yet!) but I noticed that the returned heatmaps seem to have bi-modal distributions. E.g. some values of detections are strongly negative and some are strongly positive. I understood that it may be their way of distinguishing the left hand from the right one. I still need to investigate that.   
61 | 
62 | 
63 | ### Discovered issues  <a name="handKeypointEstimationNetwork_issues" />
64 | - It seems like the model does not capture hand keypoints when exposed to images of people wearing gloves. I haven't figured out how exaclty the network was trained, but I can imagine that there was no emphasis on glove-wearing targets.
65 | 
66 | 
67 | 
68 | # External resources  <a name="externalResources" />
69 | 1. OpenPose GitHub repo: https://github.com/CMU-Perceptual-Computing-Lab/openpose
70 | 2. Origin of OpenPose: https://github.com/ZheC/Realtime_Multi-Person_Pose_Estimation
71 | 3. Paper describing the method: https://arxiv.org/abs/1611.08050
72 | 4. Keras implementation of the Realtime Multi-Person Pose Estimation (my major inspiration): https://github.com/michalfaber/keras_Realtime_Multi-Person_Pose_Estimation
73 | 


--------------------------------------------------------------------------------
/other/Openpose-Keras/images/test_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/Openpose-Keras/images/test_image.png


--------------------------------------------------------------------------------
/other/asl.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/asl.mp4


--------------------------------------------------------------------------------
/other/front-back.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/front-back.jpg


--------------------------------------------------------------------------------
/other/getModels.sh:
--------------------------------------------------------------------------------
 1 | # ------------------------- BODY, FACE AND HAND MODELS -------------------------
 2 | # Downloading body pose (COCO and MPI), face and hand models
 3 | OPENPOSE_URL="http://posefs1.perception.cs.cmu.edu/OpenPose/models/"
 4 | HAND_FOLDER="hand/"
 5 | 
 6 | # "------------------------- HAND MODELS -------------------------"
 7 | # Hand
 8 | HAND_MODEL=$HAND_FOLDER"pose_iter_102000.caffemodel"
 9 | wget -c ${OPENPOSE_URL}${HAND_MODEL} -P ${HAND_FOLDER}
10 | 


--------------------------------------------------------------------------------
/other/hand.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/hand.jpg


--------------------------------------------------------------------------------
/other/hand.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KeyForce/Hand-Keypoint-Estimation/e0107c08a2ccc8c390376895d5167f39b3fd1481/other/hand.png


--------------------------------------------------------------------------------
/other/handPoseImage.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/dnn.hpp>
  2 | #include <opencv2/imgproc.hpp>
  3 | #include <opencv2/highgui.hpp>
  4 | #include <iostream>
  5 | 
  6 | using namespace std;
  7 | using namespace cv;
  8 | using namespace cv::dnn;
  9 | 
 10 | 
 11 | const int POSE_PAIRS[20][2] =
 12 | {
 13 |     {0,1}, {1,2}, {2,3}, {3,4},         // thumb
 14 |     {0,5}, {5,6}, {6,7}, {7,8},         // index
 15 |     {0,9}, {9,10}, {10,11}, {11,12},    // middle
 16 |     {0,13}, {13,14}, {14,15}, {15,16},  // ring
 17 |     {0,17}, {17,18}, {18,19}, {19,20}   // small
 18 | };
 19 | 
 20 | string protoFile = "hand/pose_deploy.prototxt";
 21 | string weightsFile = "hand/pose_iter_102000.caffemodel";
 22 | 
 23 | int nPoints = 22;
 24 | 
 25 | int main(int argc, char **argv)
 26 | {
 27 | 
 28 |     cout << "USAGE : ./handPoseImage <imageFile> " << endl;
 29 | 
 30 |     string imageFile = "right-frontal.jpg";
 31 |     // Take arguments from commmand line
 32 |     if (argc == 2)
 33 |     {
 34 |       imageFile = argv[1];
 35 |     }
 36 | 
 37 |     float thresh = 0.01;
 38 | 
 39 |     Mat frame = imread(imageFile);
 40 |     Mat frameCopy = frame.clone();
 41 |     int frameWidth = frame.cols;
 42 |     int frameHeight = frame.rows;
 43 | 
 44 |     float aspect_ratio = frameWidth/(float)frameHeight;
 45 |     int inHeight = 368;
 46 |     int inWidth = (int(aspect_ratio*inHeight) * 8) / 8;
 47 | 
 48 |     cout << "inWidth = " << inWidth << " ; inHeight = " << inHeight << endl;
 49 | 
 50 |     double t = (double) cv::getTickCount();
 51 |     Net net = readNetFromCaffe(protoFile, weightsFile);
 52 | 
 53 |     Mat inpBlob = blobFromImage(frame, 1.0 / 255, Size(inWidth, inHeight), Scalar(0, 0, 0), false, false);
 54 | 
 55 |     net.setInput(inpBlob);
 56 | 
 57 |     Mat output = net.forward();
 58 | 
 59 |     int H = output.size[2];
 60 |     int W = output.size[3];
 61 | 
 62 |     // find the position of the body parts
 63 |     vector<Point> points(nPoints);
 64 |     for (int n=0; n < nPoints; n++)
 65 |     {
 66 |         // Probability map of corresponding body's part.
 67 |         Mat probMap(H, W, CV_32F, output.ptr(0,n));
 68 |         resize(probMap, probMap, Size(frameWidth, frameHeight));
 69 | 
 70 |         Point maxLoc;
 71 |         double prob;
 72 |         minMaxLoc(probMap, 0, &prob, 0, &maxLoc);
 73 |         if (prob > thresh)
 74 |         {
 75 |             circle(frameCopy, cv::Point((int)maxLoc.x, (int)maxLoc.y), 8, Scalar(0,255,255), -1);
 76 |             cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)maxLoc.x, (int)maxLoc.y), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 0, 255), 2);
 77 | 
 78 |         }
 79 |         points[n] = maxLoc;
 80 |     }
 81 | 
 82 |     int nPairs = sizeof(POSE_PAIRS)/sizeof(POSE_PAIRS[0]);
 83 | 
 84 |     for (int n = 0; n < nPairs; n++)
 85 |     {
 86 |         // lookup 2 connected body/hand parts
 87 |         Point2f partA = points[POSE_PAIRS[n][0]];
 88 |         Point2f partB = points[POSE_PAIRS[n][1]];
 89 | 
 90 |         if (partA.x<=0 || partA.y<=0 || partB.x<=0 || partB.y<=0)
 91 |             continue;
 92 | 
 93 |         line(frame, partA, partB, Scalar(0,255,255), 8);
 94 |         circle(frame, partA, 8, Scalar(0,0,255), -1);
 95 |         circle(frame, partB, 8, Scalar(0,0,255), -1);
 96 |     }
 97 | 
 98 |     t = ((double)cv::getTickCount() - t)/cv::getTickFrequency();
 99 |     cout << "Time Taken = " << t << endl;
100 |     imshow("Output-Keypoints", frameCopy);
101 |     imshow("Output-Skeleton", frame);
102 |     imwrite("Output-Skeleton.jpg", frame);
103 | 
104 |     waitKey();
105 | 
106 |     return 0;
107 | }
108 | 


--------------------------------------------------------------------------------
/other/handPoseImage.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import cv2
 3 | import time
 4 | import numpy as np
 5 | 
 6 | protoFile = "hand/pose_deploy.prototxt"
 7 | weightsFile = "hand/pose_iter_102000.caffemodel"
 8 | nPoints = 22
 9 | POSE_PAIRS = [ [0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20] ]
10 | net = cv2.dnn.readNetFromCaffe(protoFile, weightsFile)
11 | 
12 | frame = cv2.imread("right-frontal.jpg")
13 | frameCopy = np.copy(frame)
14 | frameWidth = frame.shape[1]
15 | frameHeight = frame.shape[0]
16 | aspect_ratio = frameWidth/frameHeight
17 | 
18 | threshold = 0.1
19 | 
20 | t = time.time()
21 | # input image dimensions for the network
22 | inHeight = 368
23 | inWidth = int(((aspect_ratio*inHeight)*8)//8)
24 | inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight), (0, 0, 0), swapRB=False, crop=False)
25 | 
26 | net.setInput(inpBlob)
27 | 
28 | output = net.forward()
29 | print("time taken by network : {:.3f}".format(time.time() - t))
30 | 
31 | # Empty list to store the detected keypoints
32 | points = []
33 | 
34 | for i in range(nPoints):
35 |     # confidence map of corresponding body's part.
36 |     probMap = output[0, i, :, :]
37 |     probMap = cv2.resize(probMap, (frameWidth, frameHeight))
38 | 
39 |     # Find global maxima of the probMap.
40 |     minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)
41 | 
42 |     if prob > threshold :
43 |         cv2.circle(frameCopy, (int(point[0]), int(point[1])), 8, (0, 255, 255), thickness=-1, lineType=cv2.FILLED)
44 |         cv2.putText(frameCopy, "{}".format(i), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2, lineType=cv2.LINE_AA)
45 | 
46 |         # Add the point to the list if the probability is greater than the threshold
47 |         points.append((int(point[0]), int(point[1])))
48 |     else :
49 |         points.append(None)
50 | 
51 | # Draw Skeleton
52 | for pair in POSE_PAIRS:
53 |     partA = pair[0]
54 |     partB = pair[1]
55 | 
56 |     if points[partA] and points[partB]:
57 |         cv2.line(frame, points[partA], points[partB], (0, 255, 255), 2)
58 |         cv2.circle(frame, points[partA], 8, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
59 |         cv2.circle(frame, points[partB], 8, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
60 | 
61 | 
62 | cv2.imshow('Output-Keypoints', frameCopy)
63 | cv2.imshow('Output-Skeleton', frame)
64 | 
65 | 
66 | cv2.imwrite('Output-Keypoints.jpg', frameCopy)
67 | cv2.imwrite('Output-Skeleton.jpg', frame)
68 | 
69 | print("Total time taken : {:.3f}".format(time.time() - t))
70 | 
71 | cv2.waitKey(0)
72 | 


--------------------------------------------------------------------------------
/other/handPoseVideo.cpp:
--------------------------------------------------------------------------------
  1 | #include <opencv2/dnn.hpp>
  2 | #include <opencv2/imgproc.hpp>
  3 | #include <opencv2/highgui.hpp>
  4 | #include <iostream>
  5 | 
  6 | using namespace std;
  7 | using namespace cv;
  8 | using namespace cv::dnn;
  9 | 
 10 | const int POSE_PAIRS[20][2] =
 11 | {
 12 |     {0,1}, {1,2}, {2,3}, {3,4},         // thumb
 13 |     {0,5}, {5,6}, {6,7}, {7,8},         // index
 14 |     {0,9}, {9,10}, {10,11}, {11,12},    // middle
 15 |     {0,13}, {13,14}, {14,15}, {15,16},  // ring
 16 |     {0,17}, {17,18}, {18,19}, {19,20}   // small
 17 | };
 18 | 
 19 | string protoFile = "hand/pose_deploy.prototxt";
 20 | string weightsFile = "hand/pose_iter_102000.caffemodel";
 21 | 
 22 | int nPoints = 22;
 23 | 
 24 | int main(int argc, char **argv)
 25 | {
 26 |     float thresh = 0.01;
 27 | 
 28 |     cv::VideoCapture cap("asl.mp4");
 29 | 
 30 |     if (!cap.isOpened())
 31 |     {
 32 |         cerr << "Unable to connect to camera" << endl;
 33 |         return 1;
 34 |     }
 35 | 
 36 |     Mat frame, frameCopy;
 37 |     int frameWidth = cap.get(CAP_PROP_FRAME_WIDTH);
 38 |     int frameHeight = cap.get(CAP_PROP_FRAME_HEIGHT);
 39 |     float aspect_ratio = frameWidth/(float)frameHeight;
 40 |     int inHeight = 368;
 41 |     int inWidth = (int(aspect_ratio*inHeight) * 8) / 8;
 42 | 
 43 |     cout << "inWidth = " << inWidth << " ; inHeight = " << inHeight << endl;
 44 | 
 45 |     VideoWriter video("Output-Skeleton.avi",VideoWriter::fourcc('M','J','P','G'), 10, Size(frameWidth,frameHeight));
 46 | 
 47 |     Net net = readNetFromCaffe(protoFile, weightsFile);
 48 | 
 49 |     double t=0;
 50 |     while(1)
 51 |     {
 52 |         double t = (double) cv::getTickCount();
 53 | 
 54 |         cap >> frame;
 55 |         frameCopy = frame.clone();
 56 |         Mat inpBlob = blobFromImage(frame, 1.0 / 255, Size(inWidth, inHeight), Scalar(0, 0, 0), false, false);
 57 | 
 58 |         net.setInput(inpBlob);
 59 | 
 60 |         Mat output = net.forward();
 61 | 
 62 |         int H = output.size[2];
 63 |         int W = output.size[3];
 64 | 
 65 |         // find the position of the body parts
 66 |         vector<Point> points(nPoints);
 67 |         for (int n=0; n < nPoints; n++)
 68 |         {
 69 |             // Probability map of corresponding body's part.
 70 |             Mat probMap(H, W, CV_32F, output.ptr(0,n));
 71 |             resize(probMap, probMap, Size(frameWidth, frameHeight));
 72 | 
 73 |             Point maxLoc;
 74 |             double prob;
 75 |             minMaxLoc(probMap, 0, &prob, 0, &maxLoc);
 76 |             if (prob > thresh)
 77 |             {
 78 |                 circle(frameCopy, cv::Point((int)maxLoc.x, (int)maxLoc.y), 8, Scalar(0,255,255), -1);
 79 |                 cv::putText(frameCopy, cv::format("%d", n), cv::Point((int)maxLoc.x, (int)maxLoc.y), cv::FONT_HERSHEY_COMPLEX, 1, cv::Scalar(0, 0, 255), 2);
 80 | 
 81 |             }
 82 |             points[n] = maxLoc;
 83 |         }
 84 | 
 85 |         int nPairs = sizeof(POSE_PAIRS)/sizeof(POSE_PAIRS[0]);
 86 | 
 87 |         for (int n = 0; n < nPairs; n++)
 88 |         {
 89 |             // lookup 2 connected body/hand parts
 90 |             Point2f partA = points[POSE_PAIRS[n][0]];
 91 |             Point2f partB = points[POSE_PAIRS[n][1]];
 92 | 
 93 |             if (partA.x<=0 || partA.y<=0 || partB.x<=0 || partB.y<=0)
 94 |                 continue;
 95 | 
 96 |             line(frame, partA, partB, Scalar(0,255,255), 8);
 97 |             circle(frame, partA, 8, Scalar(0,0,255), -1);
 98 |             circle(frame, partB, 8, Scalar(0,0,255), -1);
 99 |         }
100 | 
101 |         t = ((double)cv::getTickCount() - t)/cv::getTickFrequency();
102 |         cout << "Time Taken for frame = " << t << endl;
103 |         cv::putText(frame, cv::format("time taken = %.2f sec", t), cv::Point(50, 50), cv::FONT_HERSHEY_COMPLEX, .8, cv::Scalar(255, 50, 0), 2);
104 |         // imshow("Output-Keypoints", frameCopy);
105 |         imshow("Output-Skeleton", frame);
106 |         video.write(frame);
107 |         char key = waitKey(1);
108 |         if (key==27)
109 |             break;
110 |     }
111 |     // When everything done, release the video capture and write object
112 |     cap.release();
113 |     video.release();
114 | 
115 |     return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/other/handPoseVideo.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import time
 3 | import numpy as np
 4 | 
 5 | 
 6 | protoFile = "hand/pose_deploy.prototxt"
 7 | weightsFile = "hand/pose_iter_102000.caffemodel"
 8 | nPoints = 22
 9 | POSE_PAIRS = [ [0,1],[1,2],[2,3],[3,4],[0,5],[5,6],[6,7],[7,8],[0,9],[9,10],[10,11],[11,12],[0,13],[13,14],[14,15],[15,16],[0,17],[17,18],[18,19],[19,20] ]
10 | 
11 | threshold = 0.2
12 | 
13 | 
14 | input_source = "asl.mp4"
15 | cap = cv2.VideoCapture(input_source)
16 | hasFrame, frame = cap.read()
17 | 
18 | frameWidth = frame.shape[1]
19 | frameHeight = frame.shape[0]
20 | 
21 | aspect_ratio = frameWidth/frameHeight
22 | 
23 | inHeight = 368
24 | inWidth = int(((aspect_ratio*inHeight)*8)//8)
25 | 
26 | vid_writer = cv2.VideoWriter('output.avi',cv2.VideoWriter_fourcc('M','J','P','G'), 15, (frame.shape[1],frame.shape[0]))
27 | 
28 | net = cv2.dnn.readNetFromCaffe(protoFile, weightsFile)
29 | k = 0
30 | while 1:
31 |     k+=1
32 |     t = time.time()
33 |     hasFrame, frame = cap.read()
34 |     frameCopy = np.copy(frame)
35 |     if not hasFrame:
36 |         cv2.waitKey()
37 |         break
38 | 
39 |     inpBlob = cv2.dnn.blobFromImage(frame, 1.0 / 255, (inWidth, inHeight),
40 |                               (0, 0, 0), swapRB=False, crop=False)
41 | 
42 |     net.setInput(inpBlob)
43 | 
44 |     output = net.forward()
45 | 
46 |     print("forward = {}".format(time.time() - t))
47 | 
48 |     # Empty list to store the detected keypoints
49 |     points = []
50 | 
51 |     for i in range(nPoints):
52 |         # confidence map of corresponding body's part.
53 |         probMap = output[0, i, :, :]
54 |         probMap = cv2.resize(probMap, (frameWidth, frameHeight))
55 | 
56 |         # Find global maxima of the probMap.
57 |         minVal, prob, minLoc, point = cv2.minMaxLoc(probMap)
58 | 
59 |         if prob > threshold :
60 |             cv2.circle(frameCopy, (int(point[0]), int(point[1])), 6, (0, 255, 255), thickness=-1, lineType=cv2.FILLED)
61 |             cv2.putText(frameCopy, "{}".format(i), (int(point[0]), int(point[1])), cv2.FONT_HERSHEY_SIMPLEX, .8, (0, 0, 255), 2, lineType=cv2.LINE_AA)
62 | 
63 |             # Add the point to the list if the probability is greater than the threshold
64 |             points.append((int(point[0]), int(point[1])))
65 |         else :
66 |             points.append(None)
67 | 
68 |     # Draw Skeleton
69 |     for pair in POSE_PAIRS:
70 |         partA = pair[0]
71 |         partB = pair[1]
72 | 
73 |         if points[partA] and points[partB]:
74 |             cv2.line(frame, points[partA], points[partB], (0, 255, 255), 2, lineType=cv2.LINE_AA)
75 |             cv2.circle(frame, points[partA], 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
76 |             cv2.circle(frame, points[partB], 5, (0, 0, 255), thickness=-1, lineType=cv2.FILLED)
77 | 
78 |     print("Time Taken for frame = {}".format(time.time() - t))
79 | 
80 |     # cv2.putText(frame, "time taken = {:.2f} sec".format(time.time() - t), (50, 50), cv2.FONT_HERSHEY_COMPLEX, .8, (255, 50, 0), 2, lineType=cv2.LINE_AA)
81 |     # cv2.putText(frame, "Hand Pose using OpenCV", (50, 50), cv2.FONT_HERSHEY_COMPLEX, 1, (255, 50, 0), 2, lineType=cv2.LINE_AA)
82 |     cv2.imshow('Output-Skeleton', frame)
83 |     # cv2.imwrite("video_output/{:03d}.jpg".format(k), frame)
84 |     key = cv2.waitKey(1)
85 |     if key == 27:
86 |         break
87 | 
88 |     print("total = {}".format(time.time() - t))
89 | 
90 |     vid_writer.write(frame)
91 | 
92 | vid_writer.release()
93 | 


--------------------------------------------------------------------------------