├── LICENSE.MIT
├── README.md
├── convert_to_onnx.py
├── curve
    ├── 1.jpg
    ├── FDDB.png
    ├── Widerface.jpg
    └── test.jpg
├── data
    ├── FDDB
    │   └── img_list.txt
    ├── __init__.py
    ├── config.py
    ├── data_augment.py
    ├── input
    │   └── huge.jpg
    ├── output
    │   ├── bbox_pred_acc.png
    │   └── test_5.jpg
    └── wider_face.py
├── detect.py
├── detect_merge.py
├── layers
    ├── __init__.py
    ├── functions
    │   └── prior_box.py
    └── modules
    │   ├── __init__.py
    │   └── multibox_loss.py
├── models
    ├── __init__.py
    ├── net.py
    └── retinaface.py
├── pose
    ├── __init__.py
    ├── datasets.py
    ├── detect_image.py
    ├── hopenet.py
    ├── test_alexnet.py
    ├── test_hopenet.py
    ├── test_on_video.py
    ├── test_on_video_dlib.py
    ├── test_on_video_dockerface.py
    ├── test_resnet50_regression.py
    ├── train_alexnet.py
    ├── train_hopenet.py
    ├── train_resnet50_regression.py
    └── utils.py
├── test_fddb.py
├── test_widerface.py
├── train.py
├── utils
    ├── __init__.py
    ├── box_utils.py
    ├── nms
    │   ├── __init__.py
    │   └── py_cpu_nms.py
    └── timer.py
└── widerface_evaluate
    ├── README.md
    ├── box_overlaps.pyx
    ├── evaluation.py
    ├── ground_truth
        ├── wider_easy_val.mat
        ├── wider_face_val.mat
        ├── wider_hard_val.mat
        └── wider_medium_val.mat
    └── setup.py


/LICENSE.MIT:
--------------------------------------------------------------------------------
 1 | MIT License 
 2 | 
 3 | Copyright (c) 2019
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RetinaFace in PyTorch
 2 | 
 3 | ```
 4 |     author is leilei
 5 |     dataset is widerface
 6 |     face detection, face key point detection, and face pose estimation
 7 | ```
 8 | 
 9 | ### Note
10 | + This repository is forked from [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface)
11 | + Based on this project, we merge the three tasks of **face detection, face key point detection, and face pose estimation** into one task.
12 | + Now, we only release network's code and demo's code! More codes will be released in the future.
13 | 
14 | ### Inference
15 | ```
16 |     python detect_merge.py ${-m} ${--network} ${--image_path} ${--output_path} 
17 | ```
18 | 
19 | ### Demo
20 | |![face-detect](./data/output/test_5.jpg)|
21 | |----|
22 | 
23 | ### Weight
24 | + [GoogleDriver](https://drive.google.com/file/d/1YbMLrUdgmY1vNTQ8Y6OhR0pKifZeCGWa/view?usp=sharing)
25 | > | resnet50-retinaface | Easy | Medium | Hard |
26 | > | :----: | :----: | :----: | :----: |
27 | > | AP | 94.3486% | 93.3151% | 88.6972% |
28 | + ![face-detect-ap](./data/output/bbox_pred_acc.png)
29 | 
30 | ### Train
31 | + TODO release code
32 | 
33 | ### References
34 | + [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface)
35 | + [HopeNet-pytorch](https://github.com/natanielruiz/deep-head-pose)
36 | + [insightface-RetinaFaceAntiCov](https://github.com/deepinsight/insightface/tree/master/detection/RetinaFaceAntiCov)


--------------------------------------------------------------------------------
/convert_to_onnx.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | import argparse
 4 | import torch
 5 | import torch.backends.cudnn as cudnn
 6 | import numpy as np
 7 | from data import cfg_mnet, cfg_re50
 8 | from layers.functions.prior_box import PriorBox
 9 | from utils.nms.py_cpu_nms import py_cpu_nms
10 | import cv2
11 | from models.retinaface import RetinaFace
12 | from utils.box_utils import decode, decode_landm
13 | from utils.timer import Timer
14 | 
15 | 
16 | parser = argparse.ArgumentParser(description='Test')
17 | parser.add_argument('-m', '--trained_model', default='./weights/mobilenet0.25_Final.pth',
18 |                     type=str, help='Trained state_dict file path to open')
19 | parser.add_argument('--network', default='mobile0.25', help='Backbone network mobile0.25 or resnet50')
20 | parser.add_argument('--long_side', default=640, help='when origin_size is false, long_side is scaled size(320 or 640 for long side)')
21 | parser.add_argument('--cpu', action="store_true", default=True, help='Use cpu inference')
22 | 
23 | args = parser.parse_args()
24 | 
25 | 
26 | def check_keys(model, pretrained_state_dict):
27 |     ckpt_keys = set(pretrained_state_dict.keys())
28 |     model_keys = set(model.state_dict().keys())
29 |     used_pretrained_keys = model_keys & ckpt_keys
30 |     unused_pretrained_keys = ckpt_keys - model_keys
31 |     missing_keys = model_keys - ckpt_keys
32 |     print('Missing keys:{}'.format(len(missing_keys)))
33 |     print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys)))
34 |     print('Used keys:{}'.format(len(used_pretrained_keys)))
35 |     assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
36 |     return True
37 | 
38 | 
39 | def remove_prefix(state_dict, prefix):
40 |     ''' Old style model is stored with all names of parameters sharing common prefix 'module.' '''
41 |     print('remove prefix \'{}\''.format(prefix))
42 |     f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x
43 |     return {f(key): value for key, value in state_dict.items()}
44 | 
45 | 
46 | def load_model(model, pretrained_path, load_to_cpu):
47 |     print('Loading pretrained model from {}'.format(pretrained_path))
48 |     if load_to_cpu:
49 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
50 |     else:
51 |         device = torch.cuda.current_device()
52 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device))
53 |     if "state_dict" in pretrained_dict.keys():
54 |         pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.')
55 |     else:
56 |         pretrained_dict = remove_prefix(pretrained_dict, 'module.')
57 |     check_keys(model, pretrained_dict)
58 |     model.load_state_dict(pretrained_dict, strict=False)
59 |     return model
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     torch.set_grad_enabled(False)
64 |     cfg = None
65 |     if args.network == "mobile0.25":
66 |         cfg = cfg_mnet
67 |     elif args.network == "resnet50":
68 |         cfg = cfg_re50
69 |     # net and model
70 |     net = RetinaFace(cfg=cfg, phase = 'test')
71 |     net = load_model(net, args.trained_model, args.cpu)
72 |     net.eval()
73 |     print('Finished loading model!')
74 |     print(net)
75 |     device = torch.device("cpu" if args.cpu else "cuda")
76 |     net = net.to(device)
77 | 
78 |     # ------------------------ export -----------------------------
79 |     output_onnx = 'FaceDetector.onnx'
80 |     print("==> Exporting model to ONNX format at '{}'".format(output_onnx))
81 |     input_names = ["input0"]
82 |     output_names = ["output0"]
83 |     inputs = torch.randn(1, 3, args.long_side, args.long_side).to(device)
84 | 
85 |     torch_out = torch.onnx._export(net, inputs, output_onnx, export_params=True, verbose=False,
86 |                                    input_names=input_names, output_names=output_names)
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/curve/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/curve/1.jpg


--------------------------------------------------------------------------------
/curve/FDDB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/curve/FDDB.png


--------------------------------------------------------------------------------
/curve/Widerface.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/curve/Widerface.jpg


--------------------------------------------------------------------------------
/curve/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/curve/test.jpg


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
1 | from .wider_face import WiderFaceDetection, detection_collate
2 | from .data_augment import *
3 | from .config import *
4 | 


--------------------------------------------------------------------------------
/data/config.py:
--------------------------------------------------------------------------------
 1 | # config.py
 2 | 
 3 | cfg_mnet = {
 4 |     'name': 'mobilenet0.25',
 5 |     'min_sizes': [[16, 32], [64, 128], [256, 512]],
 6 |     'steps': [8, 16, 32],
 7 |     'variance': [0.1, 0.2],
 8 |     'clip': False,
 9 |     'loc_weight': 2.0,
10 |     'gpu_train': True,
11 |     'batch_size': 32,
12 |     'ngpu': 1,
13 |     'epoch': 250,
14 |     'decay1': 190,
15 |     'decay2': 220,
16 |     'image_size': 640,
17 |     'pretrain': True,
18 |     'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3},
19 |     'in_channel': 32,
20 |     'out_channel': 64
21 | }
22 | 
23 | cfg_re50 = {
24 |     'name': 'Resnet50',
25 |     'min_sizes': [[16, 32], [64, 128], [256, 512]],
26 |     'steps': [8, 16, 32],
27 |     'variance': [0.1, 0.2],
28 |     'clip': False,
29 |     'loc_weight': 2.0,
30 |     'gpu_train': True,
31 |     'batch_size': 24,
32 |     'ngpu': 4,
33 |     'epoch': 100,
34 |     'decay1': 70,
35 |     'decay2': 90,
36 |     'image_size': 840,
37 |     'pretrain': True,
38 |     'return_layers': {'layer2': 1, 'layer3': 2, 'layer4': 3},
39 |     'in_channel': 256,
40 |     'out_channel': 256
41 | }
42 | 
43 | 


--------------------------------------------------------------------------------
/data/data_augment.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import random
  4 | from utils.box_utils import matrix_iof
  5 | 
  6 | 
  7 | def _crop(image, boxes, labels, landm, img_dim):
  8 |     height, width, _ = image.shape
  9 |     pad_image_flag = True
 10 | 
 11 |     for _ in range(250):
 12 |         """
 13 |         if random.uniform(0, 1) <= 0.2:
 14 |             scale = 1.0
 15 |         else:
 16 |             scale = random.uniform(0.3, 1.0)
 17 |         """
 18 |         PRE_SCALES = [0.3, 0.45, 0.6, 0.8, 1.0]
 19 |         scale = random.choice(PRE_SCALES)
 20 |         short_side = min(width, height)
 21 |         w = int(scale * short_side)
 22 |         h = w
 23 | 
 24 |         if width == w:
 25 |             l = 0
 26 |         else:
 27 |             l = random.randrange(width - w)
 28 |         if height == h:
 29 |             t = 0
 30 |         else:
 31 |             t = random.randrange(height - h)
 32 |         roi = np.array((l, t, l + w, t + h))
 33 | 
 34 |         value = matrix_iof(boxes, roi[np.newaxis])
 35 |         flag = (value >= 1)
 36 |         if not flag.any():
 37 |             continue
 38 | 
 39 |         centers = (boxes[:, :2] + boxes[:, 2:]) / 2
 40 |         mask_a = np.logical_and(roi[:2] < centers, centers < roi[2:]).all(axis=1)
 41 |         boxes_t = boxes[mask_a].copy()
 42 |         labels_t = labels[mask_a].copy()
 43 |         landms_t = landm[mask_a].copy()
 44 |         landms_t = landms_t.reshape([-1, 5, 2])
 45 | 
 46 |         if boxes_t.shape[0] == 0:
 47 |             continue
 48 | 
 49 |         image_t = image[roi[1]:roi[3], roi[0]:roi[2]]
 50 | 
 51 |         boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2])
 52 |         boxes_t[:, :2] -= roi[:2]
 53 |         boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:])
 54 |         boxes_t[:, 2:] -= roi[:2]
 55 | 
 56 |         # landm
 57 |         landms_t[:, :, :2] = landms_t[:, :, :2] - roi[:2]
 58 |         landms_t[:, :, :2] = np.maximum(landms_t[:, :, :2], np.array([0, 0]))
 59 |         landms_t[:, :, :2] = np.minimum(landms_t[:, :, :2], roi[2:] - roi[:2])
 60 |         landms_t = landms_t.reshape([-1, 10])
 61 | 
 62 | 
 63 | 	# make sure that the cropped image contains at least one face > 16 pixel at training image scale
 64 |         b_w_t = (boxes_t[:, 2] - boxes_t[:, 0] + 1) / w * img_dim
 65 |         b_h_t = (boxes_t[:, 3] - boxes_t[:, 1] + 1) / h * img_dim
 66 |         mask_b = np.minimum(b_w_t, b_h_t) > 0.0
 67 |         boxes_t = boxes_t[mask_b]
 68 |         labels_t = labels_t[mask_b]
 69 |         landms_t = landms_t[mask_b]
 70 | 
 71 |         if boxes_t.shape[0] == 0:
 72 |             continue
 73 | 
 74 |         pad_image_flag = False
 75 | 
 76 |         return image_t, boxes_t, labels_t, landms_t, pad_image_flag
 77 |     return image, boxes, labels, landm, pad_image_flag
 78 | 
 79 | 
 80 | def _distort(image):
 81 | 
 82 |     def _convert(image, alpha=1, beta=0):
 83 |         tmp = image.astype(float) * alpha + beta
 84 |         tmp[tmp < 0] = 0
 85 |         tmp[tmp > 255] = 255
 86 |         image[:] = tmp
 87 | 
 88 |     image = image.copy()
 89 | 
 90 |     if random.randrange(2):
 91 | 
 92 |         #brightness distortion
 93 |         if random.randrange(2):
 94 |             _convert(image, beta=random.uniform(-32, 32))
 95 | 
 96 |         #contrast distortion
 97 |         if random.randrange(2):
 98 |             _convert(image, alpha=random.uniform(0.5, 1.5))
 99 | 
100 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
101 | 
102 |         #saturation distortion
103 |         if random.randrange(2):
104 |             _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
105 | 
106 |         #hue distortion
107 |         if random.randrange(2):
108 |             tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
109 |             tmp %= 180
110 |             image[:, :, 0] = tmp
111 | 
112 |         image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
113 | 
114 |     else:
115 | 
116 |         #brightness distortion
117 |         if random.randrange(2):
118 |             _convert(image, beta=random.uniform(-32, 32))
119 | 
120 |         image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
121 | 
122 |         #saturation distortion
123 |         if random.randrange(2):
124 |             _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5))
125 | 
126 |         #hue distortion
127 |         if random.randrange(2):
128 |             tmp = image[:, :, 0].astype(int) + random.randint(-18, 18)
129 |             tmp %= 180
130 |             image[:, :, 0] = tmp
131 | 
132 |         image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
133 | 
134 |         #contrast distortion
135 |         if random.randrange(2):
136 |             _convert(image, alpha=random.uniform(0.5, 1.5))
137 | 
138 |     return image
139 | 
140 | 
141 | def _expand(image, boxes, fill, p):
142 |     if random.randrange(2):
143 |         return image, boxes
144 | 
145 |     height, width, depth = image.shape
146 | 
147 |     scale = random.uniform(1, p)
148 |     w = int(scale * width)
149 |     h = int(scale * height)
150 | 
151 |     left = random.randint(0, w - width)
152 |     top = random.randint(0, h - height)
153 | 
154 |     boxes_t = boxes.copy()
155 |     boxes_t[:, :2] += (left, top)
156 |     boxes_t[:, 2:] += (left, top)
157 |     expand_image = np.empty(
158 |         (h, w, depth),
159 |         dtype=image.dtype)
160 |     expand_image[:, :] = fill
161 |     expand_image[top:top + height, left:left + width] = image
162 |     image = expand_image
163 | 
164 |     return image, boxes_t
165 | 
166 | 
167 | def _mirror(image, boxes, landms):
168 |     _, width, _ = image.shape
169 |     if random.randrange(2):
170 |         image = image[:, ::-1]
171 |         boxes = boxes.copy()
172 |         boxes[:, 0::2] = width - boxes[:, 2::-2]
173 | 
174 |         # landm
175 |         landms = landms.copy()
176 |         landms = landms.reshape([-1, 5, 2])
177 |         landms[:, :, 0] = width - landms[:, :, 0]
178 |         tmp = landms[:, 1, :].copy()
179 |         landms[:, 1, :] = landms[:, 0, :]
180 |         landms[:, 0, :] = tmp
181 |         tmp1 = landms[:, 4, :].copy()
182 |         landms[:, 4, :] = landms[:, 3, :]
183 |         landms[:, 3, :] = tmp1
184 |         landms = landms.reshape([-1, 10])
185 | 
186 |     return image, boxes, landms
187 | 
188 | 
189 | def _pad_to_square(image, rgb_mean, pad_image_flag):
190 |     if not pad_image_flag:
191 |         return image
192 |     height, width, _ = image.shape
193 |     long_side = max(width, height)
194 |     image_t = np.empty((long_side, long_side, 3), dtype=image.dtype)
195 |     image_t[:, :] = rgb_mean
196 |     image_t[0:0 + height, 0:0 + width] = image
197 |     return image_t
198 | 
199 | 
200 | def _resize_subtract_mean(image, insize, rgb_mean):
201 |     interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4]
202 |     interp_method = interp_methods[random.randrange(5)]
203 |     image = cv2.resize(image, (insize, insize), interpolation=interp_method)
204 |     image = image.astype(np.float32)
205 |     image -= rgb_mean
206 |     return image.transpose(2, 0, 1)
207 | 
208 | 
209 | class preproc(object):
210 | 
211 |     def __init__(self, img_dim, rgb_means):
212 |         self.img_dim = img_dim
213 |         self.rgb_means = rgb_means
214 | 
215 |     def __call__(self, image, targets):
216 |         assert targets.shape[0] > 0, "this image does not have gt"
217 | 
218 |         boxes = targets[:, :4].copy()
219 |         labels = targets[:, -1].copy()
220 |         landm = targets[:, 4:-1].copy()
221 | 
222 |         image_t, boxes_t, labels_t, landm_t, pad_image_flag = _crop(image, boxes, labels, landm, self.img_dim)
223 |         image_t = _distort(image_t)
224 |         image_t = _pad_to_square(image_t,self.rgb_means, pad_image_flag)
225 |         image_t, boxes_t, landm_t = _mirror(image_t, boxes_t, landm_t)
226 |         height, width, _ = image_t.shape
227 |         image_t = _resize_subtract_mean(image_t, self.img_dim, self.rgb_means)
228 |         boxes_t[:, 0::2] /= width
229 |         boxes_t[:, 1::2] /= height
230 | 
231 |         landm_t[:, 0::2] /= width
232 |         landm_t[:, 1::2] /= height
233 | 
234 |         labels_t = np.expand_dims(labels_t, 1)
235 |         targets_t = np.hstack((boxes_t, landm_t, labels_t))
236 | 
237 |         return image_t, targets_t
238 | 


--------------------------------------------------------------------------------
/data/input/huge.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/data/input/huge.jpg


--------------------------------------------------------------------------------
/data/output/bbox_pred_acc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/data/output/bbox_pred_acc.png


--------------------------------------------------------------------------------
/data/output/test_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/data/output/test_5.jpg


--------------------------------------------------------------------------------
/data/wider_face.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path
  3 | import sys
  4 | import torch
  5 | import torch.utils.data as data
  6 | import cv2
  7 | import numpy as np
  8 | 
  9 | class WiderFaceDetection(data.Dataset):
 10 |     def __init__(self, txt_path, preproc=None):
 11 |         self.preproc = preproc
 12 |         self.imgs_path = []
 13 |         self.words = []
 14 |         f = open(txt_path,'r')
 15 |         lines = f.readlines()
 16 |         isFirst = True
 17 |         labels = []
 18 |         for line in lines:
 19 |             line = line.rstrip()
 20 |             if line.startswith('#'):
 21 |                 if isFirst is True:
 22 |                     isFirst = False
 23 |                 else:
 24 |                     labels_copy = labels.copy()
 25 |                     self.words.append(labels_copy)
 26 |                     labels.clear()
 27 |                 path = line[2:]
 28 |                 path = txt_path.replace('label.txt','images/') + path
 29 |                 self.imgs_path.append(path)
 30 |             else:
 31 |                 line = line.split(' ')
 32 |                 label = [float(x) for x in line]
 33 |                 labels.append(label)
 34 | 
 35 |         self.words.append(labels)
 36 | 
 37 |     def __len__(self):
 38 |         return len(self.imgs_path)
 39 | 
 40 |     def __getitem__(self, index):
 41 |         img = cv2.imread(self.imgs_path[index])
 42 |         height, width, _ = img.shape
 43 | 
 44 |         labels = self.words[index]
 45 |         annotations = np.zeros((0, 15))
 46 |         if len(labels) == 0:
 47 |             return annotations
 48 |         for idx, label in enumerate(labels):
 49 |             annotation = np.zeros((1, 15))
 50 |             # bbox
 51 |             annotation[0, 0] = label[0]  # x1
 52 |             annotation[0, 1] = label[1]  # y1
 53 |             annotation[0, 2] = label[0] + label[2]  # x2
 54 |             annotation[0, 3] = label[1] + label[3]  # y2
 55 | 
 56 |             # landmarks
 57 |             annotation[0, 4] = label[4]    # l0_x
 58 |             annotation[0, 5] = label[5]    # l0_y
 59 |             annotation[0, 6] = label[7]    # l1_x
 60 |             annotation[0, 7] = label[8]    # l1_y
 61 |             annotation[0, 8] = label[10]   # l2_x
 62 |             annotation[0, 9] = label[11]   # l2_y
 63 |             annotation[0, 10] = label[13]  # l3_x
 64 |             annotation[0, 11] = label[14]  # l3_y
 65 |             annotation[0, 12] = label[16]  # l4_x
 66 |             annotation[0, 13] = label[17]  # l4_y
 67 |             if (annotation[0, 4]<0):
 68 |                 annotation[0, 14] = -1
 69 |             else:
 70 |                 annotation[0, 14] = 1
 71 | 
 72 |             annotations = np.append(annotations, annotation, axis=0)
 73 |         target = np.array(annotations)
 74 |         if self.preproc is not None:
 75 |             img, target = self.preproc(img, target)
 76 | 
 77 |         return torch.from_numpy(img), target
 78 | 
 79 | def detection_collate(batch):
 80 |     """Custom collate fn for dealing with batches of images that have a different
 81 |     number of associated object annotations (bounding boxes).
 82 | 
 83 |     Arguments:
 84 |         batch: (tuple) A tuple of tensor images and lists of annotations
 85 | 
 86 |     Return:
 87 |         A tuple containing:
 88 |             1) (tensor) batch of images stacked on their 0 dim
 89 |             2) (list of tensors) annotations for a given image are stacked on 0 dim
 90 |     """
 91 |     targets = []
 92 |     imgs = []
 93 |     for _, sample in enumerate(batch):
 94 |         for _, tup in enumerate(sample):
 95 |             if torch.is_tensor(tup):
 96 |                 imgs.append(tup)
 97 |             elif isinstance(tup, type(np.empty(0))):
 98 |                 annos = torch.from_numpy(tup).float()
 99 |                 targets.append(annos)
100 | 
101 |     return (torch.stack(imgs, 0), targets)
102 | 


--------------------------------------------------------------------------------
/detect.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import argparse
  4 | import torch
  5 | import torch.backends.cudnn as cudnn
  6 | import numpy as np
  7 | from data import cfg_mnet, cfg_re50
  8 | from layers.functions.prior_box import PriorBox
  9 | from utils.nms.py_cpu_nms import py_cpu_nms
 10 | import cv2
 11 | from models.retinaface import RetinaFace
 12 | from utils.box_utils import decode, decode_landm
 13 | import time
 14 | 
 15 | parser = argparse.ArgumentParser(description='Retinaface')
 16 | 
 17 | parser.add_argument('-m', '--trained_model', default='./weights/Resnet50_Final.pth',
 18 |                     type=str, help='Trained state_dict file path to open')
 19 | parser.add_argument('--network', default='resnet50', help='Backbone network mobile0.25 or resnet50')
 20 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference')
 21 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold')
 22 | parser.add_argument('--top_k', default=5000, type=int, help='top_k')
 23 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold')
 24 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k')
 25 | parser.add_argument('-s', '--save_image', action="store_true", default=True, help='show detection results')
 26 | parser.add_argument('--vis_thres', default=0.6, type=float, help='visualization_threshold')
 27 | args = parser.parse_args()
 28 | 
 29 | 
 30 | def check_keys(model, pretrained_state_dict):
 31 |     ckpt_keys = set(pretrained_state_dict.keys())
 32 |     model_keys = set(model.state_dict().keys())
 33 |     used_pretrained_keys = model_keys & ckpt_keys
 34 |     unused_pretrained_keys = ckpt_keys - model_keys
 35 |     missing_keys = model_keys - ckpt_keys
 36 |     print('Missing keys:{}'.format(len(missing_keys)))
 37 |     print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys)))
 38 |     print('Used keys:{}'.format(len(used_pretrained_keys)))
 39 |     assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
 40 |     return True
 41 | 
 42 | 
 43 | def remove_prefix(state_dict, prefix):
 44 |     ''' Old style model is stored with all names of parameters sharing common prefix 'module.' '''
 45 |     print('remove prefix \'{}\''.format(prefix))
 46 |     f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x
 47 |     return {f(key): value for key, value in state_dict.items()}
 48 | 
 49 | 
 50 | def load_model(model, pretrained_path, load_to_cpu):
 51 |     print('Loading pretrained model from {}'.format(pretrained_path))
 52 |     if load_to_cpu:
 53 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
 54 |     else:
 55 |         device = torch.cuda.current_device()
 56 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device))
 57 |     if "state_dict" in pretrained_dict.keys():
 58 |         pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.')
 59 |     else:
 60 |         pretrained_dict = remove_prefix(pretrained_dict, 'module.')
 61 |     check_keys(model, pretrained_dict)
 62 |     model.load_state_dict(pretrained_dict, strict=False)
 63 |     return model
 64 | 
 65 | 
 66 | if __name__ == '__main__':
 67 |     torch.set_grad_enabled(False)
 68 |     cfg = None
 69 |     if args.network == "mobile0.25":
 70 |         cfg = cfg_mnet
 71 |     elif args.network == "resnet50":
 72 |         cfg = cfg_re50
 73 |     # net and model
 74 |     net = RetinaFace(cfg=cfg, phase = 'test')
 75 |     net = load_model(net, args.trained_model, args.cpu)
 76 |     net.eval()
 77 |     print('Finished loading model!')
 78 |     print(net)
 79 |     cudnn.benchmark = True
 80 |     device = torch.device("cpu" if args.cpu else "cuda")
 81 |     net = net.to(device)
 82 | 
 83 |     resize = 1
 84 | 
 85 |     # testing begin
 86 |     for i in range(100):
 87 |         image_path = "./curve/test.jpg"
 88 |         img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR)
 89 | 
 90 |         img = np.float32(img_raw)
 91 | 
 92 |         im_height, im_width, _ = img.shape
 93 |         scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
 94 |         img -= (104, 117, 123)
 95 |         img = img.transpose(2, 0, 1)
 96 |         img = torch.from_numpy(img).unsqueeze(0)
 97 |         img = img.to(device)
 98 |         scale = scale.to(device)
 99 | 
100 |         tic = time.time()
101 |         loc, conf, landms = net(img)  # forward pass
102 |         print('net forward time: {:.4f}'.format(time.time() - tic))
103 | 
104 |         priorbox = PriorBox(cfg, image_size=(im_height, im_width))
105 |         priors = priorbox.forward()
106 |         priors = priors.to(device)
107 |         prior_data = priors.data
108 |         boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance'])
109 |         boxes = boxes * scale / resize
110 |         boxes = boxes.cpu().numpy()
111 |         scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
112 |         landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance'])
113 |         scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2],
114 |                                img.shape[3], img.shape[2], img.shape[3], img.shape[2],
115 |                                img.shape[3], img.shape[2]])
116 |         scale1 = scale1.to(device)
117 |         landms = landms * scale1 / resize
118 |         landms = landms.cpu().numpy()
119 | 
120 |         # ignore low scores
121 |         inds = np.where(scores > args.confidence_threshold)[0]
122 |         boxes = boxes[inds]
123 |         landms = landms[inds]
124 |         scores = scores[inds]
125 | 
126 |         # keep top-K before NMS
127 |         order = scores.argsort()[::-1][:args.top_k]
128 |         boxes = boxes[order]
129 |         landms = landms[order]
130 |         scores = scores[order]
131 | 
132 |         # do NMS
133 |         dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
134 |         keep = py_cpu_nms(dets, args.nms_threshold)
135 |         # keep = nms(dets, args.nms_threshold,force_cpu=args.cpu)
136 |         dets = dets[keep, :]
137 |         landms = landms[keep]
138 | 
139 |         # keep top-K faster NMS
140 |         dets = dets[:args.keep_top_k, :]
141 |         landms = landms[:args.keep_top_k, :]
142 | 
143 |         dets = np.concatenate((dets, landms), axis=1)
144 | 
145 |         # show image
146 |         if args.save_image:
147 |             for b in dets:
148 |                 if b[4] < args.vis_thres:
149 |                     continue
150 |                 text = "{:.4f}".format(b[4])
151 |                 b = list(map(int, b))
152 |                 cv2.rectangle(img_raw, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2)
153 |                 cx = b[0]
154 |                 cy = b[1] + 12
155 |                 cv2.putText(img_raw, text, (cx, cy),
156 |                             cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
157 | 
158 |                 # landms
159 |                 cv2.circle(img_raw, (b[5], b[6]), 1, (0, 0, 255), 4)
160 |                 cv2.circle(img_raw, (b[7], b[8]), 1, (0, 255, 255), 4)
161 |                 cv2.circle(img_raw, (b[9], b[10]), 1, (255, 0, 255), 4)
162 |                 cv2.circle(img_raw, (b[11], b[12]), 1, (0, 255, 0), 4)
163 |                 cv2.circle(img_raw, (b[13], b[14]), 1, (255, 0, 0), 4)
164 |             # save image
165 | 
166 |             name = "test.jpg"
167 |             cv2.imwrite(name, img_raw)
168 | 
169 | 


--------------------------------------------------------------------------------
/detect_merge.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import argparse
  4 | import torch
  5 | import torch.backends.cudnn as cudnn
  6 | import numpy as np
  7 | from data import cfg_mnet, cfg_re50
  8 | from layers.functions.prior_box import PriorBox
  9 | from utils.nms.py_cpu_nms import py_cpu_nms
 10 | import cv2
 11 | from models.retinaface import RetinaFace
 12 | from utils.box_utils import decode, decode_landm
 13 | import time
 14 | import torch.nn.functional as F
 15 | from pose import utils
 16 | 
 17 | os.environ['CUDA_VISIBLE_DEVICES'] = '0'
 18 | 
 19 | parser = argparse.ArgumentParser(description='Retinaface')
 20 | 
 21 | parser.add_argument('-m', '--trained_model', default='./weights_merge/Resnet50_Final5_best.pth',
 22 |                     type=str, help='Trained state_dict file path to open')
 23 | parser.add_argument('--network', default='resnet50', help='Backbone network mobile0.25 or resnet50')
 24 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference')
 25 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold')
 26 | parser.add_argument('--top_k', default=5000, type=int, help='top_k')
 27 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold')
 28 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k')
 29 | parser.add_argument('-s', '--save_image', default=True, type=bool, help='show detection results')
 30 | parser.add_argument('--vis_thres', default=0.5, type=float, help='visualization_threshold')
 31 | parser.add_argument('--image_path', default="/home/gengyanlei/Datasets/East_door_face/huge.jpg", type=str, help="image's path")
 32 | parser.add_argument('--output_path', default="test_5.jpg", type=str, help='predict-visual')
 33 | args = parser.parse_args()
 34 | 
 35 | 
 36 | def check_keys(model, pretrained_state_dict):
 37 |     ckpt_keys = set(pretrained_state_dict.keys())
 38 |     model_keys = set(model.state_dict().keys())
 39 |     used_pretrained_keys = model_keys & ckpt_keys
 40 |     unused_pretrained_keys = ckpt_keys - model_keys
 41 |     missing_keys = model_keys - ckpt_keys
 42 |     print('Missing keys:{}'.format(len(missing_keys)))
 43 |     print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys)))
 44 |     print('Used keys:{}'.format(len(used_pretrained_keys)))
 45 |     assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
 46 |     return True
 47 | 
 48 | 
 49 | def remove_prefix(state_dict, prefix):
 50 |     ''' Old style model is stored with all names of parameters sharing common prefix 'module.' '''
 51 |     print('remove prefix \'{}\''.format(prefix))
 52 |     f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x
 53 |     return {f(key): value for key, value in state_dict.items()}
 54 | 
 55 | 
 56 | def load_model(model, pretrained_path, load_to_cpu):
 57 |     print('Loading pretrained model from {}'.format(pretrained_path))
 58 |     if load_to_cpu:
 59 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
 60 |     else:
 61 |         device = torch.cuda.current_device()
 62 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device))
 63 |     if "state_dict" in pretrained_dict.keys():
 64 |         pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.')
 65 |     else:
 66 |         pretrained_dict = remove_prefix(pretrained_dict, 'module.')
 67 |     check_keys(model, pretrained_dict)
 68 |     model.load_state_dict(pretrained_dict, strict=False)
 69 |     return model
 70 | 
 71 | if __name__ == '__main__':
 72 |     torch.set_grad_enabled(False)
 73 |     cfg = None
 74 |     if args.network == "mobile0.25":
 75 |         cfg = cfg_mnet
 76 |     elif args.network == "resnet50":
 77 |         cfg = cfg_re50
 78 |     # net and model
 79 |     net = RetinaFace(cfg=cfg, phase='test')
 80 |     net = load_model(net, args.trained_model, args.cpu)
 81 |     net.eval()
 82 |     print('Finished loading model!')
 83 |     # print(net)
 84 |     cudnn.benchmark = True
 85 |     device = torch.device("cpu" if args.cpu else "cuda")
 86 |     net = net.to(device)
 87 | 
 88 |     idx_tensor = [idx for idx in range(66)]
 89 |     idx_tensor = torch.FloatTensor(idx_tensor).cuda()
 90 | 
 91 |     resize = 1
 92 | 
 93 |     # testing begin
 94 |     for i in range(1):
 95 |         image_path = args.image_path
 96 |         img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR)
 97 | 
 98 |         img = np.float32(img_raw)
 99 |         # 测试是原始图像尺寸，不是640*640尺寸
100 |         im_height, im_width, _ = img.shape
101 |         scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]])  #whwh
102 |         img -= (104, 117, 123)
103 |         # 扩展 归一化；而且依旧是bgr输入，前后一致
104 |         img /= (57, 57, 58)
105 | 
106 |         img = img.transpose(2, 0, 1)  # chw
107 |         img = torch.from_numpy(img).unsqueeze(0)
108 |         img = img.to(device)
109 |         scale = scale.to(device)
110 | 
111 |         tic = time.time()
112 |         loc, conf, landms, yaw, pitch, roll = net(img)  # forward pass
113 |         print('net forward time: {:.4f}'.format(time.time() - tic))
114 | 
115 |         priorbox = PriorBox(cfg, image_size=(im_height, im_width))
116 |         priors = priorbox.forward()
117 |         priors = priors.to(device)
118 |         prior_data = priors.data
119 |         # decode 就相当于 匹配了！！！将anchor与预测框之间进行匹配
120 |         boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance'])
121 |         boxes = boxes * scale / resize
122 |         
123 |         boxes = boxes.cpu().numpy()
124 |         scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
125 |         landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance'])
126 |         # wh-> xy
127 |         scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2],
128 |                                img.shape[3], img.shape[2], img.shape[3], img.shape[2],
129 |                                img.shape[3], img.shape[2]])
130 |         scale1 = scale1.to(device)
131 |         landms = landms * scale1 / resize
132 |         landms = landms.cpu().numpy()
133 | 
134 |         # ignore low scores
135 |         inds = np.where(scores > args.confidence_threshold)[0]
136 |         boxes = boxes[inds]
137 |         landms = landms[inds]
138 |         scores = scores[inds]
139 | 
140 |         yaw = yaw.squeeze(0)[inds]
141 |         pitch = pitch.squeeze(0)[inds]
142 |         roll = roll.squeeze(0)[inds]
143 | 
144 |         # keep top-K before NMS  需要进行排序，获取每个预测框的score 按照从大到小排序，应该是每一类！
145 |         order = scores.argsort()[::-1][:args.top_k]
146 |         boxes = boxes[order]
147 |         landms = landms[order]
148 |         scores = scores[order]
149 | 
150 |         yaw = yaw[order.tolist()]
151 |         pitch = pitch[order.tolist()]
152 |         roll = roll[order.tolist()]
153 | 
154 |         # do NMS
155 |         dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
156 |         keep = py_cpu_nms(dets, args.nms_threshold)
157 |         # keep = nms(dets, args.nms_threshold,force_cpu=args.cpu)
158 |         dets = dets[keep, :]
159 |         landms = landms[keep]
160 | 
161 |         yaw = yaw[keep]
162 |         pitch = pitch[keep]
163 |         roll = roll[keep]
164 | 
165 |         # keep top-K faster NMS
166 |         dets = dets[:args.keep_top_k, :]
167 |         landms = landms[:args.keep_top_k, :]
168 | 
169 |         yaw = yaw[:args.keep_top_k]
170 |         pitch = pitch[:args.keep_top_k]
171 |         roll = roll[:args.keep_top_k]
172 | 
173 |         yaw = F.softmax(yaw, dim=-1)
174 |         pitch = F.softmax(pitch, dim=-1)
175 |         roll = F.softmax(roll, dim=-1)
176 |         yaw = torch.sum(yaw * idx_tensor, -1) * 3 - 99
177 |         pitch = torch.sum(pitch * idx_tensor, -1) * 3 - 99
178 |         roll = torch.sum(roll * idx_tensor, -1) * 3 - 99
179 | 
180 |         yaw = yaw.unsqueeze(-1).cpu().numpy()
181 |         pitch = pitch.unsqueeze(-1).cpu().numpy()
182 |         roll = roll.unsqueeze(-1).cpu().numpy()
183 | 
184 |         dets = np.concatenate((dets, landms, yaw, pitch, roll), axis=1)
185 | 
186 |         # show image
187 |         if args.save_image:
188 |             for b in dets:
189 |                 if b[4] < args.vis_thres:
190 |                     continue
191 |                 text = "{:.4f}".format(b[4])
192 |                 b = list(map(int, b))
193 |                 cv2.rectangle(img_raw, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2)
194 |                 cx = b[0]
195 |                 cy = b[1] + 12
196 |                 cv2.putText(img_raw, text, (cx, cy),
197 |                             cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
198 | 
199 |                 # landms
200 |                 cv2.circle(img_raw, (b[5], b[6]), 1, (0, 0, 255), 3)
201 |                 cv2.circle(img_raw, (b[7], b[8]), 1, (0, 255, 255), 3)
202 |                 cv2.circle(img_raw, (b[9], b[10]), 1, (255, 0, 255), 3)
203 |                 cv2.circle(img_raw, (b[11], b[12]), 1, (0, 255, 0), 3)
204 |                 cv2.circle(img_raw, (b[13], b[14]), 1, (255, 0, 0), 3)
205 | 
206 |                 # pose
207 |                 utils.draw_axis(img_raw, b[15], b[16], b[17], tdx=(b[0] + b[2]) / 2, tdy=(b[1] + b[3]) / 2, size=abs(b[3]-b[1]) / 2)
208 | 
209 |             # save image
210 | 
211 |             name = args.output_path
212 |             cv2.imwrite(name, img_raw)
213 | 
214 | 


--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from .functions import *
2 | from .modules import *
3 | 


--------------------------------------------------------------------------------
/layers/functions/prior_box.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from itertools import product as product
 3 | import numpy as np
 4 | from math import ceil
 5 | 
 6 | 
 7 | class PriorBox(object):
 8 |     def __init__(self, cfg, image_size=None, phase='train'):
 9 |         super(PriorBox, self).__init__()
10 |         self.min_sizes = cfg['min_sizes']
11 |         self.steps = cfg['steps']
12 |         self.clip = cfg['clip']
13 |         self.image_size = image_size
14 |         self.feature_maps = [[ceil(self.image_size[0]/step), ceil(self.image_size[1]/step)] for step in self.steps]
15 |         self.name = "s"
16 | 
17 |     def forward(self):
18 |         anchors = []
19 |         for k, f in enumerate(self.feature_maps):
20 |             min_sizes = self.min_sizes[k]
21 |             for i, j in product(range(f[0]), range(f[1])):
22 |                 for min_size in min_sizes:
23 |                     s_kx = min_size / self.image_size[1]
24 |                     s_ky = min_size / self.image_size[0]
25 |                     dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]]
26 |                     dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]]
27 |                     for cy, cx in product(dense_cy, dense_cx):
28 |                         anchors += [cx, cy, s_kx, s_ky]
29 | 
30 |         # back to torch land
31 |         output = torch.Tensor(anchors).view(-1, 4)
32 |         if self.clip:
33 |             output.clamp_(max=1, min=0)
34 |         return output
35 | 


--------------------------------------------------------------------------------
/layers/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .multibox_loss import MultiBoxLoss
2 | 
3 | __all__ = ['MultiBoxLoss']
4 | 


--------------------------------------------------------------------------------
/layers/modules/multibox_loss.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | from utils.box_utils import match, log_sum_exp
  6 | from data import cfg_mnet
  7 | GPU = cfg_mnet['gpu_train']
  8 | 
  9 | class MultiBoxLoss(nn.Module):
 10 |     """SSD Weighted Loss Function
 11 |     Compute Targets:
 12 |         1) Produce Confidence Target Indices by matching  ground truth boxes
 13 |            with (default) 'priorboxes' that have jaccard index > threshold parameter
 14 |            (default threshold: 0.5).
 15 |         2) Produce localization target by 'encoding' variance into offsets of ground
 16 |            truth boxes and their matched  'priorboxes'.
 17 |         3) Hard negative mining to filter the excessive number of negative examples
 18 |            that comes with using a large number of default bounding boxes.
 19 |            (default negative:positive ratio 3:1)
 20 |     Objective Loss:
 21 |         L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
 22 |         Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss
 23 |         weighted by α which is set to 1 by cross val.
 24 |         Args:
 25 |             c: class confidences,
 26 |             l: predicted boxes,
 27 |             g: ground truth boxes
 28 |             N: number of matched default boxes
 29 |         See: https://arxiv.org/pdf/1512.02325.pdf for more details.
 30 |     """
 31 | 
 32 |     def __init__(self, num_classes, overlap_thresh, prior_for_matching, bkg_label, neg_mining, neg_pos, neg_overlap, encode_target):
 33 |         super(MultiBoxLoss, self).__init__()
 34 |         self.num_classes = num_classes
 35 |         self.threshold = overlap_thresh
 36 |         self.background_label = bkg_label
 37 |         self.encode_target = encode_target
 38 |         self.use_prior_for_matching = prior_for_matching
 39 |         self.do_neg_mining = neg_mining
 40 |         self.negpos_ratio = neg_pos
 41 |         self.neg_overlap = neg_overlap
 42 |         self.variance = [0.1, 0.2]
 43 | 
 44 |     def forward(self, predictions, priors, targets):
 45 |         """Multibox Loss
 46 |         Args:
 47 |             predictions (tuple): A tuple containing loc preds, conf preds,
 48 |             and prior boxes from SSD net.
 49 |                 conf shape: torch.size(batch_size,num_priors,num_classes)
 50 |                 loc shape: torch.size(batch_size,num_priors,4)
 51 |                 priors shape: torch.size(num_priors,4)
 52 | 
 53 |             ground_truth (tensor): Ground truth boxes and labels for a batch,
 54 |                 shape: [batch_size,num_objs,5] (last idx is the label).
 55 |         """
 56 | 
 57 |         loc_data, conf_data, landm_data = predictions
 58 |         priors = priors
 59 |         num = loc_data.size(0)
 60 |         num_priors = (priors.size(0))
 61 | 
 62 |         # match priors (default boxes) and ground truth boxes
 63 |         loc_t = torch.Tensor(num, num_priors, 4)
 64 |         landm_t = torch.Tensor(num, num_priors, 10)
 65 |         conf_t = torch.LongTensor(num, num_priors)
 66 |         for idx in range(num):
 67 |             truths = targets[idx][:, :4].data
 68 |             labels = targets[idx][:, -1].data
 69 |             landms = targets[idx][:, 4:14].data
 70 |             defaults = priors.data
 71 |             match(self.threshold, truths, defaults, self.variance, labels, landms, loc_t, conf_t, landm_t, idx)
 72 |         if GPU:
 73 |             loc_t = loc_t.cuda()
 74 |             conf_t = conf_t.cuda()
 75 |             landm_t = landm_t.cuda()
 76 | 
 77 |         zeros = torch.tensor(0).cuda()
 78 |         # landm Loss (Smooth L1)
 79 |         # Shape: [batch,num_priors,10]
 80 |         pos1 = conf_t > zeros
 81 |         num_pos_landm = pos1.long().sum(1, keepdim=True)
 82 |         N1 = max(num_pos_landm.data.sum().float(), 1)
 83 |         pos_idx1 = pos1.unsqueeze(pos1.dim()).expand_as(landm_data)
 84 |         landm_p = landm_data[pos_idx1].view(-1, 10)
 85 |         landm_t = landm_t[pos_idx1].view(-1, 10)
 86 |         loss_landm = F.smooth_l1_loss(landm_p, landm_t, reduction='sum')
 87 | 
 88 | 
 89 |         pos = conf_t != zeros
 90 |         conf_t[pos] = 1
 91 | 
 92 |         # Localization Loss (Smooth L1)
 93 |         # Shape: [batch,num_priors,4]
 94 |         pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data)
 95 |         loc_p = loc_data[pos_idx].view(-1, 4)
 96 |         loc_t = loc_t[pos_idx].view(-1, 4)
 97 |         loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum')
 98 | 
 99 |         # Compute max conf across batch for hard negative mining
100 |         batch_conf = conf_data.view(-1, self.num_classes)
101 |         loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1))
102 | 
103 |         # Hard Negative Mining
104 |         loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now
105 |         loss_c = loss_c.view(num, -1)
106 |         _, loss_idx = loss_c.sort(1, descending=True)
107 |         _, idx_rank = loss_idx.sort(1)
108 |         num_pos = pos.long().sum(1, keepdim=True)
109 |         num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1)
110 |         neg = idx_rank < num_neg.expand_as(idx_rank)
111 | 
112 |         # Confidence Loss Including Positive and Negative Examples
113 |         pos_idx = pos.unsqueeze(2).expand_as(conf_data)
114 |         neg_idx = neg.unsqueeze(2).expand_as(conf_data)
115 |         conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1,self.num_classes)
116 |         targets_weighted = conf_t[(pos+neg).gt(0)]
117 |         loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum')
118 | 
119 |         # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N
120 |         N = max(num_pos.data.sum().float(), 1)
121 |         loss_l /= N
122 |         loss_c /= N
123 |         loss_landm /= N1
124 | 
125 |         return loss_l, loss_c, loss_landm
126 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/models/__init__.py


--------------------------------------------------------------------------------
/models/net.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import torch
  3 | import torch.nn as nn
  4 | import torchvision.models._utils as _utils
  5 | import torchvision.models as models
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | 
  9 | def conv_bn(inp, oup, stride = 1, leaky = 0):
 10 |     return nn.Sequential(
 11 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 12 |         nn.BatchNorm2d(oup),
 13 |         nn.LeakyReLU(negative_slope=leaky, inplace=True)
 14 |     )
 15 | 
 16 | def conv_bn_no_relu(inp, oup, stride):
 17 |     return nn.Sequential(
 18 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 19 |         nn.BatchNorm2d(oup),
 20 |     )
 21 | 
 22 | def conv_bn1X1(inp, oup, stride, leaky=0):
 23 |     return nn.Sequential(
 24 |         nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
 25 |         nn.BatchNorm2d(oup),
 26 |         nn.LeakyReLU(negative_slope=leaky, inplace=True)
 27 |     )
 28 | 
 29 | def conv_dw(inp, oup, stride, leaky=0.1):
 30 |     return nn.Sequential(
 31 |         nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
 32 |         nn.BatchNorm2d(inp),
 33 |         nn.LeakyReLU(negative_slope= leaky,inplace=True),
 34 | 
 35 |         nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
 36 |         nn.BatchNorm2d(oup),
 37 |         nn.LeakyReLU(negative_slope= leaky,inplace=True),
 38 |     )
 39 | 
 40 | class SSH(nn.Module):
 41 |     def __init__(self, in_channel, out_channel):
 42 |         super(SSH, self).__init__()
 43 |         assert out_channel % 4 == 0
 44 |         leaky = 0
 45 |         if (out_channel <= 64):
 46 |             leaky = 0.1
 47 |         self.conv3X3 = conv_bn_no_relu(in_channel, out_channel//2, stride=1)
 48 | 
 49 |         self.conv5X5_1 = conv_bn(in_channel, out_channel//4, stride=1, leaky = leaky)
 50 |         self.conv5X5_2 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)
 51 | 
 52 |         self.conv7X7_2 = conv_bn(out_channel//4, out_channel//4, stride=1, leaky = leaky)
 53 |         self.conv7x7_3 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)
 54 | 
 55 |     def forward(self, input):
 56 |         conv3X3 = self.conv3X3(input)
 57 | 
 58 |         conv5X5_1 = self.conv5X5_1(input)
 59 |         conv5X5 = self.conv5X5_2(conv5X5_1)
 60 | 
 61 |         conv7X7_2 = self.conv7X7_2(conv5X5_1)
 62 |         conv7X7 = self.conv7x7_3(conv7X7_2)
 63 | 
 64 |         out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
 65 |         out = F.relu(out)
 66 |         return out
 67 | 
 68 | class FPN(nn.Module):
 69 |     def __init__(self,in_channels_list,out_channels):
 70 |         super(FPN,self).__init__()
 71 |         leaky = 0
 72 |         if (out_channels <= 64):
 73 |             leaky = 0.1
 74 |         self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride = 1, leaky = leaky)
 75 |         self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride = 1, leaky = leaky)
 76 |         self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride = 1, leaky = leaky)
 77 | 
 78 |         self.merge1 = conv_bn(out_channels, out_channels, leaky = leaky)
 79 |         self.merge2 = conv_bn(out_channels, out_channels, leaky = leaky)
 80 | 
 81 |     def forward(self, input):
 82 |         # names = list(input.keys())
 83 |         input = list(input.values())
 84 | 
 85 |         output1 = self.output1(input[0])
 86 |         output2 = self.output2(input[1])
 87 |         output3 = self.output3(input[2])
 88 | 
 89 |         up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode="nearest")
 90 |         output2 = output2 + up3
 91 |         output2 = self.merge2(output2)
 92 | 
 93 |         up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode="nearest")
 94 |         output1 = output1 + up2
 95 |         output1 = self.merge1(output1)
 96 | 
 97 |         out = [output1, output2, output3]
 98 |         return out
 99 | 
100 | 
101 | 
102 | class MobileNetV1(nn.Module):
103 |     def __init__(self):
104 |         super(MobileNetV1, self).__init__()
105 |         self.stage1 = nn.Sequential(
106 |             conv_bn(3, 8, 2, leaky = 0.1),    # 3
107 |             conv_dw(8, 16, 1),   # 7
108 |             conv_dw(16, 32, 2),  # 11
109 |             conv_dw(32, 32, 1),  # 19
110 |             conv_dw(32, 64, 2),  # 27
111 |             conv_dw(64, 64, 1),  # 43
112 |         )
113 |         self.stage2 = nn.Sequential(
114 |             conv_dw(64, 128, 2),  # 43 + 16 = 59
115 |             conv_dw(128, 128, 1), # 59 + 32 = 91
116 |             conv_dw(128, 128, 1), # 91 + 32 = 123
117 |             conv_dw(128, 128, 1), # 123 + 32 = 155
118 |             conv_dw(128, 128, 1), # 155 + 32 = 187
119 |             conv_dw(128, 128, 1), # 187 + 32 = 219
120 |         )
121 |         self.stage3 = nn.Sequential(
122 |             conv_dw(128, 256, 2), # 219 +3 2 = 241
123 |             conv_dw(256, 256, 1), # 241 + 64 = 301
124 |         )
125 |         self.avg = nn.AdaptiveAvgPool2d((1,1))
126 |         self.fc = nn.Linear(256, 1000)
127 | 
128 |     def forward(self, x):
129 |         x = self.stage1(x)
130 |         x = self.stage2(x)
131 |         x = self.stage3(x)
132 |         x = self.avg(x)
133 |         # x = self.model(x)
134 |         x = x.view(-1, 256)
135 |         x = self.fc(x)
136 |         return x
137 | 
138 | 


--------------------------------------------------------------------------------
/models/retinaface.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torchvision.models.detection.backbone_utils as backbone_utils
  4 | import torchvision.models._utils as _utils
  5 | import torch.nn.functional as F
  6 | from models.net import MobileNetV1 as MobileNetV1
  7 | from models.net import FPN as FPN
  8 | from models.net import SSH as SSH
  9 | 
 10 | class ClassHead(nn.Module):
 11 |     def __init__(self,inchannels=512,num_anchors=3):
 12 |         super(ClassHead,self).__init__()
 13 |         self.num_anchors = num_anchors
 14 |         # 对，这是conv, NCHW->NHWC->N-H*W-C; 无bn relu; anchor_num=2说明每个像素点预设anchor为2，然后分类需要输出2，bbox需要4，landmarks需要10，所以与论文源码的anchor预设是一致的；
 15 |         # 但是没有 将ssh的结果级联，源码是将ssh的结果上采样再“+”，即为 cascade
 16 |         self.conv1x1 = nn.Conv2d(inchannels,self.num_anchors*2,kernel_size=(1,1),stride=1,padding=0)
 17 | 
 18 |     def forward(self,x):
 19 |         out = self.conv1x1(x)
 20 |         out = out.permute(0,2,3,1).contiguous()
 21 |         
 22 |         return out.view(out.shape[0], -1, 2)
 23 | 
 24 | class BboxHead(nn.Module):
 25 |     def __init__(self,inchannels=512,num_anchors=3):
 26 |         super(BboxHead,self).__init__()
 27 |         self.conv1x1 = nn.Conv2d(inchannels,num_anchors*4,kernel_size=(1,1),stride=1,padding=0)
 28 | 
 29 |     def forward(self,x):
 30 |         out = self.conv1x1(x)
 31 |         out = out.permute(0,2,3,1).contiguous()
 32 | 
 33 |         return out.view(out.shape[0], -1, 4)
 34 | 
 35 | class LandmarkHead(nn.Module):
 36 |     def __init__(self,inchannels=512,num_anchors=3):
 37 |         super(LandmarkHead,self).__init__()
 38 |         self.conv1x1 = nn.Conv2d(inchannels,num_anchors*10,kernel_size=(1,1),stride=1,padding=0)
 39 | 
 40 |     def forward(self,x):
 41 |         out = self.conv1x1(x)
 42 |         out = out.permute(0,2,3,1).contiguous()
 43 | 
 44 |         return out.view(out.shape[0], -1, 10)
 45 | 
 46 | # 此类为人脸姿态估计(yaw-pitch-roll)共有的分类 类-66类
 47 | class PoseHead(nn.Module):
 48 |     def __init__(self, inchannels=512, num_anchors=3):
 49 |         super().__init__()
 50 |         self.conv1x1 = nn.Conv2d(inchannels,num_anchors*66,kernel_size=(1,1),stride=1,padding=0)
 51 | 
 52 |     def forward(self, x):
 53 |         out = self.conv1x1(x)
 54 |         out = out.permute(0, 2, 3, 1).contiguous()  # 一般view前面，需要设置contiguous，将数据底层连续；直接reshape是不太好的！
 55 | 
 56 |         return out.view(out.shape[0], -1, 66)
 57 | 
 58 | class RetinaFace(nn.Module):
 59 |     def __init__(self, cfg=None, phase='train'):
 60 |         """
 61 |         :param cfg:  Network related settings.
 62 |         :param phase: train or test.
 63 |         """
 64 |         super(RetinaFace,self).__init__()
 65 |         self.phase = phase
 66 |         backbone = None
 67 |         if cfg['name'] == 'mobilenet0.25':
 68 |             backbone = MobileNetV1()
 69 |             if cfg['pretrain']:
 70 |                 checkpoint = torch.load("./weights/mobilenetV1X0.25_pretrain.tar", map_location=torch.device('cpu'))
 71 |                 from collections import OrderedDict
 72 |                 new_state_dict = OrderedDict()
 73 |                 for k, v in checkpoint['state_dict'].items():
 74 |                     name = k[7:]  # remove module.
 75 |                     new_state_dict[name] = v
 76 |                 # load params
 77 |                 backbone.load_state_dict(new_state_dict)
 78 |         elif cfg['name'] == 'Resnet50':
 79 |             import torchvision.models as models
 80 |             backbone = models.resnet50(pretrained=cfg['pretrain'])
 81 |         # 获取中间层输出
 82 |         self.body = _utils.IntermediateLayerGetter(backbone, cfg['return_layers'])
 83 |         in_channels_stage2 = cfg['in_channel']
 84 |         in_channels_list = [
 85 |             in_channels_stage2 * 2,
 86 |             in_channels_stage2 * 4,
 87 |             in_channels_stage2 * 8,
 88 |         ]
 89 |         out_channels = cfg['out_channel']
 90 |         self.fpn = FPN(in_channels_list,out_channels)
 91 | 
 92 |         self.ssh1 = SSH(out_channels, out_channels)
 93 |         self.ssh2 = SSH(out_channels, out_channels)
 94 |         self.ssh3 = SSH(out_channels, out_channels)
 95 |         # 参考口罩分支，ssh单独新建
 96 |         self.ssh1_pose = SSH(out_channels, out_channels)
 97 |         self.ssh2_pose = SSH(out_channels, out_channels)
 98 |         self.ssh3_pose = SSH(out_channels, out_channels)
 99 |         # 对应ssh 的 框-关键点-2分类
100 |         self.ClassHead = self._make_class_head(fpn_num=3, inchannels=cfg['out_channel'])
101 |         self.BboxHead = self._make_bbox_head(fpn_num=3, inchannels=cfg['out_channel'])
102 |         self.LandmarkHead = self._make_landmark_head(fpn_num=3, inchannels=cfg['out_channel'])
103 |         # 对应ssh_pose 的 yaw-pitch-roll 的 3个66分类
104 |         self.Pose_yaw_Head = self._make_pose_yaw_pitch_roll_head(fpn_num=3, inchannels=cfg['out_channel'])
105 |         self.Pose_pitch_Head = self._make_pose_yaw_pitch_roll_head(fpn_num=3, inchannels=cfg['out_channel'])
106 |         self.Pose_roll_Head = self._make_pose_yaw_pitch_roll_head(fpn_num=3, inchannels=cfg['out_channel'])
107 | 
108 |     def _make_class_head(self,fpn_num=3,inchannels=64,anchor_num=2):
109 |         classhead = nn.ModuleList()
110 |         for i in range(fpn_num):
111 |             classhead.append(ClassHead(inchannels,anchor_num))
112 |         return classhead
113 |     
114 |     def _make_bbox_head(self,fpn_num=3,inchannels=64,anchor_num=2):
115 |         bboxhead = nn.ModuleList()
116 |         for i in range(fpn_num):
117 |             bboxhead.append(BboxHead(inchannels,anchor_num))
118 |         return bboxhead
119 | 
120 |     def _make_landmark_head(self,fpn_num=3,inchannels=64,anchor_num=2):
121 |         landmarkhead = nn.ModuleList()
122 |         for i in range(fpn_num):
123 |             landmarkhead.append(LandmarkHead(inchannels,anchor_num))
124 |         return landmarkhead
125 | 
126 |     def _make_pose_yaw_pitch_roll_head(self, fpn_num=3, inchannels=64, anchor_num=2):
127 |         pose_head = nn.ModuleList()
128 |         for i in range(fpn_num):
129 |             pose_head.append(PoseHead(inchannels, anchor_num))  # append添加Module
130 |         return pose_head
131 | 
132 | 
133 |     def forward(self,inputs):
134 |         out = self.body(inputs)
135 | 
136 |         # FPN
137 |         fpn = self.fpn(out)
138 | 
139 |         # SSH
140 |         feature1 = self.ssh1(fpn[0])
141 |         feature2 = self.ssh2(fpn[1])
142 |         feature3 = self.ssh3(fpn[2])
143 |         features = [feature1, feature2, feature3]
144 | 
145 |         # SSH Pose
146 |         feature1_pose = self.ssh1_pose(fpn[0])
147 |         feature2_pose = self.ssh2_pose(fpn[1])
148 |         feature3_pose = self.ssh3_pose(fpn[2])
149 |         features_pose = [feature1_pose, feature2_pose, feature3_pose]
150 |         # 与RetinaFace细节区别，没有P5最后新建1层，只有最后3层；然后3层ssh输出是独立的，没有cascade做逐级相加“+”输出，且fpn的inchannel有差异；原始retinaface也没有cascade的ssh
151 | 
152 |         bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1)
153 |         classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)], dim=1)
154 |         ldm_regressions = torch.cat([self.LandmarkHead[i](feature) for i, feature in enumerate(features)], dim=1)
155 | 
156 |         yaw_preds = torch.cat([self.Pose_yaw_Head[i](feature) for i, feature in enumerate(features_pose)], dim=1)
157 |         pitch_preds = torch.cat([self.Pose_pitch_Head[i](feature) for i, feature in enumerate(features_pose)], dim=1)
158 |         roll_preds = torch.cat([self.Pose_roll_Head[i](feature) for i, feature in enumerate(features_pose)], dim=1)
159 | 
160 |         if self.phase == 'train':
161 |             # 建议 采用 dict 返回，包括数据读取返回
162 |             output = (bbox_regressions, classifications, ldm_regressions, yaw_preds, pitch_preds, roll_preds)
163 |         else:
164 |             output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions, yaw_preds, pitch_preds, roll_preds)  # 测试ypr需要进行softmax
165 |         return output
166 | 
167 | 


--------------------------------------------------------------------------------
/pose/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/pose/__init__.py


--------------------------------------------------------------------------------
/pose/datasets.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import cv2
  4 | import pandas as pd
  5 | 
  6 | import torch
  7 | from torch.utils.data.dataset import Dataset
  8 | from torchvision import transforms
  9 | 
 10 | from PIL import Image, ImageFilter
 11 | 
 12 | import utils
 13 | 
 14 | def get_list_from_filenames(file_path):
 15 |     # input:    relative path to .txt file with file names
 16 |     # output:   list of relative path names
 17 |     with open(file_path) as f:
 18 |         lines = f.read().splitlines()
 19 |     return lines
 20 | 
 21 | class Synhead(Dataset):
 22 |     def __init__(self, data_dir, csv_path, transform, test=False):
 23 |         column_names = ['path', 'bbox_x_min', 'bbox_y_min', 'bbox_x_max', 'bbox_y_max', 'yaw', 'pitch', 'roll']
 24 |         tmp_df = pd.read_csv(csv_path, sep=',', names=column_names, index_col=False, encoding="utf-8-sig")
 25 |         self.data_dir = data_dir
 26 |         self.transform = transform
 27 |         self.X_train = tmp_df['path']
 28 |         self.y_train = tmp_df[['bbox_x_min', 'bbox_y_min', 'bbox_x_max', 'bbox_y_max', 'yaw', 'pitch', 'roll']]
 29 |         self.length = len(tmp_df)
 30 |         self.test = test
 31 | 
 32 |     def __getitem__(self, index):
 33 |         path = os.path.join(self.data_dir, self.X_train.iloc[index]).strip('.jpg') + '.png'
 34 |         img = Image.open(path)
 35 |         img = img.convert('RGB')
 36 | 
 37 |         x_min, y_min, x_max, y_max, yaw, pitch, roll = self.y_train.iloc[index]
 38 |         x_min = float(x_min); x_max = float(x_max)
 39 |         y_min = float(y_min); y_max = float(y_max)
 40 |         yaw = -float(yaw); pitch = float(pitch); roll = float(roll)
 41 | 
 42 |         # k = 0.2 to 0.40
 43 |         k = np.random.random_sample() * 0.2 + 0.2
 44 |         x_min -= 0.6 * k * abs(x_max - x_min)
 45 |         y_min -= 2 * k * abs(y_max - y_min)
 46 |         x_max += 0.6 * k * abs(x_max - x_min)
 47 |         y_max += 0.6 * k * abs(y_max - y_min)
 48 | 
 49 |         width, height = img.size
 50 |         # Crop the face
 51 |         img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))
 52 | 
 53 |         # Flip?
 54 |         rnd = np.random.random_sample()
 55 |         if rnd < 0.5:
 56 |             yaw = -yaw
 57 |             roll = -roll
 58 |             img = img.transpose(Image.FLIP_LEFT_RIGHT)
 59 | 
 60 |         # Blur?
 61 |         rnd = np.random.random_sample()
 62 |         if rnd < 0.05:
 63 |             img = img.filter(ImageFilter.BLUR)
 64 | 
 65 |         # Bin values
 66 |         bins = np.array(range(-99, 102, 3))
 67 |         binned_pose = np.digitize([yaw, pitch, roll], bins) - 1
 68 | 
 69 |         labels = torch.LongTensor(binned_pose)
 70 |         cont_labels = torch.FloatTensor([yaw, pitch, roll])
 71 | 
 72 |         if self.transform is not None:
 73 |             img = self.transform(img)
 74 | 
 75 |         return img, labels, cont_labels, self.X_train[index]
 76 | 
 77 |     def __len__(self):
 78 |         return self.length
 79 | 
 80 | class Pose_300W_LP(Dataset):
 81 |     # Head pose from 300W-LP dataset
 82 |     def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.mat', image_mode='RGB'):
 83 |         self.data_dir = data_dir
 84 |         self.transform = transform
 85 |         self.img_ext = img_ext
 86 |         self.annot_ext = annot_ext
 87 | 
 88 |         filename_list = get_list_from_filenames(filename_path)
 89 | 
 90 |         self.X_train = filename_list
 91 |         self.y_train = filename_list
 92 |         self.image_mode = image_mode
 93 |         self.length = len(filename_list)
 94 | 
 95 |     def __getitem__(self, index):
 96 |         img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext))
 97 |         img = img.convert(self.image_mode)
 98 |         mat_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext)
 99 | 
100 |         # Crop the face loosely
101 |         pt2d = utils.get_pt2d_from_mat(mat_path)
102 |         x_min = min(pt2d[0,:])
103 |         y_min = min(pt2d[1,:])
104 |         x_max = max(pt2d[0,:])
105 |         y_max = max(pt2d[1,:])
106 | 
107 |         # k = 0.2 to 0.40
108 |         k = np.random.random_sample() * 0.2 + 0.2
109 |         x_min -= 0.6 * k * abs(x_max - x_min)
110 |         y_min -= 2 * k * abs(y_max - y_min)
111 |         x_max += 0.6 * k * abs(x_max - x_min)
112 |         y_max += 0.6 * k * abs(y_max - y_min)
113 |         img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))
114 | 
115 |         # We get the pose in radians
116 |         pose = utils.get_ypr_from_mat(mat_path)
117 |         # And convert to degrees.
118 |         pitch = pose[0] * 180 / np.pi
119 |         yaw = pose[1] * 180 / np.pi
120 |         roll = pose[2] * 180 / np.pi
121 | 
122 |         # Flip?
123 |         rnd = np.random.random_sample()
124 |         if rnd < 0.5:
125 |             yaw = -yaw
126 |             roll = -roll
127 |             img = img.transpose(Image.FLIP_LEFT_RIGHT)
128 | 
129 |         # Blur?
130 |         rnd = np.random.random_sample()
131 |         if rnd < 0.05:
132 |             img = img.filter(ImageFilter.BLUR)
133 | 
134 |         # Bin values
135 |         bins = np.array(range(-99, 102, 3))  # 102左闭右开，所以只到99
136 |         binned_pose = np.digitize([yaw, pitch, roll], bins) - 1  # 3度为1类，从-99到+99;只计算0-65共66类
137 | 
138 |         # Get target tensors
139 |         labels = binned_pose
140 |         cont_labels = torch.FloatTensor([yaw, pitch, roll])  # 回归
141 | 
142 |         if self.transform is not None:
143 |             img = self.transform(img)
144 | 
145 |         return img, labels, cont_labels, self.X_train[index]
146 | 
147 |     def __len__(self):
148 |         # 122,450
149 |         return self.length
150 | 
151 | class Pose_300W_LP_random_ds(Dataset):
152 |     # 300W-LP dataset with random downsampling
153 |     def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.mat', image_mode='RGB'):
154 |         self.data_dir = data_dir
155 |         self.transform = transform
156 |         self.img_ext = img_ext
157 |         self.annot_ext = annot_ext
158 | 
159 |         filename_list = get_list_from_filenames(filename_path)
160 | 
161 |         self.X_train = filename_list
162 |         self.y_train = filename_list
163 |         self.image_mode = image_mode
164 |         self.length = len(filename_list)
165 | 
166 |     def __getitem__(self, index):
167 |         img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext))
168 |         img = img.convert(self.image_mode)
169 |         mat_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext)
170 | 
171 |         # Crop the face loosely
172 |         pt2d = utils.get_pt2d_from_mat(mat_path)
173 |         x_min = min(pt2d[0,:])
174 |         y_min = min(pt2d[1,:])
175 |         x_max = max(pt2d[0,:])
176 |         y_max = max(pt2d[1,:])
177 | 
178 |         # k = 0.2 to 0.40
179 |         k = np.random.random_sample() * 0.2 + 0.2
180 |         x_min -= 0.6 * k * abs(x_max - x_min)
181 |         y_min -= 2 * k * abs(y_max - y_min)
182 |         x_max += 0.6 * k * abs(x_max - x_min)
183 |         y_max += 0.6 * k * abs(y_max - y_min)
184 |         img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))
185 | 
186 |         # We get the pose in radians
187 |         pose = utils.get_ypr_from_mat(mat_path)
188 |         pitch = pose[0] * 180 / np.pi
189 |         yaw = pose[1] * 180 / np.pi
190 |         roll = pose[2] * 180 / np.pi
191 | 
192 |         ds = 1 + np.random.randint(0,4) * 5
193 |         original_size = img.size
194 |         img = img.resize((img.size[0] / ds, img.size[1] / ds), resample=Image.NEAREST)
195 |         img = img.resize((original_size[0], original_size[1]), resample=Image.NEAREST)
196 | 
197 |         # Flip?
198 |         rnd = np.random.random_sample()
199 |         if rnd < 0.5:
200 |             yaw = -yaw
201 |             roll = -roll
202 |             img = img.transpose(Image.FLIP_LEFT_RIGHT)
203 | 
204 |         # Blur?
205 |         rnd = np.random.random_sample()
206 |         if rnd < 0.05:
207 |             img = img.filter(ImageFilter.BLUR)
208 | 
209 |         # Bin values
210 |         bins = np.array(range(-99, 102, 3))
211 |         binned_pose = np.digitize([yaw, pitch, roll], bins) - 1
212 | 
213 |         # Get target tensors
214 |         labels = binned_pose
215 |         cont_labels = torch.FloatTensor([yaw, pitch, roll])
216 | 
217 |         if self.transform is not None:
218 |             img = self.transform(img)
219 | 
220 |         return img, labels, cont_labels, self.X_train[index]
221 | 
222 |     def __len__(self):
223 |         # 122,450
224 |         return self.length
225 | 
226 | class AFLW2000(Dataset):
227 |     def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.mat', image_mode='RGB'):
228 |         self.data_dir = data_dir
229 |         self.transform = transform
230 |         self.img_ext = img_ext
231 |         self.annot_ext = annot_ext
232 | 
233 |         filename_list = get_list_from_filenames(filename_path)
234 | 
235 |         self.X_train = filename_list
236 |         self.y_train = filename_list
237 |         self.image_mode = image_mode
238 |         self.length = len(filename_list)
239 | 
240 |     def __getitem__(self, index):
241 |         img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext))
242 |         img = img.convert(self.image_mode)
243 |         mat_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext)
244 | 
245 |         # Crop the face loosely
246 |         pt2d = utils.get_pt2d_from_mat(mat_path)
247 | 
248 |         x_min = min(pt2d[0,:])
249 |         y_min = min(pt2d[1,:])
250 |         x_max = max(pt2d[0,:])
251 |         y_max = max(pt2d[1,:])
252 | 
253 |         k = 0.20
254 |         x_min -= 2 * k * abs(x_max - x_min)
255 |         y_min -= 2 * k * abs(y_max - y_min)
256 |         x_max += 2 * k * abs(x_max - x_min)
257 |         y_max += 0.6 * k * abs(y_max - y_min)
258 |         img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))
259 | 
260 |         # We get the pose in radians
261 |         pose = utils.get_ypr_from_mat(mat_path)
262 |         # And convert to degrees.
263 |         pitch = pose[0] * 180 / np.pi
264 |         yaw = pose[1] * 180 / np.pi
265 |         roll = pose[2] * 180 / np.pi
266 |         # Bin values
267 |         bins = np.array(range(-99, 102, 3))
268 |         labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1)
269 |         cont_labels = torch.FloatTensor([yaw, pitch, roll])
270 | 
271 |         if self.transform is not None:
272 |             img = self.transform(img)
273 | 
274 |         return img, labels, cont_labels, self.X_train[index]
275 | 
276 |     def __len__(self):
277 |         # 2,000
278 |         return self.length
279 | 
280 | class AFLW2000_ds(Dataset):
281 |     # AFLW2000 dataset with fixed downsampling
282 |     def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.mat', image_mode='RGB'):
283 |         self.data_dir = data_dir
284 |         self.transform = transform
285 |         self.img_ext = img_ext
286 |         self.annot_ext = annot_ext
287 | 
288 |         filename_list = get_list_from_filenames(filename_path)
289 | 
290 |         self.X_train = filename_list
291 |         self.y_train = filename_list
292 |         self.image_mode = image_mode
293 |         self.length = len(filename_list)
294 | 
295 |     def __getitem__(self, index):
296 |         img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext))
297 |         img = img.convert(self.image_mode)
298 |         mat_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext)
299 | 
300 |         # Crop the face loosely
301 |         pt2d = utils.get_pt2d_from_mat(mat_path)
302 |         x_min = min(pt2d[0,:])
303 |         y_min = min(pt2d[1,:])
304 |         x_max = max(pt2d[0,:])
305 |         y_max = max(pt2d[1,:])
306 | 
307 |         k = 0.20
308 |         x_min -= 2 * k * abs(x_max - x_min)
309 |         y_min -= 2 * k * abs(y_max - y_min)
310 |         x_max += 2 * k * abs(x_max - x_min)
311 |         y_max += 0.6 * k * abs(y_max - y_min)
312 |         img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))
313 | 
314 |         ds = 3  # downsampling factor
315 |         original_size = img.size
316 |         img = img.resize((img.size[0] / ds, img.size[1] / ds), resample=Image.NEAREST)
317 |         img = img.resize((original_size[0], original_size[1]), resample=Image.NEAREST)
318 | 
319 |         # We get the pose in radians
320 |         pose = utils.get_ypr_from_mat(mat_path)
321 |         # And convert to degrees.
322 |         pitch = pose[0] * 180 / np.pi
323 |         yaw = pose[1] * 180 / np.pi
324 |         roll = pose[2] * 180 / np.pi
325 |         # Bin values
326 |         bins = np.array(range(-99, 102, 3))
327 |         labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1)
328 |         cont_labels = torch.FloatTensor([yaw, pitch, roll])
329 | 
330 |         if self.transform is not None:
331 |             img = self.transform(img)
332 | 
333 |         return img, labels, cont_labels, self.X_train[index]
334 | 
335 |     def __len__(self):
336 |         # 2,000
337 |         return self.length
338 | 
339 | class AFLW_aug(Dataset):
340 |     # AFLW dataset with flipping
341 |     def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.txt', image_mode='RGB'):
342 |         self.data_dir = data_dir
343 |         self.transform = transform
344 |         self.img_ext = img_ext
345 |         self.annot_ext = annot_ext
346 | 
347 |         filename_list = get_list_from_filenames(filename_path)
348 | 
349 |         self.X_train = filename_list
350 |         self.y_train = filename_list
351 |         self.image_mode = image_mode
352 |         self.length = len(filename_list)
353 | 
354 |     def __getitem__(self, index):
355 |         img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext))
356 |         img = img.convert(self.image_mode)
357 |         txt_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext)
358 | 
359 |         # We get the pose in radians
360 |         annot = open(txt_path, 'r')
361 |         line = annot.readline().split(' ')
362 |         pose = [float(line[1]), float(line[2]), float(line[3])]
363 |         # And convert to degrees.
364 |         yaw = pose[0] * 180 / np.pi
365 |         pitch = pose[1] * 180 / np.pi
366 |         roll = pose[2] * 180 / np.pi
367 |         # Fix the roll in AFLW
368 |         roll *= -1
369 | 
370 |         # Augment
371 |         # Flip?
372 |         rnd = np.random.random_sample()
373 |         if rnd < 0.5:
374 |             yaw = -yaw
375 |             roll = -roll
376 |             img = img.transpose(Image.FLIP_LEFT_RIGHT)
377 | 
378 |         # Bin values
379 |         bins = np.array(range(-99, 102, 3))
380 |         labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1)
381 |         cont_labels = torch.FloatTensor([yaw, pitch, roll])
382 | 
383 |         if self.transform is not None:
384 |             img = self.transform(img)
385 | 
386 |         return img, labels, cont_labels, self.X_train[index]
387 | 
388 |     def __len__(self):
389 |         # train: 18,863
390 |         # test: 1,966
391 |         return self.length
392 | 
393 | class AFLW(Dataset):
394 |     def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.txt', image_mode='RGB'):
395 |         self.data_dir = data_dir
396 |         self.transform = transform
397 |         self.img_ext = img_ext
398 |         self.annot_ext = annot_ext
399 | 
400 |         filename_list = get_list_from_filenames(filename_path)
401 | 
402 |         self.X_train = filename_list
403 |         self.y_train = filename_list
404 |         self.image_mode = image_mode
405 |         self.length = len(filename_list)
406 | 
407 |     def __getitem__(self, index):
408 |         img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext))
409 |         img = img.convert(self.image_mode)
410 |         txt_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext)
411 | 
412 |         # We get the pose in radians
413 |         annot = open(txt_path, 'r')
414 |         line = annot.readline().split(' ')
415 |         pose = [float(line[1]), float(line[2]), float(line[3])]
416 |         # And convert to degrees.
417 |         yaw = pose[0] * 180 / np.pi
418 |         pitch = pose[1] * 180 / np.pi
419 |         roll = pose[2] * 180 / np.pi
420 |         # Fix the roll in AFLW
421 |         roll *= -1
422 |         # Bin values
423 |         bins = np.array(range(-99, 102, 3))
424 |         labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1)
425 |         cont_labels = torch.FloatTensor([yaw, pitch, roll])
426 | 
427 |         if self.transform is not None:
428 |             img = self.transform(img)
429 | 
430 |         return img, labels, cont_labels, self.X_train[index]
431 | 
432 |     def __len__(self):
433 |         # train: 18,863
434 |         # test: 1,966
435 |         return self.length
436 | 
437 | class AFW(Dataset):
438 |     def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.txt', image_mode='RGB'):
439 |         self.data_dir = data_dir
440 |         self.transform = transform
441 |         self.img_ext = img_ext
442 |         self.annot_ext = annot_ext
443 | 
444 |         filename_list = get_list_from_filenames(filename_path)
445 | 
446 |         self.X_train = filename_list
447 |         self.y_train = filename_list
448 |         self.image_mode = image_mode
449 |         self.length = len(filename_list)
450 | 
451 |     def __getitem__(self, index):
452 |         txt_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext)
453 |         img_name = self.X_train[index].split('_')[0]
454 | 
455 |         img = Image.open(os.path.join(self.data_dir, img_name + self.img_ext))
456 |         img = img.convert(self.image_mode)
457 |         txt_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext)
458 | 
459 |         # We get the pose in degrees
460 |         annot = open(txt_path, 'r')
461 |         line = annot.readline().split(' ')
462 |         yaw, pitch, roll = [float(line[1]), float(line[2]), float(line[3])]
463 | 
464 |         # Crop the face loosely
465 |         k = 0.32
466 |         x1 = float(line[4])
467 |         y1 = float(line[5])
468 |         x2 = float(line[6])
469 |         y2 = float(line[7])
470 |         x1 -= 0.8 * k * abs(x2 - x1)
471 |         y1 -= 2 * k * abs(y2 - y1)
472 |         x2 += 0.8 * k * abs(x2 - x1)
473 |         y2 += 1 * k * abs(y2 - y1)
474 | 
475 |         img = img.crop((int(x1), int(y1), int(x2), int(y2)))
476 | 
477 |         # Bin values
478 |         bins = np.array(range(-99, 102, 3))
479 |         labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1)
480 |         cont_labels = torch.FloatTensor([yaw, pitch, roll])
481 | 
482 |         if self.transform is not None:
483 |             img = self.transform(img)
484 | 
485 |         return img, labels, cont_labels, self.X_train[index]
486 | 
487 |     def __len__(self):
488 |         # Around 200
489 |         return self.length
490 | 
491 | class BIWI(Dataset):
492 |     def __init__(self, data_dir, filename_path, transform, img_ext='.png', annot_ext='.txt', image_mode='RGB'):
493 |         self.data_dir = data_dir
494 |         self.transform = transform
495 |         self.img_ext = img_ext
496 |         self.annot_ext = annot_ext
497 | 
498 |         filename_list = get_list_from_filenames(filename_path)
499 | 
500 |         self.X_train = filename_list
501 |         self.y_train = filename_list
502 |         self.image_mode = image_mode
503 |         self.length = len(filename_list)
504 | 
505 |     def __getitem__(self, index):
506 |         img = Image.open(os.path.join(self.data_dir, self.X_train[index] + '_rgb' + self.img_ext))
507 |         img = img.convert(self.image_mode)
508 |         pose_path = os.path.join(self.data_dir, self.y_train[index] + '_pose' + self.annot_ext)
509 | 
510 |         y_train_list = self.y_train[index].split('/')
511 |         bbox_path = os.path.join(self.data_dir, y_train_list[0] + '/dockerface-' + y_train_list[-1] + '_rgb' + self.annot_ext)
512 | 
513 |         # Load bounding box
514 |         bbox = open(bbox_path, 'r')
515 |         line = bbox.readline().split(' ')
516 |         if len(line) < 4:
517 |             x_min, y_min, x_max, y_max = 0, 0, img.size[0], img.size[1]
518 |         else:
519 |             x_min, y_min, x_max, y_max = [float(line[1]), float(line[2]), float(line[3]), float(line[4])]
520 |         bbox.close()
521 | 
522 |         # Load pose in degrees
523 |         pose_annot = open(pose_path, 'r')
524 |         R = []
525 |         for line in pose_annot:
526 |             line = line.strip('\n').split(' ')
527 |             l = []
528 |             if line[0] != '':
529 |                 for nb in line:
530 |                     if nb == '':
531 |                         continue
532 |                     l.append(float(nb))
533 |                 R.append(l)
534 | 
535 |         R = np.array(R)
536 |         T = R[3,:]
537 |         R = R[:3,:]
538 |         pose_annot.close()
539 | 
540 |         R = np.transpose(R)
541 | 
542 |         roll = -np.arctan2(R[1][0], R[0][0]) * 180 / np.pi
543 |         yaw = -np.arctan2(-R[2][0], np.sqrt(R[2][1] ** 2 + R[2][2] ** 2)) * 180 / np.pi
544 |         pitch = np.arctan2(R[2][1], R[2][2]) * 180 / np.pi
545 | 
546 |         # Loosely crop face
547 |         k = 0.35
548 |         x_min -= 0.6 * k * abs(x_max - x_min)
549 |         y_min -= k * abs(y_max - y_min)
550 |         x_max += 0.6 * k * abs(x_max - x_min)
551 |         y_max += 0.6 * k * abs(y_max - y_min)
552 |         img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max)))
553 | 
554 |         # Bin values
555 |         bins = np.array(range(-99, 102, 3))
556 |         binned_pose = np.digitize([yaw, pitch, roll], bins) - 1
557 | 
558 |         labels = torch.LongTensor(binned_pose)
559 |         cont_labels = torch.FloatTensor([yaw, pitch, roll])
560 | 
561 |         if self.transform is not None:
562 |             img = self.transform(img)
563 | 
564 |         return img, labels, cont_labels, self.X_train[index]
565 | 
566 |     def __len__(self):
567 |         # 15,667
568 |         return self.length
569 | 


--------------------------------------------------------------------------------
/pose/detect_image.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torch.backends.cudnn as cudnn
 13 | import torchvision
 14 | import torch.nn.functional as F
 15 | from PIL import Image
 16 | 
 17 | import datasets, hopenet, utils
 18 | 
 19 | from skimage import io
 20 | import dlib
 21 | 
 22 | def parse_args():
 23 |     """Parse input arguments."""
 24 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 25 |     parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.',
 26 |           default='hopenet_robust_alpha1.pkl', type=str)
 27 |     parser.add_argument('--face_model', dest='face_model', help='Path of DLIB face detection model.',
 28 |           default='mmod_human_face_detector.dat', type=str)
 29 |     parser.add_argument('--image', dest='image_path', help='Path of image')
 30 |     # parser.add_argument('--output_string', dest='output_string', help='String appended to output file')
 31 |     args = parser.parse_args()
 32 |     return args
 33 | 
 34 | if __name__ == '__main__':
 35 |     args = parse_args()
 36 | 
 37 |     # ResNet50 structure
 38 |     model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
 39 | 
 40 |     # Pretrained model
 41 |     saved_state_dict = torch.load(args.snapshot)
 42 |     model.load_state_dict(saved_state_dict)
 43 |     model = model.cuda()
 44 | 
 45 |     print('hopenet create success')
 46 | 
 47 |     # Dlib face detection model
 48 |     cnn_face_detector = dlib.cnn_face_detection_model_v1(args.face_model)
 49 |     
 50 |     print('dlib face detector create success')
 51 | 
 52 |     transformations = transforms.Compose([transforms.Scale(224),
 53 |     transforms.CenterCrop(224), transforms.ToTensor(),
 54 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 55 | 
 56 |     model.eval()
 57 | 
 58 |     idx_tensor = [idx for idx in range(66)]
 59 |     idx_tensor = torch.FloatTensor(idx_tensor).cuda()
 60 | 
 61 |     image = cv2.imread(args.image_path)
 62 |     #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 63 | 
 64 |     # Detect faces
 65 |     dets = cnn_face_detector(image, 1)
 66 | 
 67 |     for idx, det in enumerate(dets):
 68 |         # Get x_min, y_min, x_max, y_max, conf
 69 |         x_min = det.rect.left()
 70 |         y_min = det.rect.top()
 71 |         x_max = det.rect.right()
 72 |         y_max = det.rect.bottom()
 73 |         conf = det.confidence      
 74 | 
 75 |         if conf > 1.0:
 76 |             bbox_width = abs(x_max - x_min)
 77 |             bbox_height = abs(y_max - y_min)
 78 |             x_min -= 2 * bbox_width / 4
 79 |             x_max += 2 * bbox_width / 4
 80 |             y_min -= 3 * bbox_height / 4
 81 |             y_max += bbox_height / 4
 82 |             x_min = max(x_min, 0); y_min = max(y_min, 0)
 83 |             x_max = min(image.shape[1], x_max)
 84 |             y_max = min(image.shape[0], y_max)
 85 | 
 86 |             # To int
 87 |             x_min, x_max, y_min, y_max = int(x_min), int(x_max), int(y_min), int(y_max)
 88 | 
 89 |             # Crop image
 90 |             img = image[y_min:y_max,x_min:x_max]
 91 |             img = Image.fromarray(img)          
 92 | 
 93 |             # Transform
 94 |             img = transformations(img)
 95 |             img_shape = img.size()
 96 |             img = img.view(1, img_shape[0], img_shape[1], img_shape[2])
 97 |             img = img.cuda()
 98 | 
 99 |             yaw, pitch, roll = model(img) 
100 | 
101 |             yaw_predicted = F.softmax(yaw)
102 |             pitch_predicted = F.softmax(pitch)
103 |             roll_predicted = F.softmax(roll)
104 |             # Get continuous predictions in degrees.
105 |             yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99
106 |             pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99
107 |             roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99
108 | 
109 |             print('roll:', roll_predicted.item())
110 |             print('yaw:', yaw_predicted.item())
111 |             print('pitch:', pitch_predicted.item())
112 | 
113 |             utils.draw_axis(image, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2)
114 |             # Plot expanded bounding box
115 |             # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1)
116 | 
117 |     cv2.imshow('res',image)
118 |     cv2.waitKey()
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/pose/hopenet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | import math
  5 | import torch.nn.functional as F
  6 | 
  7 | class Hopenet(nn.Module):
  8 |     # Hopenet with 3 output layers for yaw, pitch and roll
  9 |     # Predicts Euler angles by binning and regression with the expected value
 10 |     def __init__(self, block, layers, num_bins):
 11 |         self.inplanes = 64
 12 |         super(Hopenet, self).__init__()
 13 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
 14 |                                bias=False)
 15 |         self.bn1 = nn.BatchNorm2d(64)
 16 |         self.relu = nn.ReLU(inplace=True)
 17 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 18 |         self.layer1 = self._make_layer(block, 64, layers[0])
 19 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
 20 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
 21 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
 22 |         self.avgpool = nn.AvgPool2d(7)
 23 |         self.fc_yaw = nn.Linear(512 * block.expansion, num_bins)
 24 |         self.fc_pitch = nn.Linear(512 * block.expansion, num_bins)
 25 |         self.fc_roll = nn.Linear(512 * block.expansion, num_bins)
 26 | 
 27 |         # Vestigial layer from previous experiments
 28 |         self.fc_finetune = nn.Linear(512 * block.expansion + 3, 3)
 29 | 
 30 |         for m in self.modules():
 31 |             if isinstance(m, nn.Conv2d):
 32 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 33 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 34 |             elif isinstance(m, nn.BatchNorm2d):
 35 |                 m.weight.data.fill_(1)
 36 |                 m.bias.data.zero_()
 37 | 
 38 |     def _make_layer(self, block, planes, blocks, stride=1):
 39 |         downsample = None
 40 |         if stride != 1 or self.inplanes != planes * block.expansion:
 41 |             downsample = nn.Sequential(
 42 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
 43 |                           kernel_size=1, stride=stride, bias=False),
 44 |                 nn.BatchNorm2d(planes * block.expansion),
 45 |             )
 46 | 
 47 |         layers = []
 48 |         layers.append(block(self.inplanes, planes, stride, downsample))
 49 |         self.inplanes = planes * block.expansion
 50 |         for i in range(1, blocks):
 51 |             layers.append(block(self.inplanes, planes))
 52 | 
 53 |         return nn.Sequential(*layers)
 54 | 
 55 |     def forward(self, x):
 56 |         x = self.conv1(x)
 57 |         x = self.bn1(x)
 58 |         x = self.relu(x)
 59 |         x = self.maxpool(x)
 60 | 
 61 |         x = self.layer1(x)
 62 |         x = self.layer2(x)
 63 |         x = self.layer3(x)
 64 |         x = self.layer4(x)
 65 | 
 66 |         x = self.avgpool(x)
 67 |         x = x.view(x.size(0), -1)
 68 |         pre_yaw = self.fc_yaw(x)
 69 |         pre_pitch = self.fc_pitch(x)
 70 |         pre_roll = self.fc_roll(x)
 71 | 
 72 |         return pre_yaw, pre_pitch, pre_roll
 73 | 
 74 | class ResNet(nn.Module):
 75 |     # ResNet for regression of 3 Euler angles.
 76 |     def __init__(self, block, layers, num_classes=1000):
 77 |         self.inplanes = 64
 78 |         super(ResNet, self).__init__()
 79 |         self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
 80 |                                bias=False)
 81 |         self.bn1 = nn.BatchNorm2d(64)
 82 |         self.relu = nn.ReLU(inplace=True)
 83 |         self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
 84 |         self.layer1 = self._make_layer(block, 64, layers[0])
 85 |         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
 86 |         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
 87 |         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
 88 |         self.avgpool = nn.AvgPool2d(7)
 89 |         self.fc_angles = nn.Linear(512 * block.expansion, num_classes)
 90 | 
 91 |         for m in self.modules():
 92 |             if isinstance(m, nn.Conv2d):
 93 |                 n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
 94 |                 m.weight.data.normal_(0, math.sqrt(2. / n))
 95 |             elif isinstance(m, nn.BatchNorm2d):
 96 |                 m.weight.data.fill_(1)
 97 |                 m.bias.data.zero_()
 98 | 
 99 |     def _make_layer(self, block, planes, blocks, stride=1):
100 |         downsample = None
101 |         if stride != 1 or self.inplanes != planes * block.expansion:
102 |             downsample = nn.Sequential(
103 |                 nn.Conv2d(self.inplanes, planes * block.expansion,
104 |                           kernel_size=1, stride=stride, bias=False),
105 |                 nn.BatchNorm2d(planes * block.expansion),
106 |             )
107 | 
108 |         layers = []
109 |         layers.append(block(self.inplanes, planes, stride, downsample))
110 |         self.inplanes = planes * block.expansion
111 |         for i in range(1, blocks):
112 |             layers.append(block(self.inplanes, planes))
113 | 
114 |         return nn.Sequential(*layers)
115 | 
116 |     def forward(self, x):
117 |         x = self.conv1(x)
118 |         x = self.bn1(x)
119 |         x = self.relu(x)
120 |         x = self.maxpool(x)
121 | 
122 |         x = self.layer1(x)
123 |         x = self.layer2(x)
124 |         x = self.layer3(x)
125 |         x = self.layer4(x)
126 | 
127 |         x = self.avgpool(x)
128 |         x = x.view(x.size(0), -1)
129 |         x = self.fc_angles(x)
130 |         return x
131 | 
132 | class AlexNet(nn.Module):
133 |     # AlexNet laid out as a Hopenet - classify Euler angles in bins and
134 |     # regress the expected value.
135 |     def __init__(self, num_bins):
136 |         super(AlexNet, self).__init__()
137 |         self.features = nn.Sequential(
138 |             nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
139 |             nn.ReLU(inplace=True),
140 |             nn.MaxPool2d(kernel_size=3, stride=2),
141 |             nn.Conv2d(64, 192, kernel_size=5, padding=2),
142 |             nn.ReLU(inplace=True),
143 |             nn.MaxPool2d(kernel_size=3, stride=2),
144 |             nn.Conv2d(192, 384, kernel_size=3, padding=1),
145 |             nn.ReLU(inplace=True),
146 |             nn.Conv2d(384, 256, kernel_size=3, padding=1),
147 |             nn.ReLU(inplace=True),
148 |             nn.Conv2d(256, 256, kernel_size=3, padding=1),
149 |             nn.ReLU(inplace=True),
150 |             nn.MaxPool2d(kernel_size=3, stride=2),
151 |         )
152 |         self.classifier = nn.Sequential(
153 |             nn.Dropout(),
154 |             nn.Linear(256 * 6 * 6, 4096),
155 |             nn.ReLU(inplace=True),
156 |             nn.Dropout(),
157 |             nn.Linear(4096, 4096),
158 |             nn.ReLU(inplace=True),
159 |         )
160 |         self.fc_yaw = nn.Linear(4096, num_bins)
161 |         self.fc_pitch = nn.Linear(4096, num_bins)
162 |         self.fc_roll = nn.Linear(4096, num_bins)
163 | 
164 |     def forward(self, x):
165 |         x = self.features(x)
166 |         x = x.view(x.size(0), 256 * 6 * 6)
167 |         x = self.classifier(x)
168 |         yaw = self.fc_yaw(x)
169 |         pitch = self.fc_pitch(x)
170 |         roll = self.fc_roll(x)
171 |         return yaw, pitch, roll
172 | 


--------------------------------------------------------------------------------
/pose/test_alexnet.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torch.backends.cudnn as cudnn
 13 | import torchvision
 14 | import torch.nn.functional as F
 15 | 
 16 | import datasets, hopenet, utils
 17 | 
 18 | def parse_args():
 19 |     """Parse input arguments."""
 20 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 21 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 22 |             default=0, type=int)
 23 |     parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.',
 24 |           default='', type=str)
 25 |     parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.',
 26 |           default='', type=str)
 27 |     parser.add_argument('--snapshot', dest='snapshot', help='Name of model snapshot.',
 28 |           default='', type=str)
 29 |     parser.add_argument('--batch_size', dest='batch_size', help='Batch size.',
 30 |           default=1, type=int)
 31 |     parser.add_argument('--save_viz', dest='save_viz', help='Save images with pose cube.',
 32 |           default=False, type=bool)
 33 |     parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='AFLW2000', type=str)
 34 | 
 35 |     args = parser.parse_args()
 36 | 
 37 |     return args
 38 | 
 39 | if __name__ == '__main__':
 40 |     args = parse_args()
 41 | 
 42 |     cudnn.enabled = True
 43 |     gpu = args.gpu_id
 44 |     snapshot_path = args.snapshot
 45 | 
 46 |     model = hopenet.AlexNet(66)
 47 | 
 48 |     print 'Loading snapshot.'
 49 |     # Load snapshot
 50 |     saved_state_dict = torch.load(snapshot_path)
 51 |     model.load_state_dict(saved_state_dict)
 52 | 
 53 |     print 'Loading data.'
 54 | 
 55 |     transformations = transforms.Compose([transforms.Scale(224),
 56 |     transforms.CenterCrop(224), transforms.ToTensor(),
 57 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 58 | 
 59 |     if args.dataset == 'Pose_300W_LP':
 60 |         pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations)
 61 |     elif args.dataset == 'Pose_300W_LP_random_ds':
 62 |         pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations)
 63 |     elif args.dataset == 'AFLW2000':
 64 |         pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations)
 65 |     elif args.dataset == 'AFLW2000_ds':
 66 |         pose_dataset = datasets.AFLW2000_ds(args.data_dir, args.filename_list, transformations)
 67 |     elif args.dataset == 'BIWI':
 68 |         pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations)
 69 |     elif args.dataset == 'AFLW':
 70 |         pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations)
 71 |     elif args.dataset == 'AFLW_aug':
 72 |         pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations)
 73 |     elif args.dataset == 'AFW':
 74 |         pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations)
 75 |     else:
 76 |         print 'Error: not a valid dataset name'
 77 |         sys.exit()
 78 |     test_loader = torch.utils.data.DataLoader(dataset=pose_dataset,
 79 |                                                batch_size=args.batch_size,
 80 |                                                num_workers=2)
 81 | 
 82 |     model.cuda(gpu)
 83 | 
 84 |     print 'Ready to test network.'
 85 | 
 86 |     # Test the Model
 87 |     model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
 88 |     total = 0
 89 | 
 90 |     idx_tensor = [idx for idx in xrange(66)]
 91 |     idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
 92 | 
 93 |     yaw_error = .0
 94 |     pitch_error = .0
 95 |     roll_error = .0
 96 | 
 97 |     l1loss = torch.nn.L1Loss(size_average=False)
 98 | 
 99 |     for i, (images, labels, cont_labels, name) in enumerate(test_loader):
100 |         images = Variable(images).cuda(gpu)
101 |         total += cont_labels.size(0)
102 |         label_yaw = cont_labels[:,0].float()
103 |         label_pitch = cont_labels[:,1].float()
104 |         label_roll = cont_labels[:,2].float()
105 | 
106 |         yaw, pitch, roll = model(images)
107 | 
108 |         # Binned predictions
109 |         _, yaw_bpred = torch.max(yaw.data, 1)
110 |         _, pitch_bpred = torch.max(pitch.data, 1)
111 |         _, roll_bpred = torch.max(roll.data, 1)
112 | 
113 |         # Continuous predictions
114 |         yaw_predicted = utils.softmax_temperature(yaw.data, 1)
115 |         pitch_predicted = utils.softmax_temperature(pitch.data, 1)
116 |         roll_predicted = utils.softmax_temperature(roll.data, 1)
117 | 
118 |         yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu() * 3 - 99
119 |         pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu() * 3 - 99
120 |         roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu() * 3 - 99
121 | 
122 |         # Mean absolute error
123 |         yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw))
124 |         pitch_error += torch.sum(torch.abs(pitch_predicted - label_pitch))
125 |         roll_error += torch.sum(torch.abs(roll_predicted - label_roll))
126 | 
127 |         # Save first image in batch with pose cube or axis.
128 |         if args.save_viz:
129 |             name = name[0]
130 |             if args.dataset == 'BIWI':
131 |                 cv2_img = cv2.imread(os.path.join(args.data_dir, name + '_rgb.png'))
132 |             else:
133 |                 cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg'))
134 |             if args.batch_size == 1:
135 |                 error_string = 'y %.2f, p %.2f, r %.2f' % (torch.sum(torch.abs(yaw_predicted - label_yaw)), torch.sum(torch.abs(pitch_predicted - label_pitch)), torch.sum(torch.abs(roll_predicted - label_roll)))
136 |                 cv2.putText(cv2_img, error_string, (30, cv2_img.shape[0]- 30), fontFace=1, fontScale=1, color=(0,0,255), thickness=1)
137 |             # utils.plot_pose_cube(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], size=100)
138 |             utils.draw_axis(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], tdx = 200, tdy= 200, size=100)
139 |             cv2.imwrite(os.path.join('output/images', name + '.jpg'), cv2_img)
140 | 
141 |     print('Test error in degrees of the model on the ' + str(total) +
142 |     ' test images. Yaw: %.4f, Pitch: %.4f, Roll: %.4f' % (yaw_error / total,
143 |     pitch_error / total, roll_error / total))
144 | 


--------------------------------------------------------------------------------
/pose/test_hopenet.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torch.backends.cudnn as cudnn
 13 | import torchvision
 14 | import torch.nn.functional as F
 15 | 
 16 | import datasets, hopenet, utils
 17 | 
 18 | def parse_args():
 19 |     """Parse input arguments."""
 20 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 21 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 22 |             default=0, type=int)
 23 |     parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.',
 24 |           default='', type=str)
 25 |     parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.',
 26 |           default='', type=str)
 27 |     parser.add_argument('--snapshot', dest='snapshot', help='Name of model snapshot.',
 28 |           default='', type=str)
 29 |     parser.add_argument('--batch_size', dest='batch_size', help='Batch size.',
 30 |           default=1, type=int)
 31 |     parser.add_argument('--save_viz', dest='save_viz', help='Save images with pose cube.',
 32 |           default=False, type=bool)
 33 |     parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='AFLW2000', type=str)
 34 | 
 35 |     args = parser.parse_args()
 36 | 
 37 |     return args
 38 | 
 39 | if __name__ == '__main__':
 40 |     args = parse_args()
 41 | 
 42 |     cudnn.enabled = True
 43 |     gpu = args.gpu_id
 44 |     snapshot_path = args.snapshot
 45 | 
 46 |     # ResNet50 structure
 47 |     model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
 48 | 
 49 |     print('Loading snapshot.')
 50 |     # Load snapshot
 51 |     saved_state_dict = torch.load(snapshot_path)
 52 |     model.load_state_dict(saved_state_dict)
 53 | 
 54 |     print('Loading data.')
 55 | 
 56 |     transformations = transforms.Compose([transforms.Scale(224),
 57 |     transforms.CenterCrop(224), transforms.ToTensor(),
 58 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])  # rgb模式
 59 | 
 60 |     if args.dataset == 'Pose_300W_LP':
 61 |         pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations)
 62 |     elif args.dataset == 'Pose_300W_LP_random_ds':
 63 |         pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations)
 64 |     elif args.dataset == 'AFLW2000':
 65 |         pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations)
 66 |     elif args.dataset == 'AFLW2000_ds':
 67 |         pose_dataset = datasets.AFLW2000_ds(args.data_dir, args.filename_list, transformations)
 68 |     elif args.dataset == 'BIWI':
 69 |         pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations)
 70 |     elif args.dataset == 'AFLW':
 71 |         pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations)
 72 |     elif args.dataset == 'AFLW_aug':
 73 |         pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations)
 74 |     elif args.dataset == 'AFW':
 75 |         pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations)
 76 |     else:
 77 |         print('Error: not a valid dataset name')
 78 |         sys.exit()
 79 |     test_loader = torch.utils.data.DataLoader(dataset=pose_dataset,
 80 |                                                batch_size=args.batch_size,
 81 |                                                num_workers=2)
 82 | 
 83 |     model.cuda(gpu)
 84 | 
 85 |     print('Ready to test network.')
 86 | 
 87 |     # Test the Model
 88 |     model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
 89 |     total = 0
 90 | 
 91 |     idx_tensor = [idx for idx in range(66)]
 92 |     idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
 93 | 
 94 |     yaw_error = .0
 95 |     pitch_error = .0
 96 |     roll_error = .0
 97 | 
 98 |     l1loss = torch.nn.L1Loss(size_average=False)
 99 | 
100 |     for i, (images, labels, cont_labels, name) in enumerate(test_loader):
101 |         images = Variable(images).cuda(gpu)
102 |         total += cont_labels.size(0)
103 | 
104 |         label_yaw = cont_labels[:,0].float()
105 |         label_pitch = cont_labels[:,1].float()
106 |         label_roll = cont_labels[:,2].float()
107 | 
108 |         yaw, pitch, roll = model(images)
109 | 
110 |         # Binned predictions
111 |         _, yaw_bpred = torch.max(yaw.data, 1)
112 |         _, pitch_bpred = torch.max(pitch.data, 1)
113 |         _, roll_bpred = torch.max(roll.data, 1)
114 | 
115 |         # Continuous predictions
116 |         yaw_predicted = utils.softmax_temperature(yaw.data, 1)
117 |         pitch_predicted = utils.softmax_temperature(pitch.data, 1)
118 |         roll_predicted = utils.softmax_temperature(roll.data, 1)
119 | 
120 |         yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu() * 3 - 99
121 |         pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu() * 3 - 99
122 |         roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu() * 3 - 99
123 | 
124 |         # Mean absolute error
125 |         yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw))
126 |         pitch_error += torch.sum(torch.abs(pitch_predicted - label_pitch))
127 |         roll_error += torch.sum(torch.abs(roll_predicted - label_roll))
128 | 
129 |         # Save first image in batch with pose cube or axis.
130 |         if args.save_viz:
131 |             name = name[0]
132 |             if args.dataset == 'BIWI':
133 |                 cv2_img = cv2.imread(os.path.join(args.data_dir, name + '_rgb.png'))
134 |             else:
135 |                 cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg'))
136 |             if args.batch_size == 1:
137 |                 error_string = 'y %.2f, p %.2f, r %.2f' % (torch.sum(torch.abs(yaw_predicted - label_yaw)), torch.sum(torch.abs(pitch_predicted - label_pitch)), torch.sum(torch.abs(roll_predicted - label_roll)))
138 |                 cv2.putText(cv2_img, error_string, (30, cv2_img.shape[0]- 30), fontFace=1, fontScale=1, color=(0,0,255), thickness=2)
139 |             # utils.plot_pose_cube(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], size=100)
140 |             utils.draw_axis(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], tdx = 200, tdy= 200, size=100)
141 |             cv2.imwrite(os.path.join('output/images', name + '.jpg'), cv2_img)
142 | 
143 |     print('Test error in degrees of the model on the ' + str(total) +
144 |     ' test images. Yaw: %.4f, Pitch: %.4f, Roll: %.4f' % (yaw_error / total,
145 |     pitch_error / total, roll_error / total))
146 | 


--------------------------------------------------------------------------------
/pose/test_on_video.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torch.backends.cudnn as cudnn
 13 | import torchvision
 14 | import torch.nn.functional as F
 15 | from PIL import Image
 16 | 
 17 | import datasets, hopenet, utils
 18 | 
 19 | def parse_args():
 20 |     """Parse input arguments."""
 21 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 22 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 23 |             default=0, type=int)
 24 |     parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.',
 25 |           default='', type=str)
 26 |     parser.add_argument('--video', dest='video_path', help='Path of video')
 27 |     parser.add_argument('--bboxes', dest='bboxes', help='Bounding box annotations of frames')
 28 |     parser.add_argument('--output_string', dest='output_string', help='String appended to output file')
 29 |     parser.add_argument('--n_frames', dest='n_frames', help='Number of frames', type=int)
 30 |     parser.add_argument('--fps', dest='fps', help='Frames per second of source video', type=float, default=30.)
 31 |     args = parser.parse_args()
 32 |     return args
 33 | 
 34 | if __name__ == '__main__':
 35 |     args = parse_args()
 36 | 
 37 |     cudnn.enabled = True
 38 | 
 39 |     batch_size = 1
 40 |     gpu = args.gpu_id
 41 |     snapshot_path = args.snapshot
 42 |     out_dir = 'output/video'
 43 |     video_path = args.video_path
 44 | 
 45 |     if not os.path.exists(out_dir):
 46 |         os.makedirs(out_dir)
 47 | 
 48 |     if not os.path.exists(args.video_path):
 49 |         sys.exit('Video does not exist')
 50 | 
 51 |     # ResNet50 structure
 52 |     model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
 53 | 
 54 |     # print 'Loading snapshot.'
 55 |     # Load snapshot
 56 |     saved_state_dict = torch.load(snapshot_path)
 57 |     model.load_state_dict(saved_state_dict)
 58 | 
 59 |     # print 'Loading data.'
 60 | 
 61 |     transformations = transforms.Compose([transforms.Scale(224),
 62 |     transforms.CenterCrop(224), transforms.ToTensor(),
 63 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 64 | 
 65 |     model.cuda(gpu)
 66 | 
 67 |     # print 'Ready to test network.'
 68 | 
 69 |     # Test the Model
 70 |     model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
 71 |     total = 0
 72 | 
 73 |     idx_tensor = [idx for idx in range(66)]
 74 |     idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
 75 | 
 76 |     video = cv2.VideoCapture(video_path)
 77 | 
 78 |     # New cv2
 79 |     width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))   # float
 80 |     height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float
 81 | 
 82 |     # Define the codec and create VideoWriter object
 83 |     fourcc = cv2.VideoWriter_fourcc(*'MJPG')
 84 |     out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height))
 85 | 
 86 |     # # Old cv2
 87 |     # width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH))   # float
 88 |     # height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float
 89 |     #
 90 |     # # Define the codec and create VideoWriter object
 91 |     # fourcc = cv2.cv.CV_FOURCC(*'MJPG')
 92 |     # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height))
 93 | 
 94 |     txt_out = open('output/video/output-%s.txt' % args.output_string, 'w')
 95 | 
 96 |     frame_num = 1
 97 | 
 98 |     with open(args.bboxes, 'r') as f:
 99 |         bbox_line_list = f.read().splitlines()
100 | 
101 |     idx = 0
102 |     while idx < len(bbox_line_list):
103 |         line = bbox_line_list[idx]
104 |         line = line.strip('\n')
105 |         line = line.split(' ')
106 |         det_frame_num = int(line[0])
107 | 
108 |         # print frame_num
109 | 
110 |         # Stop at a certain frame number
111 |         if frame_num > args.n_frames:
112 |             break
113 | 
114 |         # Save all frames as they are if they don't have bbox annotation.
115 |         while frame_num < det_frame_num:
116 |             ret, frame = video.read()
117 |             if ret == False:
118 |                 out.release()
119 |                 video.release()
120 |                 txt_out.close()
121 |                 sys.exit(0)
122 |             # out.write(frame)
123 |             frame_num += 1
124 | 
125 |         # Start processing frame with bounding box
126 |         ret,frame = video.read()
127 |         if ret == False:
128 |             break
129 |         cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
130 | 
131 |         while True:
132 |             x_min, y_min, x_max, y_max = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4]))
133 | 
134 |             bbox_width = abs(x_max - x_min)
135 |             bbox_height = abs(y_max - y_min)
136 |             # x_min -= 3 * bbox_width / 4
137 |             # x_max += 3 * bbox_width / 4
138 |             # y_min -= 3 * bbox_height / 4
139 |             # y_max += bbox_height / 4
140 |             x_min -= 50
141 |             x_max += 50
142 |             y_min -= 50
143 |             y_max += 30
144 |             x_min = max(x_min, 0)
145 |             y_min = max(y_min, 0)
146 |             x_max = min(frame.shape[1], x_max)
147 |             y_max = min(frame.shape[0], y_max)
148 |             # Crop face loosely
149 |             img = cv2_frame[y_min:y_max,x_min:x_max]
150 |             img = Image.fromarray(img)
151 | 
152 |             # Transform
153 |             img = transformations(img)
154 |             img_shape = img.size()
155 |             img = img.view(1, img_shape[0], img_shape[1], img_shape[2])
156 |             img = Variable(img).cuda(gpu)
157 | 
158 |             yaw, pitch, roll = model(img)
159 | 
160 |             yaw_predicted = F.softmax(yaw)
161 |             pitch_predicted = F.softmax(pitch)
162 |             roll_predicted = F.softmax(roll)
163 |             # Get continuous predictions in degrees.
164 |             yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99
165 |             pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99
166 |             roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99
167 | 
168 |             # Print new frame with cube and axis
169 |             txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted))
170 |             # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width)
171 |             utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2)
172 |             # Plot expanded bounding box
173 |             # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1)
174 | 
175 |             # Peek next frame detection
176 |             next_frame_num = int(bbox_line_list[idx+1].strip('\n').split(' ')[0])
177 |             # print 'next_frame_num ', next_frame_num
178 |             if next_frame_num == det_frame_num:
179 |                 idx += 1
180 |                 line = bbox_line_list[idx].strip('\n').split(' ')
181 |                 det_frame_num = int(line[0])
182 |             else:
183 |                 break
184 | 
185 |         idx += 1
186 |         out.write(frame)
187 |         frame_num += 1
188 | 
189 |     out.release()
190 |     video.release()
191 |     txt_out.close()
192 | 


--------------------------------------------------------------------------------
/pose/test_on_video_dlib.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torch.backends.cudnn as cudnn
 13 | import torchvision
 14 | import torch.nn.functional as F
 15 | from PIL import Image
 16 | 
 17 | import datasets, hopenet, utils
 18 | 
 19 | from skimage import io
 20 | import dlib
 21 | 
 22 | def parse_args():
 23 |     """Parse input arguments."""
 24 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 25 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 26 |             default=0, type=int)
 27 |     parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.',
 28 |           default='', type=str)
 29 |     parser.add_argument('--face_model', dest='face_model', help='Path of DLIB face detection model.',
 30 |           default='', type=str)
 31 |     parser.add_argument('--video', dest='video_path', help='Path of video')
 32 |     parser.add_argument('--output_string', dest='output_string', help='String appended to output file')
 33 |     parser.add_argument('--n_frames', dest='n_frames', help='Number of frames', type=int)
 34 |     parser.add_argument('--fps', dest='fps', help='Frames per second of source video', type=float, default=30.)
 35 |     args = parser.parse_args()
 36 |     return args
 37 | 
 38 | if __name__ == '__main__':
 39 |     args = parse_args()
 40 | 
 41 |     cudnn.enabled = True
 42 | 
 43 |     batch_size = 1
 44 |     gpu = args.gpu_id
 45 |     snapshot_path = args.snapshot
 46 |     out_dir = 'output/video'
 47 |     video_path = args.video_path
 48 | 
 49 |     if not os.path.exists(out_dir):
 50 |         os.makedirs(out_dir)
 51 | 
 52 |     if not os.path.exists(args.video_path):
 53 |         sys.exit('Video does not exist')
 54 | 
 55 |     # ResNet50 structure
 56 |     model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
 57 | 
 58 |     # Dlib face detection model
 59 |     cnn_face_detector = dlib.cnn_face_detection_model_v1(args.face_model)
 60 | 
 61 |     print 'Loading snapshot.'
 62 |     # Load snapshot
 63 |     saved_state_dict = torch.load(snapshot_path)
 64 |     model.load_state_dict(saved_state_dict)
 65 | 
 66 |     print 'Loading data.'
 67 | 
 68 |     transformations = transforms.Compose([transforms.Scale(224),
 69 |     transforms.CenterCrop(224), transforms.ToTensor(),
 70 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 71 | 
 72 |     model.cuda(gpu)
 73 | 
 74 |     print 'Ready to test network.'
 75 | 
 76 |     # Test the Model
 77 |     model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
 78 |     total = 0
 79 | 
 80 |     idx_tensor = [idx for idx in xrange(66)]
 81 |     idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
 82 | 
 83 |     video = cv2.VideoCapture(video_path)
 84 | 
 85 |     # New cv2
 86 |     width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))   # float
 87 |     height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float
 88 | 
 89 |     # Define the codec and create VideoWriter object
 90 |     fourcc = cv2.VideoWriter_fourcc(*'MJPG')
 91 |     out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height))
 92 | 
 93 |     # # Old cv2
 94 |     # width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH))   # float
 95 |     # height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float
 96 |     #
 97 |     # # Define the codec and create VideoWriter object
 98 |     # fourcc = cv2.cv.CV_FOURCC(*'MJPG')
 99 |     # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height))
100 | 
101 |     txt_out = open('output/video/output-%s.txt' % args.output_string, 'w')
102 | 
103 |     frame_num = 1
104 | 
105 |     while frame_num <= args.n_frames:
106 |         print frame_num
107 | 
108 |         ret,frame = video.read()
109 |         if ret == False:
110 |             break
111 | 
112 |         cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
113 | 
114 |         # Dlib detect
115 |         dets = cnn_face_detector(cv2_frame, 1)
116 | 
117 |         for idx, det in enumerate(dets):
118 |             # Get x_min, y_min, x_max, y_max, conf
119 |             x_min = det.rect.left()
120 |             y_min = det.rect.top()
121 |             x_max = det.rect.right()
122 |             y_max = det.rect.bottom()
123 |             conf = det.confidence
124 | 
125 |             if conf > 1.0:
126 |                 bbox_width = abs(x_max - x_min)
127 |                 bbox_height = abs(y_max - y_min)
128 |                 x_min -= 2 * bbox_width / 4
129 |                 x_max += 2 * bbox_width / 4
130 |                 y_min -= 3 * bbox_height / 4
131 |                 y_max += bbox_height / 4
132 |                 x_min = max(x_min, 0); y_min = max(y_min, 0)
133 |                 x_max = min(frame.shape[1], x_max); y_max = min(frame.shape[0], y_max)
134 |                 # Crop image
135 |                 img = cv2_frame[y_min:y_max,x_min:x_max]
136 |                 img = Image.fromarray(img)
137 | 
138 |                 # Transform
139 |                 img = transformations(img)
140 |                 img_shape = img.size()
141 |                 img = img.view(1, img_shape[0], img_shape[1], img_shape[2])
142 |                 img = Variable(img).cuda(gpu)
143 | 
144 |                 yaw, pitch, roll = model(img)
145 | 
146 |                 yaw_predicted = F.softmax(yaw)
147 |                 pitch_predicted = F.softmax(pitch)
148 |                 roll_predicted = F.softmax(roll)
149 |                 # Get continuous predictions in degrees.
150 |                 yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99
151 |                 pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99
152 |                 roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99
153 | 
154 |                 # Print new frame with cube and axis
155 |                 txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted))
156 |                 # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width)
157 |                 utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2)
158 |                 # Plot expanded bounding box
159 |                 # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1)
160 | 
161 |         out.write(frame)
162 |         frame_num += 1
163 | 
164 |     out.release()
165 |     video.release()
166 | 


--------------------------------------------------------------------------------
/pose/test_on_video_dockerface.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torch.backends.cudnn as cudnn
 13 | import torchvision
 14 | import torch.nn.functional as F
 15 | from PIL import Image
 16 | 
 17 | import datasets, hopenet, utils
 18 | 
 19 | def parse_args():
 20 |     """Parse input arguments."""
 21 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 22 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 23 |             default=0, type=int)
 24 |     parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.',
 25 |           default='', type=str)
 26 |     parser.add_argument('--video', dest='video_path', help='Path of video')
 27 |     parser.add_argument('--bboxes', dest='bboxes', help='Bounding box annotations of frames')
 28 |     parser.add_argument('--output_string', dest='output_string', help='String appended to output file')
 29 |     parser.add_argument('--n_frames', dest='n_frames', help='Number of frames', type=int)
 30 |     parser.add_argument('--fps', dest='fps', help='Frames per second of source video', type=float, default=30.)
 31 |     args = parser.parse_args()
 32 |     return args
 33 | 
 34 | if __name__ == '__main__':
 35 |     args = parse_args()
 36 | 
 37 |     cudnn.enabled = True
 38 | 
 39 |     batch_size = 1
 40 |     gpu = args.gpu_id
 41 |     snapshot_path = args.snapshot
 42 |     out_dir = 'output/video'
 43 |     video_path = args.video_path
 44 | 
 45 |     if not os.path.exists(out_dir):
 46 |         os.makedirs(out_dir)
 47 | 
 48 |     if not os.path.exists(args.video_path):
 49 |         sys.exit('Video does not exist')
 50 | 
 51 |     # ResNet50 structure
 52 |     model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
 53 | 
 54 |     print 'Loading snapshot.'
 55 |     # Load snapshot
 56 |     saved_state_dict = torch.load(snapshot_path)
 57 |     model.load_state_dict(saved_state_dict)
 58 | 
 59 |     print 'Loading data.'
 60 | 
 61 |     transformations = transforms.Compose([transforms.Scale(224),
 62 |     transforms.CenterCrop(224), transforms.ToTensor(),
 63 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 64 | 
 65 |     model.cuda(gpu)
 66 | 
 67 |     print 'Ready to test network.'
 68 | 
 69 |     # Test the Model
 70 |     model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
 71 |     total = 0
 72 | 
 73 |     idx_tensor = [idx for idx in xrange(66)]
 74 |     idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu)
 75 | 
 76 |     video = cv2.VideoCapture(video_path)
 77 | 
 78 |     # New cv2
 79 |     width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))   # float
 80 |     height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float
 81 | 
 82 |     # Define the codec and create VideoWriter object
 83 |     fourcc = cv2.VideoWriter_fourcc(*'MJPG')
 84 |     out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height))
 85 | 
 86 |     # # Old cv2
 87 |     # width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH))   # float
 88 |     # height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float
 89 |     #
 90 |     # # Define the codec and create VideoWriter object
 91 |     # fourcc = cv2.cv.CV_FOURCC(*'MJPG')
 92 |     # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height))
 93 | 
 94 |     txt_out = open('output/video/output-%s.txt' % args.output_string, 'w')
 95 | 
 96 |     frame_num = 1
 97 | 
 98 |     with open(args.bboxes, 'r') as f:
 99 |         bbox_line_list = f.read().splitlines()
100 | 
101 |     idx = 0
102 |     while idx < len(bbox_line_list):
103 |         line = bbox_line_list[idx]
104 |         line = line.strip('\n')
105 |         line = line.split(' ')
106 |         det_frame_num = int(line[0])
107 | 
108 |         print frame_num
109 | 
110 |         # Stop at a certain frame number
111 |         if frame_num > args.n_frames:
112 |             break
113 | 
114 |         # Save all frames as they are if they don't have bbox annotation.
115 |         while frame_num < det_frame_num:
116 |             ret, frame = video.read()
117 |             if ret == False:
118 |                 out.release()
119 |                 video.release()
120 |                 txt_out.close()
121 |                 sys.exit(0)
122 |             out.write(frame)
123 |             frame_num += 1
124 | 
125 |         # Start processing frame with bounding box
126 |         ret,frame = video.read()
127 |         if ret == False:
128 |             break
129 |         cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
130 | 
131 |         while True:
132 |             x_min, y_min, x_max, y_max, conf = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4])), float(line[5])
133 | 
134 |             if conf > 0.98:
135 |                 bbox_width = abs(x_max - x_min)
136 |                 bbox_height = abs(y_max - y_min)
137 |                 # x_min -= 3 * bbox_width / 4
138 |                 # x_max += 3 * bbox_width / 4
139 |                 # y_min -= 3 * bbox_height / 4
140 |                 # y_max += bbox_height / 4
141 |                 x_min -= 50
142 |                 x_max += 50
143 |                 y_min -= 50
144 |                 y_max += 30
145 |                 x_min = max(x_min, 0)
146 |                 y_min = max(y_min, 0)
147 |                 x_max = min(frame.shape[1], x_max)
148 |                 y_max = min(frame.shape[0], y_max)
149 |                 # Crop image
150 |                 img = cv2_frame[y_min:y_max,x_min:x_max]
151 |                 img = Image.fromarray(img)
152 | 
153 |                 # Transform
154 |                 img = transformations(img)
155 |                 img_shape = img.size()
156 |                 img = img.view(1, img_shape[0], img_shape[1], img_shape[2])
157 |                 img = Variable(img).cuda(gpu)
158 | 
159 |                 yaw, pitch, roll = model(img)
160 | 
161 |                 yaw_predicted = F.softmax(yaw)
162 |                 pitch_predicted = F.softmax(pitch)
163 |                 roll_predicted = F.softmax(roll)
164 |                 # Get continuous predictions in degrees.
165 |                 yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99
166 |                 pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99
167 |                 roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99
168 | 
169 |                 # Print new frame with cube and axis
170 |                 txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted))
171 |                 # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width)
172 |                 utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2)
173 |                 # Plot expanded bounding box
174 |                 # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1)
175 | 
176 |             # Peek next frame detection
177 |             next_frame_num = int(bbox_line_list[idx+1].strip('\n').split(' ')[0])
178 |             # print 'next_frame_num ', next_frame_num
179 |             if next_frame_num == det_frame_num:
180 |                 idx += 1
181 |                 line = bbox_line_list[idx].strip('\n').split(' ')
182 |                 det_frame_num = int(line[0])
183 |             else:
184 |                 break
185 | 
186 |         idx += 1
187 |         out.write(frame)
188 |         frame_num += 1
189 | 
190 |     out.release()
191 |     video.release()
192 |     txt_out.close()
193 | 


--------------------------------------------------------------------------------
/pose/test_resnet50_regression.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torch.backends.cudnn as cudnn
 13 | import torchvision
 14 | import torch.nn.functional as F
 15 | 
 16 | import datasets, hopenet, utils
 17 | 
 18 | def parse_args():
 19 |     """Parse input arguments."""
 20 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 21 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 22 |             default=0, type=int)
 23 |     parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.',
 24 |           default='', type=str)
 25 |     parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.',
 26 |           default='', type=str)
 27 |     parser.add_argument('--snapshot', dest='snapshot', help='Name of model snapshot.',
 28 |           default='', type=str)
 29 |     parser.add_argument('--batch_size', dest='batch_size', help='Batch size.',
 30 |           default=1, type=int)
 31 |     parser.add_argument('--save_viz', dest='save_viz', help='Save images with pose cube.',
 32 |           default=False, type=bool)
 33 |     parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='AFLW2000', type=str)
 34 | 
 35 |     args = parser.parse_args()
 36 | 
 37 |     return args
 38 | 
 39 | if __name__ == '__main__':
 40 |     args = parse_args()
 41 | 
 42 |     cudnn.enabled = True
 43 |     gpu = args.gpu_id
 44 |     snapshot_path = args.snapshot
 45 | 
 46 |     model = hopenet.ResNet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 3)
 47 | 
 48 |     print ('Loading snapshot.')
 49 |     # Load snapshot
 50 |     saved_state_dict = torch.load(snapshot_path)
 51 |     model.load_state_dict(saved_state_dict)
 52 | 
 53 |     print ('Loading data.')
 54 | 
 55 |     transformations = transforms.Compose([transforms.Scale(224),
 56 |     transforms.CenterCrop(224), transforms.ToTensor(),
 57 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 58 | 
 59 |     if args.dataset == 'Pose_300W_LP':
 60 |         pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations)
 61 |     elif args.dataset == 'Pose_300W_LP_random_ds':
 62 |         pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations)
 63 |     elif args.dataset == 'AFLW2000':
 64 |         pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations)
 65 |     elif args.dataset == 'AFLW2000_ds':
 66 |         pose_dataset = datasets.AFLW2000_ds(args.data_dir, args.filename_list, transformations)
 67 |     elif args.dataset == 'BIWI':
 68 |         pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations)
 69 |     elif args.dataset == 'AFLW':
 70 |         pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations)
 71 |     elif args.dataset == 'AFLW_aug':
 72 |         pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations)
 73 |     elif args.dataset == 'AFW':
 74 |         pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations)
 75 |     else:
 76 |         print ('Error: not a valid dataset name')
 77 |         sys.exit()
 78 |     test_loader = torch.utils.data.DataLoader(dataset=pose_dataset,
 79 |                                                batch_size=args.batch_size,
 80 |                                                num_workers=2)
 81 | 
 82 |     model.cuda(gpu)
 83 | 
 84 |     print ('Ready to test network.')
 85 | 
 86 |     # Test the Model
 87 |     model.eval()  # Change model to 'eval' mode (BN uses moving mean/var).
 88 |     total = 0
 89 | 
 90 |     yaw_error = .0
 91 |     pitch_error = .0
 92 |     roll_error = .0
 93 | 
 94 |     l1loss = torch.nn.L1Loss(size_average=False)
 95 | 
 96 |     for i, (images, labels, cont_labels, name) in enumerate(test_loader):
 97 |         images = Variable(images).cuda(gpu)
 98 |         total += cont_labels.size(0)
 99 |         label_yaw = cont_labels[:,0].float()
100 |         label_pitch = cont_labels[:,1].float()
101 |         label_roll = cont_labels[:,2].float()
102 | 
103 |         angles = model(images)
104 |         yaw_predicted = angles[:,0].data.cpu()
105 |         pitch_predicted = angles[:,1].data.cpu()
106 |         roll_predicted = angles[:,2].data.cpu()
107 | 
108 |         # Mean absolute error
109 |         yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw))
110 |         pitch_error += torch.sum(torch.abs(pitch_predicted - label_pitch))
111 |         roll_error += torch.sum(torch.abs(roll_predicted - label_roll))
112 | 
113 |         # Save first image in batch with pose cube or axis.
114 |         if args.save_viz:
115 |             name = name[0]
116 |             if args.dataset == 'BIWI':
117 |                 cv2_img = cv2.imread(os.path.join(args.data_dir, name + '_rgb.png'))
118 |             else:
119 |                 cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg'))
120 |             if args.batch_size == 1:
121 |                 error_string = 'y %.2f, p %.2f, r %.2f' % (torch.sum(torch.abs(yaw_predicted - label_yaw)), torch.sum(torch.abs(pitch_predicted - label_pitch)), torch.sum(torch.abs(roll_predicted - label_roll)))
122 |                 cv2.putText(cv2_img, error_string, (30, cv2_img.shape[0]- 30), fontFace=1, fontScale=1, color=(0,0,255), thickness=1)
123 |             # utils.plot_pose_cube(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], size=100)
124 |             utils.draw_axis(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], tdx = 200, tdy= 200, size=100)
125 |             cv2.imwrite(os.path.join('output/images', name + '.jpg'), cv2_img)
126 | 
127 |     print('Test error in degrees of the model on the ' + str(total) +
128 |     ' test images. Yaw: %.4f, Pitch: %.4f, Roll: %.4f' % (yaw_error / total,
129 |     pitch_error / total, roll_error / total))
130 | 


--------------------------------------------------------------------------------
/pose/train_alexnet.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse, time
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torchvision
 13 | import torch.backends.cudnn as cudnn
 14 | import torch.nn.functional as F
 15 | 
 16 | import datasets, hopenet
 17 | import torch.utils.model_zoo as model_zoo
 18 | 
 19 | model_urls = {
 20 |     'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
 21 | }
 22 | 
 23 | def parse_args():
 24 |     """Parse input arguments."""
 25 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 26 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 27 |             default=0, type=int)
 28 |     parser.add_argument('--num_epochs', dest='num_epochs', help='Maximum number of training epochs.',
 29 |           default=5, type=int)
 30 |     parser.add_argument('--batch_size', dest='batch_size', help='Batch size.',
 31 |           default=16, type=int)
 32 |     parser.add_argument('--lr', dest='lr', help='Base learning rate.',
 33 |           default=0.001, type=float)
 34 |     parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.',
 35 |           default='', type=str)
 36 |     parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.',
 37 |           default='', type=str)
 38 |     parser.add_argument('--output_string', dest='output_string', help='String appended to output snapshots.', default = '', type=str)
 39 |     parser.add_argument('--alpha', dest='alpha', help='Regression loss coefficient.',
 40 |           default=0.001, type=float)
 41 |     parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='Pose_300W_LP', type=str)
 42 |     args = parser.parse_args()
 43 |     return args
 44 | 
 45 | def get_ignored_params(model):
 46 |     # Generator function that yields ignored params.
 47 |     b = [model.features[0], model.features[1], model.features[2]]
 48 |     for i in range(len(b)):
 49 |         for module_name, module in b[i].named_modules():
 50 |             if 'bn' in module_name:
 51 |                 module.eval()
 52 |             for name, param in module.named_parameters():
 53 |                 yield param
 54 | 
 55 | def get_non_ignored_params(model):
 56 |     # Generator function that yields params that will be optimized.
 57 |     b = []
 58 |     for idx in xrange(3, len(model.features)):
 59 |         b.append(model.features[idx])
 60 |     for layer in model.classifier:
 61 |         b.append(layer)
 62 |     for i in range(len(b)):
 63 |         for module_name, module in b[i].named_modules():
 64 |             if 'bn' in module_name:
 65 |                 module.eval()
 66 |             for name, param in module.named_parameters():
 67 |                 yield param
 68 | 
 69 | def get_fc_params(model):
 70 |     b = [model.fc_yaw, model.fc_pitch, model.fc_roll]
 71 |     for i in range(len(b)):
 72 |         for module_name, module in b[i].named_modules():
 73 |             for name, param in module.named_parameters():
 74 |                 yield param
 75 | 
 76 | def load_filtered_state_dict(model, snapshot):
 77 |     # By user apaszke from discuss.pytorch.org
 78 |     model_dict = model.state_dict()
 79 |     snapshot = {k: v for k, v in snapshot.items() if k in model_dict}
 80 |     model_dict.update(snapshot)
 81 |     model.load_state_dict(model_dict)
 82 | 
 83 | if __name__ == '__main__':
 84 |     args = parse_args()
 85 | 
 86 |     cudnn.enabled = True
 87 |     num_epochs = args.num_epochs
 88 |     batch_size = args.batch_size
 89 |     gpu = args.gpu_id
 90 | 
 91 |     if not os.path.exists('output/snapshots'):
 92 |         os.makedirs('output/snapshots')
 93 | 
 94 |     model = hopenet.AlexNet(66)
 95 |     load_filtered_state_dict(model, model_zoo.load_url(model_urls['alexnet']))
 96 | 
 97 |     print 'Loading data.'
 98 | 
 99 |     transformations = transforms.Compose([transforms.Scale(240),
100 |     transforms.RandomCrop(224), transforms.ToTensor(),
101 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
102 | 
103 |     if args.dataset == 'Pose_300W_LP':
104 |         pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations)
105 |     elif args.dataset == 'Pose_300W_LP_random_ds':
106 |         pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations)
107 |     elif args.dataset == 'AFLW2000':
108 |         pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations)
109 |     elif args.dataset == 'BIWI':
110 |         pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations)
111 |     elif args.dataset == 'AFLW':
112 |         pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations)
113 |     elif args.dataset == 'AFLW_aug':
114 |         pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations)
115 |     elif args.dataset == 'AFW':
116 |         pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations)
117 |     else:
118 |         print 'Error: not a valid dataset name'
119 |         sys.exit()
120 |     train_loader = torch.utils.data.DataLoader(dataset=pose_dataset,
121 |                                                batch_size=batch_size,
122 |                                                shuffle=True,
123 |                                                num_workers=2)
124 | 
125 |     model.cuda(gpu)
126 |     softmax = nn.Softmax().cuda(gpu)
127 |     criterion = nn.CrossEntropyLoss().cuda(gpu)
128 |     reg_criterion = nn.MSELoss().cuda(gpu)
129 |     # Regression loss coefficient
130 |     alpha = args.alpha
131 | 
132 |     idx_tensor = [idx for idx in xrange(66)]
133 |     idx_tensor = Variable(torch.FloatTensor(idx_tensor)).cuda(gpu)
134 | 
135 |     optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': 0},
136 |                                   {'params': get_non_ignored_params(model), 'lr': args.lr},
137 |                                   {'params': get_fc_params(model), 'lr': args.lr * 5}],
138 |                                    lr = args.lr)
139 | 
140 |     print 'Ready to train network.'
141 |     for epoch in range(num_epochs):
142 |         for i, (images, labels, cont_labels, name) in enumerate(train_loader):
143 |             images = Variable(images).cuda(gpu)
144 | 
145 |             # Binned labels
146 |             label_yaw = Variable(labels[:,0]).cuda(gpu)
147 |             label_pitch = Variable(labels[:,1]).cuda(gpu)
148 |             label_roll = Variable(labels[:,2]).cuda(gpu)
149 | 
150 |             # Continuous labels
151 |             label_yaw_cont = Variable(cont_labels[:,0]).cuda(gpu)
152 |             label_pitch_cont = Variable(cont_labels[:,1]).cuda(gpu)
153 |             label_roll_cont = Variable(cont_labels[:,2]).cuda(gpu)
154 | 
155 |             # Forward pass
156 |             pre_yaw, pre_pitch, pre_roll = model(images)
157 | 
158 |             # Cross entropy loss
159 |             loss_yaw = criterion(pre_yaw, label_yaw)
160 |             loss_pitch = criterion(pre_pitch, label_pitch)
161 |             loss_roll = criterion(pre_roll, label_roll)
162 | 
163 |             # MSE loss
164 |             yaw_predicted = softmax(pre_yaw)
165 |             pitch_predicted = softmax(pre_pitch)
166 |             roll_predicted = softmax(pre_roll)
167 | 
168 |             yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1) * 3 - 99
169 |             pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1) * 3 - 99
170 |             roll_predicted = torch.sum(roll_predicted * idx_tensor, 1) * 3 - 99
171 | 
172 |             loss_reg_yaw = reg_criterion(yaw_predicted, label_yaw_cont)
173 |             loss_reg_pitch = reg_criterion(pitch_predicted, label_pitch_cont)
174 |             loss_reg_roll = reg_criterion(roll_predicted, label_roll_cont)
175 | 
176 |             # Total loss
177 |             loss_yaw += alpha * loss_reg_yaw
178 |             loss_pitch += alpha * loss_reg_pitch
179 |             loss_roll += alpha * loss_reg_roll
180 | 
181 |             loss_seq = [loss_yaw, loss_pitch, loss_roll]
182 |             grad_seq = [torch.ones(1).cuda(gpu) for _ in range(len(loss_seq))]
183 |             torch.autograd.backward(loss_seq, grad_seq)
184 |             optimizer.step()
185 | 
186 |             if (i+1) % 100 == 0:
187 |                 print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f'
188 |                        %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0]))
189 | 
190 |         # Save models at numbered epochs.
191 |         if epoch % 1 == 0 and epoch < num_epochs:
192 |             print 'Taking snapshot...'
193 |             torch.save(model.state_dict(),
194 |             'output/snapshots/' + args.output_string + '_epoch_'+ str(epoch+1) + '.pkl')
195 | 


--------------------------------------------------------------------------------
/pose/train_hopenet.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse, time
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torchvision
 13 | import torch.backends.cudnn as cudnn
 14 | import torch.nn.functional as F
 15 | 
 16 | import datasets, hopenet
 17 | import torch.utils.model_zoo as model_zoo
 18 | 
 19 | def parse_args():
 20 |     """Parse input arguments."""
 21 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 22 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 23 |             default=0, type=int)
 24 |     parser.add_argument('--num_epochs', dest='num_epochs', help='Maximum number of training epochs.',
 25 |           default=5, type=int)
 26 |     parser.add_argument('--batch_size', dest='batch_size', help='Batch size.',
 27 |           default=16, type=int)
 28 |     parser.add_argument('--lr', dest='lr', help='Base learning rate.',
 29 |           default=0.001, type=float)
 30 |     parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='Pose_300W_LP', type=str)
 31 |     parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.',
 32 |           default='', type=str)
 33 |     parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.',
 34 |           default='', type=str)
 35 |     parser.add_argument('--output_string', dest='output_string', help='String appended to output snapshots.', default = '', type=str)
 36 |     parser.add_argument('--alpha', dest='alpha', help='Regression loss coefficient.',
 37 |           default=0.001, type=float)
 38 |     parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.',
 39 |           default='', type=str)
 40 | 
 41 |     args = parser.parse_args()
 42 |     return args
 43 | 
 44 | def get_ignored_params(model):
 45 |     # Generator function that yields ignored params.
 46 |     b = [model.conv1, model.bn1, model.fc_finetune]
 47 |     for i in range(len(b)):
 48 |         for module_name, module in b[i].named_modules():
 49 |             if 'bn' in module_name:
 50 |                 module.eval()
 51 |             for name, param in module.named_parameters():
 52 |                 yield param
 53 | 
 54 | def get_non_ignored_params(model):
 55 |     # Generator function that yields params that will be optimized.
 56 |     b = [model.layer1, model.layer2, model.layer3, model.layer4]
 57 |     for i in range(len(b)):
 58 |         for module_name, module in b[i].named_modules():
 59 |             if 'bn' in module_name:
 60 |                 module.eval()
 61 |             for name, param in module.named_parameters():
 62 |                 yield param
 63 | 
 64 | def get_fc_params(model):
 65 |     # Generator function that yields fc layer params.
 66 |     b = [model.fc_yaw, model.fc_pitch, model.fc_roll]
 67 |     for i in range(len(b)):
 68 |         for module_name, module in b[i].named_modules():
 69 |             for name, param in module.named_parameters():
 70 |                 yield param
 71 | 
 72 | def load_filtered_state_dict(model, snapshot):
 73 |     # By user apaszke from discuss.pytorch.org
 74 |     model_dict = model.state_dict()
 75 |     snapshot = {k: v for k, v in snapshot.items() if k in model_dict}
 76 |     model_dict.update(snapshot)
 77 |     model.load_state_dict(model_dict)
 78 | 
 79 | if __name__ == '__main__':
 80 |     args = parse_args()
 81 | 
 82 |     cudnn.enabled = True
 83 |     num_epochs = args.num_epochs
 84 |     batch_size = args.batch_size
 85 |     gpu = args.gpu_id
 86 | 
 87 |     if not os.path.exists('output/snapshots'):
 88 |         os.makedirs('output/snapshots')
 89 | 
 90 |     # ResNet50 structure
 91 |     model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66)
 92 | 
 93 |     if args.snapshot == '':
 94 |         load_filtered_state_dict(model, model_zoo.load_url('https://download.pytorch.org/models/resnet50-19c8e357.pth'))
 95 |     else:
 96 |         saved_state_dict = torch.load(args.snapshot)
 97 |         model.load_state_dict(saved_state_dict)
 98 | 
 99 |     # print 'Loading data.'
100 | 
101 |     transformations = transforms.Compose([transforms.Scale(240),
102 |     transforms.RandomCrop(224), transforms.ToTensor(),
103 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
104 | 
105 |     if args.dataset == 'Pose_300W_LP':
106 |         pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations)
107 |     elif args.dataset == 'Pose_300W_LP_random_ds':
108 |         pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations)
109 |     elif args.dataset == 'Synhead':
110 |         pose_dataset = datasets.Synhead(args.data_dir, args.filename_list, transformations)
111 |     elif args.dataset == 'AFLW2000':
112 |         pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations)
113 |     elif args.dataset == 'BIWI':
114 |         pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations)
115 |     elif args.dataset == 'AFLW':
116 |         pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations)
117 |     elif args.dataset == 'AFLW_aug':
118 |         pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations)
119 |     elif args.dataset == 'AFW':
120 |         pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations)
121 |     else:
122 |         # print 'Error: not a valid dataset name'
123 |         sys.exit()
124 | 
125 |     train_loader = torch.utils.data.DataLoader(dataset=pose_dataset,
126 |                                                batch_size=batch_size,
127 |                                                shuffle=True,
128 |                                                num_workers=2)
129 | 
130 |     model.cuda(gpu)
131 |     criterion = nn.CrossEntropyLoss().cuda(gpu)
132 |     reg_criterion = nn.MSELoss().cuda(gpu)
133 |     # Regression loss coefficient
134 |     alpha = args.alpha
135 | 
136 |     softmax = nn.Softmax().cuda(gpu)
137 |     idx_tensor = [idx for idx in range(66)]
138 |     idx_tensor = Variable(torch.FloatTensor(idx_tensor)).cuda(gpu)
139 | 
140 |     optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': 0},
141 |                                   {'params': get_non_ignored_params(model), 'lr': args.lr},
142 |                                   {'params': get_fc_params(model), 'lr': args.lr * 5}],
143 |                                    lr = args.lr)
144 | 
145 |     # print 'Ready to train network.'
146 |     for epoch in range(num_epochs):
147 |         for i, (images, labels, cont_labels, name) in enumerate(train_loader):
148 |             images = Variable(images).cuda(gpu)
149 | 
150 |             # Binned labels
151 |             label_yaw = Variable(labels[:,0]).cuda(gpu)
152 |             label_pitch = Variable(labels[:,1]).cuda(gpu)
153 |             label_roll = Variable(labels[:,2]).cuda(gpu)
154 | 
155 |             # Continuous labels
156 |             label_yaw_cont = Variable(cont_labels[:,0]).cuda(gpu)
157 |             label_pitch_cont = Variable(cont_labels[:,1]).cuda(gpu)
158 |             label_roll_cont = Variable(cont_labels[:,2]).cuda(gpu)
159 | 
160 |             # Forward pass
161 |             yaw, pitch, roll = model(images)
162 | 
163 |             # Cross entropy loss
164 |             loss_yaw = criterion(yaw, label_yaw)
165 |             loss_pitch = criterion(pitch, label_pitch)
166 |             loss_roll = criterion(roll, label_roll)
167 | 
168 |             # MSE loss
169 |             yaw_predicted = softmax(yaw)
170 |             pitch_predicted = softmax(pitch)
171 |             roll_predicted = softmax(roll)
172 | 
173 |             yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1) * 3 - 99
174 |             pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1) * 3 - 99
175 |             roll_predicted = torch.sum(roll_predicted * idx_tensor, 1) * 3 - 99
176 | 
177 |             loss_reg_yaw = reg_criterion(yaw_predicted, label_yaw_cont)
178 |             loss_reg_pitch = reg_criterion(pitch_predicted, label_pitch_cont)
179 |             loss_reg_roll = reg_criterion(roll_predicted, label_roll_cont)
180 | 
181 |             # Total loss
182 |             loss_yaw += alpha * loss_reg_yaw
183 |             loss_pitch += alpha * loss_reg_pitch
184 |             loss_roll += alpha * loss_reg_roll
185 | 
186 |             loss_seq = [loss_yaw, loss_pitch, loss_roll]
187 |             grad_seq = [torch.ones(1).cuda(gpu) for _ in range(len(loss_seq))]
188 |             optimizer.zero_grad()
189 |             torch.autograd.backward(loss_seq, grad_seq)
190 |             optimizer.step()
191 | 
192 |             if (i+1) % 100 == 0:
193 |                 print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f'
194 |                        %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0]))
195 | 
196 |         # Save models at numbered epochs.
197 |         if epoch % 1 == 0 and epoch < num_epochs:
198 |             # print 'Taking snapshot...'
199 |             torch.save(model.state_dict(),
200 |             'output/snapshots/' + args.output_string + '_epoch_'+ str(epoch+1) + '.pkl')
201 | 


--------------------------------------------------------------------------------
/pose/train_resnet50_regression.py:
--------------------------------------------------------------------------------
  1 | import sys, os, argparse, time
  2 | 
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.autograd import Variable
 10 | from torch.utils.data import DataLoader
 11 | from torchvision import transforms
 12 | import torchvision
 13 | import torch.backends.cudnn as cudnn
 14 | import torch.nn.functional as F
 15 | 
 16 | import datasets, hopenet
 17 | import torch.utils.model_zoo as model_zoo
 18 | 
 19 | def parse_args():
 20 |     """Parse input arguments."""
 21 |     parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.')
 22 |     parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]',
 23 |             default=0, type=int)
 24 |     parser.add_argument('--num_epochs', dest='num_epochs', help='Maximum number of training epochs.',
 25 |           default=5, type=int)
 26 |     parser.add_argument('--batch_size', dest='batch_size', help='Batch size.',
 27 |           default=16, type=int)
 28 |     parser.add_argument('--lr', dest='lr', help='Base learning rate.',
 29 |           default=0.001, type=float)
 30 |     parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.',
 31 |           default='', type=str)
 32 |     parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.',
 33 |           default='', type=str)
 34 |     parser.add_argument('--output_string', dest='output_string', help='String appended to output snapshots.', default = '', type=str)
 35 |     parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='Pose_300W_LP', type=str)
 36 | 
 37 |     args = parser.parse_args()
 38 |     return args
 39 | 
 40 | def get_ignored_params(model):
 41 |     # Generator function that yields ignored params.
 42 |     b = [model.conv1, model.bn1]
 43 |     for i in range(len(b)):
 44 |         for module_name, module in b[i].named_modules():
 45 |             if 'bn' in module_name:
 46 |                 module.eval()
 47 |             for name, param in module.named_parameters():
 48 |                 yield param
 49 | 
 50 | def get_non_ignored_params(model):
 51 |     # Generator function that yields params that will be optimized.
 52 |     b = [model.layer1, model.layer2, model.layer3, model.layer4]
 53 |     for i in range(len(b)):
 54 |         for module_name, module in b[i].named_modules():
 55 |             if 'bn' in module_name:
 56 |                 module.eval()
 57 |             for name, param in module.named_parameters():
 58 |                 yield param
 59 | 
 60 | def get_fc_params(model):
 61 |     # Generator function that yields fc layer params.
 62 |     b = [model.fc_angles]
 63 |     for i in range(len(b)):
 64 |         for module_name, module in b[i].named_modules():
 65 |             for name, param in module.named_parameters():
 66 |                 yield param
 67 | 
 68 | def load_filtered_state_dict(model, snapshot):
 69 |     # By user apaszke from discuss.pytorch.org
 70 |     model_dict = model.state_dict()
 71 |     snapshot = {k: v for k, v in snapshot.items() if k in model_dict}
 72 |     model_dict.update(snapshot)
 73 |     model.load_state_dict(model_dict)
 74 | 
 75 | if __name__ == '__main__':
 76 |     args = parse_args()
 77 | 
 78 |     cudnn.enabled = True
 79 |     num_epochs = args.num_epochs
 80 |     batch_size = args.batch_size
 81 |     gpu = args.gpu_id
 82 | 
 83 |     if not os.path.exists('output/snapshots'):
 84 |         os.makedirs('output/snapshots')
 85 | 
 86 |     # ResNet50
 87 |     model = hopenet.ResNet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 3)
 88 |     load_filtered_state_dict(model, model_zoo.load_url('https://download.pytorch.org/models/resnet50-19c8e357.pth'))
 89 | 
 90 |     print 'Loading data.'
 91 | 
 92 |     transformations = transforms.Compose([transforms.Scale(240),
 93 |     transforms.RandomCrop(224), transforms.ToTensor(),
 94 |     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 95 | 
 96 |     if args.dataset == 'Pose_300W_LP':
 97 |         pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations)
 98 |     elif args.dataset == 'Pose_300W_LP_random_ds':
 99 |         pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations)
100 |     elif args.dataset == 'AFLW2000':
101 |         pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations)
102 |     elif args.dataset == 'BIWI':
103 |         pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations)
104 |     elif args.dataset == 'AFLW':
105 |         pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations)
106 |     elif args.dataset == 'AFLW_aug':
107 |         pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations)
108 |     elif args.dataset == 'AFW':
109 |         pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations)
110 |     else:
111 |         print 'Error: not a valid dataset name'
112 |         sys.exit()
113 |     train_loader = torch.utils.data.DataLoader(dataset=pose_dataset,
114 |                                                batch_size=batch_size,
115 |                                                shuffle=True,
116 |                                                num_workers=2)
117 | 
118 |     model.cuda(gpu)
119 |     criterion = nn.MSELoss().cuda(gpu)
120 | 
121 |     optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': 0},
122 |                                   {'params': get_non_ignored_params(model), 'lr': args.lr},
123 |                                   {'params': get_fc_params(model), 'lr': args.lr * 5}],
124 |                                    lr = args.lr)
125 | 
126 |     print 'Ready to train network.'
127 |     print 'First phase of training.'
128 |     for epoch in range(num_epochs):
129 |         for i, (images, labels, cont_labels, name) in enumerate(train_loader):
130 |             images = Variable(images).cuda(gpu)
131 | 
132 |             label_angles = Variable(cont_labels[:,:3]).cuda(gpu)
133 |             angles = model(images)
134 | 
135 |             loss = criterion(angles, label_angles)
136 |             optimizer.zero_grad()
137 |             loss.backward()
138 |             optimizer.step()
139 | 
140 |             if (i+1) % 100 == 0:
141 |                 print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f'
142 |                        %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss.data[0]))
143 | 
144 |         # Save models at numbered epochs.
145 |         if epoch % 1 == 0 and epoch < num_epochs:
146 |             print 'Taking snapshot...'
147 |             torch.save(model.state_dict(),
148 |             'output/snapshots/' + args.output_string + '_epoch_'+ str(epoch+1) + '.pkl')
149 | 


--------------------------------------------------------------------------------
/pose/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | # from torch.utils.serialization import load_lua
  4 | import torchfile
  5 | import os
  6 | import scipy.io as sio
  7 | import cv2
  8 | import math
  9 | from math import cos, sin
 10 | 
 11 | # 这里直接进行softmax操作即可
 12 | def softmax_temperature(tensor, temperature):
 13 |     result = torch.exp(tensor / temperature)
 14 |     result = torch.div(result, torch.sum(result, 1).unsqueeze(1).expand_as(result))
 15 |     return result
 16 | 
 17 | def get_pose_params_from_mat(mat_path):
 18 |     # This functions gets the pose parameters from the .mat
 19 |     # Annotations that come with the Pose_300W_LP dataset.
 20 |     mat = sio.loadmat(mat_path)
 21 |     # [pitch yaw roll tdx tdy tdz scale_factor]
 22 |     pre_pose_params = mat['Pose_Para'][0]
 23 |     # Get [pitch, yaw, roll, tdx, tdy]
 24 |     pose_params = pre_pose_params[:5]
 25 |     return pose_params
 26 | 
 27 | def get_ypr_from_mat(mat_path):
 28 |     # Get yaw, pitch, roll from .mat annotation.
 29 |     # They are in radians
 30 |     mat = sio.loadmat(mat_path)
 31 |     # [pitch yaw roll tdx tdy tdz scale_factor]
 32 |     pre_pose_params = mat['Pose_Para'][0]
 33 |     # Get [pitch, yaw, roll]
 34 |     pose_params = pre_pose_params[:3]
 35 |     return pose_params
 36 | 
 37 | def get_pt2d_from_mat(mat_path):
 38 |     # Get 2D landmarks
 39 |     mat = sio.loadmat(mat_path)
 40 |     pt2d = mat['pt2d']
 41 |     return pt2d
 42 | 
 43 | def mse_loss(input, target):
 44 |     return torch.sum(torch.abs(input.data - target.data) ** 2)
 45 | 
 46 | def plot_pose_cube(img, yaw, pitch, roll, tdx=None, tdy=None, size=150.):
 47 |     # Input is a cv2 image
 48 |     # pose_params: (pitch, yaw, roll, tdx, tdy)
 49 |     # Where (tdx, tdy) is the translation of the face.
 50 |     # For pose we have [pitch yaw roll tdx tdy tdz scale_factor]
 51 | 
 52 |     p = pitch * np.pi / 180
 53 |     y = -(yaw * np.pi / 180)
 54 |     r = roll * np.pi / 180
 55 |     if tdx != None and tdy != None:
 56 |         face_x = tdx - 0.50 * size
 57 |         face_y = tdy - 0.50 * size
 58 |     else:
 59 |         height, width = img.shape[:2]
 60 |         face_x = width / 2 - 0.5 * size
 61 |         face_y = height / 2 - 0.5 * size
 62 | 
 63 |     x1 = size * (cos(y) * cos(r)) + face_x
 64 |     y1 = size * (cos(p) * sin(r) + cos(r) * sin(p) * sin(y)) + face_y
 65 |     x2 = size * (-cos(y) * sin(r)) + face_x
 66 |     y2 = size * (cos(p) * cos(r) - sin(p) * sin(y) * sin(r)) + face_y
 67 |     x3 = size * (sin(y)) + face_x
 68 |     y3 = size * (-cos(y) * sin(p)) + face_y
 69 | 
 70 |     # Draw base in red
 71 |     cv2.line(img, (int(face_x), int(face_y)), (int(x1),int(y1)),(0,0,255),3)
 72 |     cv2.line(img, (int(face_x), int(face_y)), (int(x2),int(y2)),(0,0,255),3)
 73 |     cv2.line(img, (int(x2), int(y2)), (int(x2+x1-face_x),int(y2+y1-face_y)),(0,0,255),3)
 74 |     cv2.line(img, (int(x1), int(y1)), (int(x1+x2-face_x),int(y1+y2-face_y)),(0,0,255),3)
 75 |     # Draw pillars in blue
 76 |     cv2.line(img, (int(face_x), int(face_y)), (int(x3),int(y3)),(255,0,0),2)
 77 |     cv2.line(img, (int(x1), int(y1)), (int(x1+x3-face_x),int(y1+y3-face_y)),(255,0,0),2)
 78 |     cv2.line(img, (int(x2), int(y2)), (int(x2+x3-face_x),int(y2+y3-face_y)),(255,0,0),2)
 79 |     cv2.line(img, (int(x2+x1-face_x),int(y2+y1-face_y)), (int(x3+x1+x2-2*face_x),int(y3+y2+y1-2*face_y)),(255,0,0),2)
 80 |     # Draw top in green
 81 |     cv2.line(img, (int(x3+x1-face_x),int(y3+y1-face_y)), (int(x3+x1+x2-2*face_x),int(y3+y2+y1-2*face_y)),(0,255,0),2)
 82 |     cv2.line(img, (int(x2+x3-face_x),int(y2+y3-face_y)), (int(x3+x1+x2-2*face_x),int(y3+y2+y1-2*face_y)),(0,255,0),2)
 83 |     cv2.line(img, (int(x3), int(y3)), (int(x3+x1-face_x),int(y3+y1-face_y)),(0,255,0),2)
 84 |     cv2.line(img, (int(x3), int(y3)), (int(x3+x2-face_x),int(y3+y2-face_y)),(0,255,0),2)
 85 | 
 86 |     return img
 87 | 
 88 | def draw_axis(img, yaw, pitch, roll, tdx=None, tdy=None, size = 100):
 89 | 
 90 |     pitch = pitch * np.pi / 180
 91 |     yaw = -(yaw * np.pi / 180)
 92 |     roll = roll * np.pi / 180
 93 | 
 94 |     if tdx != None and tdy != None:
 95 |         tdx = tdx
 96 |         tdy = tdy
 97 |     else:
 98 |         height, width = img.shape[:2]
 99 |         tdx = width / 2
100 |         tdy = height / 2
101 | 
102 |     # X-Axis pointing to right. drawn in red  [1,0,0]
103 |     x1 = size * (cos(yaw) * cos(roll)) + tdx
104 |     y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy
105 |     # z1省略
106 | 
107 |     # Y-Axis | drawn in green  [0,1,0]
108 |     #        v
109 |     x2 = size * (-cos(yaw) * sin(roll)) + tdx
110 |     y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy
111 |     # z2省略
112 | 
113 |     # Z-Axis (out of the screen) drawn in blue  [0,0,1]
114 |     x3 = size * (sin(yaw)) + tdx
115 |     y3 = size * (-cos(yaw) * sin(pitch)) + tdy
116 |     # z3省略
117 | 
118 |     cv2.line(img, (int(tdx), int(tdy)), (int(x1), int(y1)), (0,0,255), 2)  # 红色 x轴 脸右侧
119 |     cv2.line(img, (int(tdx), int(tdy)), (int(x2), int(y2)), (0,255,0), 2)  # 绿色 y轴 脸下方
120 |     cv2.line(img, (int(tdx), int(tdy)), (int(x3), int(y3)), (255,0,0), 2)  # 蓝色 z轴 脸前方
121 | 
122 |     return img
123 | 


--------------------------------------------------------------------------------
/test_fddb.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import argparse
  4 | import torch
  5 | import torch.backends.cudnn as cudnn
  6 | import numpy as np
  7 | from data import cfg_mnet, cfg_re50
  8 | from layers.functions.prior_box import PriorBox
  9 | from utils.nms.py_cpu_nms import py_cpu_nms
 10 | import cv2
 11 | from models.retinaface import RetinaFace
 12 | from utils.box_utils import decode, decode_landm
 13 | from utils.timer import Timer
 14 | 
 15 | parser = argparse.ArgumentParser(description='Retinaface')
 16 | 
 17 | parser.add_argument('-m', '--trained_model', default='./weights/mobilenet0.25_Final.pth',
 18 |                     type=str, help='Trained state_dict file path to open')
 19 | parser.add_argument('--network', default='mobile0.25', help='Backbone network mobile0.25 or resnet50')
 20 | parser.add_argument('--save_folder', default='eval/', type=str, help='Dir to save results')
 21 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference')
 22 | parser.add_argument('--dataset', default='FDDB', type=str, choices=['FDDB'], help='dataset')
 23 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold')
 24 | parser.add_argument('--top_k', default=5000, type=int, help='top_k')
 25 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold')
 26 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k')
 27 | parser.add_argument('-s', '--save_image', action="store_true", default=False, help='show detection results')
 28 | parser.add_argument('--vis_thres', default=0.5, type=float, help='visualization_threshold')
 29 | args = parser.parse_args()
 30 | 
 31 | 
 32 | def check_keys(model, pretrained_state_dict):
 33 |     ckpt_keys = set(pretrained_state_dict.keys())
 34 |     model_keys = set(model.state_dict().keys())
 35 |     used_pretrained_keys = model_keys & ckpt_keys
 36 |     unused_pretrained_keys = ckpt_keys - model_keys
 37 |     missing_keys = model_keys - ckpt_keys
 38 |     print('Missing keys:{}'.format(len(missing_keys)))
 39 |     print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys)))
 40 |     print('Used keys:{}'.format(len(used_pretrained_keys)))
 41 |     assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
 42 |     return True
 43 | 
 44 | 
 45 | def remove_prefix(state_dict, prefix):
 46 |     ''' Old style model is stored with all names of parameters sharing common prefix 'module.' '''
 47 |     print('remove prefix \'{}\''.format(prefix))
 48 |     f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x
 49 |     return {f(key): value for key, value in state_dict.items()}
 50 | 
 51 | 
 52 | def load_model(model, pretrained_path, load_to_cpu):
 53 |     print('Loading pretrained model from {}'.format(pretrained_path))
 54 |     if load_to_cpu:
 55 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
 56 |     else:
 57 |         device = torch.cuda.current_device()
 58 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device))
 59 |     if "state_dict" in pretrained_dict.keys():
 60 |         pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.')
 61 |     else:
 62 |         pretrained_dict = remove_prefix(pretrained_dict, 'module.')
 63 |     check_keys(model, pretrained_dict)
 64 |     model.load_state_dict(pretrained_dict, strict=False)
 65 |     return model
 66 | 
 67 | 
 68 | if __name__ == '__main__':
 69 |     torch.set_grad_enabled(False)
 70 |     cfg = None
 71 |     if args.network == "mobile0.25":
 72 |         cfg = cfg_mnet
 73 |     elif args.network == "resnet50":
 74 |         cfg = cfg_re50
 75 |     # net and model
 76 |     net = RetinaFace(cfg=cfg, phase = 'test')
 77 |     net = load_model(net, args.trained_model, args.cpu)
 78 |     net.eval()
 79 |     print('Finished loading model!')
 80 |     print(net)
 81 |     cudnn.benchmark = True
 82 |     device = torch.device("cpu" if args.cpu else "cuda")
 83 |     net = net.to(device)
 84 | 
 85 | 
 86 |     # save file
 87 |     if not os.path.exists(args.save_folder):
 88 |         os.makedirs(args.save_folder)
 89 |     fw = open(os.path.join(args.save_folder, args.dataset + '_dets.txt'), 'w')
 90 | 
 91 |     # testing dataset
 92 |     testset_folder = os.path.join('data', args.dataset, 'images/')
 93 |     testset_list = os.path.join('data', args.dataset, 'img_list.txt')
 94 |     with open(testset_list, 'r') as fr:
 95 |         test_dataset = fr.read().split()
 96 |     num_images = len(test_dataset)
 97 | 
 98 |     # testing scale
 99 |     resize = 1
100 | 
101 |     _t = {'forward_pass': Timer(), 'misc': Timer()}
102 | 
103 |     # testing begin
104 |     for i, img_name in enumerate(test_dataset):
105 |         image_path = testset_folder + img_name + '.jpg'
106 |         img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR)
107 | 
108 |         img = np.float32(img_raw)
109 |         if resize != 1:
110 |             img = cv2.resize(img, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR)
111 |         im_height, im_width, _ = img.shape
112 |         scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
113 |         img -= (104, 117, 123)
114 |         img = img.transpose(2, 0, 1)
115 |         img = torch.from_numpy(img).unsqueeze(0)
116 |         img = img.to(device)
117 |         scale = scale.to(device)
118 | 
119 |         _t['forward_pass'].tic()
120 |         loc, conf, landms = net(img)  # forward pass
121 |         _t['forward_pass'].toc()
122 |         _t['misc'].tic()
123 |         priorbox = PriorBox(cfg, image_size=(im_height, im_width))
124 |         priors = priorbox.forward()
125 |         priors = priors.to(device)
126 |         prior_data = priors.data
127 |         boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance'])
128 |         boxes = boxes * scale / resize
129 |         boxes = boxes.cpu().numpy()
130 |         scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
131 |         landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance'])
132 |         scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2],
133 |                                img.shape[3], img.shape[2], img.shape[3], img.shape[2],
134 |                                img.shape[3], img.shape[2]])
135 |         scale1 = scale1.to(device)
136 |         landms = landms * scale1 / resize
137 |         landms = landms.cpu().numpy()
138 | 
139 |         # ignore low scores
140 |         inds = np.where(scores > args.confidence_threshold)[0]
141 |         boxes = boxes[inds]
142 |         landms = landms[inds]
143 |         scores = scores[inds]
144 | 
145 |         # keep top-K before NMS
146 |         # order = scores.argsort()[::-1][:args.top_k]
147 |         order = scores.argsort()[::-1]
148 |         boxes = boxes[order]
149 |         landms = landms[order]
150 |         scores = scores[order]
151 | 
152 |         # do NMS
153 |         dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
154 |         keep = py_cpu_nms(dets, args.nms_threshold)
155 | 
156 |         dets = dets[keep, :]
157 |         landms = landms[keep]
158 | 
159 |         # keep top-K faster NMS
160 |         # dets = dets[:args.keep_top_k, :]
161 |         # landms = landms[:args.keep_top_k, :]
162 | 
163 |         dets = np.concatenate((dets, landms), axis=1)
164 |         _t['misc'].toc()
165 | 
166 |         # save dets
167 |         if args.dataset == "FDDB":
168 |             fw.write('{:s}\n'.format(img_name))
169 |             fw.write('{:.1f}\n'.format(dets.shape[0]))
170 |             for k in range(dets.shape[0]):
171 |                 xmin = dets[k, 0]
172 |                 ymin = dets[k, 1]
173 |                 xmax = dets[k, 2]
174 |                 ymax = dets[k, 3]
175 |                 score = dets[k, 4]
176 |                 w = xmax - xmin + 1
177 |                 h = ymax - ymin + 1
178 |                 # fw.write('{:.3f} {:.3f} {:.3f} {:.3f} {:.10f}\n'.format(xmin, ymin, w, h, score))
179 |                 fw.write('{:d} {:d} {:d} {:d} {:.10f}\n'.format(int(xmin), int(ymin), int(w), int(h), score))
180 |         print('im_detect: {:d}/{:d} forward_pass_time: {:.4f}s misc: {:.4f}s'.format(i + 1, num_images, _t['forward_pass'].average_time, _t['misc'].average_time))
181 | 
182 |         # show image
183 |         if args.save_image:
184 |             for b in dets:
185 |                 if b[4] < args.vis_thres:
186 |                     continue
187 |                 text = "{:.4f}".format(b[4])
188 |                 b = list(map(int, b))
189 |                 cv2.rectangle(img_raw, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2)
190 |                 cx = b[0]
191 |                 cy = b[1] + 12
192 |                 cv2.putText(img_raw, text, (cx, cy),
193 |                             cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
194 | 
195 |                 # landms
196 |                 cv2.circle(img_raw, (b[5], b[6]), 1, (0, 0, 255), 4)
197 |                 cv2.circle(img_raw, (b[7], b[8]), 1, (0, 255, 255), 4)
198 |                 cv2.circle(img_raw, (b[9], b[10]), 1, (255, 0, 255), 4)
199 |                 cv2.circle(img_raw, (b[11], b[12]), 1, (0, 255, 0), 4)
200 |                 cv2.circle(img_raw, (b[13], b[14]), 1, (255, 0, 0), 4)
201 |             # save image
202 |             if not os.path.exists("./results/"):
203 |                 os.makedirs("./results/")
204 |             name = "./results/" + str(i) + ".jpg"
205 |             cv2.imwrite(name, img_raw)
206 | 
207 |     fw.close()
208 | 


--------------------------------------------------------------------------------
/test_widerface.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import argparse
  4 | import torch
  5 | import torch.backends.cudnn as cudnn
  6 | import numpy as np
  7 | from data import cfg_mnet, cfg_re50
  8 | from layers.functions.prior_box import PriorBox
  9 | from utils.nms.py_cpu_nms import py_cpu_nms
 10 | import cv2
 11 | from models.retinaface import RetinaFace
 12 | from utils.box_utils import decode, decode_landm
 13 | from utils.timer import Timer
 14 | 
 15 | 
 16 | parser = argparse.ArgumentParser(description='Retinaface')
 17 | parser.add_argument('-m', '--trained_model', default='./weights/Resnet50_Final.pth',
 18 |                     type=str, help='Trained state_dict file path to open')
 19 | parser.add_argument('--network', default='resnet50', help='Backbone network mobile0.25 or resnet50')
 20 | parser.add_argument('--origin_size', default=True, type=str, help='Whether use origin image size to evaluate')
 21 | parser.add_argument('--save_folder', default='./widerface_evaluate/widerface_txt/', type=str, help='Dir to save txt results')
 22 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference')
 23 | parser.add_argument('--dataset_folder', default='./data/widerface/val/images/', type=str, help='dataset path')
 24 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold')
 25 | parser.add_argument('--top_k', default=5000, type=int, help='top_k')
 26 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold')
 27 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k')
 28 | parser.add_argument('-s', '--save_image', action="store_true", default=False, help='show detection results')
 29 | parser.add_argument('--vis_thres', default=0.5, type=float, help='visualization_threshold')
 30 | args = parser.parse_args()
 31 | 
 32 | 
 33 | def check_keys(model, pretrained_state_dict):
 34 |     ckpt_keys = set(pretrained_state_dict.keys())
 35 |     model_keys = set(model.state_dict().keys())
 36 |     used_pretrained_keys = model_keys & ckpt_keys
 37 |     unused_pretrained_keys = ckpt_keys - model_keys
 38 |     missing_keys = model_keys - ckpt_keys
 39 |     print('Missing keys:{}'.format(len(missing_keys)))
 40 |     print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys)))
 41 |     print('Used keys:{}'.format(len(used_pretrained_keys)))
 42 |     assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint'
 43 |     return True
 44 | 
 45 | 
 46 | def remove_prefix(state_dict, prefix):
 47 |     ''' Old style model is stored with all names of parameters sharing common prefix 'module.' '''
 48 |     print('remove prefix \'{}\''.format(prefix))
 49 |     f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x
 50 |     return {f(key): value for key, value in state_dict.items()}
 51 | 
 52 | 
 53 | def load_model(model, pretrained_path, load_to_cpu):
 54 |     print('Loading pretrained model from {}'.format(pretrained_path))
 55 |     if load_to_cpu:
 56 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage)
 57 |     else:
 58 |         device = torch.cuda.current_device()
 59 |         pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device))
 60 |     if "state_dict" in pretrained_dict.keys():
 61 |         pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.')
 62 |     else:
 63 |         pretrained_dict = remove_prefix(pretrained_dict, 'module.')
 64 |     check_keys(model, pretrained_dict)
 65 |     model.load_state_dict(pretrained_dict, strict=False)
 66 |     return model
 67 | 
 68 | 
 69 | if __name__ == '__main__':
 70 |     torch.set_grad_enabled(False)
 71 | 
 72 |     cfg = None
 73 |     if args.network == "mobile0.25":
 74 |         cfg = cfg_mnet
 75 |     elif args.network == "resnet50":
 76 |         cfg = cfg_re50
 77 |     # net and model
 78 |     net = RetinaFace(cfg=cfg, phase = 'test')
 79 |     net = load_model(net, args.trained_model, args.cpu)
 80 |     net.eval()
 81 |     print('Finished loading model!')
 82 |     print(net)
 83 |     cudnn.benchmark = True
 84 |     device = torch.device("cpu" if args.cpu else "cuda")
 85 |     net = net.to(device)
 86 | 
 87 |     # testing dataset
 88 |     testset_folder = args.dataset_folder
 89 |     testset_list = args.dataset_folder[:-7] + "wider_val.txt"
 90 | 
 91 |     with open(testset_list, 'r') as fr:
 92 |         test_dataset = fr.read().split()
 93 |     num_images = len(test_dataset)
 94 | 
 95 |     _t = {'forward_pass': Timer(), 'misc': Timer()}
 96 | 
 97 |     # testing begin
 98 |     for i, img_name in enumerate(test_dataset):
 99 |         image_path = testset_folder + img_name
100 |         img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR)
101 |         img = np.float32(img_raw)
102 | 
103 |         # testing scale
104 |         target_size = 1600
105 |         max_size = 2150
106 |         im_shape = img.shape
107 |         im_size_min = np.min(im_shape[0:2])
108 |         im_size_max = np.max(im_shape[0:2])
109 |         resize = float(target_size) / float(im_size_min)
110 |         # prevent bigger axis from being more than max_size:
111 |         if np.round(resize * im_size_max) > max_size:
112 |             resize = float(max_size) / float(im_size_max)
113 |         if args.origin_size:
114 |             resize = 1
115 | 
116 |         if resize != 1:
117 |             img = cv2.resize(img, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR)
118 |         im_height, im_width, _ = img.shape
119 |         scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]])
120 |         img -= (104, 117, 123)
121 |         img = img.transpose(2, 0, 1)
122 |         img = torch.from_numpy(img).unsqueeze(0)
123 |         img = img.to(device)
124 |         scale = scale.to(device)
125 | 
126 |         _t['forward_pass'].tic()
127 |         loc, conf, landms = net(img)  # forward pass
128 |         _t['forward_pass'].toc()
129 |         _t['misc'].tic()
130 |         priorbox = PriorBox(cfg, image_size=(im_height, im_width))
131 |         priors = priorbox.forward()
132 |         priors = priors.to(device)
133 |         prior_data = priors.data
134 |         boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance'])
135 |         boxes = boxes * scale / resize
136 |         boxes = boxes.cpu().numpy()
137 |         scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
138 |         landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance'])
139 |         scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2],
140 |                                img.shape[3], img.shape[2], img.shape[3], img.shape[2],
141 |                                img.shape[3], img.shape[2]])
142 |         scale1 = scale1.to(device)
143 |         landms = landms * scale1 / resize
144 |         landms = landms.cpu().numpy()
145 | 
146 |         # ignore low scores
147 |         inds = np.where(scores > args.confidence_threshold)[0]
148 |         boxes = boxes[inds]
149 |         landms = landms[inds]
150 |         scores = scores[inds]
151 | 
152 |         # keep top-K before NMS
153 |         order = scores.argsort()[::-1]
154 |         # order = scores.argsort()[::-1][:args.top_k]
155 |         boxes = boxes[order]
156 |         landms = landms[order]
157 |         scores = scores[order]
158 | 
159 |         # do NMS
160 |         dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
161 |         keep = py_cpu_nms(dets, args.nms_threshold)
162 |         # keep = nms(dets, args.nms_threshold,force_cpu=args.cpu)
163 |         dets = dets[keep, :]
164 |         landms = landms[keep]
165 | 
166 |         # keep top-K faster NMS
167 |         # dets = dets[:args.keep_top_k, :]
168 |         # landms = landms[:args.keep_top_k, :]
169 | 
170 |         dets = np.concatenate((dets, landms), axis=1)
171 |         _t['misc'].toc()
172 | 
173 |         # --------------------------------------------------------------------
174 |         save_name = args.save_folder + img_name[:-4] + ".txt"
175 |         dirname = os.path.dirname(save_name)
176 |         if not os.path.isdir(dirname):
177 |             os.makedirs(dirname)
178 |         with open(save_name, "w") as fd:
179 |             bboxs = dets
180 |             file_name = os.path.basename(save_name)[:-4] + "\n"
181 |             bboxs_num = str(len(bboxs)) + "\n"
182 |             fd.write(file_name)
183 |             fd.write(bboxs_num)
184 |             for box in bboxs:
185 |                 x = int(box[0])
186 |                 y = int(box[1])
187 |                 w = int(box[2]) - int(box[0])
188 |                 h = int(box[3]) - int(box[1])
189 |                 confidence = str(box[4])
190 |                 line = str(x) + " " + str(y) + " " + str(w) + " " + str(h) + " " + confidence + " \n"
191 |                 fd.write(line)
192 | 
193 |         print('im_detect: {:d}/{:d} forward_pass_time: {:.4f}s misc: {:.4f}s'.format(i + 1, num_images, _t['forward_pass'].average_time, _t['misc'].average_time))
194 | 
195 |         # save image
196 |         if args.save_image:
197 |             for b in dets:
198 |                 if b[4] < args.vis_thres:
199 |                     continue
200 |                 text = "{:.4f}".format(b[4])
201 |                 b = list(map(int, b))
202 |                 cv2.rectangle(img_raw, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2)
203 |                 cx = b[0]
204 |                 cy = b[1] + 12
205 |                 cv2.putText(img_raw, text, (cx, cy),
206 |                             cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255))
207 | 
208 |                 # landms
209 |                 cv2.circle(img_raw, (b[5], b[6]), 1, (0, 0, 255), 4)
210 |                 cv2.circle(img_raw, (b[7], b[8]), 1, (0, 255, 255), 4)
211 |                 cv2.circle(img_raw, (b[9], b[10]), 1, (255, 0, 255), 4)
212 |                 cv2.circle(img_raw, (b[11], b[12]), 1, (0, 255, 0), 4)
213 |                 cv2.circle(img_raw, (b[13], b[14]), 1, (255, 0, 0), 4)
214 |             # save image
215 |             if not os.path.exists("./results/"):
216 |                 os.makedirs("./results/")
217 |             name = "./results/" + str(i) + ".jpg"
218 |             cv2.imwrite(name, img_raw)
219 | 
220 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import os
  3 | import torch
  4 | import torch.optim as optim
  5 | import torch.backends.cudnn as cudnn
  6 | import argparse
  7 | import torch.utils.data as data
  8 | from data import WiderFaceDetection, detection_collate, preproc, cfg_mnet, cfg_re50
  9 | from layers.modules import MultiBoxLoss
 10 | from layers.functions.prior_box import PriorBox
 11 | import time
 12 | import datetime
 13 | import math
 14 | from models.retinaface import RetinaFace
 15 | 
 16 | parser = argparse.ArgumentParser(description='Retinaface Training')
 17 | parser.add_argument('--training_dataset', default='./data/widerface/train/label.txt', help='Training dataset directory')
 18 | parser.add_argument('--network', default='mobile0.25', help='Backbone network mobile0.25 or resnet50')
 19 | parser.add_argument('--num_workers', default=4, type=int, help='Number of workers used in dataloading')
 20 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, help='initial learning rate')
 21 | parser.add_argument('--momentum', default=0.9, type=float, help='momentum')
 22 | parser.add_argument('--resume_net', default=None, help='resume net for retraining')
 23 | parser.add_argument('--resume_epoch', default=0, type=int, help='resume iter for retraining')
 24 | parser.add_argument('--weight_decay', default=5e-4, type=float, help='Weight decay for SGD')
 25 | parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD')
 26 | parser.add_argument('--save_folder', default='./weights/', help='Location to save checkpoint models')
 27 | 
 28 | args = parser.parse_args()
 29 | 
 30 | if not os.path.exists(args.save_folder):
 31 |     os.mkdir(args.save_folder)
 32 | cfg = None
 33 | if args.network == "mobile0.25":
 34 |     cfg = cfg_mnet
 35 | elif args.network == "resnet50":
 36 |     cfg = cfg_re50
 37 | 
 38 | rgb_mean = (104, 117, 123) # bgr order
 39 | num_classes = 2
 40 | img_dim = cfg['image_size']
 41 | num_gpu = cfg['ngpu']
 42 | batch_size = cfg['batch_size']
 43 | max_epoch = cfg['epoch']
 44 | gpu_train = cfg['gpu_train']
 45 | 
 46 | num_workers = args.num_workers
 47 | momentum = args.momentum
 48 | weight_decay = args.weight_decay
 49 | initial_lr = args.lr
 50 | gamma = args.gamma
 51 | training_dataset = args.training_dataset
 52 | save_folder = args.save_folder
 53 | 
 54 | net = RetinaFace(cfg=cfg)
 55 | print("Printing net...")
 56 | print(net)
 57 | 
 58 | if args.resume_net is not None:
 59 |     print('Loading resume network...')
 60 |     state_dict = torch.load(args.resume_net)
 61 |     # create new OrderedDict that does not contain `module.`
 62 |     from collections import OrderedDict
 63 |     new_state_dict = OrderedDict()
 64 |     for k, v in state_dict.items():
 65 |         head = k[:7]
 66 |         if head == 'module.':
 67 |             name = k[7:] # remove `module.`
 68 |         else:
 69 |             name = k
 70 |         new_state_dict[name] = v
 71 |     net.load_state_dict(new_state_dict)
 72 | 
 73 | if num_gpu > 1 and gpu_train:
 74 |     net = torch.nn.DataParallel(net).cuda()
 75 | else:
 76 |     net = net.cuda()
 77 | 
 78 | cudnn.benchmark = True
 79 | 
 80 | 
 81 | optimizer = optim.SGD(net.parameters(), lr=initial_lr, momentum=momentum, weight_decay=weight_decay)
 82 | criterion = MultiBoxLoss(num_classes, 0.35, True, 0, True, 7, 0.35, False)
 83 | 
 84 | priorbox = PriorBox(cfg, image_size=(img_dim, img_dim))
 85 | with torch.no_grad():
 86 |     priors = priorbox.forward()
 87 |     priors = priors.cuda()
 88 | 
 89 | def train():
 90 |     net.train()
 91 |     epoch = 0 + args.resume_epoch
 92 |     print('Loading Dataset...')
 93 | 
 94 |     dataset = WiderFaceDetection( training_dataset,preproc(img_dim, rgb_mean))
 95 | 
 96 |     epoch_size = math.ceil(len(dataset) / batch_size)
 97 |     max_iter = max_epoch * epoch_size
 98 | 
 99 |     stepvalues = (cfg['decay1'] * epoch_size, cfg['decay2'] * epoch_size)
100 |     step_index = 0
101 | 
102 |     if args.resume_epoch > 0:
103 |         start_iter = args.resume_epoch * epoch_size
104 |     else:
105 |         start_iter = 0
106 | 
107 |     for iteration in range(start_iter, max_iter):
108 |         if iteration % epoch_size == 0:
109 |             # create batch iterator
110 |             batch_iterator = iter(data.DataLoader(dataset, batch_size, shuffle=True, num_workers=num_workers, collate_fn=detection_collate))
111 |             if (epoch % 10 == 0 and epoch > 0) or (epoch % 5 == 0 and epoch > cfg['decay1']):
112 |                 torch.save(net.state_dict(), save_folder + cfg['name']+ '_epoch_' + str(epoch) + '.pth')
113 |             epoch += 1
114 | 
115 |         load_t0 = time.time()
116 |         if iteration in stepvalues:
117 |             step_index += 1
118 |         lr = adjust_learning_rate(optimizer, gamma, epoch, step_index, iteration, epoch_size)
119 | 
120 |         # load train data
121 |         images, targets = next(batch_iterator)
122 |         images = images.cuda()
123 |         targets = [anno.cuda() for anno in targets]
124 | 
125 |         # forward
126 |         out = net(images)
127 | 
128 |         # backprop
129 |         optimizer.zero_grad()
130 |         loss_l, loss_c, loss_landm = criterion(out, priors, targets)
131 |         loss = cfg['loc_weight'] * loss_l + loss_c + loss_landm
132 |         loss.backward()
133 |         optimizer.step()
134 |         load_t1 = time.time()
135 |         batch_time = load_t1 - load_t0
136 |         eta = int(batch_time * (max_iter - iteration))
137 |         print('Epoch:{}/{} || Epochiter: {}/{} || Iter: {}/{} || Loc: {:.4f} Cla: {:.4f} Landm: {:.4f} || LR: {:.8f} || Batchtime: {:.4f} s || ETA: {}'
138 |               .format(epoch, max_epoch, (iteration % epoch_size) + 1,
139 |               epoch_size, iteration + 1, max_iter, loss_l.item(), loss_c.item(), loss_landm.item(), lr, batch_time, str(datetime.timedelta(seconds=eta))))
140 | 
141 |     torch.save(net.state_dict(), save_folder + cfg['name'] + '_Final.pth')
142 |     # torch.save(net.state_dict(), save_folder + 'Final_Retinaface.pth')
143 | 
144 | 
145 | def adjust_learning_rate(optimizer, gamma, epoch, step_index, iteration, epoch_size):
146 |     """Sets the learning rate
147 |     # Adapted from PyTorch Imagenet example:
148 |     # https://github.com/pytorch/examples/blob/master/imagenet/main.py
149 |     """
150 |     warmup_epoch = -1
151 |     if epoch <= warmup_epoch:
152 |         lr = 1e-6 + (initial_lr-1e-6) * iteration / (epoch_size * warmup_epoch)
153 |     else:
154 |         lr = initial_lr * (gamma ** (step_index))
155 |     for param_group in optimizer.param_groups:
156 |         param_group['lr'] = lr
157 |     return lr
158 | 
159 | if __name__ == '__main__':
160 |     train()
161 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/utils/__init__.py


--------------------------------------------------------------------------------
/utils/box_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | 
  5 | def point_form(boxes):
  6 |     """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
  7 |     representation for comparison to point form ground truth data.
  8 |     Args:
  9 |         boxes: (tensor) center-size default boxes from priorbox layers.
 10 |     Return:
 11 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 12 |     """
 13 |     return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
 14 |                      boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax
 15 | 
 16 | 
 17 | def center_size(boxes):
 18 |     """ Convert prior_boxes to (cx, cy, w, h)
 19 |     representation for comparison to center-size form ground truth data.
 20 |     Args:
 21 |         boxes: (tensor) point_form boxes
 22 |     Return:
 23 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 24 |     """
 25 |     return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
 26 |                      boxes[:, 2:] - boxes[:, :2], 1)  # w, h
 27 | 
 28 | 
 29 | def intersect(box_a, box_b):
 30 |     """ We resize both tensors to [A,B,2] without new malloc:
 31 |     [A,2] -> [A,1,2] -> [A,B,2]
 32 |     [B,2] -> [1,B,2] -> [A,B,2]
 33 |     Then we compute the area of intersect between box_a and box_b.
 34 |     Args:
 35 |       box_a: (tensor) bounding boxes, Shape: [A,4].
 36 |       box_b: (tensor) bounding boxes, Shape: [B,4].
 37 |     Return:
 38 |       (tensor) intersection area, Shape: [A,B].
 39 |     """
 40 |     A = box_a.size(0)
 41 |     B = box_b.size(0)
 42 |     max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
 43 |                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
 44 |     min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
 45 |                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
 46 |     inter = torch.clamp((max_xy - min_xy), min=0)
 47 |     return inter[:, :, 0] * inter[:, :, 1]
 48 | 
 49 | 
 50 | def jaccard(box_a, box_b):
 51 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 52 |     is simply the intersection over union of two boxes.  Here we operate on
 53 |     ground truth boxes and default boxes.
 54 |     E.g.:
 55 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 56 |     Args:
 57 |         box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
 58 |         box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
 59 |     Return:
 60 |         jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
 61 |     """
 62 |     inter = intersect(box_a, box_b)
 63 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 64 |               (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
 65 |     area_b = ((box_b[:, 2]-box_b[:, 0]) *
 66 |               (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
 67 |     union = area_a + area_b - inter
 68 |     return inter / union  # [A,B]
 69 | 
 70 | 
 71 | def matrix_iou(a, b):
 72 |     """
 73 |     return iou of a and b, numpy version for data augenmentation
 74 |     """
 75 |     lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
 76 |     rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
 77 | 
 78 |     area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
 79 |     area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
 80 |     area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
 81 |     return area_i / (area_a[:, np.newaxis] + area_b - area_i)
 82 | 
 83 | 
 84 | def matrix_iof(a, b):
 85 |     """
 86 |     return iof of a and b, numpy version for data augenmentation
 87 |     """
 88 |     lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
 89 |     rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
 90 | 
 91 |     area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
 92 |     area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
 93 |     return area_i / np.maximum(area_a[:, np.newaxis], 1)
 94 | 
 95 | 
 96 | def match(threshold, truths, priors, variances, labels, landms, loc_t, conf_t, landm_t, idx):
 97 |     """Match each prior box with the ground truth box of the highest jaccard
 98 |     overlap, encode the bounding boxes, then return the matched indices
 99 |     corresponding to both confidence and location preds.
100 |     Args:
101 |         threshold: (float) The overlap threshold used when mathing boxes.
102 |         truths: (tensor) Ground truth boxes, Shape: [num_obj, 4].
103 |         priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
104 |         variances: (tensor) Variances corresponding to each prior coord,
105 |             Shape: [num_priors, 4].
106 |         labels: (tensor) All the class labels for the image, Shape: [num_obj].
107 |         landms: (tensor) Ground truth landms, Shape [num_obj, 10].
108 |         loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
109 |         conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
110 |         landm_t: (tensor) Tensor to be filled w/ endcoded landm targets.
111 |         idx: (int) current batch index
112 |     Return:
113 |         The matched indices corresponding to 1)location 2)confidence 3)landm preds.
114 |     """
115 |     # jaccard index
116 |     overlaps = jaccard(
117 |         truths,
118 |         point_form(priors)
119 |     )
120 |     # (Bipartite Matching)
121 |     # [1,num_objects] best prior for each ground truth
122 |     best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
123 | 
124 |     # ignore hard gt
125 |     valid_gt_idx = best_prior_overlap[:, 0] >= 0.2
126 |     best_prior_idx_filter = best_prior_idx[valid_gt_idx, :]
127 |     if best_prior_idx_filter.shape[0] <= 0:
128 |         loc_t[idx] = 0
129 |         conf_t[idx] = 0
130 |         return
131 | 
132 |     # [1,num_priors] best ground truth for each prior
133 |     best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
134 |     best_truth_idx.squeeze_(0)
135 |     best_truth_overlap.squeeze_(0)
136 |     best_prior_idx.squeeze_(1)
137 |     best_prior_idx_filter.squeeze_(1)
138 |     best_prior_overlap.squeeze_(1)
139 |     best_truth_overlap.index_fill_(0, best_prior_idx_filter, 2)  # ensure best prior
140 |     # TODO refactor: index  best_prior_idx with long tensor
141 |     # ensure every gt matches with its prior of max overlap
142 |     for j in range(best_prior_idx.size(0)):     # 判别此anchor是预测哪一个boxes
143 |         best_truth_idx[best_prior_idx[j]] = j
144 |     matches = truths[best_truth_idx]            # Shape: [num_priors,4] 此处为每一个anchor对应的bbox取出来
145 |     conf = labels[best_truth_idx]               # Shape: [num_priors]      此处为每一个anchor对应的label取出来
146 |     conf[best_truth_overlap < threshold] = 0    # label as background   overlap<0.35的全部作为负样本
147 |     loc = encode(matches, priors, variances)
148 | 
149 |     matches_landm = landms[best_truth_idx]
150 |     landm = encode_landm(matches_landm, priors, variances)
151 |     loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
152 |     conf_t[idx] = conf  # [num_priors] top class label for each prior
153 |     landm_t[idx] = landm
154 | 
155 | 
156 | def encode(matched, priors, variances):
157 |     """Encode the variances from the priorbox layers into the ground truth boxes
158 |     we have matched (based on jaccard overlap) with the prior boxes.
159 |     Args:
160 |         matched: (tensor) Coords of ground truth for each prior in point-form
161 |             Shape: [num_priors, 4].
162 |         priors: (tensor) Prior boxes in center-offset form
163 |             Shape: [num_priors,4].
164 |         variances: (list[float]) Variances of priorboxes
165 |     Return:
166 |         encoded boxes (tensor), Shape: [num_priors, 4]
167 |     """
168 | 
169 |     # dist b/t match center and prior's center
170 |     g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
171 |     # encode variance
172 |     g_cxcy /= (variances[0] * priors[:, 2:])
173 |     # match wh / prior wh
174 |     g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
175 |     g_wh = torch.log(g_wh) / variances[1]
176 |     # return target for smooth_l1_loss
177 |     return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
178 | 
179 | def encode_landm(matched, priors, variances):
180 |     """Encode the variances from the priorbox layers into the ground truth boxes
181 |     we have matched (based on jaccard overlap) with the prior boxes.
182 |     Args:
183 |         matched: (tensor) Coords of ground truth for each prior in point-form
184 |             Shape: [num_priors, 10].
185 |         priors: (tensor) Prior boxes in center-offset form
186 |             Shape: [num_priors,4].
187 |         variances: (list[float]) Variances of priorboxes
188 |     Return:
189 |         encoded landm (tensor), Shape: [num_priors, 10]
190 |     """
191 | 
192 |     # dist b/t match center and prior's center
193 |     matched = torch.reshape(matched, (matched.size(0), 5, 2))
194 |     priors_cx = priors[:, 0].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
195 |     priors_cy = priors[:, 1].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
196 |     priors_w = priors[:, 2].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
197 |     priors_h = priors[:, 3].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
198 |     priors = torch.cat([priors_cx, priors_cy, priors_w, priors_h], dim=2)
199 |     g_cxcy = matched[:, :, :2] - priors[:, :, :2]
200 |     # encode variance
201 |     g_cxcy /= (variances[0] * priors[:, :, 2:])
202 |     # g_cxcy /= priors[:, :, 2:]
203 |     g_cxcy = g_cxcy.reshape(g_cxcy.size(0), -1)
204 |     # return target for smooth_l1_loss
205 |     return g_cxcy
206 | 
207 | 
208 | # Adapted from https://github.com/Hakuyume/chainer-ssd
209 | def decode(loc, priors, variances):
210 |     """Decode locations from predictions using priors to undo
211 |     the encoding we did for offset regression at train time.
212 |     Args:
213 |         loc (tensor): location predictions for loc layers,
214 |             Shape: [num_priors,4]
215 |         priors (tensor): Prior boxes in center-offset form.
216 |             Shape: [num_priors,4].
217 |         variances: (list[float]) Variances of priorboxes
218 |     Return:
219 |         decoded bounding box predictions
220 |     """
221 | 
222 |     boxes = torch.cat((
223 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
224 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
225 |     boxes[:, :2] -= boxes[:, 2:] / 2
226 |     boxes[:, 2:] += boxes[:, :2]
227 |     return boxes
228 | 
229 | def decode_landm(pre, priors, variances):
230 |     """Decode landm from predictions using priors to undo
231 |     the encoding we did for offset regression at train time.
232 |     Args:
233 |         pre (tensor): landm predictions for loc layers,
234 |             Shape: [num_priors,10]
235 |         priors (tensor): Prior boxes in center-offset form.
236 |             Shape: [num_priors,4].
237 |         variances: (list[float]) Variances of priorboxes
238 |     Return:
239 |         decoded landm predictions
240 |     """
241 |     landms = torch.cat((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:],
242 |                         priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:],
243 |                         priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:],
244 |                         priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:],
245 |                         priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:],
246 |                         ), dim=1)
247 |     return landms
248 | 
249 | 
250 | def log_sum_exp(x):
251 |     """Utility function for computing log_sum_exp while determining
252 |     This will be used to determine unaveraged confidence loss across
253 |     all examples in a batch.
254 |     Args:
255 |         x (Variable(tensor)): conf_preds from conf layers
256 |     """
257 |     x_max = x.data.max()
258 |     return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
259 | 
260 | 
261 | # Original author: Francisco Massa:
262 | # https://github.com/fmassa/object-detection.torch
263 | # Ported to PyTorch by Max deGroot (02/01/2017)
264 | def nms(boxes, scores, overlap=0.5, top_k=200):
265 |     """Apply non-maximum suppression at test time to avoid detecting too many
266 |     overlapping bounding boxes for a given object.
267 |     Args:
268 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
269 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
270 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
271 |         top_k: (int) The Maximum number of box preds to consider.
272 |     Return:
273 |         The indices of the kept boxes with respect to num_priors.
274 |     """
275 | 
276 |     keep = torch.Tensor(scores.size(0)).fill_(0).long()
277 |     if boxes.numel() == 0:
278 |         return keep
279 |     x1 = boxes[:, 0]
280 |     y1 = boxes[:, 1]
281 |     x2 = boxes[:, 2]
282 |     y2 = boxes[:, 3]
283 |     area = torch.mul(x2 - x1, y2 - y1)
284 |     v, idx = scores.sort(0)  # sort in ascending order
285 |     # I = I[v >= 0.01]
286 |     idx = idx[-top_k:]  # indices of the top-k largest vals
287 |     xx1 = boxes.new()
288 |     yy1 = boxes.new()
289 |     xx2 = boxes.new()
290 |     yy2 = boxes.new()
291 |     w = boxes.new()
292 |     h = boxes.new()
293 | 
294 |     # keep = torch.Tensor()
295 |     count = 0
296 |     while idx.numel() > 0:
297 |         i = idx[-1]  # index of current largest val
298 |         # keep.append(i)
299 |         keep[count] = i
300 |         count += 1
301 |         if idx.size(0) == 1:
302 |             break
303 |         idx = idx[:-1]  # remove kept element from view
304 |         # load bboxes of next highest vals
305 |         torch.index_select(x1, 0, idx, out=xx1)
306 |         torch.index_select(y1, 0, idx, out=yy1)
307 |         torch.index_select(x2, 0, idx, out=xx2)
308 |         torch.index_select(y2, 0, idx, out=yy2)
309 |         # store element-wise max with next highest score
310 |         xx1 = torch.clamp(xx1, min=x1[i])
311 |         yy1 = torch.clamp(yy1, min=y1[i])
312 |         xx2 = torch.clamp(xx2, max=x2[i])
313 |         yy2 = torch.clamp(yy2, max=y2[i])
314 |         w.resize_as_(xx2)
315 |         h.resize_as_(yy2)
316 |         w = xx2 - xx1
317 |         h = yy2 - yy1
318 |         # check sizes of xx1 and xx2.. after each iteration
319 |         w = torch.clamp(w, min=0.0)
320 |         h = torch.clamp(h, min=0.0)
321 |         inter = w*h
322 |         # IoU = i / (area(a) + area(b) - i)
323 |         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
324 |         union = (rem_areas - inter) + area[i]
325 |         IoU = inter/union  # store result in iou
326 |         # keep only elements with an IoU <= overlap
327 |         idx = idx[IoU.le(overlap)]
328 |     return keep, count
329 | 
330 | 
331 | 


--------------------------------------------------------------------------------
/utils/nms/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/utils/nms/__init__.py


--------------------------------------------------------------------------------
/utils/nms/py_cpu_nms.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | def py_cpu_nms(dets, thresh):
11 |     """Pure Python NMS baseline."""
12 |     x1 = dets[:, 0]
13 |     y1 = dets[:, 1]
14 |     x2 = dets[:, 2]
15 |     y2 = dets[:, 3]
16 |     scores = dets[:, 4]
17 | 
18 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
19 |     order = scores.argsort()[::-1]
20 | 
21 |     keep = []
22 |     while order.size > 0:
23 |         i = order[0]
24 |         keep.append(i)
25 |         xx1 = np.maximum(x1[i], x1[order[1:]])
26 |         yy1 = np.maximum(y1[i], y1[order[1:]])
27 |         xx2 = np.minimum(x2[i], x2[order[1:]])
28 |         yy2 = np.minimum(y2[i], y2[order[1:]])
29 | 
30 |         w = np.maximum(0.0, xx2 - xx1 + 1)
31 |         h = np.maximum(0.0, yy2 - yy1 + 1)
32 |         inter = w * h
33 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
34 | 
35 |         inds = np.where(ovr <= thresh)[0]
36 |         order = order[inds + 1]
37 | 
38 |     return keep
39 | 


--------------------------------------------------------------------------------
/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import time
 9 | 
10 | 
11 | class Timer(object):
12 |     """A simple timer."""
13 |     def __init__(self):
14 |         self.total_time = 0.
15 |         self.calls = 0
16 |         self.start_time = 0.
17 |         self.diff = 0.
18 |         self.average_time = 0.
19 | 
20 |     def tic(self):
21 |         # using time.time instead of time.clock because time time.clock
22 |         # does not normalize for multithreading
23 |         self.start_time = time.time()
24 | 
25 |     def toc(self, average=True):
26 |         self.diff = time.time() - self.start_time
27 |         self.total_time += self.diff
28 |         self.calls += 1
29 |         self.average_time = self.total_time / self.calls
30 |         if average:
31 |             return self.average_time
32 |         else:
33 |             return self.diff
34 | 
35 |     def clear(self):
36 |         self.total_time = 0.
37 |         self.calls = 0
38 |         self.start_time = 0.
39 |         self.diff = 0.
40 |         self.average_time = 0.
41 | 


--------------------------------------------------------------------------------
/widerface_evaluate/README.md:
--------------------------------------------------------------------------------
 1 | # WiderFace-Evaluation
 2 | Python Evaluation Code for [Wider Face Dataset](http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/)
 3 | 
 4 | 
 5 | ## Usage
 6 | 
 7 | 
 8 | ##### before evaluating ....
 9 | 
10 | ````
11 | python3 setup.py build_ext --inplace
12 | ````
13 | 
14 | ##### evaluating
15 | 
16 | **GroungTruth:** `wider_face_val.mat`, `wider_easy_val.mat`, `wider_medium_val.mat`,`wider_hard_val.mat`
17 | 
18 | ````
19 | python3 evaluation.py -p <your prediction dir> -g <groud truth dir>
20 | ````
21 | 
22 | ## Bugs & Problems
23 | please issue
24 | 
25 | ## Acknowledgements
26 | 
27 | some code borrowed from Sergey Karayev
28 | 


--------------------------------------------------------------------------------
/widerface_evaluate/box_overlaps.pyx:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Sergey Karayev
 6 | # --------------------------------------------------------
 7 | 
 8 | cimport cython
 9 | import numpy as np
10 | cimport numpy as np
11 | 
12 | DTYPE = np.float
13 | ctypedef np.float_t DTYPE_t
14 | 
15 | def bbox_overlaps(
16 |         np.ndarray[DTYPE_t, ndim=2] boxes,
17 |         np.ndarray[DTYPE_t, ndim=2] query_boxes):
18 |     """
19 |     Parameters
20 |     ----------
21 |     boxes: (N, 4) ndarray of float
22 |     query_boxes: (K, 4) ndarray of float
23 |     Returns
24 |     -------
25 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
26 |     """
27 |     cdef unsigned int N = boxes.shape[0]
28 |     cdef unsigned int K = query_boxes.shape[0]
29 |     cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE)
30 |     cdef DTYPE_t iw, ih, box_area
31 |     cdef DTYPE_t ua
32 |     cdef unsigned int k, n
33 |     for k in range(K):
34 |         box_area = (
35 |             (query_boxes[k, 2] - query_boxes[k, 0] + 1) *
36 |             (query_boxes[k, 3] - query_boxes[k, 1] + 1)
37 |         )
38 |         for n in range(N):
39 |             iw = (
40 |                 min(boxes[n, 2], query_boxes[k, 2]) -
41 |                 max(boxes[n, 0], query_boxes[k, 0]) + 1
42 |             )
43 |             if iw > 0:
44 |                 ih = (
45 |                     min(boxes[n, 3], query_boxes[k, 3]) -
46 |                     max(boxes[n, 1], query_boxes[k, 1]) + 1
47 |                 )
48 |                 if ih > 0:
49 |                     ua = float(
50 |                         (boxes[n, 2] - boxes[n, 0] + 1) *
51 |                         (boxes[n, 3] - boxes[n, 1] + 1) +
52 |                         box_area - iw * ih
53 |                     )
54 |                     overlaps[n, k] = iw * ih / ua
55 |     return overlaps


--------------------------------------------------------------------------------
/widerface_evaluate/evaluation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | WiderFace evaluation code
  3 | author: wondervictor
  4 | mail: tianhengcheng@gmail.com
  5 | copyright@wondervictor
  6 | """
  7 | 
  8 | import os
  9 | import tqdm
 10 | import pickle
 11 | import argparse
 12 | import numpy as np
 13 | from scipy.io import loadmat
 14 | from bbox import bbox_overlaps
 15 | from IPython import embed
 16 | 
 17 | 
 18 | def get_gt_boxes(gt_dir):
 19 |     """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)"""
 20 | 
 21 |     gt_mat = loadmat(os.path.join(gt_dir, 'wider_face_val.mat'))
 22 |     hard_mat = loadmat(os.path.join(gt_dir, 'wider_hard_val.mat'))
 23 |     medium_mat = loadmat(os.path.join(gt_dir, 'wider_medium_val.mat'))
 24 |     easy_mat = loadmat(os.path.join(gt_dir, 'wider_easy_val.mat'))
 25 | 
 26 |     facebox_list = gt_mat['face_bbx_list']
 27 |     event_list = gt_mat['event_list']
 28 |     file_list = gt_mat['file_list']
 29 | 
 30 |     hard_gt_list = hard_mat['gt_list']
 31 |     medium_gt_list = medium_mat['gt_list']
 32 |     easy_gt_list = easy_mat['gt_list']
 33 | 
 34 |     return facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list
 35 | 
 36 | 
 37 | def get_gt_boxes_from_txt(gt_path, cache_dir):
 38 | 
 39 |     cache_file = os.path.join(cache_dir, 'gt_cache.pkl')
 40 |     if os.path.exists(cache_file):
 41 |         f = open(cache_file, 'rb')
 42 |         boxes = pickle.load(f)
 43 |         f.close()
 44 |         return boxes
 45 | 
 46 |     f = open(gt_path, 'r')
 47 |     state = 0
 48 |     lines = f.readlines()
 49 |     lines = list(map(lambda x: x.rstrip('\r\n'), lines))
 50 |     boxes = {}
 51 |     print(len(lines))
 52 |     f.close()
 53 |     current_boxes = []
 54 |     current_name = None
 55 |     for line in lines:
 56 |         if state == 0 and '--' in line:
 57 |             state = 1
 58 |             current_name = line
 59 |             continue
 60 |         if state == 1:
 61 |             state = 2
 62 |             continue
 63 | 
 64 |         if state == 2 and '--' in line:
 65 |             state = 1
 66 |             boxes[current_name] = np.array(current_boxes).astype('float32')
 67 |             current_name = line
 68 |             current_boxes = []
 69 |             continue
 70 | 
 71 |         if state == 2:
 72 |             box = [float(x) for x in line.split(' ')[:4]]
 73 |             current_boxes.append(box)
 74 |             continue
 75 | 
 76 |     f = open(cache_file, 'wb')
 77 |     pickle.dump(boxes, f)
 78 |     f.close()
 79 |     return boxes
 80 | 
 81 | 
 82 | def read_pred_file(filepath):
 83 | 
 84 |     with open(filepath, 'r') as f:
 85 |         lines = f.readlines()
 86 |         img_file = lines[0].rstrip('\n\r')
 87 |         lines = lines[2:]
 88 | 
 89 |     # b = lines[0].rstrip('\r\n').split(' ')[:-1]
 90 |     # c = float(b)
 91 |     # a = map(lambda x: [[float(a[0]), float(a[1]), float(a[2]), float(a[3]), float(a[4])] for a in x.rstrip('\r\n').split(' ')], lines)
 92 |     boxes = []
 93 |     for line in lines:
 94 |         line = line.rstrip('\r\n').split(' ')
 95 |         if line[0] is '':
 96 |             continue
 97 |         # a = float(line[4])
 98 |         boxes.append([float(line[0]), float(line[1]), float(line[2]), float(line[3]), float(line[4])])
 99 |     boxes = np.array(boxes)
100 |     # boxes = np.array(list(map(lambda x: [float(a) for a in x.rstrip('\r\n').split(' ')], lines))).astype('float')
101 |     return img_file.split('/')[-1], boxes
102 | 
103 | 
104 | def get_preds(pred_dir):
105 |     events = os.listdir(pred_dir)
106 |     boxes = dict()
107 |     pbar = tqdm.tqdm(events)
108 | 
109 |     for event in pbar:
110 |         pbar.set_description('Reading Predictions ')
111 |         event_dir = os.path.join(pred_dir, event)
112 |         event_images = os.listdir(event_dir)
113 |         current_event = dict()
114 |         for imgtxt in event_images:
115 |             imgname, _boxes = read_pred_file(os.path.join(event_dir, imgtxt))
116 |             current_event[imgname.rstrip('.jpg')] = _boxes
117 |         boxes[event] = current_event
118 |     return boxes
119 | 
120 | 
121 | def norm_score(pred):
122 |     """ norm score
123 |     pred {key: [[x1,y1,x2,y2,s]]}
124 |     """
125 | 
126 |     max_score = 0
127 |     min_score = 1
128 | 
129 |     for _, k in pred.items():
130 |         for _, v in k.items():
131 |             if len(v) == 0:
132 |                 continue
133 |             _min = np.min(v[:, -1])
134 |             _max = np.max(v[:, -1])
135 |             max_score = max(_max, max_score)
136 |             min_score = min(_min, min_score)
137 | 
138 |     diff = max_score - min_score
139 |     for _, k in pred.items():
140 |         for _, v in k.items():
141 |             if len(v) == 0:
142 |                 continue
143 |             v[:, -1] = (v[:, -1] - min_score)/diff
144 | 
145 | 
146 | def image_eval(pred, gt, ignore, iou_thresh):
147 |     """ single image evaluation
148 |     pred: Nx5
149 |     gt: Nx4
150 |     ignore:
151 |     """
152 | 
153 |     _pred = pred.copy()
154 |     _gt = gt.copy()
155 |     pred_recall = np.zeros(_pred.shape[0])
156 |     recall_list = np.zeros(_gt.shape[0])
157 |     proposal_list = np.ones(_pred.shape[0])
158 | 
159 |     _pred[:, 2] = _pred[:, 2] + _pred[:, 0]
160 |     _pred[:, 3] = _pred[:, 3] + _pred[:, 1]
161 |     _gt[:, 2] = _gt[:, 2] + _gt[:, 0]
162 |     _gt[:, 3] = _gt[:, 3] + _gt[:, 1]
163 | 
164 |     overlaps = bbox_overlaps(_pred[:, :4], _gt)
165 | 
166 |     for h in range(_pred.shape[0]):
167 | 
168 |         gt_overlap = overlaps[h]
169 |         max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax()
170 |         if max_overlap >= iou_thresh:
171 |             if ignore[max_idx] == 0:
172 |                 recall_list[max_idx] = -1
173 |                 proposal_list[h] = -1
174 |             elif recall_list[max_idx] == 0:
175 |                 recall_list[max_idx] = 1
176 | 
177 |         r_keep_index = np.where(recall_list == 1)[0]
178 |         pred_recall[h] = len(r_keep_index)
179 |     return pred_recall, proposal_list
180 | 
181 | 
182 | def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall):
183 |     pr_info = np.zeros((thresh_num, 2)).astype('float')
184 |     for t in range(thresh_num):
185 | 
186 |         thresh = 1 - (t+1)/thresh_num
187 |         r_index = np.where(pred_info[:, 4] >= thresh)[0]
188 |         if len(r_index) == 0:
189 |             pr_info[t, 0] = 0
190 |             pr_info[t, 1] = 0
191 |         else:
192 |             r_index = r_index[-1]
193 |             p_index = np.where(proposal_list[:r_index+1] == 1)[0]
194 |             pr_info[t, 0] = len(p_index)
195 |             pr_info[t, 1] = pred_recall[r_index]
196 |     return pr_info
197 | 
198 | 
199 | def dataset_pr_info(thresh_num, pr_curve, count_face):
200 |     _pr_curve = np.zeros((thresh_num, 2))
201 |     for i in range(thresh_num):
202 |         _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0]
203 |         _pr_curve[i, 1] = pr_curve[i, 1] / count_face
204 |     return _pr_curve
205 | 
206 | 
207 | def voc_ap(rec, prec):
208 | 
209 |     # correct AP calculation
210 |     # first append sentinel values at the end
211 |     mrec = np.concatenate(([0.], rec, [1.]))
212 |     mpre = np.concatenate(([0.], prec, [0.]))
213 | 
214 |     # compute the precision envelope
215 |     for i in range(mpre.size - 1, 0, -1):
216 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
217 | 
218 |     # to calculate area under PR curve, look for points
219 |     # where X axis (recall) changes value
220 |     i = np.where(mrec[1:] != mrec[:-1])[0]
221 | 
222 |     # and sum (\Delta recall) * prec
223 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
224 |     return ap
225 | 
226 | 
227 | def evaluation(pred, gt_path, iou_thresh=0.5):
228 |     pred = get_preds(pred)
229 |     norm_score(pred)
230 |     facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list = get_gt_boxes(gt_path)
231 |     event_num = len(event_list)
232 |     thresh_num = 1000
233 |     settings = ['easy', 'medium', 'hard']
234 |     setting_gts = [easy_gt_list, medium_gt_list, hard_gt_list]
235 |     aps = []
236 |     for setting_id in range(3):
237 |         # different setting
238 |         gt_list = setting_gts[setting_id]
239 |         count_face = 0
240 |         pr_curve = np.zeros((thresh_num, 2)).astype('float')
241 |         # [hard, medium, easy]
242 |         pbar = tqdm.tqdm(range(event_num))
243 |         for i in pbar:
244 |             pbar.set_description('Processing {}'.format(settings[setting_id]))
245 |             event_name = str(event_list[i][0][0])
246 |             img_list = file_list[i][0]
247 |             pred_list = pred[event_name]
248 |             sub_gt_list = gt_list[i][0]
249 |             # img_pr_info_list = np.zeros((len(img_list), thresh_num, 2))
250 |             gt_bbx_list = facebox_list[i][0]
251 | 
252 |             for j in range(len(img_list)):
253 |                 pred_info = pred_list[str(img_list[j][0][0])]
254 | 
255 |                 gt_boxes = gt_bbx_list[j][0].astype('float')
256 |                 keep_index = sub_gt_list[j][0]
257 |                 count_face += len(keep_index)
258 | 
259 |                 if len(gt_boxes) == 0 or len(pred_info) == 0:
260 |                     continue
261 |                 ignore = np.zeros(gt_boxes.shape[0])
262 |                 if len(keep_index) != 0:
263 |                     ignore[keep_index-1] = 1
264 |                 pred_recall, proposal_list = image_eval(pred_info, gt_boxes, ignore, iou_thresh)
265 | 
266 |                 _img_pr_info = img_pr_info(thresh_num, pred_info, proposal_list, pred_recall)
267 | 
268 |                 pr_curve += _img_pr_info
269 |         pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face)
270 | 
271 |         propose = pr_curve[:, 0]
272 |         recall = pr_curve[:, 1]
273 | 
274 |         ap = voc_ap(recall, propose)
275 |         aps.append(ap)
276 | 
277 |     print("==================== Results ====================")
278 |     print("Easy   Val AP: {}".format(aps[0]))
279 |     print("Medium Val AP: {}".format(aps[1]))
280 |     print("Hard   Val AP: {}".format(aps[2]))
281 |     print("=================================================")
282 | 
283 | 
284 | if __name__ == '__main__':
285 | 
286 |     parser = argparse.ArgumentParser()
287 |     parser.add_argument('-p', '--pred', default="./widerface_txt/")
288 |     parser.add_argument('-g', '--gt', default='./ground_truth/')
289 | 
290 |     args = parser.parse_args()
291 |     evaluation(args.pred, args.gt)
292 | 
293 | 
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | 
301 | 
302 | 
303 | 
304 | 


--------------------------------------------------------------------------------
/widerface_evaluate/ground_truth/wider_easy_val.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/widerface_evaluate/ground_truth/wider_easy_val.mat


--------------------------------------------------------------------------------
/widerface_evaluate/ground_truth/wider_face_val.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/widerface_evaluate/ground_truth/wider_face_val.mat


--------------------------------------------------------------------------------
/widerface_evaluate/ground_truth/wider_hard_val.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/widerface_evaluate/ground_truth/wider_hard_val.mat


--------------------------------------------------------------------------------
/widerface_evaluate/ground_truth/wider_medium_val.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/widerface_evaluate/ground_truth/wider_medium_val.mat


--------------------------------------------------------------------------------
/widerface_evaluate/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WiderFace evaluation code
 3 | author: wondervictor
 4 | mail: tianhengcheng@gmail.com
 5 | copyright@wondervictor
 6 | """
 7 | 
 8 | from distutils.core import setup, Extension
 9 | from Cython.Build import cythonize
10 | import numpy
11 | 
12 | package = Extension('bbox', ['box_overlaps.pyx'], include_dirs=[numpy.get_include()])
13 | setup(ext_modules=cythonize([package]))
14 | 


--------------------------------------------------------------------------------