├── LICENSE.MIT ├── README.md ├── convert_to_onnx.py ├── curve ├── 1.jpg ├── FDDB.png ├── Widerface.jpg └── test.jpg ├── data ├── FDDB │ └── img_list.txt ├── __init__.py ├── config.py ├── data_augment.py ├── input │ └── huge.jpg ├── output │ ├── bbox_pred_acc.png │ └── test_5.jpg └── wider_face.py ├── detect.py ├── detect_merge.py ├── layers ├── __init__.py ├── functions │ └── prior_box.py └── modules │ ├── __init__.py │ └── multibox_loss.py ├── models ├── __init__.py ├── net.py └── retinaface.py ├── pose ├── __init__.py ├── datasets.py ├── detect_image.py ├── hopenet.py ├── test_alexnet.py ├── test_hopenet.py ├── test_on_video.py ├── test_on_video_dlib.py ├── test_on_video_dockerface.py ├── test_resnet50_regression.py ├── train_alexnet.py ├── train_hopenet.py ├── train_resnet50_regression.py └── utils.py ├── test_fddb.py ├── test_widerface.py ├── train.py ├── utils ├── __init__.py ├── box_utils.py ├── nms │ ├── __init__.py │ └── py_cpu_nms.py └── timer.py └── widerface_evaluate ├── README.md ├── box_overlaps.pyx ├── evaluation.py ├── ground_truth ├── wider_easy_val.mat ├── wider_face_val.mat ├── wider_hard_val.mat └── wider_medium_val.mat └── setup.py /LICENSE.MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RetinaFace in PyTorch 2 | 3 | ``` 4 | author is leilei 5 | dataset is widerface 6 | face detection, face key point detection, and face pose estimation 7 | ``` 8 | 9 | ### Note 10 | + This repository is forked from [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface) 11 | + Based on this project, we merge the three tasks of **face detection, face key point detection, and face pose estimation** into one task. 12 | + Now, we only release network's code and demo's code! More codes will be released in the future. 13 | 14 | ### Inference 15 | ``` 16 | python detect_merge.py ${-m} ${--network} ${--image_path} ${--output_path} 17 | ``` 18 | 19 | ### Demo 20 | |![face-detect](./data/output/test_5.jpg)| 21 | |----| 22 | 23 | ### Weight 24 | + [GoogleDriver](https://drive.google.com/file/d/1YbMLrUdgmY1vNTQ8Y6OhR0pKifZeCGWa/view?usp=sharing) 25 | > | resnet50-retinaface | Easy | Medium | Hard | 26 | > | :----: | :----: | :----: | :----: | 27 | > | AP | 94.3486% | 93.3151% | 88.6972% | 28 | + ![face-detect-ap](./data/output/bbox_pred_acc.png) 29 | 30 | ### Train 31 | + TODO release code 32 | 33 | ### References 34 | + [biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface) 35 | + [HopeNet-pytorch](https://github.com/natanielruiz/deep-head-pose) 36 | + [insightface-RetinaFaceAntiCov](https://github.com/deepinsight/insightface/tree/master/detection/RetinaFaceAntiCov) -------------------------------------------------------------------------------- /convert_to_onnx.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import argparse 4 | import torch 5 | import torch.backends.cudnn as cudnn 6 | import numpy as np 7 | from data import cfg_mnet, cfg_re50 8 | from layers.functions.prior_box import PriorBox 9 | from utils.nms.py_cpu_nms import py_cpu_nms 10 | import cv2 11 | from models.retinaface import RetinaFace 12 | from utils.box_utils import decode, decode_landm 13 | from utils.timer import Timer 14 | 15 | 16 | parser = argparse.ArgumentParser(description='Test') 17 | parser.add_argument('-m', '--trained_model', default='./weights/mobilenet0.25_Final.pth', 18 | type=str, help='Trained state_dict file path to open') 19 | parser.add_argument('--network', default='mobile0.25', help='Backbone network mobile0.25 or resnet50') 20 | parser.add_argument('--long_side', default=640, help='when origin_size is false, long_side is scaled size(320 or 640 for long side)') 21 | parser.add_argument('--cpu', action="store_true", default=True, help='Use cpu inference') 22 | 23 | args = parser.parse_args() 24 | 25 | 26 | def check_keys(model, pretrained_state_dict): 27 | ckpt_keys = set(pretrained_state_dict.keys()) 28 | model_keys = set(model.state_dict().keys()) 29 | used_pretrained_keys = model_keys & ckpt_keys 30 | unused_pretrained_keys = ckpt_keys - model_keys 31 | missing_keys = model_keys - ckpt_keys 32 | print('Missing keys:{}'.format(len(missing_keys))) 33 | print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys))) 34 | print('Used keys:{}'.format(len(used_pretrained_keys))) 35 | assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' 36 | return True 37 | 38 | 39 | def remove_prefix(state_dict, prefix): 40 | ''' Old style model is stored with all names of parameters sharing common prefix 'module.' ''' 41 | print('remove prefix \'{}\''.format(prefix)) 42 | f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x 43 | return {f(key): value for key, value in state_dict.items()} 44 | 45 | 46 | def load_model(model, pretrained_path, load_to_cpu): 47 | print('Loading pretrained model from {}'.format(pretrained_path)) 48 | if load_to_cpu: 49 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 50 | else: 51 | device = torch.cuda.current_device() 52 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) 53 | if "state_dict" in pretrained_dict.keys(): 54 | pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.') 55 | else: 56 | pretrained_dict = remove_prefix(pretrained_dict, 'module.') 57 | check_keys(model, pretrained_dict) 58 | model.load_state_dict(pretrained_dict, strict=False) 59 | return model 60 | 61 | 62 | if __name__ == '__main__': 63 | torch.set_grad_enabled(False) 64 | cfg = None 65 | if args.network == "mobile0.25": 66 | cfg = cfg_mnet 67 | elif args.network == "resnet50": 68 | cfg = cfg_re50 69 | # net and model 70 | net = RetinaFace(cfg=cfg, phase = 'test') 71 | net = load_model(net, args.trained_model, args.cpu) 72 | net.eval() 73 | print('Finished loading model!') 74 | print(net) 75 | device = torch.device("cpu" if args.cpu else "cuda") 76 | net = net.to(device) 77 | 78 | # ------------------------ export ----------------------------- 79 | output_onnx = 'FaceDetector.onnx' 80 | print("==> Exporting model to ONNX format at '{}'".format(output_onnx)) 81 | input_names = ["input0"] 82 | output_names = ["output0"] 83 | inputs = torch.randn(1, 3, args.long_side, args.long_side).to(device) 84 | 85 | torch_out = torch.onnx._export(net, inputs, output_onnx, export_params=True, verbose=False, 86 | input_names=input_names, output_names=output_names) 87 | 88 | 89 | -------------------------------------------------------------------------------- /curve/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/curve/1.jpg -------------------------------------------------------------------------------- /curve/FDDB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/curve/FDDB.png -------------------------------------------------------------------------------- /curve/Widerface.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/curve/Widerface.jpg -------------------------------------------------------------------------------- /curve/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/curve/test.jpg -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | from .wider_face import WiderFaceDetection, detection_collate 2 | from .data_augment import * 3 | from .config import * 4 | -------------------------------------------------------------------------------- /data/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | 3 | cfg_mnet = { 4 | 'name': 'mobilenet0.25', 5 | 'min_sizes': [[16, 32], [64, 128], [256, 512]], 6 | 'steps': [8, 16, 32], 7 | 'variance': [0.1, 0.2], 8 | 'clip': False, 9 | 'loc_weight': 2.0, 10 | 'gpu_train': True, 11 | 'batch_size': 32, 12 | 'ngpu': 1, 13 | 'epoch': 250, 14 | 'decay1': 190, 15 | 'decay2': 220, 16 | 'image_size': 640, 17 | 'pretrain': True, 18 | 'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3}, 19 | 'in_channel': 32, 20 | 'out_channel': 64 21 | } 22 | 23 | cfg_re50 = { 24 | 'name': 'Resnet50', 25 | 'min_sizes': [[16, 32], [64, 128], [256, 512]], 26 | 'steps': [8, 16, 32], 27 | 'variance': [0.1, 0.2], 28 | 'clip': False, 29 | 'loc_weight': 2.0, 30 | 'gpu_train': True, 31 | 'batch_size': 24, 32 | 'ngpu': 4, 33 | 'epoch': 100, 34 | 'decay1': 70, 35 | 'decay2': 90, 36 | 'image_size': 840, 37 | 'pretrain': True, 38 | 'return_layers': {'layer2': 1, 'layer3': 2, 'layer4': 3}, 39 | 'in_channel': 256, 40 | 'out_channel': 256 41 | } 42 | 43 | -------------------------------------------------------------------------------- /data/data_augment.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import random 4 | from utils.box_utils import matrix_iof 5 | 6 | 7 | def _crop(image, boxes, labels, landm, img_dim): 8 | height, width, _ = image.shape 9 | pad_image_flag = True 10 | 11 | for _ in range(250): 12 | """ 13 | if random.uniform(0, 1) <= 0.2: 14 | scale = 1.0 15 | else: 16 | scale = random.uniform(0.3, 1.0) 17 | """ 18 | PRE_SCALES = [0.3, 0.45, 0.6, 0.8, 1.0] 19 | scale = random.choice(PRE_SCALES) 20 | short_side = min(width, height) 21 | w = int(scale * short_side) 22 | h = w 23 | 24 | if width == w: 25 | l = 0 26 | else: 27 | l = random.randrange(width - w) 28 | if height == h: 29 | t = 0 30 | else: 31 | t = random.randrange(height - h) 32 | roi = np.array((l, t, l + w, t + h)) 33 | 34 | value = matrix_iof(boxes, roi[np.newaxis]) 35 | flag = (value >= 1) 36 | if not flag.any(): 37 | continue 38 | 39 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2 40 | mask_a = np.logical_and(roi[:2] < centers, centers < roi[2:]).all(axis=1) 41 | boxes_t = boxes[mask_a].copy() 42 | labels_t = labels[mask_a].copy() 43 | landms_t = landm[mask_a].copy() 44 | landms_t = landms_t.reshape([-1, 5, 2]) 45 | 46 | if boxes_t.shape[0] == 0: 47 | continue 48 | 49 | image_t = image[roi[1]:roi[3], roi[0]:roi[2]] 50 | 51 | boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2]) 52 | boxes_t[:, :2] -= roi[:2] 53 | boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:]) 54 | boxes_t[:, 2:] -= roi[:2] 55 | 56 | # landm 57 | landms_t[:, :, :2] = landms_t[:, :, :2] - roi[:2] 58 | landms_t[:, :, :2] = np.maximum(landms_t[:, :, :2], np.array([0, 0])) 59 | landms_t[:, :, :2] = np.minimum(landms_t[:, :, :2], roi[2:] - roi[:2]) 60 | landms_t = landms_t.reshape([-1, 10]) 61 | 62 | 63 | # make sure that the cropped image contains at least one face > 16 pixel at training image scale 64 | b_w_t = (boxes_t[:, 2] - boxes_t[:, 0] + 1) / w * img_dim 65 | b_h_t = (boxes_t[:, 3] - boxes_t[:, 1] + 1) / h * img_dim 66 | mask_b = np.minimum(b_w_t, b_h_t) > 0.0 67 | boxes_t = boxes_t[mask_b] 68 | labels_t = labels_t[mask_b] 69 | landms_t = landms_t[mask_b] 70 | 71 | if boxes_t.shape[0] == 0: 72 | continue 73 | 74 | pad_image_flag = False 75 | 76 | return image_t, boxes_t, labels_t, landms_t, pad_image_flag 77 | return image, boxes, labels, landm, pad_image_flag 78 | 79 | 80 | def _distort(image): 81 | 82 | def _convert(image, alpha=1, beta=0): 83 | tmp = image.astype(float) * alpha + beta 84 | tmp[tmp < 0] = 0 85 | tmp[tmp > 255] = 255 86 | image[:] = tmp 87 | 88 | image = image.copy() 89 | 90 | if random.randrange(2): 91 | 92 | #brightness distortion 93 | if random.randrange(2): 94 | _convert(image, beta=random.uniform(-32, 32)) 95 | 96 | #contrast distortion 97 | if random.randrange(2): 98 | _convert(image, alpha=random.uniform(0.5, 1.5)) 99 | 100 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 101 | 102 | #saturation distortion 103 | if random.randrange(2): 104 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 105 | 106 | #hue distortion 107 | if random.randrange(2): 108 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 109 | tmp %= 180 110 | image[:, :, 0] = tmp 111 | 112 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 113 | 114 | else: 115 | 116 | #brightness distortion 117 | if random.randrange(2): 118 | _convert(image, beta=random.uniform(-32, 32)) 119 | 120 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 121 | 122 | #saturation distortion 123 | if random.randrange(2): 124 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 125 | 126 | #hue distortion 127 | if random.randrange(2): 128 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 129 | tmp %= 180 130 | image[:, :, 0] = tmp 131 | 132 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 133 | 134 | #contrast distortion 135 | if random.randrange(2): 136 | _convert(image, alpha=random.uniform(0.5, 1.5)) 137 | 138 | return image 139 | 140 | 141 | def _expand(image, boxes, fill, p): 142 | if random.randrange(2): 143 | return image, boxes 144 | 145 | height, width, depth = image.shape 146 | 147 | scale = random.uniform(1, p) 148 | w = int(scale * width) 149 | h = int(scale * height) 150 | 151 | left = random.randint(0, w - width) 152 | top = random.randint(0, h - height) 153 | 154 | boxes_t = boxes.copy() 155 | boxes_t[:, :2] += (left, top) 156 | boxes_t[:, 2:] += (left, top) 157 | expand_image = np.empty( 158 | (h, w, depth), 159 | dtype=image.dtype) 160 | expand_image[:, :] = fill 161 | expand_image[top:top + height, left:left + width] = image 162 | image = expand_image 163 | 164 | return image, boxes_t 165 | 166 | 167 | def _mirror(image, boxes, landms): 168 | _, width, _ = image.shape 169 | if random.randrange(2): 170 | image = image[:, ::-1] 171 | boxes = boxes.copy() 172 | boxes[:, 0::2] = width - boxes[:, 2::-2] 173 | 174 | # landm 175 | landms = landms.copy() 176 | landms = landms.reshape([-1, 5, 2]) 177 | landms[:, :, 0] = width - landms[:, :, 0] 178 | tmp = landms[:, 1, :].copy() 179 | landms[:, 1, :] = landms[:, 0, :] 180 | landms[:, 0, :] = tmp 181 | tmp1 = landms[:, 4, :].copy() 182 | landms[:, 4, :] = landms[:, 3, :] 183 | landms[:, 3, :] = tmp1 184 | landms = landms.reshape([-1, 10]) 185 | 186 | return image, boxes, landms 187 | 188 | 189 | def _pad_to_square(image, rgb_mean, pad_image_flag): 190 | if not pad_image_flag: 191 | return image 192 | height, width, _ = image.shape 193 | long_side = max(width, height) 194 | image_t = np.empty((long_side, long_side, 3), dtype=image.dtype) 195 | image_t[:, :] = rgb_mean 196 | image_t[0:0 + height, 0:0 + width] = image 197 | return image_t 198 | 199 | 200 | def _resize_subtract_mean(image, insize, rgb_mean): 201 | interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] 202 | interp_method = interp_methods[random.randrange(5)] 203 | image = cv2.resize(image, (insize, insize), interpolation=interp_method) 204 | image = image.astype(np.float32) 205 | image -= rgb_mean 206 | return image.transpose(2, 0, 1) 207 | 208 | 209 | class preproc(object): 210 | 211 | def __init__(self, img_dim, rgb_means): 212 | self.img_dim = img_dim 213 | self.rgb_means = rgb_means 214 | 215 | def __call__(self, image, targets): 216 | assert targets.shape[0] > 0, "this image does not have gt" 217 | 218 | boxes = targets[:, :4].copy() 219 | labels = targets[:, -1].copy() 220 | landm = targets[:, 4:-1].copy() 221 | 222 | image_t, boxes_t, labels_t, landm_t, pad_image_flag = _crop(image, boxes, labels, landm, self.img_dim) 223 | image_t = _distort(image_t) 224 | image_t = _pad_to_square(image_t,self.rgb_means, pad_image_flag) 225 | image_t, boxes_t, landm_t = _mirror(image_t, boxes_t, landm_t) 226 | height, width, _ = image_t.shape 227 | image_t = _resize_subtract_mean(image_t, self.img_dim, self.rgb_means) 228 | boxes_t[:, 0::2] /= width 229 | boxes_t[:, 1::2] /= height 230 | 231 | landm_t[:, 0::2] /= width 232 | landm_t[:, 1::2] /= height 233 | 234 | labels_t = np.expand_dims(labels_t, 1) 235 | targets_t = np.hstack((boxes_t, landm_t, labels_t)) 236 | 237 | return image_t, targets_t 238 | -------------------------------------------------------------------------------- /data/input/huge.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/data/input/huge.jpg -------------------------------------------------------------------------------- /data/output/bbox_pred_acc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/data/output/bbox_pred_acc.png -------------------------------------------------------------------------------- /data/output/test_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/data/output/test_5.jpg -------------------------------------------------------------------------------- /data/wider_face.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import sys 4 | import torch 5 | import torch.utils.data as data 6 | import cv2 7 | import numpy as np 8 | 9 | class WiderFaceDetection(data.Dataset): 10 | def __init__(self, txt_path, preproc=None): 11 | self.preproc = preproc 12 | self.imgs_path = [] 13 | self.words = [] 14 | f = open(txt_path,'r') 15 | lines = f.readlines() 16 | isFirst = True 17 | labels = [] 18 | for line in lines: 19 | line = line.rstrip() 20 | if line.startswith('#'): 21 | if isFirst is True: 22 | isFirst = False 23 | else: 24 | labels_copy = labels.copy() 25 | self.words.append(labels_copy) 26 | labels.clear() 27 | path = line[2:] 28 | path = txt_path.replace('label.txt','images/') + path 29 | self.imgs_path.append(path) 30 | else: 31 | line = line.split(' ') 32 | label = [float(x) for x in line] 33 | labels.append(label) 34 | 35 | self.words.append(labels) 36 | 37 | def __len__(self): 38 | return len(self.imgs_path) 39 | 40 | def __getitem__(self, index): 41 | img = cv2.imread(self.imgs_path[index]) 42 | height, width, _ = img.shape 43 | 44 | labels = self.words[index] 45 | annotations = np.zeros((0, 15)) 46 | if len(labels) == 0: 47 | return annotations 48 | for idx, label in enumerate(labels): 49 | annotation = np.zeros((1, 15)) 50 | # bbox 51 | annotation[0, 0] = label[0] # x1 52 | annotation[0, 1] = label[1] # y1 53 | annotation[0, 2] = label[0] + label[2] # x2 54 | annotation[0, 3] = label[1] + label[3] # y2 55 | 56 | # landmarks 57 | annotation[0, 4] = label[4] # l0_x 58 | annotation[0, 5] = label[5] # l0_y 59 | annotation[0, 6] = label[7] # l1_x 60 | annotation[0, 7] = label[8] # l1_y 61 | annotation[0, 8] = label[10] # l2_x 62 | annotation[0, 9] = label[11] # l2_y 63 | annotation[0, 10] = label[13] # l3_x 64 | annotation[0, 11] = label[14] # l3_y 65 | annotation[0, 12] = label[16] # l4_x 66 | annotation[0, 13] = label[17] # l4_y 67 | if (annotation[0, 4]<0): 68 | annotation[0, 14] = -1 69 | else: 70 | annotation[0, 14] = 1 71 | 72 | annotations = np.append(annotations, annotation, axis=0) 73 | target = np.array(annotations) 74 | if self.preproc is not None: 75 | img, target = self.preproc(img, target) 76 | 77 | return torch.from_numpy(img), target 78 | 79 | def detection_collate(batch): 80 | """Custom collate fn for dealing with batches of images that have a different 81 | number of associated object annotations (bounding boxes). 82 | 83 | Arguments: 84 | batch: (tuple) A tuple of tensor images and lists of annotations 85 | 86 | Return: 87 | A tuple containing: 88 | 1) (tensor) batch of images stacked on their 0 dim 89 | 2) (list of tensors) annotations for a given image are stacked on 0 dim 90 | """ 91 | targets = [] 92 | imgs = [] 93 | for _, sample in enumerate(batch): 94 | for _, tup in enumerate(sample): 95 | if torch.is_tensor(tup): 96 | imgs.append(tup) 97 | elif isinstance(tup, type(np.empty(0))): 98 | annos = torch.from_numpy(tup).float() 99 | targets.append(annos) 100 | 101 | return (torch.stack(imgs, 0), targets) 102 | -------------------------------------------------------------------------------- /detect.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import argparse 4 | import torch 5 | import torch.backends.cudnn as cudnn 6 | import numpy as np 7 | from data import cfg_mnet, cfg_re50 8 | from layers.functions.prior_box import PriorBox 9 | from utils.nms.py_cpu_nms import py_cpu_nms 10 | import cv2 11 | from models.retinaface import RetinaFace 12 | from utils.box_utils import decode, decode_landm 13 | import time 14 | 15 | parser = argparse.ArgumentParser(description='Retinaface') 16 | 17 | parser.add_argument('-m', '--trained_model', default='./weights/Resnet50_Final.pth', 18 | type=str, help='Trained state_dict file path to open') 19 | parser.add_argument('--network', default='resnet50', help='Backbone network mobile0.25 or resnet50') 20 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference') 21 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold') 22 | parser.add_argument('--top_k', default=5000, type=int, help='top_k') 23 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold') 24 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k') 25 | parser.add_argument('-s', '--save_image', action="store_true", default=True, help='show detection results') 26 | parser.add_argument('--vis_thres', default=0.6, type=float, help='visualization_threshold') 27 | args = parser.parse_args() 28 | 29 | 30 | def check_keys(model, pretrained_state_dict): 31 | ckpt_keys = set(pretrained_state_dict.keys()) 32 | model_keys = set(model.state_dict().keys()) 33 | used_pretrained_keys = model_keys & ckpt_keys 34 | unused_pretrained_keys = ckpt_keys - model_keys 35 | missing_keys = model_keys - ckpt_keys 36 | print('Missing keys:{}'.format(len(missing_keys))) 37 | print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys))) 38 | print('Used keys:{}'.format(len(used_pretrained_keys))) 39 | assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' 40 | return True 41 | 42 | 43 | def remove_prefix(state_dict, prefix): 44 | ''' Old style model is stored with all names of parameters sharing common prefix 'module.' ''' 45 | print('remove prefix \'{}\''.format(prefix)) 46 | f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x 47 | return {f(key): value for key, value in state_dict.items()} 48 | 49 | 50 | def load_model(model, pretrained_path, load_to_cpu): 51 | print('Loading pretrained model from {}'.format(pretrained_path)) 52 | if load_to_cpu: 53 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 54 | else: 55 | device = torch.cuda.current_device() 56 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) 57 | if "state_dict" in pretrained_dict.keys(): 58 | pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.') 59 | else: 60 | pretrained_dict = remove_prefix(pretrained_dict, 'module.') 61 | check_keys(model, pretrained_dict) 62 | model.load_state_dict(pretrained_dict, strict=False) 63 | return model 64 | 65 | 66 | if __name__ == '__main__': 67 | torch.set_grad_enabled(False) 68 | cfg = None 69 | if args.network == "mobile0.25": 70 | cfg = cfg_mnet 71 | elif args.network == "resnet50": 72 | cfg = cfg_re50 73 | # net and model 74 | net = RetinaFace(cfg=cfg, phase = 'test') 75 | net = load_model(net, args.trained_model, args.cpu) 76 | net.eval() 77 | print('Finished loading model!') 78 | print(net) 79 | cudnn.benchmark = True 80 | device = torch.device("cpu" if args.cpu else "cuda") 81 | net = net.to(device) 82 | 83 | resize = 1 84 | 85 | # testing begin 86 | for i in range(100): 87 | image_path = "./curve/test.jpg" 88 | img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR) 89 | 90 | img = np.float32(img_raw) 91 | 92 | im_height, im_width, _ = img.shape 93 | scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) 94 | img -= (104, 117, 123) 95 | img = img.transpose(2, 0, 1) 96 | img = torch.from_numpy(img).unsqueeze(0) 97 | img = img.to(device) 98 | scale = scale.to(device) 99 | 100 | tic = time.time() 101 | loc, conf, landms = net(img) # forward pass 102 | print('net forward time: {:.4f}'.format(time.time() - tic)) 103 | 104 | priorbox = PriorBox(cfg, image_size=(im_height, im_width)) 105 | priors = priorbox.forward() 106 | priors = priors.to(device) 107 | prior_data = priors.data 108 | boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance']) 109 | boxes = boxes * scale / resize 110 | boxes = boxes.cpu().numpy() 111 | scores = conf.squeeze(0).data.cpu().numpy()[:, 1] 112 | landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance']) 113 | scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2], 114 | img.shape[3], img.shape[2], img.shape[3], img.shape[2], 115 | img.shape[3], img.shape[2]]) 116 | scale1 = scale1.to(device) 117 | landms = landms * scale1 / resize 118 | landms = landms.cpu().numpy() 119 | 120 | # ignore low scores 121 | inds = np.where(scores > args.confidence_threshold)[0] 122 | boxes = boxes[inds] 123 | landms = landms[inds] 124 | scores = scores[inds] 125 | 126 | # keep top-K before NMS 127 | order = scores.argsort()[::-1][:args.top_k] 128 | boxes = boxes[order] 129 | landms = landms[order] 130 | scores = scores[order] 131 | 132 | # do NMS 133 | dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) 134 | keep = py_cpu_nms(dets, args.nms_threshold) 135 | # keep = nms(dets, args.nms_threshold,force_cpu=args.cpu) 136 | dets = dets[keep, :] 137 | landms = landms[keep] 138 | 139 | # keep top-K faster NMS 140 | dets = dets[:args.keep_top_k, :] 141 | landms = landms[:args.keep_top_k, :] 142 | 143 | dets = np.concatenate((dets, landms), axis=1) 144 | 145 | # show image 146 | if args.save_image: 147 | for b in dets: 148 | if b[4] < args.vis_thres: 149 | continue 150 | text = "{:.4f}".format(b[4]) 151 | b = list(map(int, b)) 152 | cv2.rectangle(img_raw, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2) 153 | cx = b[0] 154 | cy = b[1] + 12 155 | cv2.putText(img_raw, text, (cx, cy), 156 | cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) 157 | 158 | # landms 159 | cv2.circle(img_raw, (b[5], b[6]), 1, (0, 0, 255), 4) 160 | cv2.circle(img_raw, (b[7], b[8]), 1, (0, 255, 255), 4) 161 | cv2.circle(img_raw, (b[9], b[10]), 1, (255, 0, 255), 4) 162 | cv2.circle(img_raw, (b[11], b[12]), 1, (0, 255, 0), 4) 163 | cv2.circle(img_raw, (b[13], b[14]), 1, (255, 0, 0), 4) 164 | # save image 165 | 166 | name = "test.jpg" 167 | cv2.imwrite(name, img_raw) 168 | 169 | -------------------------------------------------------------------------------- /detect_merge.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import argparse 4 | import torch 5 | import torch.backends.cudnn as cudnn 6 | import numpy as np 7 | from data import cfg_mnet, cfg_re50 8 | from layers.functions.prior_box import PriorBox 9 | from utils.nms.py_cpu_nms import py_cpu_nms 10 | import cv2 11 | from models.retinaface import RetinaFace 12 | from utils.box_utils import decode, decode_landm 13 | import time 14 | import torch.nn.functional as F 15 | from pose import utils 16 | 17 | os.environ['CUDA_VISIBLE_DEVICES'] = '0' 18 | 19 | parser = argparse.ArgumentParser(description='Retinaface') 20 | 21 | parser.add_argument('-m', '--trained_model', default='./weights_merge/Resnet50_Final5_best.pth', 22 | type=str, help='Trained state_dict file path to open') 23 | parser.add_argument('--network', default='resnet50', help='Backbone network mobile0.25 or resnet50') 24 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference') 25 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold') 26 | parser.add_argument('--top_k', default=5000, type=int, help='top_k') 27 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold') 28 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k') 29 | parser.add_argument('-s', '--save_image', default=True, type=bool, help='show detection results') 30 | parser.add_argument('--vis_thres', default=0.5, type=float, help='visualization_threshold') 31 | parser.add_argument('--image_path', default="/home/gengyanlei/Datasets/East_door_face/huge.jpg", type=str, help="image's path") 32 | parser.add_argument('--output_path', default="test_5.jpg", type=str, help='predict-visual') 33 | args = parser.parse_args() 34 | 35 | 36 | def check_keys(model, pretrained_state_dict): 37 | ckpt_keys = set(pretrained_state_dict.keys()) 38 | model_keys = set(model.state_dict().keys()) 39 | used_pretrained_keys = model_keys & ckpt_keys 40 | unused_pretrained_keys = ckpt_keys - model_keys 41 | missing_keys = model_keys - ckpt_keys 42 | print('Missing keys:{}'.format(len(missing_keys))) 43 | print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys))) 44 | print('Used keys:{}'.format(len(used_pretrained_keys))) 45 | assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' 46 | return True 47 | 48 | 49 | def remove_prefix(state_dict, prefix): 50 | ''' Old style model is stored with all names of parameters sharing common prefix 'module.' ''' 51 | print('remove prefix \'{}\''.format(prefix)) 52 | f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x 53 | return {f(key): value for key, value in state_dict.items()} 54 | 55 | 56 | def load_model(model, pretrained_path, load_to_cpu): 57 | print('Loading pretrained model from {}'.format(pretrained_path)) 58 | if load_to_cpu: 59 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 60 | else: 61 | device = torch.cuda.current_device() 62 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) 63 | if "state_dict" in pretrained_dict.keys(): 64 | pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.') 65 | else: 66 | pretrained_dict = remove_prefix(pretrained_dict, 'module.') 67 | check_keys(model, pretrained_dict) 68 | model.load_state_dict(pretrained_dict, strict=False) 69 | return model 70 | 71 | if __name__ == '__main__': 72 | torch.set_grad_enabled(False) 73 | cfg = None 74 | if args.network == "mobile0.25": 75 | cfg = cfg_mnet 76 | elif args.network == "resnet50": 77 | cfg = cfg_re50 78 | # net and model 79 | net = RetinaFace(cfg=cfg, phase='test') 80 | net = load_model(net, args.trained_model, args.cpu) 81 | net.eval() 82 | print('Finished loading model!') 83 | # print(net) 84 | cudnn.benchmark = True 85 | device = torch.device("cpu" if args.cpu else "cuda") 86 | net = net.to(device) 87 | 88 | idx_tensor = [idx for idx in range(66)] 89 | idx_tensor = torch.FloatTensor(idx_tensor).cuda() 90 | 91 | resize = 1 92 | 93 | # testing begin 94 | for i in range(1): 95 | image_path = args.image_path 96 | img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR) 97 | 98 | img = np.float32(img_raw) 99 | # 测试是原始图像尺寸,不是640*640尺寸 100 | im_height, im_width, _ = img.shape 101 | scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) #whwh 102 | img -= (104, 117, 123) 103 | # 扩展 归一化;而且依旧是bgr输入,前后一致 104 | img /= (57, 57, 58) 105 | 106 | img = img.transpose(2, 0, 1) # chw 107 | img = torch.from_numpy(img).unsqueeze(0) 108 | img = img.to(device) 109 | scale = scale.to(device) 110 | 111 | tic = time.time() 112 | loc, conf, landms, yaw, pitch, roll = net(img) # forward pass 113 | print('net forward time: {:.4f}'.format(time.time() - tic)) 114 | 115 | priorbox = PriorBox(cfg, image_size=(im_height, im_width)) 116 | priors = priorbox.forward() 117 | priors = priors.to(device) 118 | prior_data = priors.data 119 | # decode 就相当于 匹配了!!!将anchor与预测框之间进行匹配 120 | boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance']) 121 | boxes = boxes * scale / resize 122 | 123 | boxes = boxes.cpu().numpy() 124 | scores = conf.squeeze(0).data.cpu().numpy()[:, 1] 125 | landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance']) 126 | # wh-> xy 127 | scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2], 128 | img.shape[3], img.shape[2], img.shape[3], img.shape[2], 129 | img.shape[3], img.shape[2]]) 130 | scale1 = scale1.to(device) 131 | landms = landms * scale1 / resize 132 | landms = landms.cpu().numpy() 133 | 134 | # ignore low scores 135 | inds = np.where(scores > args.confidence_threshold)[0] 136 | boxes = boxes[inds] 137 | landms = landms[inds] 138 | scores = scores[inds] 139 | 140 | yaw = yaw.squeeze(0)[inds] 141 | pitch = pitch.squeeze(0)[inds] 142 | roll = roll.squeeze(0)[inds] 143 | 144 | # keep top-K before NMS 需要进行排序,获取每个预测框的score 按照从大到小排序,应该是每一类! 145 | order = scores.argsort()[::-1][:args.top_k] 146 | boxes = boxes[order] 147 | landms = landms[order] 148 | scores = scores[order] 149 | 150 | yaw = yaw[order.tolist()] 151 | pitch = pitch[order.tolist()] 152 | roll = roll[order.tolist()] 153 | 154 | # do NMS 155 | dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) 156 | keep = py_cpu_nms(dets, args.nms_threshold) 157 | # keep = nms(dets, args.nms_threshold,force_cpu=args.cpu) 158 | dets = dets[keep, :] 159 | landms = landms[keep] 160 | 161 | yaw = yaw[keep] 162 | pitch = pitch[keep] 163 | roll = roll[keep] 164 | 165 | # keep top-K faster NMS 166 | dets = dets[:args.keep_top_k, :] 167 | landms = landms[:args.keep_top_k, :] 168 | 169 | yaw = yaw[:args.keep_top_k] 170 | pitch = pitch[:args.keep_top_k] 171 | roll = roll[:args.keep_top_k] 172 | 173 | yaw = F.softmax(yaw, dim=-1) 174 | pitch = F.softmax(pitch, dim=-1) 175 | roll = F.softmax(roll, dim=-1) 176 | yaw = torch.sum(yaw * idx_tensor, -1) * 3 - 99 177 | pitch = torch.sum(pitch * idx_tensor, -1) * 3 - 99 178 | roll = torch.sum(roll * idx_tensor, -1) * 3 - 99 179 | 180 | yaw = yaw.unsqueeze(-1).cpu().numpy() 181 | pitch = pitch.unsqueeze(-1).cpu().numpy() 182 | roll = roll.unsqueeze(-1).cpu().numpy() 183 | 184 | dets = np.concatenate((dets, landms, yaw, pitch, roll), axis=1) 185 | 186 | # show image 187 | if args.save_image: 188 | for b in dets: 189 | if b[4] < args.vis_thres: 190 | continue 191 | text = "{:.4f}".format(b[4]) 192 | b = list(map(int, b)) 193 | cv2.rectangle(img_raw, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2) 194 | cx = b[0] 195 | cy = b[1] + 12 196 | cv2.putText(img_raw, text, (cx, cy), 197 | cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) 198 | 199 | # landms 200 | cv2.circle(img_raw, (b[5], b[6]), 1, (0, 0, 255), 3) 201 | cv2.circle(img_raw, (b[7], b[8]), 1, (0, 255, 255), 3) 202 | cv2.circle(img_raw, (b[9], b[10]), 1, (255, 0, 255), 3) 203 | cv2.circle(img_raw, (b[11], b[12]), 1, (0, 255, 0), 3) 204 | cv2.circle(img_raw, (b[13], b[14]), 1, (255, 0, 0), 3) 205 | 206 | # pose 207 | utils.draw_axis(img_raw, b[15], b[16], b[17], tdx=(b[0] + b[2]) / 2, tdy=(b[1] + b[3]) / 2, size=abs(b[3]-b[1]) / 2) 208 | 209 | # save image 210 | 211 | name = args.output_path 212 | cv2.imwrite(name, img_raw) 213 | 214 | -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from itertools import product as product 3 | import numpy as np 4 | from math import ceil 5 | 6 | 7 | class PriorBox(object): 8 | def __init__(self, cfg, image_size=None, phase='train'): 9 | super(PriorBox, self).__init__() 10 | self.min_sizes = cfg['min_sizes'] 11 | self.steps = cfg['steps'] 12 | self.clip = cfg['clip'] 13 | self.image_size = image_size 14 | self.feature_maps = [[ceil(self.image_size[0]/step), ceil(self.image_size[1]/step)] for step in self.steps] 15 | self.name = "s" 16 | 17 | def forward(self): 18 | anchors = [] 19 | for k, f in enumerate(self.feature_maps): 20 | min_sizes = self.min_sizes[k] 21 | for i, j in product(range(f[0]), range(f[1])): 22 | for min_size in min_sizes: 23 | s_kx = min_size / self.image_size[1] 24 | s_ky = min_size / self.image_size[0] 25 | dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]] 26 | dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]] 27 | for cy, cx in product(dense_cy, dense_cx): 28 | anchors += [cx, cy, s_kx, s_ky] 29 | 30 | # back to torch land 31 | output = torch.Tensor(anchors).view(-1, 4) 32 | if self.clip: 33 | output.clamp_(max=1, min=0) 34 | return output 35 | -------------------------------------------------------------------------------- /layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .multibox_loss import MultiBoxLoss 2 | 3 | __all__ = ['MultiBoxLoss'] 4 | -------------------------------------------------------------------------------- /layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from utils.box_utils import match, log_sum_exp 6 | from data import cfg_mnet 7 | GPU = cfg_mnet['gpu_train'] 8 | 9 | class MultiBoxLoss(nn.Module): 10 | """SSD Weighted Loss Function 11 | Compute Targets: 12 | 1) Produce Confidence Target Indices by matching ground truth boxes 13 | with (default) 'priorboxes' that have jaccard index > threshold parameter 14 | (default threshold: 0.5). 15 | 2) Produce localization target by 'encoding' variance into offsets of ground 16 | truth boxes and their matched 'priorboxes'. 17 | 3) Hard negative mining to filter the excessive number of negative examples 18 | that comes with using a large number of default bounding boxes. 19 | (default negative:positive ratio 3:1) 20 | Objective Loss: 21 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 22 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 23 | weighted by α which is set to 1 by cross val. 24 | Args: 25 | c: class confidences, 26 | l: predicted boxes, 27 | g: ground truth boxes 28 | N: number of matched default boxes 29 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 30 | """ 31 | 32 | def __init__(self, num_classes, overlap_thresh, prior_for_matching, bkg_label, neg_mining, neg_pos, neg_overlap, encode_target): 33 | super(MultiBoxLoss, self).__init__() 34 | self.num_classes = num_classes 35 | self.threshold = overlap_thresh 36 | self.background_label = bkg_label 37 | self.encode_target = encode_target 38 | self.use_prior_for_matching = prior_for_matching 39 | self.do_neg_mining = neg_mining 40 | self.negpos_ratio = neg_pos 41 | self.neg_overlap = neg_overlap 42 | self.variance = [0.1, 0.2] 43 | 44 | def forward(self, predictions, priors, targets): 45 | """Multibox Loss 46 | Args: 47 | predictions (tuple): A tuple containing loc preds, conf preds, 48 | and prior boxes from SSD net. 49 | conf shape: torch.size(batch_size,num_priors,num_classes) 50 | loc shape: torch.size(batch_size,num_priors,4) 51 | priors shape: torch.size(num_priors,4) 52 | 53 | ground_truth (tensor): Ground truth boxes and labels for a batch, 54 | shape: [batch_size,num_objs,5] (last idx is the label). 55 | """ 56 | 57 | loc_data, conf_data, landm_data = predictions 58 | priors = priors 59 | num = loc_data.size(0) 60 | num_priors = (priors.size(0)) 61 | 62 | # match priors (default boxes) and ground truth boxes 63 | loc_t = torch.Tensor(num, num_priors, 4) 64 | landm_t = torch.Tensor(num, num_priors, 10) 65 | conf_t = torch.LongTensor(num, num_priors) 66 | for idx in range(num): 67 | truths = targets[idx][:, :4].data 68 | labels = targets[idx][:, -1].data 69 | landms = targets[idx][:, 4:14].data 70 | defaults = priors.data 71 | match(self.threshold, truths, defaults, self.variance, labels, landms, loc_t, conf_t, landm_t, idx) 72 | if GPU: 73 | loc_t = loc_t.cuda() 74 | conf_t = conf_t.cuda() 75 | landm_t = landm_t.cuda() 76 | 77 | zeros = torch.tensor(0).cuda() 78 | # landm Loss (Smooth L1) 79 | # Shape: [batch,num_priors,10] 80 | pos1 = conf_t > zeros 81 | num_pos_landm = pos1.long().sum(1, keepdim=True) 82 | N1 = max(num_pos_landm.data.sum().float(), 1) 83 | pos_idx1 = pos1.unsqueeze(pos1.dim()).expand_as(landm_data) 84 | landm_p = landm_data[pos_idx1].view(-1, 10) 85 | landm_t = landm_t[pos_idx1].view(-1, 10) 86 | loss_landm = F.smooth_l1_loss(landm_p, landm_t, reduction='sum') 87 | 88 | 89 | pos = conf_t != zeros 90 | conf_t[pos] = 1 91 | 92 | # Localization Loss (Smooth L1) 93 | # Shape: [batch,num_priors,4] 94 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 95 | loc_p = loc_data[pos_idx].view(-1, 4) 96 | loc_t = loc_t[pos_idx].view(-1, 4) 97 | loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') 98 | 99 | # Compute max conf across batch for hard negative mining 100 | batch_conf = conf_data.view(-1, self.num_classes) 101 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) 102 | 103 | # Hard Negative Mining 104 | loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now 105 | loss_c = loss_c.view(num, -1) 106 | _, loss_idx = loss_c.sort(1, descending=True) 107 | _, idx_rank = loss_idx.sort(1) 108 | num_pos = pos.long().sum(1, keepdim=True) 109 | num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) 110 | neg = idx_rank < num_neg.expand_as(idx_rank) 111 | 112 | # Confidence Loss Including Positive and Negative Examples 113 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 114 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 115 | conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1,self.num_classes) 116 | targets_weighted = conf_t[(pos+neg).gt(0)] 117 | loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum') 118 | 119 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 120 | N = max(num_pos.data.sum().float(), 1) 121 | loss_l /= N 122 | loss_c /= N 123 | loss_landm /= N1 124 | 125 | return loss_l, loss_c, loss_landm 126 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/models/__init__.py -------------------------------------------------------------------------------- /models/net.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import torch.nn as nn 4 | import torchvision.models._utils as _utils 5 | import torchvision.models as models 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | def conv_bn(inp, oup, stride = 1, leaky = 0): 10 | return nn.Sequential( 11 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 12 | nn.BatchNorm2d(oup), 13 | nn.LeakyReLU(negative_slope=leaky, inplace=True) 14 | ) 15 | 16 | def conv_bn_no_relu(inp, oup, stride): 17 | return nn.Sequential( 18 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 19 | nn.BatchNorm2d(oup), 20 | ) 21 | 22 | def conv_bn1X1(inp, oup, stride, leaky=0): 23 | return nn.Sequential( 24 | nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False), 25 | nn.BatchNorm2d(oup), 26 | nn.LeakyReLU(negative_slope=leaky, inplace=True) 27 | ) 28 | 29 | def conv_dw(inp, oup, stride, leaky=0.1): 30 | return nn.Sequential( 31 | nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), 32 | nn.BatchNorm2d(inp), 33 | nn.LeakyReLU(negative_slope= leaky,inplace=True), 34 | 35 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 36 | nn.BatchNorm2d(oup), 37 | nn.LeakyReLU(negative_slope= leaky,inplace=True), 38 | ) 39 | 40 | class SSH(nn.Module): 41 | def __init__(self, in_channel, out_channel): 42 | super(SSH, self).__init__() 43 | assert out_channel % 4 == 0 44 | leaky = 0 45 | if (out_channel <= 64): 46 | leaky = 0.1 47 | self.conv3X3 = conv_bn_no_relu(in_channel, out_channel//2, stride=1) 48 | 49 | self.conv5X5_1 = conv_bn(in_channel, out_channel//4, stride=1, leaky = leaky) 50 | self.conv5X5_2 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1) 51 | 52 | self.conv7X7_2 = conv_bn(out_channel//4, out_channel//4, stride=1, leaky = leaky) 53 | self.conv7x7_3 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1) 54 | 55 | def forward(self, input): 56 | conv3X3 = self.conv3X3(input) 57 | 58 | conv5X5_1 = self.conv5X5_1(input) 59 | conv5X5 = self.conv5X5_2(conv5X5_1) 60 | 61 | conv7X7_2 = self.conv7X7_2(conv5X5_1) 62 | conv7X7 = self.conv7x7_3(conv7X7_2) 63 | 64 | out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1) 65 | out = F.relu(out) 66 | return out 67 | 68 | class FPN(nn.Module): 69 | def __init__(self,in_channels_list,out_channels): 70 | super(FPN,self).__init__() 71 | leaky = 0 72 | if (out_channels <= 64): 73 | leaky = 0.1 74 | self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride = 1, leaky = leaky) 75 | self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride = 1, leaky = leaky) 76 | self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride = 1, leaky = leaky) 77 | 78 | self.merge1 = conv_bn(out_channels, out_channels, leaky = leaky) 79 | self.merge2 = conv_bn(out_channels, out_channels, leaky = leaky) 80 | 81 | def forward(self, input): 82 | # names = list(input.keys()) 83 | input = list(input.values()) 84 | 85 | output1 = self.output1(input[0]) 86 | output2 = self.output2(input[1]) 87 | output3 = self.output3(input[2]) 88 | 89 | up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode="nearest") 90 | output2 = output2 + up3 91 | output2 = self.merge2(output2) 92 | 93 | up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode="nearest") 94 | output1 = output1 + up2 95 | output1 = self.merge1(output1) 96 | 97 | out = [output1, output2, output3] 98 | return out 99 | 100 | 101 | 102 | class MobileNetV1(nn.Module): 103 | def __init__(self): 104 | super(MobileNetV1, self).__init__() 105 | self.stage1 = nn.Sequential( 106 | conv_bn(3, 8, 2, leaky = 0.1), # 3 107 | conv_dw(8, 16, 1), # 7 108 | conv_dw(16, 32, 2), # 11 109 | conv_dw(32, 32, 1), # 19 110 | conv_dw(32, 64, 2), # 27 111 | conv_dw(64, 64, 1), # 43 112 | ) 113 | self.stage2 = nn.Sequential( 114 | conv_dw(64, 128, 2), # 43 + 16 = 59 115 | conv_dw(128, 128, 1), # 59 + 32 = 91 116 | conv_dw(128, 128, 1), # 91 + 32 = 123 117 | conv_dw(128, 128, 1), # 123 + 32 = 155 118 | conv_dw(128, 128, 1), # 155 + 32 = 187 119 | conv_dw(128, 128, 1), # 187 + 32 = 219 120 | ) 121 | self.stage3 = nn.Sequential( 122 | conv_dw(128, 256, 2), # 219 +3 2 = 241 123 | conv_dw(256, 256, 1), # 241 + 64 = 301 124 | ) 125 | self.avg = nn.AdaptiveAvgPool2d((1,1)) 126 | self.fc = nn.Linear(256, 1000) 127 | 128 | def forward(self, x): 129 | x = self.stage1(x) 130 | x = self.stage2(x) 131 | x = self.stage3(x) 132 | x = self.avg(x) 133 | # x = self.model(x) 134 | x = x.view(-1, 256) 135 | x = self.fc(x) 136 | return x 137 | 138 | -------------------------------------------------------------------------------- /models/retinaface.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision.models.detection.backbone_utils as backbone_utils 4 | import torchvision.models._utils as _utils 5 | import torch.nn.functional as F 6 | from models.net import MobileNetV1 as MobileNetV1 7 | from models.net import FPN as FPN 8 | from models.net import SSH as SSH 9 | 10 | class ClassHead(nn.Module): 11 | def __init__(self,inchannels=512,num_anchors=3): 12 | super(ClassHead,self).__init__() 13 | self.num_anchors = num_anchors 14 | # 对,这是conv, NCHW->NHWC->N-H*W-C; 无bn relu; anchor_num=2说明每个像素点预设anchor为2,然后分类需要输出2,bbox需要4,landmarks需要10,所以与论文源码的anchor预设是一致的; 15 | # 但是没有 将ssh的结果级联,源码是将ssh的结果上采样再“+”,即为 cascade 16 | self.conv1x1 = nn.Conv2d(inchannels,self.num_anchors*2,kernel_size=(1,1),stride=1,padding=0) 17 | 18 | def forward(self,x): 19 | out = self.conv1x1(x) 20 | out = out.permute(0,2,3,1).contiguous() 21 | 22 | return out.view(out.shape[0], -1, 2) 23 | 24 | class BboxHead(nn.Module): 25 | def __init__(self,inchannels=512,num_anchors=3): 26 | super(BboxHead,self).__init__() 27 | self.conv1x1 = nn.Conv2d(inchannels,num_anchors*4,kernel_size=(1,1),stride=1,padding=0) 28 | 29 | def forward(self,x): 30 | out = self.conv1x1(x) 31 | out = out.permute(0,2,3,1).contiguous() 32 | 33 | return out.view(out.shape[0], -1, 4) 34 | 35 | class LandmarkHead(nn.Module): 36 | def __init__(self,inchannels=512,num_anchors=3): 37 | super(LandmarkHead,self).__init__() 38 | self.conv1x1 = nn.Conv2d(inchannels,num_anchors*10,kernel_size=(1,1),stride=1,padding=0) 39 | 40 | def forward(self,x): 41 | out = self.conv1x1(x) 42 | out = out.permute(0,2,3,1).contiguous() 43 | 44 | return out.view(out.shape[0], -1, 10) 45 | 46 | # 此类为人脸姿态估计(yaw-pitch-roll)共有的分类 类-66类 47 | class PoseHead(nn.Module): 48 | def __init__(self, inchannels=512, num_anchors=3): 49 | super().__init__() 50 | self.conv1x1 = nn.Conv2d(inchannels,num_anchors*66,kernel_size=(1,1),stride=1,padding=0) 51 | 52 | def forward(self, x): 53 | out = self.conv1x1(x) 54 | out = out.permute(0, 2, 3, 1).contiguous() # 一般view前面,需要设置contiguous,将数据底层连续;直接reshape是不太好的! 55 | 56 | return out.view(out.shape[0], -1, 66) 57 | 58 | class RetinaFace(nn.Module): 59 | def __init__(self, cfg=None, phase='train'): 60 | """ 61 | :param cfg: Network related settings. 62 | :param phase: train or test. 63 | """ 64 | super(RetinaFace,self).__init__() 65 | self.phase = phase 66 | backbone = None 67 | if cfg['name'] == 'mobilenet0.25': 68 | backbone = MobileNetV1() 69 | if cfg['pretrain']: 70 | checkpoint = torch.load("./weights/mobilenetV1X0.25_pretrain.tar", map_location=torch.device('cpu')) 71 | from collections import OrderedDict 72 | new_state_dict = OrderedDict() 73 | for k, v in checkpoint['state_dict'].items(): 74 | name = k[7:] # remove module. 75 | new_state_dict[name] = v 76 | # load params 77 | backbone.load_state_dict(new_state_dict) 78 | elif cfg['name'] == 'Resnet50': 79 | import torchvision.models as models 80 | backbone = models.resnet50(pretrained=cfg['pretrain']) 81 | # 获取中间层输出 82 | self.body = _utils.IntermediateLayerGetter(backbone, cfg['return_layers']) 83 | in_channels_stage2 = cfg['in_channel'] 84 | in_channels_list = [ 85 | in_channels_stage2 * 2, 86 | in_channels_stage2 * 4, 87 | in_channels_stage2 * 8, 88 | ] 89 | out_channels = cfg['out_channel'] 90 | self.fpn = FPN(in_channels_list,out_channels) 91 | 92 | self.ssh1 = SSH(out_channels, out_channels) 93 | self.ssh2 = SSH(out_channels, out_channels) 94 | self.ssh3 = SSH(out_channels, out_channels) 95 | # 参考口罩分支,ssh单独新建 96 | self.ssh1_pose = SSH(out_channels, out_channels) 97 | self.ssh2_pose = SSH(out_channels, out_channels) 98 | self.ssh3_pose = SSH(out_channels, out_channels) 99 | # 对应ssh 的 框-关键点-2分类 100 | self.ClassHead = self._make_class_head(fpn_num=3, inchannels=cfg['out_channel']) 101 | self.BboxHead = self._make_bbox_head(fpn_num=3, inchannels=cfg['out_channel']) 102 | self.LandmarkHead = self._make_landmark_head(fpn_num=3, inchannels=cfg['out_channel']) 103 | # 对应ssh_pose 的 yaw-pitch-roll 的 3个66分类 104 | self.Pose_yaw_Head = self._make_pose_yaw_pitch_roll_head(fpn_num=3, inchannels=cfg['out_channel']) 105 | self.Pose_pitch_Head = self._make_pose_yaw_pitch_roll_head(fpn_num=3, inchannels=cfg['out_channel']) 106 | self.Pose_roll_Head = self._make_pose_yaw_pitch_roll_head(fpn_num=3, inchannels=cfg['out_channel']) 107 | 108 | def _make_class_head(self,fpn_num=3,inchannels=64,anchor_num=2): 109 | classhead = nn.ModuleList() 110 | for i in range(fpn_num): 111 | classhead.append(ClassHead(inchannels,anchor_num)) 112 | return classhead 113 | 114 | def _make_bbox_head(self,fpn_num=3,inchannels=64,anchor_num=2): 115 | bboxhead = nn.ModuleList() 116 | for i in range(fpn_num): 117 | bboxhead.append(BboxHead(inchannels,anchor_num)) 118 | return bboxhead 119 | 120 | def _make_landmark_head(self,fpn_num=3,inchannels=64,anchor_num=2): 121 | landmarkhead = nn.ModuleList() 122 | for i in range(fpn_num): 123 | landmarkhead.append(LandmarkHead(inchannels,anchor_num)) 124 | return landmarkhead 125 | 126 | def _make_pose_yaw_pitch_roll_head(self, fpn_num=3, inchannels=64, anchor_num=2): 127 | pose_head = nn.ModuleList() 128 | for i in range(fpn_num): 129 | pose_head.append(PoseHead(inchannels, anchor_num)) # append添加Module 130 | return pose_head 131 | 132 | 133 | def forward(self,inputs): 134 | out = self.body(inputs) 135 | 136 | # FPN 137 | fpn = self.fpn(out) 138 | 139 | # SSH 140 | feature1 = self.ssh1(fpn[0]) 141 | feature2 = self.ssh2(fpn[1]) 142 | feature3 = self.ssh3(fpn[2]) 143 | features = [feature1, feature2, feature3] 144 | 145 | # SSH Pose 146 | feature1_pose = self.ssh1_pose(fpn[0]) 147 | feature2_pose = self.ssh2_pose(fpn[1]) 148 | feature3_pose = self.ssh3_pose(fpn[2]) 149 | features_pose = [feature1_pose, feature2_pose, feature3_pose] 150 | # 与RetinaFace细节区别,没有P5最后新建1层,只有最后3层;然后3层ssh输出是独立的,没有cascade做逐级相加“+”输出,且fpn的inchannel有差异;原始retinaface也没有cascade的ssh 151 | 152 | bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1) 153 | classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)], dim=1) 154 | ldm_regressions = torch.cat([self.LandmarkHead[i](feature) for i, feature in enumerate(features)], dim=1) 155 | 156 | yaw_preds = torch.cat([self.Pose_yaw_Head[i](feature) for i, feature in enumerate(features_pose)], dim=1) 157 | pitch_preds = torch.cat([self.Pose_pitch_Head[i](feature) for i, feature in enumerate(features_pose)], dim=1) 158 | roll_preds = torch.cat([self.Pose_roll_Head[i](feature) for i, feature in enumerate(features_pose)], dim=1) 159 | 160 | if self.phase == 'train': 161 | # 建议 采用 dict 返回,包括数据读取返回 162 | output = (bbox_regressions, classifications, ldm_regressions, yaw_preds, pitch_preds, roll_preds) 163 | else: 164 | output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions, yaw_preds, pitch_preds, roll_preds) # 测试ypr需要进行softmax 165 | return output 166 | 167 | -------------------------------------------------------------------------------- /pose/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/pose/__init__.py -------------------------------------------------------------------------------- /pose/datasets.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import cv2 4 | import pandas as pd 5 | 6 | import torch 7 | from torch.utils.data.dataset import Dataset 8 | from torchvision import transforms 9 | 10 | from PIL import Image, ImageFilter 11 | 12 | import utils 13 | 14 | def get_list_from_filenames(file_path): 15 | # input: relative path to .txt file with file names 16 | # output: list of relative path names 17 | with open(file_path) as f: 18 | lines = f.read().splitlines() 19 | return lines 20 | 21 | class Synhead(Dataset): 22 | def __init__(self, data_dir, csv_path, transform, test=False): 23 | column_names = ['path', 'bbox_x_min', 'bbox_y_min', 'bbox_x_max', 'bbox_y_max', 'yaw', 'pitch', 'roll'] 24 | tmp_df = pd.read_csv(csv_path, sep=',', names=column_names, index_col=False, encoding="utf-8-sig") 25 | self.data_dir = data_dir 26 | self.transform = transform 27 | self.X_train = tmp_df['path'] 28 | self.y_train = tmp_df[['bbox_x_min', 'bbox_y_min', 'bbox_x_max', 'bbox_y_max', 'yaw', 'pitch', 'roll']] 29 | self.length = len(tmp_df) 30 | self.test = test 31 | 32 | def __getitem__(self, index): 33 | path = os.path.join(self.data_dir, self.X_train.iloc[index]).strip('.jpg') + '.png' 34 | img = Image.open(path) 35 | img = img.convert('RGB') 36 | 37 | x_min, y_min, x_max, y_max, yaw, pitch, roll = self.y_train.iloc[index] 38 | x_min = float(x_min); x_max = float(x_max) 39 | y_min = float(y_min); y_max = float(y_max) 40 | yaw = -float(yaw); pitch = float(pitch); roll = float(roll) 41 | 42 | # k = 0.2 to 0.40 43 | k = np.random.random_sample() * 0.2 + 0.2 44 | x_min -= 0.6 * k * abs(x_max - x_min) 45 | y_min -= 2 * k * abs(y_max - y_min) 46 | x_max += 0.6 * k * abs(x_max - x_min) 47 | y_max += 0.6 * k * abs(y_max - y_min) 48 | 49 | width, height = img.size 50 | # Crop the face 51 | img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max))) 52 | 53 | # Flip? 54 | rnd = np.random.random_sample() 55 | if rnd < 0.5: 56 | yaw = -yaw 57 | roll = -roll 58 | img = img.transpose(Image.FLIP_LEFT_RIGHT) 59 | 60 | # Blur? 61 | rnd = np.random.random_sample() 62 | if rnd < 0.05: 63 | img = img.filter(ImageFilter.BLUR) 64 | 65 | # Bin values 66 | bins = np.array(range(-99, 102, 3)) 67 | binned_pose = np.digitize([yaw, pitch, roll], bins) - 1 68 | 69 | labels = torch.LongTensor(binned_pose) 70 | cont_labels = torch.FloatTensor([yaw, pitch, roll]) 71 | 72 | if self.transform is not None: 73 | img = self.transform(img) 74 | 75 | return img, labels, cont_labels, self.X_train[index] 76 | 77 | def __len__(self): 78 | return self.length 79 | 80 | class Pose_300W_LP(Dataset): 81 | # Head pose from 300W-LP dataset 82 | def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.mat', image_mode='RGB'): 83 | self.data_dir = data_dir 84 | self.transform = transform 85 | self.img_ext = img_ext 86 | self.annot_ext = annot_ext 87 | 88 | filename_list = get_list_from_filenames(filename_path) 89 | 90 | self.X_train = filename_list 91 | self.y_train = filename_list 92 | self.image_mode = image_mode 93 | self.length = len(filename_list) 94 | 95 | def __getitem__(self, index): 96 | img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext)) 97 | img = img.convert(self.image_mode) 98 | mat_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext) 99 | 100 | # Crop the face loosely 101 | pt2d = utils.get_pt2d_from_mat(mat_path) 102 | x_min = min(pt2d[0,:]) 103 | y_min = min(pt2d[1,:]) 104 | x_max = max(pt2d[0,:]) 105 | y_max = max(pt2d[1,:]) 106 | 107 | # k = 0.2 to 0.40 108 | k = np.random.random_sample() * 0.2 + 0.2 109 | x_min -= 0.6 * k * abs(x_max - x_min) 110 | y_min -= 2 * k * abs(y_max - y_min) 111 | x_max += 0.6 * k * abs(x_max - x_min) 112 | y_max += 0.6 * k * abs(y_max - y_min) 113 | img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max))) 114 | 115 | # We get the pose in radians 116 | pose = utils.get_ypr_from_mat(mat_path) 117 | # And convert to degrees. 118 | pitch = pose[0] * 180 / np.pi 119 | yaw = pose[1] * 180 / np.pi 120 | roll = pose[2] * 180 / np.pi 121 | 122 | # Flip? 123 | rnd = np.random.random_sample() 124 | if rnd < 0.5: 125 | yaw = -yaw 126 | roll = -roll 127 | img = img.transpose(Image.FLIP_LEFT_RIGHT) 128 | 129 | # Blur? 130 | rnd = np.random.random_sample() 131 | if rnd < 0.05: 132 | img = img.filter(ImageFilter.BLUR) 133 | 134 | # Bin values 135 | bins = np.array(range(-99, 102, 3)) # 102左闭右开,所以只到99 136 | binned_pose = np.digitize([yaw, pitch, roll], bins) - 1 # 3度为1类,从-99到+99;只计算0-65共66类 137 | 138 | # Get target tensors 139 | labels = binned_pose 140 | cont_labels = torch.FloatTensor([yaw, pitch, roll]) # 回归 141 | 142 | if self.transform is not None: 143 | img = self.transform(img) 144 | 145 | return img, labels, cont_labels, self.X_train[index] 146 | 147 | def __len__(self): 148 | # 122,450 149 | return self.length 150 | 151 | class Pose_300W_LP_random_ds(Dataset): 152 | # 300W-LP dataset with random downsampling 153 | def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.mat', image_mode='RGB'): 154 | self.data_dir = data_dir 155 | self.transform = transform 156 | self.img_ext = img_ext 157 | self.annot_ext = annot_ext 158 | 159 | filename_list = get_list_from_filenames(filename_path) 160 | 161 | self.X_train = filename_list 162 | self.y_train = filename_list 163 | self.image_mode = image_mode 164 | self.length = len(filename_list) 165 | 166 | def __getitem__(self, index): 167 | img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext)) 168 | img = img.convert(self.image_mode) 169 | mat_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext) 170 | 171 | # Crop the face loosely 172 | pt2d = utils.get_pt2d_from_mat(mat_path) 173 | x_min = min(pt2d[0,:]) 174 | y_min = min(pt2d[1,:]) 175 | x_max = max(pt2d[0,:]) 176 | y_max = max(pt2d[1,:]) 177 | 178 | # k = 0.2 to 0.40 179 | k = np.random.random_sample() * 0.2 + 0.2 180 | x_min -= 0.6 * k * abs(x_max - x_min) 181 | y_min -= 2 * k * abs(y_max - y_min) 182 | x_max += 0.6 * k * abs(x_max - x_min) 183 | y_max += 0.6 * k * abs(y_max - y_min) 184 | img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max))) 185 | 186 | # We get the pose in radians 187 | pose = utils.get_ypr_from_mat(mat_path) 188 | pitch = pose[0] * 180 / np.pi 189 | yaw = pose[1] * 180 / np.pi 190 | roll = pose[2] * 180 / np.pi 191 | 192 | ds = 1 + np.random.randint(0,4) * 5 193 | original_size = img.size 194 | img = img.resize((img.size[0] / ds, img.size[1] / ds), resample=Image.NEAREST) 195 | img = img.resize((original_size[0], original_size[1]), resample=Image.NEAREST) 196 | 197 | # Flip? 198 | rnd = np.random.random_sample() 199 | if rnd < 0.5: 200 | yaw = -yaw 201 | roll = -roll 202 | img = img.transpose(Image.FLIP_LEFT_RIGHT) 203 | 204 | # Blur? 205 | rnd = np.random.random_sample() 206 | if rnd < 0.05: 207 | img = img.filter(ImageFilter.BLUR) 208 | 209 | # Bin values 210 | bins = np.array(range(-99, 102, 3)) 211 | binned_pose = np.digitize([yaw, pitch, roll], bins) - 1 212 | 213 | # Get target tensors 214 | labels = binned_pose 215 | cont_labels = torch.FloatTensor([yaw, pitch, roll]) 216 | 217 | if self.transform is not None: 218 | img = self.transform(img) 219 | 220 | return img, labels, cont_labels, self.X_train[index] 221 | 222 | def __len__(self): 223 | # 122,450 224 | return self.length 225 | 226 | class AFLW2000(Dataset): 227 | def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.mat', image_mode='RGB'): 228 | self.data_dir = data_dir 229 | self.transform = transform 230 | self.img_ext = img_ext 231 | self.annot_ext = annot_ext 232 | 233 | filename_list = get_list_from_filenames(filename_path) 234 | 235 | self.X_train = filename_list 236 | self.y_train = filename_list 237 | self.image_mode = image_mode 238 | self.length = len(filename_list) 239 | 240 | def __getitem__(self, index): 241 | img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext)) 242 | img = img.convert(self.image_mode) 243 | mat_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext) 244 | 245 | # Crop the face loosely 246 | pt2d = utils.get_pt2d_from_mat(mat_path) 247 | 248 | x_min = min(pt2d[0,:]) 249 | y_min = min(pt2d[1,:]) 250 | x_max = max(pt2d[0,:]) 251 | y_max = max(pt2d[1,:]) 252 | 253 | k = 0.20 254 | x_min -= 2 * k * abs(x_max - x_min) 255 | y_min -= 2 * k * abs(y_max - y_min) 256 | x_max += 2 * k * abs(x_max - x_min) 257 | y_max += 0.6 * k * abs(y_max - y_min) 258 | img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max))) 259 | 260 | # We get the pose in radians 261 | pose = utils.get_ypr_from_mat(mat_path) 262 | # And convert to degrees. 263 | pitch = pose[0] * 180 / np.pi 264 | yaw = pose[1] * 180 / np.pi 265 | roll = pose[2] * 180 / np.pi 266 | # Bin values 267 | bins = np.array(range(-99, 102, 3)) 268 | labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1) 269 | cont_labels = torch.FloatTensor([yaw, pitch, roll]) 270 | 271 | if self.transform is not None: 272 | img = self.transform(img) 273 | 274 | return img, labels, cont_labels, self.X_train[index] 275 | 276 | def __len__(self): 277 | # 2,000 278 | return self.length 279 | 280 | class AFLW2000_ds(Dataset): 281 | # AFLW2000 dataset with fixed downsampling 282 | def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.mat', image_mode='RGB'): 283 | self.data_dir = data_dir 284 | self.transform = transform 285 | self.img_ext = img_ext 286 | self.annot_ext = annot_ext 287 | 288 | filename_list = get_list_from_filenames(filename_path) 289 | 290 | self.X_train = filename_list 291 | self.y_train = filename_list 292 | self.image_mode = image_mode 293 | self.length = len(filename_list) 294 | 295 | def __getitem__(self, index): 296 | img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext)) 297 | img = img.convert(self.image_mode) 298 | mat_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext) 299 | 300 | # Crop the face loosely 301 | pt2d = utils.get_pt2d_from_mat(mat_path) 302 | x_min = min(pt2d[0,:]) 303 | y_min = min(pt2d[1,:]) 304 | x_max = max(pt2d[0,:]) 305 | y_max = max(pt2d[1,:]) 306 | 307 | k = 0.20 308 | x_min -= 2 * k * abs(x_max - x_min) 309 | y_min -= 2 * k * abs(y_max - y_min) 310 | x_max += 2 * k * abs(x_max - x_min) 311 | y_max += 0.6 * k * abs(y_max - y_min) 312 | img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max))) 313 | 314 | ds = 3 # downsampling factor 315 | original_size = img.size 316 | img = img.resize((img.size[0] / ds, img.size[1] / ds), resample=Image.NEAREST) 317 | img = img.resize((original_size[0], original_size[1]), resample=Image.NEAREST) 318 | 319 | # We get the pose in radians 320 | pose = utils.get_ypr_from_mat(mat_path) 321 | # And convert to degrees. 322 | pitch = pose[0] * 180 / np.pi 323 | yaw = pose[1] * 180 / np.pi 324 | roll = pose[2] * 180 / np.pi 325 | # Bin values 326 | bins = np.array(range(-99, 102, 3)) 327 | labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1) 328 | cont_labels = torch.FloatTensor([yaw, pitch, roll]) 329 | 330 | if self.transform is not None: 331 | img = self.transform(img) 332 | 333 | return img, labels, cont_labels, self.X_train[index] 334 | 335 | def __len__(self): 336 | # 2,000 337 | return self.length 338 | 339 | class AFLW_aug(Dataset): 340 | # AFLW dataset with flipping 341 | def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.txt', image_mode='RGB'): 342 | self.data_dir = data_dir 343 | self.transform = transform 344 | self.img_ext = img_ext 345 | self.annot_ext = annot_ext 346 | 347 | filename_list = get_list_from_filenames(filename_path) 348 | 349 | self.X_train = filename_list 350 | self.y_train = filename_list 351 | self.image_mode = image_mode 352 | self.length = len(filename_list) 353 | 354 | def __getitem__(self, index): 355 | img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext)) 356 | img = img.convert(self.image_mode) 357 | txt_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext) 358 | 359 | # We get the pose in radians 360 | annot = open(txt_path, 'r') 361 | line = annot.readline().split(' ') 362 | pose = [float(line[1]), float(line[2]), float(line[3])] 363 | # And convert to degrees. 364 | yaw = pose[0] * 180 / np.pi 365 | pitch = pose[1] * 180 / np.pi 366 | roll = pose[2] * 180 / np.pi 367 | # Fix the roll in AFLW 368 | roll *= -1 369 | 370 | # Augment 371 | # Flip? 372 | rnd = np.random.random_sample() 373 | if rnd < 0.5: 374 | yaw = -yaw 375 | roll = -roll 376 | img = img.transpose(Image.FLIP_LEFT_RIGHT) 377 | 378 | # Bin values 379 | bins = np.array(range(-99, 102, 3)) 380 | labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1) 381 | cont_labels = torch.FloatTensor([yaw, pitch, roll]) 382 | 383 | if self.transform is not None: 384 | img = self.transform(img) 385 | 386 | return img, labels, cont_labels, self.X_train[index] 387 | 388 | def __len__(self): 389 | # train: 18,863 390 | # test: 1,966 391 | return self.length 392 | 393 | class AFLW(Dataset): 394 | def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.txt', image_mode='RGB'): 395 | self.data_dir = data_dir 396 | self.transform = transform 397 | self.img_ext = img_ext 398 | self.annot_ext = annot_ext 399 | 400 | filename_list = get_list_from_filenames(filename_path) 401 | 402 | self.X_train = filename_list 403 | self.y_train = filename_list 404 | self.image_mode = image_mode 405 | self.length = len(filename_list) 406 | 407 | def __getitem__(self, index): 408 | img = Image.open(os.path.join(self.data_dir, self.X_train[index] + self.img_ext)) 409 | img = img.convert(self.image_mode) 410 | txt_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext) 411 | 412 | # We get the pose in radians 413 | annot = open(txt_path, 'r') 414 | line = annot.readline().split(' ') 415 | pose = [float(line[1]), float(line[2]), float(line[3])] 416 | # And convert to degrees. 417 | yaw = pose[0] * 180 / np.pi 418 | pitch = pose[1] * 180 / np.pi 419 | roll = pose[2] * 180 / np.pi 420 | # Fix the roll in AFLW 421 | roll *= -1 422 | # Bin values 423 | bins = np.array(range(-99, 102, 3)) 424 | labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1) 425 | cont_labels = torch.FloatTensor([yaw, pitch, roll]) 426 | 427 | if self.transform is not None: 428 | img = self.transform(img) 429 | 430 | return img, labels, cont_labels, self.X_train[index] 431 | 432 | def __len__(self): 433 | # train: 18,863 434 | # test: 1,966 435 | return self.length 436 | 437 | class AFW(Dataset): 438 | def __init__(self, data_dir, filename_path, transform, img_ext='.jpg', annot_ext='.txt', image_mode='RGB'): 439 | self.data_dir = data_dir 440 | self.transform = transform 441 | self.img_ext = img_ext 442 | self.annot_ext = annot_ext 443 | 444 | filename_list = get_list_from_filenames(filename_path) 445 | 446 | self.X_train = filename_list 447 | self.y_train = filename_list 448 | self.image_mode = image_mode 449 | self.length = len(filename_list) 450 | 451 | def __getitem__(self, index): 452 | txt_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext) 453 | img_name = self.X_train[index].split('_')[0] 454 | 455 | img = Image.open(os.path.join(self.data_dir, img_name + self.img_ext)) 456 | img = img.convert(self.image_mode) 457 | txt_path = os.path.join(self.data_dir, self.y_train[index] + self.annot_ext) 458 | 459 | # We get the pose in degrees 460 | annot = open(txt_path, 'r') 461 | line = annot.readline().split(' ') 462 | yaw, pitch, roll = [float(line[1]), float(line[2]), float(line[3])] 463 | 464 | # Crop the face loosely 465 | k = 0.32 466 | x1 = float(line[4]) 467 | y1 = float(line[5]) 468 | x2 = float(line[6]) 469 | y2 = float(line[7]) 470 | x1 -= 0.8 * k * abs(x2 - x1) 471 | y1 -= 2 * k * abs(y2 - y1) 472 | x2 += 0.8 * k * abs(x2 - x1) 473 | y2 += 1 * k * abs(y2 - y1) 474 | 475 | img = img.crop((int(x1), int(y1), int(x2), int(y2))) 476 | 477 | # Bin values 478 | bins = np.array(range(-99, 102, 3)) 479 | labels = torch.LongTensor(np.digitize([yaw, pitch, roll], bins) - 1) 480 | cont_labels = torch.FloatTensor([yaw, pitch, roll]) 481 | 482 | if self.transform is not None: 483 | img = self.transform(img) 484 | 485 | return img, labels, cont_labels, self.X_train[index] 486 | 487 | def __len__(self): 488 | # Around 200 489 | return self.length 490 | 491 | class BIWI(Dataset): 492 | def __init__(self, data_dir, filename_path, transform, img_ext='.png', annot_ext='.txt', image_mode='RGB'): 493 | self.data_dir = data_dir 494 | self.transform = transform 495 | self.img_ext = img_ext 496 | self.annot_ext = annot_ext 497 | 498 | filename_list = get_list_from_filenames(filename_path) 499 | 500 | self.X_train = filename_list 501 | self.y_train = filename_list 502 | self.image_mode = image_mode 503 | self.length = len(filename_list) 504 | 505 | def __getitem__(self, index): 506 | img = Image.open(os.path.join(self.data_dir, self.X_train[index] + '_rgb' + self.img_ext)) 507 | img = img.convert(self.image_mode) 508 | pose_path = os.path.join(self.data_dir, self.y_train[index] + '_pose' + self.annot_ext) 509 | 510 | y_train_list = self.y_train[index].split('/') 511 | bbox_path = os.path.join(self.data_dir, y_train_list[0] + '/dockerface-' + y_train_list[-1] + '_rgb' + self.annot_ext) 512 | 513 | # Load bounding box 514 | bbox = open(bbox_path, 'r') 515 | line = bbox.readline().split(' ') 516 | if len(line) < 4: 517 | x_min, y_min, x_max, y_max = 0, 0, img.size[0], img.size[1] 518 | else: 519 | x_min, y_min, x_max, y_max = [float(line[1]), float(line[2]), float(line[3]), float(line[4])] 520 | bbox.close() 521 | 522 | # Load pose in degrees 523 | pose_annot = open(pose_path, 'r') 524 | R = [] 525 | for line in pose_annot: 526 | line = line.strip('\n').split(' ') 527 | l = [] 528 | if line[0] != '': 529 | for nb in line: 530 | if nb == '': 531 | continue 532 | l.append(float(nb)) 533 | R.append(l) 534 | 535 | R = np.array(R) 536 | T = R[3,:] 537 | R = R[:3,:] 538 | pose_annot.close() 539 | 540 | R = np.transpose(R) 541 | 542 | roll = -np.arctan2(R[1][0], R[0][0]) * 180 / np.pi 543 | yaw = -np.arctan2(-R[2][0], np.sqrt(R[2][1] ** 2 + R[2][2] ** 2)) * 180 / np.pi 544 | pitch = np.arctan2(R[2][1], R[2][2]) * 180 / np.pi 545 | 546 | # Loosely crop face 547 | k = 0.35 548 | x_min -= 0.6 * k * abs(x_max - x_min) 549 | y_min -= k * abs(y_max - y_min) 550 | x_max += 0.6 * k * abs(x_max - x_min) 551 | y_max += 0.6 * k * abs(y_max - y_min) 552 | img = img.crop((int(x_min), int(y_min), int(x_max), int(y_max))) 553 | 554 | # Bin values 555 | bins = np.array(range(-99, 102, 3)) 556 | binned_pose = np.digitize([yaw, pitch, roll], bins) - 1 557 | 558 | labels = torch.LongTensor(binned_pose) 559 | cont_labels = torch.FloatTensor([yaw, pitch, roll]) 560 | 561 | if self.transform is not None: 562 | img = self.transform(img) 563 | 564 | return img, labels, cont_labels, self.X_train[index] 565 | 566 | def __len__(self): 567 | # 15,667 568 | return self.length 569 | -------------------------------------------------------------------------------- /pose/detect_image.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torch.backends.cudnn as cudnn 13 | import torchvision 14 | import torch.nn.functional as F 15 | from PIL import Image 16 | 17 | import datasets, hopenet, utils 18 | 19 | from skimage import io 20 | import dlib 21 | 22 | def parse_args(): 23 | """Parse input arguments.""" 24 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 25 | parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.', 26 | default='hopenet_robust_alpha1.pkl', type=str) 27 | parser.add_argument('--face_model', dest='face_model', help='Path of DLIB face detection model.', 28 | default='mmod_human_face_detector.dat', type=str) 29 | parser.add_argument('--image', dest='image_path', help='Path of image') 30 | # parser.add_argument('--output_string', dest='output_string', help='String appended to output file') 31 | args = parser.parse_args() 32 | return args 33 | 34 | if __name__ == '__main__': 35 | args = parse_args() 36 | 37 | # ResNet50 structure 38 | model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) 39 | 40 | # Pretrained model 41 | saved_state_dict = torch.load(args.snapshot) 42 | model.load_state_dict(saved_state_dict) 43 | model = model.cuda() 44 | 45 | print('hopenet create success') 46 | 47 | # Dlib face detection model 48 | cnn_face_detector = dlib.cnn_face_detection_model_v1(args.face_model) 49 | 50 | print('dlib face detector create success') 51 | 52 | transformations = transforms.Compose([transforms.Scale(224), 53 | transforms.CenterCrop(224), transforms.ToTensor(), 54 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 55 | 56 | model.eval() 57 | 58 | idx_tensor = [idx for idx in range(66)] 59 | idx_tensor = torch.FloatTensor(idx_tensor).cuda() 60 | 61 | image = cv2.imread(args.image_path) 62 | #image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 63 | 64 | # Detect faces 65 | dets = cnn_face_detector(image, 1) 66 | 67 | for idx, det in enumerate(dets): 68 | # Get x_min, y_min, x_max, y_max, conf 69 | x_min = det.rect.left() 70 | y_min = det.rect.top() 71 | x_max = det.rect.right() 72 | y_max = det.rect.bottom() 73 | conf = det.confidence 74 | 75 | if conf > 1.0: 76 | bbox_width = abs(x_max - x_min) 77 | bbox_height = abs(y_max - y_min) 78 | x_min -= 2 * bbox_width / 4 79 | x_max += 2 * bbox_width / 4 80 | y_min -= 3 * bbox_height / 4 81 | y_max += bbox_height / 4 82 | x_min = max(x_min, 0); y_min = max(y_min, 0) 83 | x_max = min(image.shape[1], x_max) 84 | y_max = min(image.shape[0], y_max) 85 | 86 | # To int 87 | x_min, x_max, y_min, y_max = int(x_min), int(x_max), int(y_min), int(y_max) 88 | 89 | # Crop image 90 | img = image[y_min:y_max,x_min:x_max] 91 | img = Image.fromarray(img) 92 | 93 | # Transform 94 | img = transformations(img) 95 | img_shape = img.size() 96 | img = img.view(1, img_shape[0], img_shape[1], img_shape[2]) 97 | img = img.cuda() 98 | 99 | yaw, pitch, roll = model(img) 100 | 101 | yaw_predicted = F.softmax(yaw) 102 | pitch_predicted = F.softmax(pitch) 103 | roll_predicted = F.softmax(roll) 104 | # Get continuous predictions in degrees. 105 | yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99 106 | pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99 107 | roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99 108 | 109 | print('roll:', roll_predicted.item()) 110 | print('yaw:', yaw_predicted.item()) 111 | print('pitch:', pitch_predicted.item()) 112 | 113 | utils.draw_axis(image, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2) 114 | # Plot expanded bounding box 115 | # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1) 116 | 117 | cv2.imshow('res',image) 118 | cv2.waitKey() 119 | 120 | 121 | -------------------------------------------------------------------------------- /pose/hopenet.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import math 5 | import torch.nn.functional as F 6 | 7 | class Hopenet(nn.Module): 8 | # Hopenet with 3 output layers for yaw, pitch and roll 9 | # Predicts Euler angles by binning and regression with the expected value 10 | def __init__(self, block, layers, num_bins): 11 | self.inplanes = 64 12 | super(Hopenet, self).__init__() 13 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 14 | bias=False) 15 | self.bn1 = nn.BatchNorm2d(64) 16 | self.relu = nn.ReLU(inplace=True) 17 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 18 | self.layer1 = self._make_layer(block, 64, layers[0]) 19 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 20 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 21 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 22 | self.avgpool = nn.AvgPool2d(7) 23 | self.fc_yaw = nn.Linear(512 * block.expansion, num_bins) 24 | self.fc_pitch = nn.Linear(512 * block.expansion, num_bins) 25 | self.fc_roll = nn.Linear(512 * block.expansion, num_bins) 26 | 27 | # Vestigial layer from previous experiments 28 | self.fc_finetune = nn.Linear(512 * block.expansion + 3, 3) 29 | 30 | for m in self.modules(): 31 | if isinstance(m, nn.Conv2d): 32 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 33 | m.weight.data.normal_(0, math.sqrt(2. / n)) 34 | elif isinstance(m, nn.BatchNorm2d): 35 | m.weight.data.fill_(1) 36 | m.bias.data.zero_() 37 | 38 | def _make_layer(self, block, planes, blocks, stride=1): 39 | downsample = None 40 | if stride != 1 or self.inplanes != planes * block.expansion: 41 | downsample = nn.Sequential( 42 | nn.Conv2d(self.inplanes, planes * block.expansion, 43 | kernel_size=1, stride=stride, bias=False), 44 | nn.BatchNorm2d(planes * block.expansion), 45 | ) 46 | 47 | layers = [] 48 | layers.append(block(self.inplanes, planes, stride, downsample)) 49 | self.inplanes = planes * block.expansion 50 | for i in range(1, blocks): 51 | layers.append(block(self.inplanes, planes)) 52 | 53 | return nn.Sequential(*layers) 54 | 55 | def forward(self, x): 56 | x = self.conv1(x) 57 | x = self.bn1(x) 58 | x = self.relu(x) 59 | x = self.maxpool(x) 60 | 61 | x = self.layer1(x) 62 | x = self.layer2(x) 63 | x = self.layer3(x) 64 | x = self.layer4(x) 65 | 66 | x = self.avgpool(x) 67 | x = x.view(x.size(0), -1) 68 | pre_yaw = self.fc_yaw(x) 69 | pre_pitch = self.fc_pitch(x) 70 | pre_roll = self.fc_roll(x) 71 | 72 | return pre_yaw, pre_pitch, pre_roll 73 | 74 | class ResNet(nn.Module): 75 | # ResNet for regression of 3 Euler angles. 76 | def __init__(self, block, layers, num_classes=1000): 77 | self.inplanes = 64 78 | super(ResNet, self).__init__() 79 | self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, 80 | bias=False) 81 | self.bn1 = nn.BatchNorm2d(64) 82 | self.relu = nn.ReLU(inplace=True) 83 | self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 84 | self.layer1 = self._make_layer(block, 64, layers[0]) 85 | self.layer2 = self._make_layer(block, 128, layers[1], stride=2) 86 | self.layer3 = self._make_layer(block, 256, layers[2], stride=2) 87 | self.layer4 = self._make_layer(block, 512, layers[3], stride=2) 88 | self.avgpool = nn.AvgPool2d(7) 89 | self.fc_angles = nn.Linear(512 * block.expansion, num_classes) 90 | 91 | for m in self.modules(): 92 | if isinstance(m, nn.Conv2d): 93 | n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels 94 | m.weight.data.normal_(0, math.sqrt(2. / n)) 95 | elif isinstance(m, nn.BatchNorm2d): 96 | m.weight.data.fill_(1) 97 | m.bias.data.zero_() 98 | 99 | def _make_layer(self, block, planes, blocks, stride=1): 100 | downsample = None 101 | if stride != 1 or self.inplanes != planes * block.expansion: 102 | downsample = nn.Sequential( 103 | nn.Conv2d(self.inplanes, planes * block.expansion, 104 | kernel_size=1, stride=stride, bias=False), 105 | nn.BatchNorm2d(planes * block.expansion), 106 | ) 107 | 108 | layers = [] 109 | layers.append(block(self.inplanes, planes, stride, downsample)) 110 | self.inplanes = planes * block.expansion 111 | for i in range(1, blocks): 112 | layers.append(block(self.inplanes, planes)) 113 | 114 | return nn.Sequential(*layers) 115 | 116 | def forward(self, x): 117 | x = self.conv1(x) 118 | x = self.bn1(x) 119 | x = self.relu(x) 120 | x = self.maxpool(x) 121 | 122 | x = self.layer1(x) 123 | x = self.layer2(x) 124 | x = self.layer3(x) 125 | x = self.layer4(x) 126 | 127 | x = self.avgpool(x) 128 | x = x.view(x.size(0), -1) 129 | x = self.fc_angles(x) 130 | return x 131 | 132 | class AlexNet(nn.Module): 133 | # AlexNet laid out as a Hopenet - classify Euler angles in bins and 134 | # regress the expected value. 135 | def __init__(self, num_bins): 136 | super(AlexNet, self).__init__() 137 | self.features = nn.Sequential( 138 | nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), 139 | nn.ReLU(inplace=True), 140 | nn.MaxPool2d(kernel_size=3, stride=2), 141 | nn.Conv2d(64, 192, kernel_size=5, padding=2), 142 | nn.ReLU(inplace=True), 143 | nn.MaxPool2d(kernel_size=3, stride=2), 144 | nn.Conv2d(192, 384, kernel_size=3, padding=1), 145 | nn.ReLU(inplace=True), 146 | nn.Conv2d(384, 256, kernel_size=3, padding=1), 147 | nn.ReLU(inplace=True), 148 | nn.Conv2d(256, 256, kernel_size=3, padding=1), 149 | nn.ReLU(inplace=True), 150 | nn.MaxPool2d(kernel_size=3, stride=2), 151 | ) 152 | self.classifier = nn.Sequential( 153 | nn.Dropout(), 154 | nn.Linear(256 * 6 * 6, 4096), 155 | nn.ReLU(inplace=True), 156 | nn.Dropout(), 157 | nn.Linear(4096, 4096), 158 | nn.ReLU(inplace=True), 159 | ) 160 | self.fc_yaw = nn.Linear(4096, num_bins) 161 | self.fc_pitch = nn.Linear(4096, num_bins) 162 | self.fc_roll = nn.Linear(4096, num_bins) 163 | 164 | def forward(self, x): 165 | x = self.features(x) 166 | x = x.view(x.size(0), 256 * 6 * 6) 167 | x = self.classifier(x) 168 | yaw = self.fc_yaw(x) 169 | pitch = self.fc_pitch(x) 170 | roll = self.fc_roll(x) 171 | return yaw, pitch, roll 172 | -------------------------------------------------------------------------------- /pose/test_alexnet.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torch.backends.cudnn as cudnn 13 | import torchvision 14 | import torch.nn.functional as F 15 | 16 | import datasets, hopenet, utils 17 | 18 | def parse_args(): 19 | """Parse input arguments.""" 20 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 21 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 22 | default=0, type=int) 23 | parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.', 24 | default='', type=str) 25 | parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.', 26 | default='', type=str) 27 | parser.add_argument('--snapshot', dest='snapshot', help='Name of model snapshot.', 28 | default='', type=str) 29 | parser.add_argument('--batch_size', dest='batch_size', help='Batch size.', 30 | default=1, type=int) 31 | parser.add_argument('--save_viz', dest='save_viz', help='Save images with pose cube.', 32 | default=False, type=bool) 33 | parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='AFLW2000', type=str) 34 | 35 | args = parser.parse_args() 36 | 37 | return args 38 | 39 | if __name__ == '__main__': 40 | args = parse_args() 41 | 42 | cudnn.enabled = True 43 | gpu = args.gpu_id 44 | snapshot_path = args.snapshot 45 | 46 | model = hopenet.AlexNet(66) 47 | 48 | print 'Loading snapshot.' 49 | # Load snapshot 50 | saved_state_dict = torch.load(snapshot_path) 51 | model.load_state_dict(saved_state_dict) 52 | 53 | print 'Loading data.' 54 | 55 | transformations = transforms.Compose([transforms.Scale(224), 56 | transforms.CenterCrop(224), transforms.ToTensor(), 57 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 58 | 59 | if args.dataset == 'Pose_300W_LP': 60 | pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations) 61 | elif args.dataset == 'Pose_300W_LP_random_ds': 62 | pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations) 63 | elif args.dataset == 'AFLW2000': 64 | pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations) 65 | elif args.dataset == 'AFLW2000_ds': 66 | pose_dataset = datasets.AFLW2000_ds(args.data_dir, args.filename_list, transformations) 67 | elif args.dataset == 'BIWI': 68 | pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations) 69 | elif args.dataset == 'AFLW': 70 | pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations) 71 | elif args.dataset == 'AFLW_aug': 72 | pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations) 73 | elif args.dataset == 'AFW': 74 | pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations) 75 | else: 76 | print 'Error: not a valid dataset name' 77 | sys.exit() 78 | test_loader = torch.utils.data.DataLoader(dataset=pose_dataset, 79 | batch_size=args.batch_size, 80 | num_workers=2) 81 | 82 | model.cuda(gpu) 83 | 84 | print 'Ready to test network.' 85 | 86 | # Test the Model 87 | model.eval() # Change model to 'eval' mode (BN uses moving mean/var). 88 | total = 0 89 | 90 | idx_tensor = [idx for idx in xrange(66)] 91 | idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu) 92 | 93 | yaw_error = .0 94 | pitch_error = .0 95 | roll_error = .0 96 | 97 | l1loss = torch.nn.L1Loss(size_average=False) 98 | 99 | for i, (images, labels, cont_labels, name) in enumerate(test_loader): 100 | images = Variable(images).cuda(gpu) 101 | total += cont_labels.size(0) 102 | label_yaw = cont_labels[:,0].float() 103 | label_pitch = cont_labels[:,1].float() 104 | label_roll = cont_labels[:,2].float() 105 | 106 | yaw, pitch, roll = model(images) 107 | 108 | # Binned predictions 109 | _, yaw_bpred = torch.max(yaw.data, 1) 110 | _, pitch_bpred = torch.max(pitch.data, 1) 111 | _, roll_bpred = torch.max(roll.data, 1) 112 | 113 | # Continuous predictions 114 | yaw_predicted = utils.softmax_temperature(yaw.data, 1) 115 | pitch_predicted = utils.softmax_temperature(pitch.data, 1) 116 | roll_predicted = utils.softmax_temperature(roll.data, 1) 117 | 118 | yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu() * 3 - 99 119 | pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu() * 3 - 99 120 | roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu() * 3 - 99 121 | 122 | # Mean absolute error 123 | yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw)) 124 | pitch_error += torch.sum(torch.abs(pitch_predicted - label_pitch)) 125 | roll_error += torch.sum(torch.abs(roll_predicted - label_roll)) 126 | 127 | # Save first image in batch with pose cube or axis. 128 | if args.save_viz: 129 | name = name[0] 130 | if args.dataset == 'BIWI': 131 | cv2_img = cv2.imread(os.path.join(args.data_dir, name + '_rgb.png')) 132 | else: 133 | cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg')) 134 | if args.batch_size == 1: 135 | error_string = 'y %.2f, p %.2f, r %.2f' % (torch.sum(torch.abs(yaw_predicted - label_yaw)), torch.sum(torch.abs(pitch_predicted - label_pitch)), torch.sum(torch.abs(roll_predicted - label_roll))) 136 | cv2.putText(cv2_img, error_string, (30, cv2_img.shape[0]- 30), fontFace=1, fontScale=1, color=(0,0,255), thickness=1) 137 | # utils.plot_pose_cube(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], size=100) 138 | utils.draw_axis(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], tdx = 200, tdy= 200, size=100) 139 | cv2.imwrite(os.path.join('output/images', name + '.jpg'), cv2_img) 140 | 141 | print('Test error in degrees of the model on the ' + str(total) + 142 | ' test images. Yaw: %.4f, Pitch: %.4f, Roll: %.4f' % (yaw_error / total, 143 | pitch_error / total, roll_error / total)) 144 | -------------------------------------------------------------------------------- /pose/test_hopenet.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torch.backends.cudnn as cudnn 13 | import torchvision 14 | import torch.nn.functional as F 15 | 16 | import datasets, hopenet, utils 17 | 18 | def parse_args(): 19 | """Parse input arguments.""" 20 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 21 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 22 | default=0, type=int) 23 | parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.', 24 | default='', type=str) 25 | parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.', 26 | default='', type=str) 27 | parser.add_argument('--snapshot', dest='snapshot', help='Name of model snapshot.', 28 | default='', type=str) 29 | parser.add_argument('--batch_size', dest='batch_size', help='Batch size.', 30 | default=1, type=int) 31 | parser.add_argument('--save_viz', dest='save_viz', help='Save images with pose cube.', 32 | default=False, type=bool) 33 | parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='AFLW2000', type=str) 34 | 35 | args = parser.parse_args() 36 | 37 | return args 38 | 39 | if __name__ == '__main__': 40 | args = parse_args() 41 | 42 | cudnn.enabled = True 43 | gpu = args.gpu_id 44 | snapshot_path = args.snapshot 45 | 46 | # ResNet50 structure 47 | model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) 48 | 49 | print('Loading snapshot.') 50 | # Load snapshot 51 | saved_state_dict = torch.load(snapshot_path) 52 | model.load_state_dict(saved_state_dict) 53 | 54 | print('Loading data.') 55 | 56 | transformations = transforms.Compose([transforms.Scale(224), 57 | transforms.CenterCrop(224), transforms.ToTensor(), 58 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) # rgb模式 59 | 60 | if args.dataset == 'Pose_300W_LP': 61 | pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations) 62 | elif args.dataset == 'Pose_300W_LP_random_ds': 63 | pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations) 64 | elif args.dataset == 'AFLW2000': 65 | pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations) 66 | elif args.dataset == 'AFLW2000_ds': 67 | pose_dataset = datasets.AFLW2000_ds(args.data_dir, args.filename_list, transformations) 68 | elif args.dataset == 'BIWI': 69 | pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations) 70 | elif args.dataset == 'AFLW': 71 | pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations) 72 | elif args.dataset == 'AFLW_aug': 73 | pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations) 74 | elif args.dataset == 'AFW': 75 | pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations) 76 | else: 77 | print('Error: not a valid dataset name') 78 | sys.exit() 79 | test_loader = torch.utils.data.DataLoader(dataset=pose_dataset, 80 | batch_size=args.batch_size, 81 | num_workers=2) 82 | 83 | model.cuda(gpu) 84 | 85 | print('Ready to test network.') 86 | 87 | # Test the Model 88 | model.eval() # Change model to 'eval' mode (BN uses moving mean/var). 89 | total = 0 90 | 91 | idx_tensor = [idx for idx in range(66)] 92 | idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu) 93 | 94 | yaw_error = .0 95 | pitch_error = .0 96 | roll_error = .0 97 | 98 | l1loss = torch.nn.L1Loss(size_average=False) 99 | 100 | for i, (images, labels, cont_labels, name) in enumerate(test_loader): 101 | images = Variable(images).cuda(gpu) 102 | total += cont_labels.size(0) 103 | 104 | label_yaw = cont_labels[:,0].float() 105 | label_pitch = cont_labels[:,1].float() 106 | label_roll = cont_labels[:,2].float() 107 | 108 | yaw, pitch, roll = model(images) 109 | 110 | # Binned predictions 111 | _, yaw_bpred = torch.max(yaw.data, 1) 112 | _, pitch_bpred = torch.max(pitch.data, 1) 113 | _, roll_bpred = torch.max(roll.data, 1) 114 | 115 | # Continuous predictions 116 | yaw_predicted = utils.softmax_temperature(yaw.data, 1) 117 | pitch_predicted = utils.softmax_temperature(pitch.data, 1) 118 | roll_predicted = utils.softmax_temperature(roll.data, 1) 119 | 120 | yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1).cpu() * 3 - 99 121 | pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1).cpu() * 3 - 99 122 | roll_predicted = torch.sum(roll_predicted * idx_tensor, 1).cpu() * 3 - 99 123 | 124 | # Mean absolute error 125 | yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw)) 126 | pitch_error += torch.sum(torch.abs(pitch_predicted - label_pitch)) 127 | roll_error += torch.sum(torch.abs(roll_predicted - label_roll)) 128 | 129 | # Save first image in batch with pose cube or axis. 130 | if args.save_viz: 131 | name = name[0] 132 | if args.dataset == 'BIWI': 133 | cv2_img = cv2.imread(os.path.join(args.data_dir, name + '_rgb.png')) 134 | else: 135 | cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg')) 136 | if args.batch_size == 1: 137 | error_string = 'y %.2f, p %.2f, r %.2f' % (torch.sum(torch.abs(yaw_predicted - label_yaw)), torch.sum(torch.abs(pitch_predicted - label_pitch)), torch.sum(torch.abs(roll_predicted - label_roll))) 138 | cv2.putText(cv2_img, error_string, (30, cv2_img.shape[0]- 30), fontFace=1, fontScale=1, color=(0,0,255), thickness=2) 139 | # utils.plot_pose_cube(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], size=100) 140 | utils.draw_axis(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], tdx = 200, tdy= 200, size=100) 141 | cv2.imwrite(os.path.join('output/images', name + '.jpg'), cv2_img) 142 | 143 | print('Test error in degrees of the model on the ' + str(total) + 144 | ' test images. Yaw: %.4f, Pitch: %.4f, Roll: %.4f' % (yaw_error / total, 145 | pitch_error / total, roll_error / total)) 146 | -------------------------------------------------------------------------------- /pose/test_on_video.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torch.backends.cudnn as cudnn 13 | import torchvision 14 | import torch.nn.functional as F 15 | from PIL import Image 16 | 17 | import datasets, hopenet, utils 18 | 19 | def parse_args(): 20 | """Parse input arguments.""" 21 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 22 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 23 | default=0, type=int) 24 | parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.', 25 | default='', type=str) 26 | parser.add_argument('--video', dest='video_path', help='Path of video') 27 | parser.add_argument('--bboxes', dest='bboxes', help='Bounding box annotations of frames') 28 | parser.add_argument('--output_string', dest='output_string', help='String appended to output file') 29 | parser.add_argument('--n_frames', dest='n_frames', help='Number of frames', type=int) 30 | parser.add_argument('--fps', dest='fps', help='Frames per second of source video', type=float, default=30.) 31 | args = parser.parse_args() 32 | return args 33 | 34 | if __name__ == '__main__': 35 | args = parse_args() 36 | 37 | cudnn.enabled = True 38 | 39 | batch_size = 1 40 | gpu = args.gpu_id 41 | snapshot_path = args.snapshot 42 | out_dir = 'output/video' 43 | video_path = args.video_path 44 | 45 | if not os.path.exists(out_dir): 46 | os.makedirs(out_dir) 47 | 48 | if not os.path.exists(args.video_path): 49 | sys.exit('Video does not exist') 50 | 51 | # ResNet50 structure 52 | model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) 53 | 54 | # print 'Loading snapshot.' 55 | # Load snapshot 56 | saved_state_dict = torch.load(snapshot_path) 57 | model.load_state_dict(saved_state_dict) 58 | 59 | # print 'Loading data.' 60 | 61 | transformations = transforms.Compose([transforms.Scale(224), 62 | transforms.CenterCrop(224), transforms.ToTensor(), 63 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 64 | 65 | model.cuda(gpu) 66 | 67 | # print 'Ready to test network.' 68 | 69 | # Test the Model 70 | model.eval() # Change model to 'eval' mode (BN uses moving mean/var). 71 | total = 0 72 | 73 | idx_tensor = [idx for idx in range(66)] 74 | idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu) 75 | 76 | video = cv2.VideoCapture(video_path) 77 | 78 | # New cv2 79 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) # float 80 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float 81 | 82 | # Define the codec and create VideoWriter object 83 | fourcc = cv2.VideoWriter_fourcc(*'MJPG') 84 | out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height)) 85 | 86 | # # Old cv2 87 | # width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)) # float 88 | # height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float 89 | # 90 | # # Define the codec and create VideoWriter object 91 | # fourcc = cv2.cv.CV_FOURCC(*'MJPG') 92 | # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height)) 93 | 94 | txt_out = open('output/video/output-%s.txt' % args.output_string, 'w') 95 | 96 | frame_num = 1 97 | 98 | with open(args.bboxes, 'r') as f: 99 | bbox_line_list = f.read().splitlines() 100 | 101 | idx = 0 102 | while idx < len(bbox_line_list): 103 | line = bbox_line_list[idx] 104 | line = line.strip('\n') 105 | line = line.split(' ') 106 | det_frame_num = int(line[0]) 107 | 108 | # print frame_num 109 | 110 | # Stop at a certain frame number 111 | if frame_num > args.n_frames: 112 | break 113 | 114 | # Save all frames as they are if they don't have bbox annotation. 115 | while frame_num < det_frame_num: 116 | ret, frame = video.read() 117 | if ret == False: 118 | out.release() 119 | video.release() 120 | txt_out.close() 121 | sys.exit(0) 122 | # out.write(frame) 123 | frame_num += 1 124 | 125 | # Start processing frame with bounding box 126 | ret,frame = video.read() 127 | if ret == False: 128 | break 129 | cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) 130 | 131 | while True: 132 | x_min, y_min, x_max, y_max = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4])) 133 | 134 | bbox_width = abs(x_max - x_min) 135 | bbox_height = abs(y_max - y_min) 136 | # x_min -= 3 * bbox_width / 4 137 | # x_max += 3 * bbox_width / 4 138 | # y_min -= 3 * bbox_height / 4 139 | # y_max += bbox_height / 4 140 | x_min -= 50 141 | x_max += 50 142 | y_min -= 50 143 | y_max += 30 144 | x_min = max(x_min, 0) 145 | y_min = max(y_min, 0) 146 | x_max = min(frame.shape[1], x_max) 147 | y_max = min(frame.shape[0], y_max) 148 | # Crop face loosely 149 | img = cv2_frame[y_min:y_max,x_min:x_max] 150 | img = Image.fromarray(img) 151 | 152 | # Transform 153 | img = transformations(img) 154 | img_shape = img.size() 155 | img = img.view(1, img_shape[0], img_shape[1], img_shape[2]) 156 | img = Variable(img).cuda(gpu) 157 | 158 | yaw, pitch, roll = model(img) 159 | 160 | yaw_predicted = F.softmax(yaw) 161 | pitch_predicted = F.softmax(pitch) 162 | roll_predicted = F.softmax(roll) 163 | # Get continuous predictions in degrees. 164 | yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99 165 | pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99 166 | roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99 167 | 168 | # Print new frame with cube and axis 169 | txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted)) 170 | # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width) 171 | utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2) 172 | # Plot expanded bounding box 173 | # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1) 174 | 175 | # Peek next frame detection 176 | next_frame_num = int(bbox_line_list[idx+1].strip('\n').split(' ')[0]) 177 | # print 'next_frame_num ', next_frame_num 178 | if next_frame_num == det_frame_num: 179 | idx += 1 180 | line = bbox_line_list[idx].strip('\n').split(' ') 181 | det_frame_num = int(line[0]) 182 | else: 183 | break 184 | 185 | idx += 1 186 | out.write(frame) 187 | frame_num += 1 188 | 189 | out.release() 190 | video.release() 191 | txt_out.close() 192 | -------------------------------------------------------------------------------- /pose/test_on_video_dlib.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torch.backends.cudnn as cudnn 13 | import torchvision 14 | import torch.nn.functional as F 15 | from PIL import Image 16 | 17 | import datasets, hopenet, utils 18 | 19 | from skimage import io 20 | import dlib 21 | 22 | def parse_args(): 23 | """Parse input arguments.""" 24 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 25 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 26 | default=0, type=int) 27 | parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.', 28 | default='', type=str) 29 | parser.add_argument('--face_model', dest='face_model', help='Path of DLIB face detection model.', 30 | default='', type=str) 31 | parser.add_argument('--video', dest='video_path', help='Path of video') 32 | parser.add_argument('--output_string', dest='output_string', help='String appended to output file') 33 | parser.add_argument('--n_frames', dest='n_frames', help='Number of frames', type=int) 34 | parser.add_argument('--fps', dest='fps', help='Frames per second of source video', type=float, default=30.) 35 | args = parser.parse_args() 36 | return args 37 | 38 | if __name__ == '__main__': 39 | args = parse_args() 40 | 41 | cudnn.enabled = True 42 | 43 | batch_size = 1 44 | gpu = args.gpu_id 45 | snapshot_path = args.snapshot 46 | out_dir = 'output/video' 47 | video_path = args.video_path 48 | 49 | if not os.path.exists(out_dir): 50 | os.makedirs(out_dir) 51 | 52 | if not os.path.exists(args.video_path): 53 | sys.exit('Video does not exist') 54 | 55 | # ResNet50 structure 56 | model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) 57 | 58 | # Dlib face detection model 59 | cnn_face_detector = dlib.cnn_face_detection_model_v1(args.face_model) 60 | 61 | print 'Loading snapshot.' 62 | # Load snapshot 63 | saved_state_dict = torch.load(snapshot_path) 64 | model.load_state_dict(saved_state_dict) 65 | 66 | print 'Loading data.' 67 | 68 | transformations = transforms.Compose([transforms.Scale(224), 69 | transforms.CenterCrop(224), transforms.ToTensor(), 70 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 71 | 72 | model.cuda(gpu) 73 | 74 | print 'Ready to test network.' 75 | 76 | # Test the Model 77 | model.eval() # Change model to 'eval' mode (BN uses moving mean/var). 78 | total = 0 79 | 80 | idx_tensor = [idx for idx in xrange(66)] 81 | idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu) 82 | 83 | video = cv2.VideoCapture(video_path) 84 | 85 | # New cv2 86 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) # float 87 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float 88 | 89 | # Define the codec and create VideoWriter object 90 | fourcc = cv2.VideoWriter_fourcc(*'MJPG') 91 | out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height)) 92 | 93 | # # Old cv2 94 | # width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)) # float 95 | # height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float 96 | # 97 | # # Define the codec and create VideoWriter object 98 | # fourcc = cv2.cv.CV_FOURCC(*'MJPG') 99 | # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height)) 100 | 101 | txt_out = open('output/video/output-%s.txt' % args.output_string, 'w') 102 | 103 | frame_num = 1 104 | 105 | while frame_num <= args.n_frames: 106 | print frame_num 107 | 108 | ret,frame = video.read() 109 | if ret == False: 110 | break 111 | 112 | cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) 113 | 114 | # Dlib detect 115 | dets = cnn_face_detector(cv2_frame, 1) 116 | 117 | for idx, det in enumerate(dets): 118 | # Get x_min, y_min, x_max, y_max, conf 119 | x_min = det.rect.left() 120 | y_min = det.rect.top() 121 | x_max = det.rect.right() 122 | y_max = det.rect.bottom() 123 | conf = det.confidence 124 | 125 | if conf > 1.0: 126 | bbox_width = abs(x_max - x_min) 127 | bbox_height = abs(y_max - y_min) 128 | x_min -= 2 * bbox_width / 4 129 | x_max += 2 * bbox_width / 4 130 | y_min -= 3 * bbox_height / 4 131 | y_max += bbox_height / 4 132 | x_min = max(x_min, 0); y_min = max(y_min, 0) 133 | x_max = min(frame.shape[1], x_max); y_max = min(frame.shape[0], y_max) 134 | # Crop image 135 | img = cv2_frame[y_min:y_max,x_min:x_max] 136 | img = Image.fromarray(img) 137 | 138 | # Transform 139 | img = transformations(img) 140 | img_shape = img.size() 141 | img = img.view(1, img_shape[0], img_shape[1], img_shape[2]) 142 | img = Variable(img).cuda(gpu) 143 | 144 | yaw, pitch, roll = model(img) 145 | 146 | yaw_predicted = F.softmax(yaw) 147 | pitch_predicted = F.softmax(pitch) 148 | roll_predicted = F.softmax(roll) 149 | # Get continuous predictions in degrees. 150 | yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99 151 | pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99 152 | roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99 153 | 154 | # Print new frame with cube and axis 155 | txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted)) 156 | # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width) 157 | utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2) 158 | # Plot expanded bounding box 159 | # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1) 160 | 161 | out.write(frame) 162 | frame_num += 1 163 | 164 | out.release() 165 | video.release() 166 | -------------------------------------------------------------------------------- /pose/test_on_video_dockerface.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torch.backends.cudnn as cudnn 13 | import torchvision 14 | import torch.nn.functional as F 15 | from PIL import Image 16 | 17 | import datasets, hopenet, utils 18 | 19 | def parse_args(): 20 | """Parse input arguments.""" 21 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 22 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 23 | default=0, type=int) 24 | parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.', 25 | default='', type=str) 26 | parser.add_argument('--video', dest='video_path', help='Path of video') 27 | parser.add_argument('--bboxes', dest='bboxes', help='Bounding box annotations of frames') 28 | parser.add_argument('--output_string', dest='output_string', help='String appended to output file') 29 | parser.add_argument('--n_frames', dest='n_frames', help='Number of frames', type=int) 30 | parser.add_argument('--fps', dest='fps', help='Frames per second of source video', type=float, default=30.) 31 | args = parser.parse_args() 32 | return args 33 | 34 | if __name__ == '__main__': 35 | args = parse_args() 36 | 37 | cudnn.enabled = True 38 | 39 | batch_size = 1 40 | gpu = args.gpu_id 41 | snapshot_path = args.snapshot 42 | out_dir = 'output/video' 43 | video_path = args.video_path 44 | 45 | if not os.path.exists(out_dir): 46 | os.makedirs(out_dir) 47 | 48 | if not os.path.exists(args.video_path): 49 | sys.exit('Video does not exist') 50 | 51 | # ResNet50 structure 52 | model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) 53 | 54 | print 'Loading snapshot.' 55 | # Load snapshot 56 | saved_state_dict = torch.load(snapshot_path) 57 | model.load_state_dict(saved_state_dict) 58 | 59 | print 'Loading data.' 60 | 61 | transformations = transforms.Compose([transforms.Scale(224), 62 | transforms.CenterCrop(224), transforms.ToTensor(), 63 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 64 | 65 | model.cuda(gpu) 66 | 67 | print 'Ready to test network.' 68 | 69 | # Test the Model 70 | model.eval() # Change model to 'eval' mode (BN uses moving mean/var). 71 | total = 0 72 | 73 | idx_tensor = [idx for idx in xrange(66)] 74 | idx_tensor = torch.FloatTensor(idx_tensor).cuda(gpu) 75 | 76 | video = cv2.VideoCapture(video_path) 77 | 78 | # New cv2 79 | width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH)) # float 80 | height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT)) # float 81 | 82 | # Define the codec and create VideoWriter object 83 | fourcc = cv2.VideoWriter_fourcc(*'MJPG') 84 | out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, args.fps, (width, height)) 85 | 86 | # # Old cv2 87 | # width = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)) # float 88 | # height = int(video.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)) # float 89 | # 90 | # # Define the codec and create VideoWriter object 91 | # fourcc = cv2.cv.CV_FOURCC(*'MJPG') 92 | # out = cv2.VideoWriter('output/video/output-%s.avi' % args.output_string, fourcc, 30.0, (width, height)) 93 | 94 | txt_out = open('output/video/output-%s.txt' % args.output_string, 'w') 95 | 96 | frame_num = 1 97 | 98 | with open(args.bboxes, 'r') as f: 99 | bbox_line_list = f.read().splitlines() 100 | 101 | idx = 0 102 | while idx < len(bbox_line_list): 103 | line = bbox_line_list[idx] 104 | line = line.strip('\n') 105 | line = line.split(' ') 106 | det_frame_num = int(line[0]) 107 | 108 | print frame_num 109 | 110 | # Stop at a certain frame number 111 | if frame_num > args.n_frames: 112 | break 113 | 114 | # Save all frames as they are if they don't have bbox annotation. 115 | while frame_num < det_frame_num: 116 | ret, frame = video.read() 117 | if ret == False: 118 | out.release() 119 | video.release() 120 | txt_out.close() 121 | sys.exit(0) 122 | out.write(frame) 123 | frame_num += 1 124 | 125 | # Start processing frame with bounding box 126 | ret,frame = video.read() 127 | if ret == False: 128 | break 129 | cv2_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB) 130 | 131 | while True: 132 | x_min, y_min, x_max, y_max, conf = int(float(line[1])), int(float(line[2])), int(float(line[3])), int(float(line[4])), float(line[5]) 133 | 134 | if conf > 0.98: 135 | bbox_width = abs(x_max - x_min) 136 | bbox_height = abs(y_max - y_min) 137 | # x_min -= 3 * bbox_width / 4 138 | # x_max += 3 * bbox_width / 4 139 | # y_min -= 3 * bbox_height / 4 140 | # y_max += bbox_height / 4 141 | x_min -= 50 142 | x_max += 50 143 | y_min -= 50 144 | y_max += 30 145 | x_min = max(x_min, 0) 146 | y_min = max(y_min, 0) 147 | x_max = min(frame.shape[1], x_max) 148 | y_max = min(frame.shape[0], y_max) 149 | # Crop image 150 | img = cv2_frame[y_min:y_max,x_min:x_max] 151 | img = Image.fromarray(img) 152 | 153 | # Transform 154 | img = transformations(img) 155 | img_shape = img.size() 156 | img = img.view(1, img_shape[0], img_shape[1], img_shape[2]) 157 | img = Variable(img).cuda(gpu) 158 | 159 | yaw, pitch, roll = model(img) 160 | 161 | yaw_predicted = F.softmax(yaw) 162 | pitch_predicted = F.softmax(pitch) 163 | roll_predicted = F.softmax(roll) 164 | # Get continuous predictions in degrees. 165 | yaw_predicted = torch.sum(yaw_predicted.data[0] * idx_tensor) * 3 - 99 166 | pitch_predicted = torch.sum(pitch_predicted.data[0] * idx_tensor) * 3 - 99 167 | roll_predicted = torch.sum(roll_predicted.data[0] * idx_tensor) * 3 - 99 168 | 169 | # Print new frame with cube and axis 170 | txt_out.write(str(frame_num) + ' %f %f %f\n' % (yaw_predicted, pitch_predicted, roll_predicted)) 171 | # utils.plot_pose_cube(frame, yaw_predicted, pitch_predicted, roll_predicted, (x_min + x_max) / 2, (y_min + y_max) / 2, size = bbox_width) 172 | utils.draw_axis(frame, yaw_predicted, pitch_predicted, roll_predicted, tdx = (x_min + x_max) / 2, tdy= (y_min + y_max) / 2, size = bbox_height/2) 173 | # Plot expanded bounding box 174 | # cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), (0,255,0), 1) 175 | 176 | # Peek next frame detection 177 | next_frame_num = int(bbox_line_list[idx+1].strip('\n').split(' ')[0]) 178 | # print 'next_frame_num ', next_frame_num 179 | if next_frame_num == det_frame_num: 180 | idx += 1 181 | line = bbox_line_list[idx].strip('\n').split(' ') 182 | det_frame_num = int(line[0]) 183 | else: 184 | break 185 | 186 | idx += 1 187 | out.write(frame) 188 | frame_num += 1 189 | 190 | out.release() 191 | video.release() 192 | txt_out.close() 193 | -------------------------------------------------------------------------------- /pose/test_resnet50_regression.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torch.backends.cudnn as cudnn 13 | import torchvision 14 | import torch.nn.functional as F 15 | 16 | import datasets, hopenet, utils 17 | 18 | def parse_args(): 19 | """Parse input arguments.""" 20 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 21 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 22 | default=0, type=int) 23 | parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.', 24 | default='', type=str) 25 | parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.', 26 | default='', type=str) 27 | parser.add_argument('--snapshot', dest='snapshot', help='Name of model snapshot.', 28 | default='', type=str) 29 | parser.add_argument('--batch_size', dest='batch_size', help='Batch size.', 30 | default=1, type=int) 31 | parser.add_argument('--save_viz', dest='save_viz', help='Save images with pose cube.', 32 | default=False, type=bool) 33 | parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='AFLW2000', type=str) 34 | 35 | args = parser.parse_args() 36 | 37 | return args 38 | 39 | if __name__ == '__main__': 40 | args = parse_args() 41 | 42 | cudnn.enabled = True 43 | gpu = args.gpu_id 44 | snapshot_path = args.snapshot 45 | 46 | model = hopenet.ResNet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 3) 47 | 48 | print ('Loading snapshot.') 49 | # Load snapshot 50 | saved_state_dict = torch.load(snapshot_path) 51 | model.load_state_dict(saved_state_dict) 52 | 53 | print ('Loading data.') 54 | 55 | transformations = transforms.Compose([transforms.Scale(224), 56 | transforms.CenterCrop(224), transforms.ToTensor(), 57 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 58 | 59 | if args.dataset == 'Pose_300W_LP': 60 | pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations) 61 | elif args.dataset == 'Pose_300W_LP_random_ds': 62 | pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations) 63 | elif args.dataset == 'AFLW2000': 64 | pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations) 65 | elif args.dataset == 'AFLW2000_ds': 66 | pose_dataset = datasets.AFLW2000_ds(args.data_dir, args.filename_list, transformations) 67 | elif args.dataset == 'BIWI': 68 | pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations) 69 | elif args.dataset == 'AFLW': 70 | pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations) 71 | elif args.dataset == 'AFLW_aug': 72 | pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations) 73 | elif args.dataset == 'AFW': 74 | pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations) 75 | else: 76 | print ('Error: not a valid dataset name') 77 | sys.exit() 78 | test_loader = torch.utils.data.DataLoader(dataset=pose_dataset, 79 | batch_size=args.batch_size, 80 | num_workers=2) 81 | 82 | model.cuda(gpu) 83 | 84 | print ('Ready to test network.') 85 | 86 | # Test the Model 87 | model.eval() # Change model to 'eval' mode (BN uses moving mean/var). 88 | total = 0 89 | 90 | yaw_error = .0 91 | pitch_error = .0 92 | roll_error = .0 93 | 94 | l1loss = torch.nn.L1Loss(size_average=False) 95 | 96 | for i, (images, labels, cont_labels, name) in enumerate(test_loader): 97 | images = Variable(images).cuda(gpu) 98 | total += cont_labels.size(0) 99 | label_yaw = cont_labels[:,0].float() 100 | label_pitch = cont_labels[:,1].float() 101 | label_roll = cont_labels[:,2].float() 102 | 103 | angles = model(images) 104 | yaw_predicted = angles[:,0].data.cpu() 105 | pitch_predicted = angles[:,1].data.cpu() 106 | roll_predicted = angles[:,2].data.cpu() 107 | 108 | # Mean absolute error 109 | yaw_error += torch.sum(torch.abs(yaw_predicted - label_yaw)) 110 | pitch_error += torch.sum(torch.abs(pitch_predicted - label_pitch)) 111 | roll_error += torch.sum(torch.abs(roll_predicted - label_roll)) 112 | 113 | # Save first image in batch with pose cube or axis. 114 | if args.save_viz: 115 | name = name[0] 116 | if args.dataset == 'BIWI': 117 | cv2_img = cv2.imread(os.path.join(args.data_dir, name + '_rgb.png')) 118 | else: 119 | cv2_img = cv2.imread(os.path.join(args.data_dir, name + '.jpg')) 120 | if args.batch_size == 1: 121 | error_string = 'y %.2f, p %.2f, r %.2f' % (torch.sum(torch.abs(yaw_predicted - label_yaw)), torch.sum(torch.abs(pitch_predicted - label_pitch)), torch.sum(torch.abs(roll_predicted - label_roll))) 122 | cv2.putText(cv2_img, error_string, (30, cv2_img.shape[0]- 30), fontFace=1, fontScale=1, color=(0,0,255), thickness=1) 123 | # utils.plot_pose_cube(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], size=100) 124 | utils.draw_axis(cv2_img, yaw_predicted[0], pitch_predicted[0], roll_predicted[0], tdx = 200, tdy= 200, size=100) 125 | cv2.imwrite(os.path.join('output/images', name + '.jpg'), cv2_img) 126 | 127 | print('Test error in degrees of the model on the ' + str(total) + 128 | ' test images. Yaw: %.4f, Pitch: %.4f, Roll: %.4f' % (yaw_error / total, 129 | pitch_error / total, roll_error / total)) 130 | -------------------------------------------------------------------------------- /pose/train_alexnet.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse, time 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torchvision 13 | import torch.backends.cudnn as cudnn 14 | import torch.nn.functional as F 15 | 16 | import datasets, hopenet 17 | import torch.utils.model_zoo as model_zoo 18 | 19 | model_urls = { 20 | 'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth', 21 | } 22 | 23 | def parse_args(): 24 | """Parse input arguments.""" 25 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 26 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 27 | default=0, type=int) 28 | parser.add_argument('--num_epochs', dest='num_epochs', help='Maximum number of training epochs.', 29 | default=5, type=int) 30 | parser.add_argument('--batch_size', dest='batch_size', help='Batch size.', 31 | default=16, type=int) 32 | parser.add_argument('--lr', dest='lr', help='Base learning rate.', 33 | default=0.001, type=float) 34 | parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.', 35 | default='', type=str) 36 | parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.', 37 | default='', type=str) 38 | parser.add_argument('--output_string', dest='output_string', help='String appended to output snapshots.', default = '', type=str) 39 | parser.add_argument('--alpha', dest='alpha', help='Regression loss coefficient.', 40 | default=0.001, type=float) 41 | parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='Pose_300W_LP', type=str) 42 | args = parser.parse_args() 43 | return args 44 | 45 | def get_ignored_params(model): 46 | # Generator function that yields ignored params. 47 | b = [model.features[0], model.features[1], model.features[2]] 48 | for i in range(len(b)): 49 | for module_name, module in b[i].named_modules(): 50 | if 'bn' in module_name: 51 | module.eval() 52 | for name, param in module.named_parameters(): 53 | yield param 54 | 55 | def get_non_ignored_params(model): 56 | # Generator function that yields params that will be optimized. 57 | b = [] 58 | for idx in xrange(3, len(model.features)): 59 | b.append(model.features[idx]) 60 | for layer in model.classifier: 61 | b.append(layer) 62 | for i in range(len(b)): 63 | for module_name, module in b[i].named_modules(): 64 | if 'bn' in module_name: 65 | module.eval() 66 | for name, param in module.named_parameters(): 67 | yield param 68 | 69 | def get_fc_params(model): 70 | b = [model.fc_yaw, model.fc_pitch, model.fc_roll] 71 | for i in range(len(b)): 72 | for module_name, module in b[i].named_modules(): 73 | for name, param in module.named_parameters(): 74 | yield param 75 | 76 | def load_filtered_state_dict(model, snapshot): 77 | # By user apaszke from discuss.pytorch.org 78 | model_dict = model.state_dict() 79 | snapshot = {k: v for k, v in snapshot.items() if k in model_dict} 80 | model_dict.update(snapshot) 81 | model.load_state_dict(model_dict) 82 | 83 | if __name__ == '__main__': 84 | args = parse_args() 85 | 86 | cudnn.enabled = True 87 | num_epochs = args.num_epochs 88 | batch_size = args.batch_size 89 | gpu = args.gpu_id 90 | 91 | if not os.path.exists('output/snapshots'): 92 | os.makedirs('output/snapshots') 93 | 94 | model = hopenet.AlexNet(66) 95 | load_filtered_state_dict(model, model_zoo.load_url(model_urls['alexnet'])) 96 | 97 | print 'Loading data.' 98 | 99 | transformations = transforms.Compose([transforms.Scale(240), 100 | transforms.RandomCrop(224), transforms.ToTensor(), 101 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 102 | 103 | if args.dataset == 'Pose_300W_LP': 104 | pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations) 105 | elif args.dataset == 'Pose_300W_LP_random_ds': 106 | pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations) 107 | elif args.dataset == 'AFLW2000': 108 | pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations) 109 | elif args.dataset == 'BIWI': 110 | pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations) 111 | elif args.dataset == 'AFLW': 112 | pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations) 113 | elif args.dataset == 'AFLW_aug': 114 | pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations) 115 | elif args.dataset == 'AFW': 116 | pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations) 117 | else: 118 | print 'Error: not a valid dataset name' 119 | sys.exit() 120 | train_loader = torch.utils.data.DataLoader(dataset=pose_dataset, 121 | batch_size=batch_size, 122 | shuffle=True, 123 | num_workers=2) 124 | 125 | model.cuda(gpu) 126 | softmax = nn.Softmax().cuda(gpu) 127 | criterion = nn.CrossEntropyLoss().cuda(gpu) 128 | reg_criterion = nn.MSELoss().cuda(gpu) 129 | # Regression loss coefficient 130 | alpha = args.alpha 131 | 132 | idx_tensor = [idx for idx in xrange(66)] 133 | idx_tensor = Variable(torch.FloatTensor(idx_tensor)).cuda(gpu) 134 | 135 | optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': 0}, 136 | {'params': get_non_ignored_params(model), 'lr': args.lr}, 137 | {'params': get_fc_params(model), 'lr': args.lr * 5}], 138 | lr = args.lr) 139 | 140 | print 'Ready to train network.' 141 | for epoch in range(num_epochs): 142 | for i, (images, labels, cont_labels, name) in enumerate(train_loader): 143 | images = Variable(images).cuda(gpu) 144 | 145 | # Binned labels 146 | label_yaw = Variable(labels[:,0]).cuda(gpu) 147 | label_pitch = Variable(labels[:,1]).cuda(gpu) 148 | label_roll = Variable(labels[:,2]).cuda(gpu) 149 | 150 | # Continuous labels 151 | label_yaw_cont = Variable(cont_labels[:,0]).cuda(gpu) 152 | label_pitch_cont = Variable(cont_labels[:,1]).cuda(gpu) 153 | label_roll_cont = Variable(cont_labels[:,2]).cuda(gpu) 154 | 155 | # Forward pass 156 | pre_yaw, pre_pitch, pre_roll = model(images) 157 | 158 | # Cross entropy loss 159 | loss_yaw = criterion(pre_yaw, label_yaw) 160 | loss_pitch = criterion(pre_pitch, label_pitch) 161 | loss_roll = criterion(pre_roll, label_roll) 162 | 163 | # MSE loss 164 | yaw_predicted = softmax(pre_yaw) 165 | pitch_predicted = softmax(pre_pitch) 166 | roll_predicted = softmax(pre_roll) 167 | 168 | yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1) * 3 - 99 169 | pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1) * 3 - 99 170 | roll_predicted = torch.sum(roll_predicted * idx_tensor, 1) * 3 - 99 171 | 172 | loss_reg_yaw = reg_criterion(yaw_predicted, label_yaw_cont) 173 | loss_reg_pitch = reg_criterion(pitch_predicted, label_pitch_cont) 174 | loss_reg_roll = reg_criterion(roll_predicted, label_roll_cont) 175 | 176 | # Total loss 177 | loss_yaw += alpha * loss_reg_yaw 178 | loss_pitch += alpha * loss_reg_pitch 179 | loss_roll += alpha * loss_reg_roll 180 | 181 | loss_seq = [loss_yaw, loss_pitch, loss_roll] 182 | grad_seq = [torch.ones(1).cuda(gpu) for _ in range(len(loss_seq))] 183 | torch.autograd.backward(loss_seq, grad_seq) 184 | optimizer.step() 185 | 186 | if (i+1) % 100 == 0: 187 | print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f' 188 | %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0])) 189 | 190 | # Save models at numbered epochs. 191 | if epoch % 1 == 0 and epoch < num_epochs: 192 | print 'Taking snapshot...' 193 | torch.save(model.state_dict(), 194 | 'output/snapshots/' + args.output_string + '_epoch_'+ str(epoch+1) + '.pkl') 195 | -------------------------------------------------------------------------------- /pose/train_hopenet.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse, time 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torchvision 13 | import torch.backends.cudnn as cudnn 14 | import torch.nn.functional as F 15 | 16 | import datasets, hopenet 17 | import torch.utils.model_zoo as model_zoo 18 | 19 | def parse_args(): 20 | """Parse input arguments.""" 21 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 22 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 23 | default=0, type=int) 24 | parser.add_argument('--num_epochs', dest='num_epochs', help='Maximum number of training epochs.', 25 | default=5, type=int) 26 | parser.add_argument('--batch_size', dest='batch_size', help='Batch size.', 27 | default=16, type=int) 28 | parser.add_argument('--lr', dest='lr', help='Base learning rate.', 29 | default=0.001, type=float) 30 | parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='Pose_300W_LP', type=str) 31 | parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.', 32 | default='', type=str) 33 | parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.', 34 | default='', type=str) 35 | parser.add_argument('--output_string', dest='output_string', help='String appended to output snapshots.', default = '', type=str) 36 | parser.add_argument('--alpha', dest='alpha', help='Regression loss coefficient.', 37 | default=0.001, type=float) 38 | parser.add_argument('--snapshot', dest='snapshot', help='Path of model snapshot.', 39 | default='', type=str) 40 | 41 | args = parser.parse_args() 42 | return args 43 | 44 | def get_ignored_params(model): 45 | # Generator function that yields ignored params. 46 | b = [model.conv1, model.bn1, model.fc_finetune] 47 | for i in range(len(b)): 48 | for module_name, module in b[i].named_modules(): 49 | if 'bn' in module_name: 50 | module.eval() 51 | for name, param in module.named_parameters(): 52 | yield param 53 | 54 | def get_non_ignored_params(model): 55 | # Generator function that yields params that will be optimized. 56 | b = [model.layer1, model.layer2, model.layer3, model.layer4] 57 | for i in range(len(b)): 58 | for module_name, module in b[i].named_modules(): 59 | if 'bn' in module_name: 60 | module.eval() 61 | for name, param in module.named_parameters(): 62 | yield param 63 | 64 | def get_fc_params(model): 65 | # Generator function that yields fc layer params. 66 | b = [model.fc_yaw, model.fc_pitch, model.fc_roll] 67 | for i in range(len(b)): 68 | for module_name, module in b[i].named_modules(): 69 | for name, param in module.named_parameters(): 70 | yield param 71 | 72 | def load_filtered_state_dict(model, snapshot): 73 | # By user apaszke from discuss.pytorch.org 74 | model_dict = model.state_dict() 75 | snapshot = {k: v for k, v in snapshot.items() if k in model_dict} 76 | model_dict.update(snapshot) 77 | model.load_state_dict(model_dict) 78 | 79 | if __name__ == '__main__': 80 | args = parse_args() 81 | 82 | cudnn.enabled = True 83 | num_epochs = args.num_epochs 84 | batch_size = args.batch_size 85 | gpu = args.gpu_id 86 | 87 | if not os.path.exists('output/snapshots'): 88 | os.makedirs('output/snapshots') 89 | 90 | # ResNet50 structure 91 | model = hopenet.Hopenet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 66) 92 | 93 | if args.snapshot == '': 94 | load_filtered_state_dict(model, model_zoo.load_url('https://download.pytorch.org/models/resnet50-19c8e357.pth')) 95 | else: 96 | saved_state_dict = torch.load(args.snapshot) 97 | model.load_state_dict(saved_state_dict) 98 | 99 | # print 'Loading data.' 100 | 101 | transformations = transforms.Compose([transforms.Scale(240), 102 | transforms.RandomCrop(224), transforms.ToTensor(), 103 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 104 | 105 | if args.dataset == 'Pose_300W_LP': 106 | pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations) 107 | elif args.dataset == 'Pose_300W_LP_random_ds': 108 | pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations) 109 | elif args.dataset == 'Synhead': 110 | pose_dataset = datasets.Synhead(args.data_dir, args.filename_list, transformations) 111 | elif args.dataset == 'AFLW2000': 112 | pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations) 113 | elif args.dataset == 'BIWI': 114 | pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations) 115 | elif args.dataset == 'AFLW': 116 | pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations) 117 | elif args.dataset == 'AFLW_aug': 118 | pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations) 119 | elif args.dataset == 'AFW': 120 | pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations) 121 | else: 122 | # print 'Error: not a valid dataset name' 123 | sys.exit() 124 | 125 | train_loader = torch.utils.data.DataLoader(dataset=pose_dataset, 126 | batch_size=batch_size, 127 | shuffle=True, 128 | num_workers=2) 129 | 130 | model.cuda(gpu) 131 | criterion = nn.CrossEntropyLoss().cuda(gpu) 132 | reg_criterion = nn.MSELoss().cuda(gpu) 133 | # Regression loss coefficient 134 | alpha = args.alpha 135 | 136 | softmax = nn.Softmax().cuda(gpu) 137 | idx_tensor = [idx for idx in range(66)] 138 | idx_tensor = Variable(torch.FloatTensor(idx_tensor)).cuda(gpu) 139 | 140 | optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': 0}, 141 | {'params': get_non_ignored_params(model), 'lr': args.lr}, 142 | {'params': get_fc_params(model), 'lr': args.lr * 5}], 143 | lr = args.lr) 144 | 145 | # print 'Ready to train network.' 146 | for epoch in range(num_epochs): 147 | for i, (images, labels, cont_labels, name) in enumerate(train_loader): 148 | images = Variable(images).cuda(gpu) 149 | 150 | # Binned labels 151 | label_yaw = Variable(labels[:,0]).cuda(gpu) 152 | label_pitch = Variable(labels[:,1]).cuda(gpu) 153 | label_roll = Variable(labels[:,2]).cuda(gpu) 154 | 155 | # Continuous labels 156 | label_yaw_cont = Variable(cont_labels[:,0]).cuda(gpu) 157 | label_pitch_cont = Variable(cont_labels[:,1]).cuda(gpu) 158 | label_roll_cont = Variable(cont_labels[:,2]).cuda(gpu) 159 | 160 | # Forward pass 161 | yaw, pitch, roll = model(images) 162 | 163 | # Cross entropy loss 164 | loss_yaw = criterion(yaw, label_yaw) 165 | loss_pitch = criterion(pitch, label_pitch) 166 | loss_roll = criterion(roll, label_roll) 167 | 168 | # MSE loss 169 | yaw_predicted = softmax(yaw) 170 | pitch_predicted = softmax(pitch) 171 | roll_predicted = softmax(roll) 172 | 173 | yaw_predicted = torch.sum(yaw_predicted * idx_tensor, 1) * 3 - 99 174 | pitch_predicted = torch.sum(pitch_predicted * idx_tensor, 1) * 3 - 99 175 | roll_predicted = torch.sum(roll_predicted * idx_tensor, 1) * 3 - 99 176 | 177 | loss_reg_yaw = reg_criterion(yaw_predicted, label_yaw_cont) 178 | loss_reg_pitch = reg_criterion(pitch_predicted, label_pitch_cont) 179 | loss_reg_roll = reg_criterion(roll_predicted, label_roll_cont) 180 | 181 | # Total loss 182 | loss_yaw += alpha * loss_reg_yaw 183 | loss_pitch += alpha * loss_reg_pitch 184 | loss_roll += alpha * loss_reg_roll 185 | 186 | loss_seq = [loss_yaw, loss_pitch, loss_roll] 187 | grad_seq = [torch.ones(1).cuda(gpu) for _ in range(len(loss_seq))] 188 | optimizer.zero_grad() 189 | torch.autograd.backward(loss_seq, grad_seq) 190 | optimizer.step() 191 | 192 | if (i+1) % 100 == 0: 193 | print ('Epoch [%d/%d], Iter [%d/%d] Losses: Yaw %.4f, Pitch %.4f, Roll %.4f' 194 | %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss_yaw.data[0], loss_pitch.data[0], loss_roll.data[0])) 195 | 196 | # Save models at numbered epochs. 197 | if epoch % 1 == 0 and epoch < num_epochs: 198 | # print 'Taking snapshot...' 199 | torch.save(model.state_dict(), 200 | 'output/snapshots/' + args.output_string + '_epoch_'+ str(epoch+1) + '.pkl') 201 | -------------------------------------------------------------------------------- /pose/train_resnet50_regression.py: -------------------------------------------------------------------------------- 1 | import sys, os, argparse, time 2 | 3 | import numpy as np 4 | import cv2 5 | import matplotlib.pyplot as plt 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.autograd import Variable 10 | from torch.utils.data import DataLoader 11 | from torchvision import transforms 12 | import torchvision 13 | import torch.backends.cudnn as cudnn 14 | import torch.nn.functional as F 15 | 16 | import datasets, hopenet 17 | import torch.utils.model_zoo as model_zoo 18 | 19 | def parse_args(): 20 | """Parse input arguments.""" 21 | parser = argparse.ArgumentParser(description='Head pose estimation using the Hopenet network.') 22 | parser.add_argument('--gpu', dest='gpu_id', help='GPU device id to use [0]', 23 | default=0, type=int) 24 | parser.add_argument('--num_epochs', dest='num_epochs', help='Maximum number of training epochs.', 25 | default=5, type=int) 26 | parser.add_argument('--batch_size', dest='batch_size', help='Batch size.', 27 | default=16, type=int) 28 | parser.add_argument('--lr', dest='lr', help='Base learning rate.', 29 | default=0.001, type=float) 30 | parser.add_argument('--data_dir', dest='data_dir', help='Directory path for data.', 31 | default='', type=str) 32 | parser.add_argument('--filename_list', dest='filename_list', help='Path to text file containing relative paths for every example.', 33 | default='', type=str) 34 | parser.add_argument('--output_string', dest='output_string', help='String appended to output snapshots.', default = '', type=str) 35 | parser.add_argument('--dataset', dest='dataset', help='Dataset type.', default='Pose_300W_LP', type=str) 36 | 37 | args = parser.parse_args() 38 | return args 39 | 40 | def get_ignored_params(model): 41 | # Generator function that yields ignored params. 42 | b = [model.conv1, model.bn1] 43 | for i in range(len(b)): 44 | for module_name, module in b[i].named_modules(): 45 | if 'bn' in module_name: 46 | module.eval() 47 | for name, param in module.named_parameters(): 48 | yield param 49 | 50 | def get_non_ignored_params(model): 51 | # Generator function that yields params that will be optimized. 52 | b = [model.layer1, model.layer2, model.layer3, model.layer4] 53 | for i in range(len(b)): 54 | for module_name, module in b[i].named_modules(): 55 | if 'bn' in module_name: 56 | module.eval() 57 | for name, param in module.named_parameters(): 58 | yield param 59 | 60 | def get_fc_params(model): 61 | # Generator function that yields fc layer params. 62 | b = [model.fc_angles] 63 | for i in range(len(b)): 64 | for module_name, module in b[i].named_modules(): 65 | for name, param in module.named_parameters(): 66 | yield param 67 | 68 | def load_filtered_state_dict(model, snapshot): 69 | # By user apaszke from discuss.pytorch.org 70 | model_dict = model.state_dict() 71 | snapshot = {k: v for k, v in snapshot.items() if k in model_dict} 72 | model_dict.update(snapshot) 73 | model.load_state_dict(model_dict) 74 | 75 | if __name__ == '__main__': 76 | args = parse_args() 77 | 78 | cudnn.enabled = True 79 | num_epochs = args.num_epochs 80 | batch_size = args.batch_size 81 | gpu = args.gpu_id 82 | 83 | if not os.path.exists('output/snapshots'): 84 | os.makedirs('output/snapshots') 85 | 86 | # ResNet50 87 | model = hopenet.ResNet(torchvision.models.resnet.Bottleneck, [3, 4, 6, 3], 3) 88 | load_filtered_state_dict(model, model_zoo.load_url('https://download.pytorch.org/models/resnet50-19c8e357.pth')) 89 | 90 | print 'Loading data.' 91 | 92 | transformations = transforms.Compose([transforms.Scale(240), 93 | transforms.RandomCrop(224), transforms.ToTensor(), 94 | transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]) 95 | 96 | if args.dataset == 'Pose_300W_LP': 97 | pose_dataset = datasets.Pose_300W_LP(args.data_dir, args.filename_list, transformations) 98 | elif args.dataset == 'Pose_300W_LP_random_ds': 99 | pose_dataset = datasets.Pose_300W_LP_random_ds(args.data_dir, args.filename_list, transformations) 100 | elif args.dataset == 'AFLW2000': 101 | pose_dataset = datasets.AFLW2000(args.data_dir, args.filename_list, transformations) 102 | elif args.dataset == 'BIWI': 103 | pose_dataset = datasets.BIWI(args.data_dir, args.filename_list, transformations) 104 | elif args.dataset == 'AFLW': 105 | pose_dataset = datasets.AFLW(args.data_dir, args.filename_list, transformations) 106 | elif args.dataset == 'AFLW_aug': 107 | pose_dataset = datasets.AFLW_aug(args.data_dir, args.filename_list, transformations) 108 | elif args.dataset == 'AFW': 109 | pose_dataset = datasets.AFW(args.data_dir, args.filename_list, transformations) 110 | else: 111 | print 'Error: not a valid dataset name' 112 | sys.exit() 113 | train_loader = torch.utils.data.DataLoader(dataset=pose_dataset, 114 | batch_size=batch_size, 115 | shuffle=True, 116 | num_workers=2) 117 | 118 | model.cuda(gpu) 119 | criterion = nn.MSELoss().cuda(gpu) 120 | 121 | optimizer = torch.optim.Adam([{'params': get_ignored_params(model), 'lr': 0}, 122 | {'params': get_non_ignored_params(model), 'lr': args.lr}, 123 | {'params': get_fc_params(model), 'lr': args.lr * 5}], 124 | lr = args.lr) 125 | 126 | print 'Ready to train network.' 127 | print 'First phase of training.' 128 | for epoch in range(num_epochs): 129 | for i, (images, labels, cont_labels, name) in enumerate(train_loader): 130 | images = Variable(images).cuda(gpu) 131 | 132 | label_angles = Variable(cont_labels[:,:3]).cuda(gpu) 133 | angles = model(images) 134 | 135 | loss = criterion(angles, label_angles) 136 | optimizer.zero_grad() 137 | loss.backward() 138 | optimizer.step() 139 | 140 | if (i+1) % 100 == 0: 141 | print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' 142 | %(epoch+1, num_epochs, i+1, len(pose_dataset)//batch_size, loss.data[0])) 143 | 144 | # Save models at numbered epochs. 145 | if epoch % 1 == 0 and epoch < num_epochs: 146 | print 'Taking snapshot...' 147 | torch.save(model.state_dict(), 148 | 'output/snapshots/' + args.output_string + '_epoch_'+ str(epoch+1) + '.pkl') 149 | -------------------------------------------------------------------------------- /pose/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | # from torch.utils.serialization import load_lua 4 | import torchfile 5 | import os 6 | import scipy.io as sio 7 | import cv2 8 | import math 9 | from math import cos, sin 10 | 11 | # 这里直接进行softmax操作即可 12 | def softmax_temperature(tensor, temperature): 13 | result = torch.exp(tensor / temperature) 14 | result = torch.div(result, torch.sum(result, 1).unsqueeze(1).expand_as(result)) 15 | return result 16 | 17 | def get_pose_params_from_mat(mat_path): 18 | # This functions gets the pose parameters from the .mat 19 | # Annotations that come with the Pose_300W_LP dataset. 20 | mat = sio.loadmat(mat_path) 21 | # [pitch yaw roll tdx tdy tdz scale_factor] 22 | pre_pose_params = mat['Pose_Para'][0] 23 | # Get [pitch, yaw, roll, tdx, tdy] 24 | pose_params = pre_pose_params[:5] 25 | return pose_params 26 | 27 | def get_ypr_from_mat(mat_path): 28 | # Get yaw, pitch, roll from .mat annotation. 29 | # They are in radians 30 | mat = sio.loadmat(mat_path) 31 | # [pitch yaw roll tdx tdy tdz scale_factor] 32 | pre_pose_params = mat['Pose_Para'][0] 33 | # Get [pitch, yaw, roll] 34 | pose_params = pre_pose_params[:3] 35 | return pose_params 36 | 37 | def get_pt2d_from_mat(mat_path): 38 | # Get 2D landmarks 39 | mat = sio.loadmat(mat_path) 40 | pt2d = mat['pt2d'] 41 | return pt2d 42 | 43 | def mse_loss(input, target): 44 | return torch.sum(torch.abs(input.data - target.data) ** 2) 45 | 46 | def plot_pose_cube(img, yaw, pitch, roll, tdx=None, tdy=None, size=150.): 47 | # Input is a cv2 image 48 | # pose_params: (pitch, yaw, roll, tdx, tdy) 49 | # Where (tdx, tdy) is the translation of the face. 50 | # For pose we have [pitch yaw roll tdx tdy tdz scale_factor] 51 | 52 | p = pitch * np.pi / 180 53 | y = -(yaw * np.pi / 180) 54 | r = roll * np.pi / 180 55 | if tdx != None and tdy != None: 56 | face_x = tdx - 0.50 * size 57 | face_y = tdy - 0.50 * size 58 | else: 59 | height, width = img.shape[:2] 60 | face_x = width / 2 - 0.5 * size 61 | face_y = height / 2 - 0.5 * size 62 | 63 | x1 = size * (cos(y) * cos(r)) + face_x 64 | y1 = size * (cos(p) * sin(r) + cos(r) * sin(p) * sin(y)) + face_y 65 | x2 = size * (-cos(y) * sin(r)) + face_x 66 | y2 = size * (cos(p) * cos(r) - sin(p) * sin(y) * sin(r)) + face_y 67 | x3 = size * (sin(y)) + face_x 68 | y3 = size * (-cos(y) * sin(p)) + face_y 69 | 70 | # Draw base in red 71 | cv2.line(img, (int(face_x), int(face_y)), (int(x1),int(y1)),(0,0,255),3) 72 | cv2.line(img, (int(face_x), int(face_y)), (int(x2),int(y2)),(0,0,255),3) 73 | cv2.line(img, (int(x2), int(y2)), (int(x2+x1-face_x),int(y2+y1-face_y)),(0,0,255),3) 74 | cv2.line(img, (int(x1), int(y1)), (int(x1+x2-face_x),int(y1+y2-face_y)),(0,0,255),3) 75 | # Draw pillars in blue 76 | cv2.line(img, (int(face_x), int(face_y)), (int(x3),int(y3)),(255,0,0),2) 77 | cv2.line(img, (int(x1), int(y1)), (int(x1+x3-face_x),int(y1+y3-face_y)),(255,0,0),2) 78 | cv2.line(img, (int(x2), int(y2)), (int(x2+x3-face_x),int(y2+y3-face_y)),(255,0,0),2) 79 | cv2.line(img, (int(x2+x1-face_x),int(y2+y1-face_y)), (int(x3+x1+x2-2*face_x),int(y3+y2+y1-2*face_y)),(255,0,0),2) 80 | # Draw top in green 81 | cv2.line(img, (int(x3+x1-face_x),int(y3+y1-face_y)), (int(x3+x1+x2-2*face_x),int(y3+y2+y1-2*face_y)),(0,255,0),2) 82 | cv2.line(img, (int(x2+x3-face_x),int(y2+y3-face_y)), (int(x3+x1+x2-2*face_x),int(y3+y2+y1-2*face_y)),(0,255,0),2) 83 | cv2.line(img, (int(x3), int(y3)), (int(x3+x1-face_x),int(y3+y1-face_y)),(0,255,0),2) 84 | cv2.line(img, (int(x3), int(y3)), (int(x3+x2-face_x),int(y3+y2-face_y)),(0,255,0),2) 85 | 86 | return img 87 | 88 | def draw_axis(img, yaw, pitch, roll, tdx=None, tdy=None, size = 100): 89 | 90 | pitch = pitch * np.pi / 180 91 | yaw = -(yaw * np.pi / 180) 92 | roll = roll * np.pi / 180 93 | 94 | if tdx != None and tdy != None: 95 | tdx = tdx 96 | tdy = tdy 97 | else: 98 | height, width = img.shape[:2] 99 | tdx = width / 2 100 | tdy = height / 2 101 | 102 | # X-Axis pointing to right. drawn in red [1,0,0] 103 | x1 = size * (cos(yaw) * cos(roll)) + tdx 104 | y1 = size * (cos(pitch) * sin(roll) + cos(roll) * sin(pitch) * sin(yaw)) + tdy 105 | # z1省略 106 | 107 | # Y-Axis | drawn in green [0,1,0] 108 | # v 109 | x2 = size * (-cos(yaw) * sin(roll)) + tdx 110 | y2 = size * (cos(pitch) * cos(roll) - sin(pitch) * sin(yaw) * sin(roll)) + tdy 111 | # z2省略 112 | 113 | # Z-Axis (out of the screen) drawn in blue [0,0,1] 114 | x3 = size * (sin(yaw)) + tdx 115 | y3 = size * (-cos(yaw) * sin(pitch)) + tdy 116 | # z3省略 117 | 118 | cv2.line(img, (int(tdx), int(tdy)), (int(x1), int(y1)), (0,0,255), 2) # 红色 x轴 脸右侧 119 | cv2.line(img, (int(tdx), int(tdy)), (int(x2), int(y2)), (0,255,0), 2) # 绿色 y轴 脸下方 120 | cv2.line(img, (int(tdx), int(tdy)), (int(x3), int(y3)), (255,0,0), 2) # 蓝色 z轴 脸前方 121 | 122 | return img 123 | -------------------------------------------------------------------------------- /test_fddb.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import argparse 4 | import torch 5 | import torch.backends.cudnn as cudnn 6 | import numpy as np 7 | from data import cfg_mnet, cfg_re50 8 | from layers.functions.prior_box import PriorBox 9 | from utils.nms.py_cpu_nms import py_cpu_nms 10 | import cv2 11 | from models.retinaface import RetinaFace 12 | from utils.box_utils import decode, decode_landm 13 | from utils.timer import Timer 14 | 15 | parser = argparse.ArgumentParser(description='Retinaface') 16 | 17 | parser.add_argument('-m', '--trained_model', default='./weights/mobilenet0.25_Final.pth', 18 | type=str, help='Trained state_dict file path to open') 19 | parser.add_argument('--network', default='mobile0.25', help='Backbone network mobile0.25 or resnet50') 20 | parser.add_argument('--save_folder', default='eval/', type=str, help='Dir to save results') 21 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference') 22 | parser.add_argument('--dataset', default='FDDB', type=str, choices=['FDDB'], help='dataset') 23 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold') 24 | parser.add_argument('--top_k', default=5000, type=int, help='top_k') 25 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold') 26 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k') 27 | parser.add_argument('-s', '--save_image', action="store_true", default=False, help='show detection results') 28 | parser.add_argument('--vis_thres', default=0.5, type=float, help='visualization_threshold') 29 | args = parser.parse_args() 30 | 31 | 32 | def check_keys(model, pretrained_state_dict): 33 | ckpt_keys = set(pretrained_state_dict.keys()) 34 | model_keys = set(model.state_dict().keys()) 35 | used_pretrained_keys = model_keys & ckpt_keys 36 | unused_pretrained_keys = ckpt_keys - model_keys 37 | missing_keys = model_keys - ckpt_keys 38 | print('Missing keys:{}'.format(len(missing_keys))) 39 | print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys))) 40 | print('Used keys:{}'.format(len(used_pretrained_keys))) 41 | assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' 42 | return True 43 | 44 | 45 | def remove_prefix(state_dict, prefix): 46 | ''' Old style model is stored with all names of parameters sharing common prefix 'module.' ''' 47 | print('remove prefix \'{}\''.format(prefix)) 48 | f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x 49 | return {f(key): value for key, value in state_dict.items()} 50 | 51 | 52 | def load_model(model, pretrained_path, load_to_cpu): 53 | print('Loading pretrained model from {}'.format(pretrained_path)) 54 | if load_to_cpu: 55 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 56 | else: 57 | device = torch.cuda.current_device() 58 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) 59 | if "state_dict" in pretrained_dict.keys(): 60 | pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.') 61 | else: 62 | pretrained_dict = remove_prefix(pretrained_dict, 'module.') 63 | check_keys(model, pretrained_dict) 64 | model.load_state_dict(pretrained_dict, strict=False) 65 | return model 66 | 67 | 68 | if __name__ == '__main__': 69 | torch.set_grad_enabled(False) 70 | cfg = None 71 | if args.network == "mobile0.25": 72 | cfg = cfg_mnet 73 | elif args.network == "resnet50": 74 | cfg = cfg_re50 75 | # net and model 76 | net = RetinaFace(cfg=cfg, phase = 'test') 77 | net = load_model(net, args.trained_model, args.cpu) 78 | net.eval() 79 | print('Finished loading model!') 80 | print(net) 81 | cudnn.benchmark = True 82 | device = torch.device("cpu" if args.cpu else "cuda") 83 | net = net.to(device) 84 | 85 | 86 | # save file 87 | if not os.path.exists(args.save_folder): 88 | os.makedirs(args.save_folder) 89 | fw = open(os.path.join(args.save_folder, args.dataset + '_dets.txt'), 'w') 90 | 91 | # testing dataset 92 | testset_folder = os.path.join('data', args.dataset, 'images/') 93 | testset_list = os.path.join('data', args.dataset, 'img_list.txt') 94 | with open(testset_list, 'r') as fr: 95 | test_dataset = fr.read().split() 96 | num_images = len(test_dataset) 97 | 98 | # testing scale 99 | resize = 1 100 | 101 | _t = {'forward_pass': Timer(), 'misc': Timer()} 102 | 103 | # testing begin 104 | for i, img_name in enumerate(test_dataset): 105 | image_path = testset_folder + img_name + '.jpg' 106 | img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR) 107 | 108 | img = np.float32(img_raw) 109 | if resize != 1: 110 | img = cv2.resize(img, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) 111 | im_height, im_width, _ = img.shape 112 | scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) 113 | img -= (104, 117, 123) 114 | img = img.transpose(2, 0, 1) 115 | img = torch.from_numpy(img).unsqueeze(0) 116 | img = img.to(device) 117 | scale = scale.to(device) 118 | 119 | _t['forward_pass'].tic() 120 | loc, conf, landms = net(img) # forward pass 121 | _t['forward_pass'].toc() 122 | _t['misc'].tic() 123 | priorbox = PriorBox(cfg, image_size=(im_height, im_width)) 124 | priors = priorbox.forward() 125 | priors = priors.to(device) 126 | prior_data = priors.data 127 | boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance']) 128 | boxes = boxes * scale / resize 129 | boxes = boxes.cpu().numpy() 130 | scores = conf.squeeze(0).data.cpu().numpy()[:, 1] 131 | landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance']) 132 | scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2], 133 | img.shape[3], img.shape[2], img.shape[3], img.shape[2], 134 | img.shape[3], img.shape[2]]) 135 | scale1 = scale1.to(device) 136 | landms = landms * scale1 / resize 137 | landms = landms.cpu().numpy() 138 | 139 | # ignore low scores 140 | inds = np.where(scores > args.confidence_threshold)[0] 141 | boxes = boxes[inds] 142 | landms = landms[inds] 143 | scores = scores[inds] 144 | 145 | # keep top-K before NMS 146 | # order = scores.argsort()[::-1][:args.top_k] 147 | order = scores.argsort()[::-1] 148 | boxes = boxes[order] 149 | landms = landms[order] 150 | scores = scores[order] 151 | 152 | # do NMS 153 | dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) 154 | keep = py_cpu_nms(dets, args.nms_threshold) 155 | 156 | dets = dets[keep, :] 157 | landms = landms[keep] 158 | 159 | # keep top-K faster NMS 160 | # dets = dets[:args.keep_top_k, :] 161 | # landms = landms[:args.keep_top_k, :] 162 | 163 | dets = np.concatenate((dets, landms), axis=1) 164 | _t['misc'].toc() 165 | 166 | # save dets 167 | if args.dataset == "FDDB": 168 | fw.write('{:s}\n'.format(img_name)) 169 | fw.write('{:.1f}\n'.format(dets.shape[0])) 170 | for k in range(dets.shape[0]): 171 | xmin = dets[k, 0] 172 | ymin = dets[k, 1] 173 | xmax = dets[k, 2] 174 | ymax = dets[k, 3] 175 | score = dets[k, 4] 176 | w = xmax - xmin + 1 177 | h = ymax - ymin + 1 178 | # fw.write('{:.3f} {:.3f} {:.3f} {:.3f} {:.10f}\n'.format(xmin, ymin, w, h, score)) 179 | fw.write('{:d} {:d} {:d} {:d} {:.10f}\n'.format(int(xmin), int(ymin), int(w), int(h), score)) 180 | print('im_detect: {:d}/{:d} forward_pass_time: {:.4f}s misc: {:.4f}s'.format(i + 1, num_images, _t['forward_pass'].average_time, _t['misc'].average_time)) 181 | 182 | # show image 183 | if args.save_image: 184 | for b in dets: 185 | if b[4] < args.vis_thres: 186 | continue 187 | text = "{:.4f}".format(b[4]) 188 | b = list(map(int, b)) 189 | cv2.rectangle(img_raw, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2) 190 | cx = b[0] 191 | cy = b[1] + 12 192 | cv2.putText(img_raw, text, (cx, cy), 193 | cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) 194 | 195 | # landms 196 | cv2.circle(img_raw, (b[5], b[6]), 1, (0, 0, 255), 4) 197 | cv2.circle(img_raw, (b[7], b[8]), 1, (0, 255, 255), 4) 198 | cv2.circle(img_raw, (b[9], b[10]), 1, (255, 0, 255), 4) 199 | cv2.circle(img_raw, (b[11], b[12]), 1, (0, 255, 0), 4) 200 | cv2.circle(img_raw, (b[13], b[14]), 1, (255, 0, 0), 4) 201 | # save image 202 | if not os.path.exists("./results/"): 203 | os.makedirs("./results/") 204 | name = "./results/" + str(i) + ".jpg" 205 | cv2.imwrite(name, img_raw) 206 | 207 | fw.close() 208 | -------------------------------------------------------------------------------- /test_widerface.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import argparse 4 | import torch 5 | import torch.backends.cudnn as cudnn 6 | import numpy as np 7 | from data import cfg_mnet, cfg_re50 8 | from layers.functions.prior_box import PriorBox 9 | from utils.nms.py_cpu_nms import py_cpu_nms 10 | import cv2 11 | from models.retinaface import RetinaFace 12 | from utils.box_utils import decode, decode_landm 13 | from utils.timer import Timer 14 | 15 | 16 | parser = argparse.ArgumentParser(description='Retinaface') 17 | parser.add_argument('-m', '--trained_model', default='./weights/Resnet50_Final.pth', 18 | type=str, help='Trained state_dict file path to open') 19 | parser.add_argument('--network', default='resnet50', help='Backbone network mobile0.25 or resnet50') 20 | parser.add_argument('--origin_size', default=True, type=str, help='Whether use origin image size to evaluate') 21 | parser.add_argument('--save_folder', default='./widerface_evaluate/widerface_txt/', type=str, help='Dir to save txt results') 22 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference') 23 | parser.add_argument('--dataset_folder', default='./data/widerface/val/images/', type=str, help='dataset path') 24 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold') 25 | parser.add_argument('--top_k', default=5000, type=int, help='top_k') 26 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold') 27 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k') 28 | parser.add_argument('-s', '--save_image', action="store_true", default=False, help='show detection results') 29 | parser.add_argument('--vis_thres', default=0.5, type=float, help='visualization_threshold') 30 | args = parser.parse_args() 31 | 32 | 33 | def check_keys(model, pretrained_state_dict): 34 | ckpt_keys = set(pretrained_state_dict.keys()) 35 | model_keys = set(model.state_dict().keys()) 36 | used_pretrained_keys = model_keys & ckpt_keys 37 | unused_pretrained_keys = ckpt_keys - model_keys 38 | missing_keys = model_keys - ckpt_keys 39 | print('Missing keys:{}'.format(len(missing_keys))) 40 | print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys))) 41 | print('Used keys:{}'.format(len(used_pretrained_keys))) 42 | assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' 43 | return True 44 | 45 | 46 | def remove_prefix(state_dict, prefix): 47 | ''' Old style model is stored with all names of parameters sharing common prefix 'module.' ''' 48 | print('remove prefix \'{}\''.format(prefix)) 49 | f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x 50 | return {f(key): value for key, value in state_dict.items()} 51 | 52 | 53 | def load_model(model, pretrained_path, load_to_cpu): 54 | print('Loading pretrained model from {}'.format(pretrained_path)) 55 | if load_to_cpu: 56 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 57 | else: 58 | device = torch.cuda.current_device() 59 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) 60 | if "state_dict" in pretrained_dict.keys(): 61 | pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.') 62 | else: 63 | pretrained_dict = remove_prefix(pretrained_dict, 'module.') 64 | check_keys(model, pretrained_dict) 65 | model.load_state_dict(pretrained_dict, strict=False) 66 | return model 67 | 68 | 69 | if __name__ == '__main__': 70 | torch.set_grad_enabled(False) 71 | 72 | cfg = None 73 | if args.network == "mobile0.25": 74 | cfg = cfg_mnet 75 | elif args.network == "resnet50": 76 | cfg = cfg_re50 77 | # net and model 78 | net = RetinaFace(cfg=cfg, phase = 'test') 79 | net = load_model(net, args.trained_model, args.cpu) 80 | net.eval() 81 | print('Finished loading model!') 82 | print(net) 83 | cudnn.benchmark = True 84 | device = torch.device("cpu" if args.cpu else "cuda") 85 | net = net.to(device) 86 | 87 | # testing dataset 88 | testset_folder = args.dataset_folder 89 | testset_list = args.dataset_folder[:-7] + "wider_val.txt" 90 | 91 | with open(testset_list, 'r') as fr: 92 | test_dataset = fr.read().split() 93 | num_images = len(test_dataset) 94 | 95 | _t = {'forward_pass': Timer(), 'misc': Timer()} 96 | 97 | # testing begin 98 | for i, img_name in enumerate(test_dataset): 99 | image_path = testset_folder + img_name 100 | img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR) 101 | img = np.float32(img_raw) 102 | 103 | # testing scale 104 | target_size = 1600 105 | max_size = 2150 106 | im_shape = img.shape 107 | im_size_min = np.min(im_shape[0:2]) 108 | im_size_max = np.max(im_shape[0:2]) 109 | resize = float(target_size) / float(im_size_min) 110 | # prevent bigger axis from being more than max_size: 111 | if np.round(resize * im_size_max) > max_size: 112 | resize = float(max_size) / float(im_size_max) 113 | if args.origin_size: 114 | resize = 1 115 | 116 | if resize != 1: 117 | img = cv2.resize(img, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) 118 | im_height, im_width, _ = img.shape 119 | scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) 120 | img -= (104, 117, 123) 121 | img = img.transpose(2, 0, 1) 122 | img = torch.from_numpy(img).unsqueeze(0) 123 | img = img.to(device) 124 | scale = scale.to(device) 125 | 126 | _t['forward_pass'].tic() 127 | loc, conf, landms = net(img) # forward pass 128 | _t['forward_pass'].toc() 129 | _t['misc'].tic() 130 | priorbox = PriorBox(cfg, image_size=(im_height, im_width)) 131 | priors = priorbox.forward() 132 | priors = priors.to(device) 133 | prior_data = priors.data 134 | boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance']) 135 | boxes = boxes * scale / resize 136 | boxes = boxes.cpu().numpy() 137 | scores = conf.squeeze(0).data.cpu().numpy()[:, 1] 138 | landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance']) 139 | scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2], 140 | img.shape[3], img.shape[2], img.shape[3], img.shape[2], 141 | img.shape[3], img.shape[2]]) 142 | scale1 = scale1.to(device) 143 | landms = landms * scale1 / resize 144 | landms = landms.cpu().numpy() 145 | 146 | # ignore low scores 147 | inds = np.where(scores > args.confidence_threshold)[0] 148 | boxes = boxes[inds] 149 | landms = landms[inds] 150 | scores = scores[inds] 151 | 152 | # keep top-K before NMS 153 | order = scores.argsort()[::-1] 154 | # order = scores.argsort()[::-1][:args.top_k] 155 | boxes = boxes[order] 156 | landms = landms[order] 157 | scores = scores[order] 158 | 159 | # do NMS 160 | dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) 161 | keep = py_cpu_nms(dets, args.nms_threshold) 162 | # keep = nms(dets, args.nms_threshold,force_cpu=args.cpu) 163 | dets = dets[keep, :] 164 | landms = landms[keep] 165 | 166 | # keep top-K faster NMS 167 | # dets = dets[:args.keep_top_k, :] 168 | # landms = landms[:args.keep_top_k, :] 169 | 170 | dets = np.concatenate((dets, landms), axis=1) 171 | _t['misc'].toc() 172 | 173 | # -------------------------------------------------------------------- 174 | save_name = args.save_folder + img_name[:-4] + ".txt" 175 | dirname = os.path.dirname(save_name) 176 | if not os.path.isdir(dirname): 177 | os.makedirs(dirname) 178 | with open(save_name, "w") as fd: 179 | bboxs = dets 180 | file_name = os.path.basename(save_name)[:-4] + "\n" 181 | bboxs_num = str(len(bboxs)) + "\n" 182 | fd.write(file_name) 183 | fd.write(bboxs_num) 184 | for box in bboxs: 185 | x = int(box[0]) 186 | y = int(box[1]) 187 | w = int(box[2]) - int(box[0]) 188 | h = int(box[3]) - int(box[1]) 189 | confidence = str(box[4]) 190 | line = str(x) + " " + str(y) + " " + str(w) + " " + str(h) + " " + confidence + " \n" 191 | fd.write(line) 192 | 193 | print('im_detect: {:d}/{:d} forward_pass_time: {:.4f}s misc: {:.4f}s'.format(i + 1, num_images, _t['forward_pass'].average_time, _t['misc'].average_time)) 194 | 195 | # save image 196 | if args.save_image: 197 | for b in dets: 198 | if b[4] < args.vis_thres: 199 | continue 200 | text = "{:.4f}".format(b[4]) 201 | b = list(map(int, b)) 202 | cv2.rectangle(img_raw, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2) 203 | cx = b[0] 204 | cy = b[1] + 12 205 | cv2.putText(img_raw, text, (cx, cy), 206 | cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) 207 | 208 | # landms 209 | cv2.circle(img_raw, (b[5], b[6]), 1, (0, 0, 255), 4) 210 | cv2.circle(img_raw, (b[7], b[8]), 1, (0, 255, 255), 4) 211 | cv2.circle(img_raw, (b[9], b[10]), 1, (255, 0, 255), 4) 212 | cv2.circle(img_raw, (b[11], b[12]), 1, (0, 255, 0), 4) 213 | cv2.circle(img_raw, (b[13], b[14]), 1, (255, 0, 0), 4) 214 | # save image 215 | if not os.path.exists("./results/"): 216 | os.makedirs("./results/") 217 | name = "./results/" + str(i) + ".jpg" 218 | cv2.imwrite(name, img_raw) 219 | 220 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | import torch.optim as optim 5 | import torch.backends.cudnn as cudnn 6 | import argparse 7 | import torch.utils.data as data 8 | from data import WiderFaceDetection, detection_collate, preproc, cfg_mnet, cfg_re50 9 | from layers.modules import MultiBoxLoss 10 | from layers.functions.prior_box import PriorBox 11 | import time 12 | import datetime 13 | import math 14 | from models.retinaface import RetinaFace 15 | 16 | parser = argparse.ArgumentParser(description='Retinaface Training') 17 | parser.add_argument('--training_dataset', default='./data/widerface/train/label.txt', help='Training dataset directory') 18 | parser.add_argument('--network', default='mobile0.25', help='Backbone network mobile0.25 or resnet50') 19 | parser.add_argument('--num_workers', default=4, type=int, help='Number of workers used in dataloading') 20 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, help='initial learning rate') 21 | parser.add_argument('--momentum', default=0.9, type=float, help='momentum') 22 | parser.add_argument('--resume_net', default=None, help='resume net for retraining') 23 | parser.add_argument('--resume_epoch', default=0, type=int, help='resume iter for retraining') 24 | parser.add_argument('--weight_decay', default=5e-4, type=float, help='Weight decay for SGD') 25 | parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD') 26 | parser.add_argument('--save_folder', default='./weights/', help='Location to save checkpoint models') 27 | 28 | args = parser.parse_args() 29 | 30 | if not os.path.exists(args.save_folder): 31 | os.mkdir(args.save_folder) 32 | cfg = None 33 | if args.network == "mobile0.25": 34 | cfg = cfg_mnet 35 | elif args.network == "resnet50": 36 | cfg = cfg_re50 37 | 38 | rgb_mean = (104, 117, 123) # bgr order 39 | num_classes = 2 40 | img_dim = cfg['image_size'] 41 | num_gpu = cfg['ngpu'] 42 | batch_size = cfg['batch_size'] 43 | max_epoch = cfg['epoch'] 44 | gpu_train = cfg['gpu_train'] 45 | 46 | num_workers = args.num_workers 47 | momentum = args.momentum 48 | weight_decay = args.weight_decay 49 | initial_lr = args.lr 50 | gamma = args.gamma 51 | training_dataset = args.training_dataset 52 | save_folder = args.save_folder 53 | 54 | net = RetinaFace(cfg=cfg) 55 | print("Printing net...") 56 | print(net) 57 | 58 | if args.resume_net is not None: 59 | print('Loading resume network...') 60 | state_dict = torch.load(args.resume_net) 61 | # create new OrderedDict that does not contain `module.` 62 | from collections import OrderedDict 63 | new_state_dict = OrderedDict() 64 | for k, v in state_dict.items(): 65 | head = k[:7] 66 | if head == 'module.': 67 | name = k[7:] # remove `module.` 68 | else: 69 | name = k 70 | new_state_dict[name] = v 71 | net.load_state_dict(new_state_dict) 72 | 73 | if num_gpu > 1 and gpu_train: 74 | net = torch.nn.DataParallel(net).cuda() 75 | else: 76 | net = net.cuda() 77 | 78 | cudnn.benchmark = True 79 | 80 | 81 | optimizer = optim.SGD(net.parameters(), lr=initial_lr, momentum=momentum, weight_decay=weight_decay) 82 | criterion = MultiBoxLoss(num_classes, 0.35, True, 0, True, 7, 0.35, False) 83 | 84 | priorbox = PriorBox(cfg, image_size=(img_dim, img_dim)) 85 | with torch.no_grad(): 86 | priors = priorbox.forward() 87 | priors = priors.cuda() 88 | 89 | def train(): 90 | net.train() 91 | epoch = 0 + args.resume_epoch 92 | print('Loading Dataset...') 93 | 94 | dataset = WiderFaceDetection( training_dataset,preproc(img_dim, rgb_mean)) 95 | 96 | epoch_size = math.ceil(len(dataset) / batch_size) 97 | max_iter = max_epoch * epoch_size 98 | 99 | stepvalues = (cfg['decay1'] * epoch_size, cfg['decay2'] * epoch_size) 100 | step_index = 0 101 | 102 | if args.resume_epoch > 0: 103 | start_iter = args.resume_epoch * epoch_size 104 | else: 105 | start_iter = 0 106 | 107 | for iteration in range(start_iter, max_iter): 108 | if iteration % epoch_size == 0: 109 | # create batch iterator 110 | batch_iterator = iter(data.DataLoader(dataset, batch_size, shuffle=True, num_workers=num_workers, collate_fn=detection_collate)) 111 | if (epoch % 10 == 0 and epoch > 0) or (epoch % 5 == 0 and epoch > cfg['decay1']): 112 | torch.save(net.state_dict(), save_folder + cfg['name']+ '_epoch_' + str(epoch) + '.pth') 113 | epoch += 1 114 | 115 | load_t0 = time.time() 116 | if iteration in stepvalues: 117 | step_index += 1 118 | lr = adjust_learning_rate(optimizer, gamma, epoch, step_index, iteration, epoch_size) 119 | 120 | # load train data 121 | images, targets = next(batch_iterator) 122 | images = images.cuda() 123 | targets = [anno.cuda() for anno in targets] 124 | 125 | # forward 126 | out = net(images) 127 | 128 | # backprop 129 | optimizer.zero_grad() 130 | loss_l, loss_c, loss_landm = criterion(out, priors, targets) 131 | loss = cfg['loc_weight'] * loss_l + loss_c + loss_landm 132 | loss.backward() 133 | optimizer.step() 134 | load_t1 = time.time() 135 | batch_time = load_t1 - load_t0 136 | eta = int(batch_time * (max_iter - iteration)) 137 | print('Epoch:{}/{} || Epochiter: {}/{} || Iter: {}/{} || Loc: {:.4f} Cla: {:.4f} Landm: {:.4f} || LR: {:.8f} || Batchtime: {:.4f} s || ETA: {}' 138 | .format(epoch, max_epoch, (iteration % epoch_size) + 1, 139 | epoch_size, iteration + 1, max_iter, loss_l.item(), loss_c.item(), loss_landm.item(), lr, batch_time, str(datetime.timedelta(seconds=eta)))) 140 | 141 | torch.save(net.state_dict(), save_folder + cfg['name'] + '_Final.pth') 142 | # torch.save(net.state_dict(), save_folder + 'Final_Retinaface.pth') 143 | 144 | 145 | def adjust_learning_rate(optimizer, gamma, epoch, step_index, iteration, epoch_size): 146 | """Sets the learning rate 147 | # Adapted from PyTorch Imagenet example: 148 | # https://github.com/pytorch/examples/blob/master/imagenet/main.py 149 | """ 150 | warmup_epoch = -1 151 | if epoch <= warmup_epoch: 152 | lr = 1e-6 + (initial_lr-1e-6) * iteration / (epoch_size * warmup_epoch) 153 | else: 154 | lr = initial_lr * (gamma ** (step_index)) 155 | for param_group in optimizer.param_groups: 156 | param_group['lr'] = lr 157 | return lr 158 | 159 | if __name__ == '__main__': 160 | train() 161 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/utils/__init__.py -------------------------------------------------------------------------------- /utils/box_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def point_form(boxes): 6 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 7 | representation for comparison to point form ground truth data. 8 | Args: 9 | boxes: (tensor) center-size default boxes from priorbox layers. 10 | Return: 11 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 12 | """ 13 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 14 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 15 | 16 | 17 | def center_size(boxes): 18 | """ Convert prior_boxes to (cx, cy, w, h) 19 | representation for comparison to center-size form ground truth data. 20 | Args: 21 | boxes: (tensor) point_form boxes 22 | Return: 23 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 24 | """ 25 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 26 | boxes[:, 2:] - boxes[:, :2], 1) # w, h 27 | 28 | 29 | def intersect(box_a, box_b): 30 | """ We resize both tensors to [A,B,2] without new malloc: 31 | [A,2] -> [A,1,2] -> [A,B,2] 32 | [B,2] -> [1,B,2] -> [A,B,2] 33 | Then we compute the area of intersect between box_a and box_b. 34 | Args: 35 | box_a: (tensor) bounding boxes, Shape: [A,4]. 36 | box_b: (tensor) bounding boxes, Shape: [B,4]. 37 | Return: 38 | (tensor) intersection area, Shape: [A,B]. 39 | """ 40 | A = box_a.size(0) 41 | B = box_b.size(0) 42 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 43 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 44 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 45 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 46 | inter = torch.clamp((max_xy - min_xy), min=0) 47 | return inter[:, :, 0] * inter[:, :, 1] 48 | 49 | 50 | def jaccard(box_a, box_b): 51 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 52 | is simply the intersection over union of two boxes. Here we operate on 53 | ground truth boxes and default boxes. 54 | E.g.: 55 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 56 | Args: 57 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 58 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 59 | Return: 60 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 61 | """ 62 | inter = intersect(box_a, box_b) 63 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 64 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 65 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 66 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 67 | union = area_a + area_b - inter 68 | return inter / union # [A,B] 69 | 70 | 71 | def matrix_iou(a, b): 72 | """ 73 | return iou of a and b, numpy version for data augenmentation 74 | """ 75 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 76 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 77 | 78 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 79 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 80 | area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) 81 | return area_i / (area_a[:, np.newaxis] + area_b - area_i) 82 | 83 | 84 | def matrix_iof(a, b): 85 | """ 86 | return iof of a and b, numpy version for data augenmentation 87 | """ 88 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 89 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 90 | 91 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 92 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 93 | return area_i / np.maximum(area_a[:, np.newaxis], 1) 94 | 95 | 96 | def match(threshold, truths, priors, variances, labels, landms, loc_t, conf_t, landm_t, idx): 97 | """Match each prior box with the ground truth box of the highest jaccard 98 | overlap, encode the bounding boxes, then return the matched indices 99 | corresponding to both confidence and location preds. 100 | Args: 101 | threshold: (float) The overlap threshold used when mathing boxes. 102 | truths: (tensor) Ground truth boxes, Shape: [num_obj, 4]. 103 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 104 | variances: (tensor) Variances corresponding to each prior coord, 105 | Shape: [num_priors, 4]. 106 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 107 | landms: (tensor) Ground truth landms, Shape [num_obj, 10]. 108 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 109 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 110 | landm_t: (tensor) Tensor to be filled w/ endcoded landm targets. 111 | idx: (int) current batch index 112 | Return: 113 | The matched indices corresponding to 1)location 2)confidence 3)landm preds. 114 | """ 115 | # jaccard index 116 | overlaps = jaccard( 117 | truths, 118 | point_form(priors) 119 | ) 120 | # (Bipartite Matching) 121 | # [1,num_objects] best prior for each ground truth 122 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 123 | 124 | # ignore hard gt 125 | valid_gt_idx = best_prior_overlap[:, 0] >= 0.2 126 | best_prior_idx_filter = best_prior_idx[valid_gt_idx, :] 127 | if best_prior_idx_filter.shape[0] <= 0: 128 | loc_t[idx] = 0 129 | conf_t[idx] = 0 130 | return 131 | 132 | # [1,num_priors] best ground truth for each prior 133 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 134 | best_truth_idx.squeeze_(0) 135 | best_truth_overlap.squeeze_(0) 136 | best_prior_idx.squeeze_(1) 137 | best_prior_idx_filter.squeeze_(1) 138 | best_prior_overlap.squeeze_(1) 139 | best_truth_overlap.index_fill_(0, best_prior_idx_filter, 2) # ensure best prior 140 | # TODO refactor: index best_prior_idx with long tensor 141 | # ensure every gt matches with its prior of max overlap 142 | for j in range(best_prior_idx.size(0)): # 判别此anchor是预测哪一个boxes 143 | best_truth_idx[best_prior_idx[j]] = j 144 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 此处为每一个anchor对应的bbox取出来 145 | conf = labels[best_truth_idx] # Shape: [num_priors] 此处为每一个anchor对应的label取出来 146 | conf[best_truth_overlap < threshold] = 0 # label as background overlap<0.35的全部作为负样本 147 | loc = encode(matches, priors, variances) 148 | 149 | matches_landm = landms[best_truth_idx] 150 | landm = encode_landm(matches_landm, priors, variances) 151 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 152 | conf_t[idx] = conf # [num_priors] top class label for each prior 153 | landm_t[idx] = landm 154 | 155 | 156 | def encode(matched, priors, variances): 157 | """Encode the variances from the priorbox layers into the ground truth boxes 158 | we have matched (based on jaccard overlap) with the prior boxes. 159 | Args: 160 | matched: (tensor) Coords of ground truth for each prior in point-form 161 | Shape: [num_priors, 4]. 162 | priors: (tensor) Prior boxes in center-offset form 163 | Shape: [num_priors,4]. 164 | variances: (list[float]) Variances of priorboxes 165 | Return: 166 | encoded boxes (tensor), Shape: [num_priors, 4] 167 | """ 168 | 169 | # dist b/t match center and prior's center 170 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 171 | # encode variance 172 | g_cxcy /= (variances[0] * priors[:, 2:]) 173 | # match wh / prior wh 174 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 175 | g_wh = torch.log(g_wh) / variances[1] 176 | # return target for smooth_l1_loss 177 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 178 | 179 | def encode_landm(matched, priors, variances): 180 | """Encode the variances from the priorbox layers into the ground truth boxes 181 | we have matched (based on jaccard overlap) with the prior boxes. 182 | Args: 183 | matched: (tensor) Coords of ground truth for each prior in point-form 184 | Shape: [num_priors, 10]. 185 | priors: (tensor) Prior boxes in center-offset form 186 | Shape: [num_priors,4]. 187 | variances: (list[float]) Variances of priorboxes 188 | Return: 189 | encoded landm (tensor), Shape: [num_priors, 10] 190 | """ 191 | 192 | # dist b/t match center and prior's center 193 | matched = torch.reshape(matched, (matched.size(0), 5, 2)) 194 | priors_cx = priors[:, 0].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 195 | priors_cy = priors[:, 1].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 196 | priors_w = priors[:, 2].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 197 | priors_h = priors[:, 3].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 198 | priors = torch.cat([priors_cx, priors_cy, priors_w, priors_h], dim=2) 199 | g_cxcy = matched[:, :, :2] - priors[:, :, :2] 200 | # encode variance 201 | g_cxcy /= (variances[0] * priors[:, :, 2:]) 202 | # g_cxcy /= priors[:, :, 2:] 203 | g_cxcy = g_cxcy.reshape(g_cxcy.size(0), -1) 204 | # return target for smooth_l1_loss 205 | return g_cxcy 206 | 207 | 208 | # Adapted from https://github.com/Hakuyume/chainer-ssd 209 | def decode(loc, priors, variances): 210 | """Decode locations from predictions using priors to undo 211 | the encoding we did for offset regression at train time. 212 | Args: 213 | loc (tensor): location predictions for loc layers, 214 | Shape: [num_priors,4] 215 | priors (tensor): Prior boxes in center-offset form. 216 | Shape: [num_priors,4]. 217 | variances: (list[float]) Variances of priorboxes 218 | Return: 219 | decoded bounding box predictions 220 | """ 221 | 222 | boxes = torch.cat(( 223 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 224 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 225 | boxes[:, :2] -= boxes[:, 2:] / 2 226 | boxes[:, 2:] += boxes[:, :2] 227 | return boxes 228 | 229 | def decode_landm(pre, priors, variances): 230 | """Decode landm from predictions using priors to undo 231 | the encoding we did for offset regression at train time. 232 | Args: 233 | pre (tensor): landm predictions for loc layers, 234 | Shape: [num_priors,10] 235 | priors (tensor): Prior boxes in center-offset form. 236 | Shape: [num_priors,4]. 237 | variances: (list[float]) Variances of priorboxes 238 | Return: 239 | decoded landm predictions 240 | """ 241 | landms = torch.cat((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:], 242 | priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:], 243 | priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:], 244 | priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:], 245 | priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:], 246 | ), dim=1) 247 | return landms 248 | 249 | 250 | def log_sum_exp(x): 251 | """Utility function for computing log_sum_exp while determining 252 | This will be used to determine unaveraged confidence loss across 253 | all examples in a batch. 254 | Args: 255 | x (Variable(tensor)): conf_preds from conf layers 256 | """ 257 | x_max = x.data.max() 258 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 259 | 260 | 261 | # Original author: Francisco Massa: 262 | # https://github.com/fmassa/object-detection.torch 263 | # Ported to PyTorch by Max deGroot (02/01/2017) 264 | def nms(boxes, scores, overlap=0.5, top_k=200): 265 | """Apply non-maximum suppression at test time to avoid detecting too many 266 | overlapping bounding boxes for a given object. 267 | Args: 268 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 269 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 270 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 271 | top_k: (int) The Maximum number of box preds to consider. 272 | Return: 273 | The indices of the kept boxes with respect to num_priors. 274 | """ 275 | 276 | keep = torch.Tensor(scores.size(0)).fill_(0).long() 277 | if boxes.numel() == 0: 278 | return keep 279 | x1 = boxes[:, 0] 280 | y1 = boxes[:, 1] 281 | x2 = boxes[:, 2] 282 | y2 = boxes[:, 3] 283 | area = torch.mul(x2 - x1, y2 - y1) 284 | v, idx = scores.sort(0) # sort in ascending order 285 | # I = I[v >= 0.01] 286 | idx = idx[-top_k:] # indices of the top-k largest vals 287 | xx1 = boxes.new() 288 | yy1 = boxes.new() 289 | xx2 = boxes.new() 290 | yy2 = boxes.new() 291 | w = boxes.new() 292 | h = boxes.new() 293 | 294 | # keep = torch.Tensor() 295 | count = 0 296 | while idx.numel() > 0: 297 | i = idx[-1] # index of current largest val 298 | # keep.append(i) 299 | keep[count] = i 300 | count += 1 301 | if idx.size(0) == 1: 302 | break 303 | idx = idx[:-1] # remove kept element from view 304 | # load bboxes of next highest vals 305 | torch.index_select(x1, 0, idx, out=xx1) 306 | torch.index_select(y1, 0, idx, out=yy1) 307 | torch.index_select(x2, 0, idx, out=xx2) 308 | torch.index_select(y2, 0, idx, out=yy2) 309 | # store element-wise max with next highest score 310 | xx1 = torch.clamp(xx1, min=x1[i]) 311 | yy1 = torch.clamp(yy1, min=y1[i]) 312 | xx2 = torch.clamp(xx2, max=x2[i]) 313 | yy2 = torch.clamp(yy2, max=y2[i]) 314 | w.resize_as_(xx2) 315 | h.resize_as_(yy2) 316 | w = xx2 - xx1 317 | h = yy2 - yy1 318 | # check sizes of xx1 and xx2.. after each iteration 319 | w = torch.clamp(w, min=0.0) 320 | h = torch.clamp(h, min=0.0) 321 | inter = w*h 322 | # IoU = i / (area(a) + area(b) - i) 323 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 324 | union = (rem_areas - inter) + area[i] 325 | IoU = inter/union # store result in iou 326 | # keep only elements with an IoU <= overlap 327 | idx = idx[IoU.le(overlap)] 328 | return keep, count 329 | 330 | 331 | -------------------------------------------------------------------------------- /utils/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/utils/nms/__init__.py -------------------------------------------------------------------------------- /utils/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | 11 | class Timer(object): 12 | """A simple timer.""" 13 | def __init__(self): 14 | self.total_time = 0. 15 | self.calls = 0 16 | self.start_time = 0. 17 | self.diff = 0. 18 | self.average_time = 0. 19 | 20 | def tic(self): 21 | # using time.time instead of time.clock because time time.clock 22 | # does not normalize for multithreading 23 | self.start_time = time.time() 24 | 25 | def toc(self, average=True): 26 | self.diff = time.time() - self.start_time 27 | self.total_time += self.diff 28 | self.calls += 1 29 | self.average_time = self.total_time / self.calls 30 | if average: 31 | return self.average_time 32 | else: 33 | return self.diff 34 | 35 | def clear(self): 36 | self.total_time = 0. 37 | self.calls = 0 38 | self.start_time = 0. 39 | self.diff = 0. 40 | self.average_time = 0. 41 | -------------------------------------------------------------------------------- /widerface_evaluate/README.md: -------------------------------------------------------------------------------- 1 | # WiderFace-Evaluation 2 | Python Evaluation Code for [Wider Face Dataset](http://mmlab.ie.cuhk.edu.hk/projects/WIDERFace/) 3 | 4 | 5 | ## Usage 6 | 7 | 8 | ##### before evaluating .... 9 | 10 | ```` 11 | python3 setup.py build_ext --inplace 12 | ```` 13 | 14 | ##### evaluating 15 | 16 | **GroungTruth:** `wider_face_val.mat`, `wider_easy_val.mat`, `wider_medium_val.mat`,`wider_hard_val.mat` 17 | 18 | ```` 19 | python3 evaluation.py -p -g 20 | ```` 21 | 22 | ## Bugs & Problems 23 | please issue 24 | 25 | ## Acknowledgements 26 | 27 | some code borrowed from Sergey Karayev 28 | -------------------------------------------------------------------------------- /widerface_evaluate/box_overlaps.pyx: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Sergey Karayev 6 | # -------------------------------------------------------- 7 | 8 | cimport cython 9 | import numpy as np 10 | cimport numpy as np 11 | 12 | DTYPE = np.float 13 | ctypedef np.float_t DTYPE_t 14 | 15 | def bbox_overlaps( 16 | np.ndarray[DTYPE_t, ndim=2] boxes, 17 | np.ndarray[DTYPE_t, ndim=2] query_boxes): 18 | """ 19 | Parameters 20 | ---------- 21 | boxes: (N, 4) ndarray of float 22 | query_boxes: (K, 4) ndarray of float 23 | Returns 24 | ------- 25 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 26 | """ 27 | cdef unsigned int N = boxes.shape[0] 28 | cdef unsigned int K = query_boxes.shape[0] 29 | cdef np.ndarray[DTYPE_t, ndim=2] overlaps = np.zeros((N, K), dtype=DTYPE) 30 | cdef DTYPE_t iw, ih, box_area 31 | cdef DTYPE_t ua 32 | cdef unsigned int k, n 33 | for k in range(K): 34 | box_area = ( 35 | (query_boxes[k, 2] - query_boxes[k, 0] + 1) * 36 | (query_boxes[k, 3] - query_boxes[k, 1] + 1) 37 | ) 38 | for n in range(N): 39 | iw = ( 40 | min(boxes[n, 2], query_boxes[k, 2]) - 41 | max(boxes[n, 0], query_boxes[k, 0]) + 1 42 | ) 43 | if iw > 0: 44 | ih = ( 45 | min(boxes[n, 3], query_boxes[k, 3]) - 46 | max(boxes[n, 1], query_boxes[k, 1]) + 1 47 | ) 48 | if ih > 0: 49 | ua = float( 50 | (boxes[n, 2] - boxes[n, 0] + 1) * 51 | (boxes[n, 3] - boxes[n, 1] + 1) + 52 | box_area - iw * ih 53 | ) 54 | overlaps[n, k] = iw * ih / ua 55 | return overlaps -------------------------------------------------------------------------------- /widerface_evaluate/evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | WiderFace evaluation code 3 | author: wondervictor 4 | mail: tianhengcheng@gmail.com 5 | copyright@wondervictor 6 | """ 7 | 8 | import os 9 | import tqdm 10 | import pickle 11 | import argparse 12 | import numpy as np 13 | from scipy.io import loadmat 14 | from bbox import bbox_overlaps 15 | from IPython import embed 16 | 17 | 18 | def get_gt_boxes(gt_dir): 19 | """ gt dir: (wider_face_val.mat, wider_easy_val.mat, wider_medium_val.mat, wider_hard_val.mat)""" 20 | 21 | gt_mat = loadmat(os.path.join(gt_dir, 'wider_face_val.mat')) 22 | hard_mat = loadmat(os.path.join(gt_dir, 'wider_hard_val.mat')) 23 | medium_mat = loadmat(os.path.join(gt_dir, 'wider_medium_val.mat')) 24 | easy_mat = loadmat(os.path.join(gt_dir, 'wider_easy_val.mat')) 25 | 26 | facebox_list = gt_mat['face_bbx_list'] 27 | event_list = gt_mat['event_list'] 28 | file_list = gt_mat['file_list'] 29 | 30 | hard_gt_list = hard_mat['gt_list'] 31 | medium_gt_list = medium_mat['gt_list'] 32 | easy_gt_list = easy_mat['gt_list'] 33 | 34 | return facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list 35 | 36 | 37 | def get_gt_boxes_from_txt(gt_path, cache_dir): 38 | 39 | cache_file = os.path.join(cache_dir, 'gt_cache.pkl') 40 | if os.path.exists(cache_file): 41 | f = open(cache_file, 'rb') 42 | boxes = pickle.load(f) 43 | f.close() 44 | return boxes 45 | 46 | f = open(gt_path, 'r') 47 | state = 0 48 | lines = f.readlines() 49 | lines = list(map(lambda x: x.rstrip('\r\n'), lines)) 50 | boxes = {} 51 | print(len(lines)) 52 | f.close() 53 | current_boxes = [] 54 | current_name = None 55 | for line in lines: 56 | if state == 0 and '--' in line: 57 | state = 1 58 | current_name = line 59 | continue 60 | if state == 1: 61 | state = 2 62 | continue 63 | 64 | if state == 2 and '--' in line: 65 | state = 1 66 | boxes[current_name] = np.array(current_boxes).astype('float32') 67 | current_name = line 68 | current_boxes = [] 69 | continue 70 | 71 | if state == 2: 72 | box = [float(x) for x in line.split(' ')[:4]] 73 | current_boxes.append(box) 74 | continue 75 | 76 | f = open(cache_file, 'wb') 77 | pickle.dump(boxes, f) 78 | f.close() 79 | return boxes 80 | 81 | 82 | def read_pred_file(filepath): 83 | 84 | with open(filepath, 'r') as f: 85 | lines = f.readlines() 86 | img_file = lines[0].rstrip('\n\r') 87 | lines = lines[2:] 88 | 89 | # b = lines[0].rstrip('\r\n').split(' ')[:-1] 90 | # c = float(b) 91 | # a = map(lambda x: [[float(a[0]), float(a[1]), float(a[2]), float(a[3]), float(a[4])] for a in x.rstrip('\r\n').split(' ')], lines) 92 | boxes = [] 93 | for line in lines: 94 | line = line.rstrip('\r\n').split(' ') 95 | if line[0] is '': 96 | continue 97 | # a = float(line[4]) 98 | boxes.append([float(line[0]), float(line[1]), float(line[2]), float(line[3]), float(line[4])]) 99 | boxes = np.array(boxes) 100 | # boxes = np.array(list(map(lambda x: [float(a) for a in x.rstrip('\r\n').split(' ')], lines))).astype('float') 101 | return img_file.split('/')[-1], boxes 102 | 103 | 104 | def get_preds(pred_dir): 105 | events = os.listdir(pred_dir) 106 | boxes = dict() 107 | pbar = tqdm.tqdm(events) 108 | 109 | for event in pbar: 110 | pbar.set_description('Reading Predictions ') 111 | event_dir = os.path.join(pred_dir, event) 112 | event_images = os.listdir(event_dir) 113 | current_event = dict() 114 | for imgtxt in event_images: 115 | imgname, _boxes = read_pred_file(os.path.join(event_dir, imgtxt)) 116 | current_event[imgname.rstrip('.jpg')] = _boxes 117 | boxes[event] = current_event 118 | return boxes 119 | 120 | 121 | def norm_score(pred): 122 | """ norm score 123 | pred {key: [[x1,y1,x2,y2,s]]} 124 | """ 125 | 126 | max_score = 0 127 | min_score = 1 128 | 129 | for _, k in pred.items(): 130 | for _, v in k.items(): 131 | if len(v) == 0: 132 | continue 133 | _min = np.min(v[:, -1]) 134 | _max = np.max(v[:, -1]) 135 | max_score = max(_max, max_score) 136 | min_score = min(_min, min_score) 137 | 138 | diff = max_score - min_score 139 | for _, k in pred.items(): 140 | for _, v in k.items(): 141 | if len(v) == 0: 142 | continue 143 | v[:, -1] = (v[:, -1] - min_score)/diff 144 | 145 | 146 | def image_eval(pred, gt, ignore, iou_thresh): 147 | """ single image evaluation 148 | pred: Nx5 149 | gt: Nx4 150 | ignore: 151 | """ 152 | 153 | _pred = pred.copy() 154 | _gt = gt.copy() 155 | pred_recall = np.zeros(_pred.shape[0]) 156 | recall_list = np.zeros(_gt.shape[0]) 157 | proposal_list = np.ones(_pred.shape[0]) 158 | 159 | _pred[:, 2] = _pred[:, 2] + _pred[:, 0] 160 | _pred[:, 3] = _pred[:, 3] + _pred[:, 1] 161 | _gt[:, 2] = _gt[:, 2] + _gt[:, 0] 162 | _gt[:, 3] = _gt[:, 3] + _gt[:, 1] 163 | 164 | overlaps = bbox_overlaps(_pred[:, :4], _gt) 165 | 166 | for h in range(_pred.shape[0]): 167 | 168 | gt_overlap = overlaps[h] 169 | max_overlap, max_idx = gt_overlap.max(), gt_overlap.argmax() 170 | if max_overlap >= iou_thresh: 171 | if ignore[max_idx] == 0: 172 | recall_list[max_idx] = -1 173 | proposal_list[h] = -1 174 | elif recall_list[max_idx] == 0: 175 | recall_list[max_idx] = 1 176 | 177 | r_keep_index = np.where(recall_list == 1)[0] 178 | pred_recall[h] = len(r_keep_index) 179 | return pred_recall, proposal_list 180 | 181 | 182 | def img_pr_info(thresh_num, pred_info, proposal_list, pred_recall): 183 | pr_info = np.zeros((thresh_num, 2)).astype('float') 184 | for t in range(thresh_num): 185 | 186 | thresh = 1 - (t+1)/thresh_num 187 | r_index = np.where(pred_info[:, 4] >= thresh)[0] 188 | if len(r_index) == 0: 189 | pr_info[t, 0] = 0 190 | pr_info[t, 1] = 0 191 | else: 192 | r_index = r_index[-1] 193 | p_index = np.where(proposal_list[:r_index+1] == 1)[0] 194 | pr_info[t, 0] = len(p_index) 195 | pr_info[t, 1] = pred_recall[r_index] 196 | return pr_info 197 | 198 | 199 | def dataset_pr_info(thresh_num, pr_curve, count_face): 200 | _pr_curve = np.zeros((thresh_num, 2)) 201 | for i in range(thresh_num): 202 | _pr_curve[i, 0] = pr_curve[i, 1] / pr_curve[i, 0] 203 | _pr_curve[i, 1] = pr_curve[i, 1] / count_face 204 | return _pr_curve 205 | 206 | 207 | def voc_ap(rec, prec): 208 | 209 | # correct AP calculation 210 | # first append sentinel values at the end 211 | mrec = np.concatenate(([0.], rec, [1.])) 212 | mpre = np.concatenate(([0.], prec, [0.])) 213 | 214 | # compute the precision envelope 215 | for i in range(mpre.size - 1, 0, -1): 216 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 217 | 218 | # to calculate area under PR curve, look for points 219 | # where X axis (recall) changes value 220 | i = np.where(mrec[1:] != mrec[:-1])[0] 221 | 222 | # and sum (\Delta recall) * prec 223 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 224 | return ap 225 | 226 | 227 | def evaluation(pred, gt_path, iou_thresh=0.5): 228 | pred = get_preds(pred) 229 | norm_score(pred) 230 | facebox_list, event_list, file_list, hard_gt_list, medium_gt_list, easy_gt_list = get_gt_boxes(gt_path) 231 | event_num = len(event_list) 232 | thresh_num = 1000 233 | settings = ['easy', 'medium', 'hard'] 234 | setting_gts = [easy_gt_list, medium_gt_list, hard_gt_list] 235 | aps = [] 236 | for setting_id in range(3): 237 | # different setting 238 | gt_list = setting_gts[setting_id] 239 | count_face = 0 240 | pr_curve = np.zeros((thresh_num, 2)).astype('float') 241 | # [hard, medium, easy] 242 | pbar = tqdm.tqdm(range(event_num)) 243 | for i in pbar: 244 | pbar.set_description('Processing {}'.format(settings[setting_id])) 245 | event_name = str(event_list[i][0][0]) 246 | img_list = file_list[i][0] 247 | pred_list = pred[event_name] 248 | sub_gt_list = gt_list[i][0] 249 | # img_pr_info_list = np.zeros((len(img_list), thresh_num, 2)) 250 | gt_bbx_list = facebox_list[i][0] 251 | 252 | for j in range(len(img_list)): 253 | pred_info = pred_list[str(img_list[j][0][0])] 254 | 255 | gt_boxes = gt_bbx_list[j][0].astype('float') 256 | keep_index = sub_gt_list[j][0] 257 | count_face += len(keep_index) 258 | 259 | if len(gt_boxes) == 0 or len(pred_info) == 0: 260 | continue 261 | ignore = np.zeros(gt_boxes.shape[0]) 262 | if len(keep_index) != 0: 263 | ignore[keep_index-1] = 1 264 | pred_recall, proposal_list = image_eval(pred_info, gt_boxes, ignore, iou_thresh) 265 | 266 | _img_pr_info = img_pr_info(thresh_num, pred_info, proposal_list, pred_recall) 267 | 268 | pr_curve += _img_pr_info 269 | pr_curve = dataset_pr_info(thresh_num, pr_curve, count_face) 270 | 271 | propose = pr_curve[:, 0] 272 | recall = pr_curve[:, 1] 273 | 274 | ap = voc_ap(recall, propose) 275 | aps.append(ap) 276 | 277 | print("==================== Results ====================") 278 | print("Easy Val AP: {}".format(aps[0])) 279 | print("Medium Val AP: {}".format(aps[1])) 280 | print("Hard Val AP: {}".format(aps[2])) 281 | print("=================================================") 282 | 283 | 284 | if __name__ == '__main__': 285 | 286 | parser = argparse.ArgumentParser() 287 | parser.add_argument('-p', '--pred', default="./widerface_txt/") 288 | parser.add_argument('-g', '--gt', default='./ground_truth/') 289 | 290 | args = parser.parse_args() 291 | evaluation(args.pred, args.gt) 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | -------------------------------------------------------------------------------- /widerface_evaluate/ground_truth/wider_easy_val.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/widerface_evaluate/ground_truth/wider_easy_val.mat -------------------------------------------------------------------------------- /widerface_evaluate/ground_truth/wider_face_val.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/widerface_evaluate/ground_truth/wider_face_val.mat -------------------------------------------------------------------------------- /widerface_evaluate/ground_truth/wider_hard_val.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/widerface_evaluate/ground_truth/wider_hard_val.mat -------------------------------------------------------------------------------- /widerface_evaluate/ground_truth/wider_medium_val.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gengyanlei/Pytorch_Retinaface/b863d9ed7c405f3733d038f249ebf3581e3c7ba6/widerface_evaluate/ground_truth/wider_medium_val.mat -------------------------------------------------------------------------------- /widerface_evaluate/setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | WiderFace evaluation code 3 | author: wondervictor 4 | mail: tianhengcheng@gmail.com 5 | copyright@wondervictor 6 | """ 7 | 8 | from distutils.core import setup, Extension 9 | from Cython.Build import cythonize 10 | import numpy 11 | 12 | package = Extension('bbox', ['box_overlaps.pyx'], include_dirs=[numpy.get_include()]) 13 | setup(ext_modules=cythonize([package])) 14 | --------------------------------------------------------------------------------