├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── face_detection_test.py ├── ibug └── face_detection │ ├── __init__.py │ ├── retina_face │ ├── __init__.py │ ├── box_utils.py │ ├── config.py │ ├── prior_box.py │ ├── py_cpu_nms.py │ ├── retina_face.py │ ├── retina_face_net.py │ ├── retina_face_predictor.py │ └── weights │ │ ├── Resnet50_Final.pth │ │ └── mobilenet0.25_Final.pth │ ├── s3fd │ ├── __init__.py │ ├── s3fd_net.py │ ├── s3fd_predictor.py │ ├── utils.py │ └── weights │ │ └── s3fd_weights.pth │ └── utils │ ├── __init__.py │ ├── data │ └── bfm_lms.npy │ ├── head_pose_estimator.py │ └── simple_face_tracker.py ├── requirements.txt └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | . text !filter !merge !diff 2 | ibug/face_detection/retina_face/weights/Resnet50_Final.pth filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | .vscode 3 | .idea 4 | *.pyc 5 | build 6 | dist 7 | *.egg-info 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jie Shen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ibug.face_detection 2 | A collection of pretrained face detectors including: 3 | * [S3FD](http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_S3FD_Single_Shot_ICCV_2017_paper.pdf) \[1\] with weights trained on the [WIDER](http://shuoyang1213.me/WIDERFACE/) \[2\] dataset. Implementation of the algorithm is based on this repository: [https://github.com/cs-giung/face-detection-pytorch](https://github.com/cs-giung/face-detection-pytorch). 4 | * [RetinaFace](https://arxiv.org/pdf/1905.00641) \[3\] with weights trained on the [WIDER](http://shuoyang1213.me/WIDERFACE/) \[2\] dataset. Wights for networks using either Resnet50 or MobileNet0.25 as the backbone are included. The implementation is based on this repository: [https://github.com/biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface). 5 | 6 | For convenience, the package also includes a simple IOU-based face tracker and a head pose estimator using EPnP. 7 | 8 | ## Prerequisites 9 | * [Git LFS](https://git-lfs.github.com/), needed for downloading the pretrained weights that are larger than 100 MB. 10 | * [Numpy](https://www.numpy.org/): `$pip3 install numpy` 11 | * [Sciypy](https://www.scipy.org/): `$pip3 install scipy` 12 | * [PyTorch](https://pytorch.org/): `$pip3 install torch torchvision` 13 | * [OpenCV](https://opencv.org/): `$pip3 install opencv-python` 14 | 15 | ## How to Install 16 | ``` 17 | git clone https://github.com/hhj1897/face_detection.git 18 | cd face_detection 19 | git lfs pull 20 | pip install -e . 21 | ``` 22 | 23 | ## How to Test 24 | * To test on live video: `python face_detection_test.py [-i webcam_index]` 25 | * To test on a video file: `python face_detection_test.py [-i input_file] [-o output_file]` 26 | 27 | By default, the test script would use RetinaFace with Resnet50, but you can change that using `--method` and `--weights` options. 28 | 29 | ## How to Use 30 | ```python 31 | # Import everything, just for illustration purposes 32 | import cv2 33 | from ibug.face_detection import RetinaFacePredictor, S3FDPredictor 34 | from ibug.face_detection.utils import HeadPoseEstimator, SimpleFaceTracker 35 | 36 | # Create a RetinaFace detector using Resnet50 backbone, with the confidence 37 | # threshold set to 0.8 38 | face_detector = RetinaFacePredictor( 39 | threshold=0.8, device='cuda:0', 40 | model=RetinaFacePredictor.get_model('resnet50')) 41 | 42 | # Create a head pose estimator 43 | pose_estimator = HeadPoseEstimator() 44 | 45 | # Create a simple face tracker, with mininum face size set to 64x64 pixels 46 | face_tracker = SimpleFaceTracker(minimum_face_size=64) 47 | 48 | # Load a test image. Note that images loaded by OpenCV adopt the B-G-R channel 49 | # order. 50 | image = cv2.imread('test.png') 51 | 52 | # Detect faces from the image 53 | # Note: 54 | # 1. The input image must be a byte array of dimension HxWx3. 55 | # 2. The return value is a Nx5 (for S3FD) or a Nx15 (for RetinaFace) matrix, 56 | # in which N is the number of detected faces. The first 4 columns store 57 | # (in this order) the left, top, right, and bottom coordinates of the 58 | # detected face boxes. The 5th columns stores the detection confidences. 59 | # The remaining columns store the coordinates (in the order of x1, y1, x2, 60 | # y2, ...) of the detected landmarks. 61 | detected_faces = face_detector(image, rgb=False) 62 | 63 | # Head pose estimation (only works for RetinaFace, which also detects the 5 64 | # landmarks on the face), which gives pitch, yaw, and roll (in degrees) of 65 | # the detected faces. 66 | for face in detected_faces: 67 | pitch, yaw, roll = pose_estimator(face[5:].reshape((-1, 2))) 68 | 69 | # If you are processing frames in a video, you can also perform rudimentary 70 | # face tracking, as shown below. The return value is a list containing the 71 | # tracklet ID (>=1) of the detected faces. If a face cannot be tracked 72 | # (such as because it is too small), its corresponding element in the list 73 | # would be set to None. 74 | tracked_ids = face_tracker(detected_faces[:, :4]) 75 | ``` 76 | 77 | ## References 78 | \[1\] Zhang, Shifeng, Xiangyu Zhu, Zhen Lei, Hailin Shi, Xiaobo Wang, and Stan Z. Li. "[S3fd: Single shot scale-invariant face detector.](http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_S3FD_Single_Shot_ICCV_2017_paper.pdf)" In _Proceedings of the IEEE international conference on computer vision_, pp. 192-201. 2017. 79 | 80 | \[2\] Yang, Shuo, Ping Luo, Chen-Change Loy, Xiaoou Tang. "[WIDER FACE: A Face Detection Benchmark.](http://openaccess.thecvf.com/content_cvpr_2016/papers/Yang_WIDER_FACE_A_CVPR_2016_paper.pdf)" In _Proceedings of the IEEE international conference on computer vision_, pp. 5525-5533. 2016. 81 | 82 | \[3\] Deng, Jiankang, Jia Guo, Evangelos Ververas, Irene Kotsia, and Stefanos Zafeiriou. "[Retinaface: Single-shot multi-level face localisation in the wild.](https://openaccess.thecvf.com/content_CVPR_2020/papers/Deng_RetinaFace_Single-Shot_Multi-Level_Face_Localisation_in_the_Wild_CVPR_2020_paper.pdf)" In _Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition_, pp. 5203-5212. 2020. 83 | -------------------------------------------------------------------------------- /face_detection_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import time 4 | import torch 5 | from argparse import ArgumentParser 6 | from ibug.face_detection import RetinaFacePredictor, S3FDPredictor 7 | from ibug.face_detection.utils import SimpleFaceTracker, HeadPoseEstimator 8 | 9 | 10 | def main() -> None: 11 | # Parse command-line arguments 12 | parser = ArgumentParser() 13 | parser.add_argument('--input', '-i', help='Input video path or webcam index (default=0)', default=0) 14 | parser.add_argument('--output', '-o', help='Output file path', default=None) 15 | parser.add_argument('--fourcc', '-f', help='FourCC of the output video (default=mp4v)', 16 | type=str, default='mp4v') 17 | parser.add_argument('--benchmark', '-b', help='Enable benchmark mode for CUDNN', 18 | action='store_true', default=False) 19 | parser.add_argument('--no-display', '-n', help='No display if processing a video file', 20 | action='store_true', default=False) 21 | parser.add_argument('--threshold', '-t', help='Confidence threshold (default=0.8)', 22 | type=float, default=0.8) 23 | parser.add_argument('--method', '-m', help='Method to use, can be either RatinaFace or S3FD (default=RatinaFace)', 24 | default='retinaface') 25 | parser.add_argument('--weights', '-w', 26 | help='Weights to load, can be either resnet50 or mobilenet0.25 when using RetinaFace', 27 | default=None) 28 | parser.add_argument('--alternative-pth', '-p', help='Alternative pth file to load', default=None) 29 | parser.add_argument('--device', '-d', help='Device to be used by the model (default=cuda:0)', 30 | default='cuda:0') 31 | parser.add_argument('--iou-threshold', '-iou', 32 | help='IOU threshold used by the simple face tracker (default=0.4)', 33 | type=float, default=0.4) 34 | parser.add_argument('--minimum-face-size', '-min', 35 | help='Minimum face size used by the simple face tracker (default=0.0)', 36 | type=float, default=0.0) 37 | parser.add_argument('--head-pose-preference', '-hp', 38 | help='Head pose output preference (default=0)', 39 | type=int, default=0) 40 | args = parser.parse_args() 41 | 42 | # Set benchmark mode flag for CUDNN 43 | torch.backends.cudnn.benchmark = args.benchmark 44 | 45 | vid = None 46 | out_vid = None 47 | has_window = False 48 | try: 49 | # Create the face detector 50 | args.method = args.method.lower().strip() 51 | if args.method == 'retinaface': 52 | face_detector_class = (RetinaFacePredictor, 'RetinaFace') 53 | elif args.method == 's3fd': 54 | face_detector_class = (S3FDPredictor, 'S3FD') 55 | else: 56 | raise ValueError('method must be set to either RetinaFace or S3FD') 57 | if args.weights is None: 58 | fd_model = face_detector_class[0].get_model() 59 | else: 60 | fd_model = face_detector_class[0].get_model(args.weights) 61 | if args.alternative_pth is not None: 62 | fd_model.weights = args.alternative_pth 63 | face_detector = face_detector_class[0](threshold=args.threshold, device=args.device, model=fd_model) 64 | print(f"Face detector created using {face_detector_class[1]} ({fd_model.weights}).") 65 | 66 | # Create the simple face tracker 67 | face_tracker = SimpleFaceTracker(iou_threshold=args.iou_threshold, 68 | minimum_face_size=args.minimum_face_size) 69 | print('Simple face tracker created.') 70 | 71 | # Create the head pose estimator 72 | head_pose_estimator = HeadPoseEstimator() 73 | print('Head pose estimator created.') 74 | 75 | # Open the input video 76 | using_webcam = not os.path.exists(args.input) 77 | vid = cv2.VideoCapture(int(args.input) if using_webcam else args.input) 78 | assert vid.isOpened() 79 | if using_webcam: 80 | print(f'Webcam #{int(args.input)} opened.') 81 | else: 82 | print(f'Input video "{args.input}" opened.') 83 | 84 | # Open the output video (if a path is given) 85 | if args.output is not None: 86 | out_vid = cv2.VideoWriter(args.output, fps=vid.get(cv2.CAP_PROP_FPS), 87 | frameSize=(int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)), 88 | int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))), 89 | fourcc=cv2.VideoWriter_fourcc(*args.fourcc)) 90 | assert out_vid.isOpened() 91 | 92 | # Process the frames 93 | frame_number = 0 94 | window_title = os.path.splitext(os.path.basename(__file__))[0] 95 | colours = [(0, 0, 255), (0, 255, 0), (255, 0, 0), (0, 255, 255), (255, 0, 255), (255, 255, 0), 96 | (0, 128, 255), (128, 255, 0), (255, 0, 128), (128, 0, 255), (0, 255, 128), (255, 128, 0)] 97 | print('Processing started, press \'Q\' to quit or \'R\' to reset the tracker.') 98 | while True: 99 | # Get a new frame 100 | _, frame = vid.read() 101 | if frame is None: 102 | break 103 | else: 104 | # Detect and track faces, also estimate head pose if landmarks are available 105 | start_time = time.time() 106 | faces = face_detector(frame, rgb=False) 107 | tids = face_tracker(faces) 108 | if faces.shape[1] >= 15: 109 | head_poses = [head_pose_estimator(face[5:15].reshape((-1, 2)), *frame.shape[1::-1], 110 | output_preference=args.head_pose_preference) 111 | for face in faces] 112 | else: 113 | head_poses = [None] * faces.shape[0] 114 | elapsed_time = time.time() - start_time 115 | 116 | # Textural output 117 | print(f'Frame #{frame_number} processed in {elapsed_time * 1000.0:.04f} ms: ' + 118 | f'{len(faces)} faces detected.') 119 | 120 | # Rendering 121 | for face, tid, head_pose in zip(faces, tids, head_poses): 122 | bbox = face[:4].astype(int) 123 | if tid is None: 124 | colour = (128, 128, 128) 125 | else: 126 | colour = colours[(tid - 1) % len(colours)] 127 | cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color=colour, thickness=2) 128 | if len(face) > 5: 129 | for pts in face[5:].reshape((-1, 2)): 130 | cv2.circle(frame, tuple(pts.astype(int).tolist()), 3, colour, -1) 131 | if tid is not None: 132 | cv2.putText(frame, f'Face {tid}', (bbox[0], bbox[1] - 10), 133 | cv2.FONT_HERSHEY_DUPLEX, 0.6, colour, lineType=cv2.LINE_AA) 134 | if head_pose is not None: 135 | pitch, yaw, roll = head_pose 136 | cv2.putText(frame, f'Pitch: {pitch:.1f}', (bbox[2] + 5, bbox[1] + 10), 137 | cv2.FONT_HERSHEY_DUPLEX, 0.5, colour, lineType=cv2.LINE_AA) 138 | cv2.putText(frame, f'Yaw: {yaw:.1f}', (bbox[2] + 5, bbox[1] + 30), 139 | cv2.FONT_HERSHEY_DUPLEX, 0.5, colour, lineType=cv2.LINE_AA) 140 | cv2.putText(frame, f'Roll: {roll:.1f}', (bbox[2] + 5, bbox[1] + 50), 141 | cv2.FONT_HERSHEY_DUPLEX, 0.5, colour, lineType=cv2.LINE_AA) 142 | 143 | # Write the frame to output video (if recording) 144 | if out_vid is not None: 145 | out_vid.write(frame) 146 | 147 | # Display the frame 148 | if using_webcam or not args.no_display: 149 | has_window = True 150 | cv2.imshow(window_title, frame) 151 | key = cv2.waitKey(1) % 2 ** 16 152 | if key == ord('q') or key == ord('Q'): 153 | print('\'Q\' pressed, we are done here.') 154 | break 155 | elif key == ord('r') or key == ord('R'): 156 | print('\'R\' pressed, reset the tracker.') 157 | face_tracker.reset() 158 | frame_number += 1 159 | finally: 160 | if has_window: 161 | cv2.destroyAllWindows() 162 | if out_vid is not None: 163 | out_vid.release() 164 | if vid is not None: 165 | vid.release() 166 | print('All done.') 167 | 168 | 169 | if __name__ == '__main__': 170 | main() 171 | -------------------------------------------------------------------------------- /ibug/face_detection/__init__.py: -------------------------------------------------------------------------------- 1 | from .s3fd import S3FDPredictor 2 | from .retina_face import RetinaFacePredictor 3 | 4 | 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/__init__.py: -------------------------------------------------------------------------------- 1 | from .retina_face_predictor import RetinaFacePredictor 2 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/box_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def point_form(boxes): 6 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 7 | representation for comparison to point form ground truth data. 8 | Args: 9 | boxes: (tensor) center-size default boxes from priorbox layers. 10 | Return: 11 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 12 | """ 13 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 14 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 15 | 16 | 17 | def center_size(boxes): 18 | """ Convert prior_boxes to (cx, cy, w, h) 19 | representation for comparison to center-size form ground truth data. 20 | Args: 21 | boxes: (tensor) point_form boxes 22 | Return: 23 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 24 | """ 25 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 26 | boxes[:, 2:] - boxes[:, :2], 1) # w, h 27 | 28 | 29 | def intersect(box_a, box_b): 30 | """ We resize both tensors to [A,B,2] without new malloc: 31 | [A,2] -> [A,1,2] -> [A,B,2] 32 | [B,2] -> [1,B,2] -> [A,B,2] 33 | Then we compute the area of intersect between box_a and box_b. 34 | Args: 35 | box_a: (tensor) bounding boxes, Shape: [A,4]. 36 | box_b: (tensor) bounding boxes, Shape: [B,4]. 37 | Return: 38 | (tensor) intersection area, Shape: [A,B]. 39 | """ 40 | A = box_a.size(0) 41 | B = box_b.size(0) 42 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 43 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 44 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 45 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 46 | inter = torch.clamp((max_xy - min_xy), min=0) 47 | return inter[:, :, 0] * inter[:, :, 1] 48 | 49 | 50 | def jaccard(box_a, box_b): 51 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 52 | is simply the intersection over union of two boxes. Here we operate on 53 | ground truth boxes and default boxes. 54 | E.g.: 55 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 56 | Args: 57 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 58 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 59 | Return: 60 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 61 | """ 62 | inter = intersect(box_a, box_b) 63 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 64 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 65 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 66 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 67 | union = area_a + area_b - inter 68 | return inter / union # [A,B] 69 | 70 | 71 | def matrix_iou(a, b): 72 | """ 73 | return iou of a and b, numpy version for data augenmentation 74 | """ 75 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 76 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 77 | 78 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 79 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 80 | area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) 81 | return area_i / (area_a[:, np.newaxis] + area_b - area_i) 82 | 83 | 84 | def matrix_iof(a, b): 85 | """ 86 | return iof of a and b, numpy version for data augenmentation 87 | """ 88 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 89 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 90 | 91 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 92 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 93 | return area_i / np.maximum(area_a[:, np.newaxis], 1) 94 | 95 | 96 | def match(threshold, truths, priors, variances, labels, landms, loc_t, conf_t, landm_t, idx): 97 | """Match each prior box with the ground truth box of the highest jaccard 98 | overlap, encode the bounding boxes, then return the matched indices 99 | corresponding to both confidence and location preds. 100 | Args: 101 | threshold: (float) The overlap threshold used when mathing boxes. 102 | truths: (tensor) Ground truth boxes, Shape: [num_obj, 4]. 103 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 104 | variances: (tensor) Variances corresponding to each prior coord, 105 | Shape: [num_priors, 4]. 106 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 107 | landms: (tensor) Ground truth landms, Shape [num_obj, 10]. 108 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 109 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 110 | landm_t: (tensor) Tensor to be filled w/ endcoded landm targets. 111 | idx: (int) current batch index 112 | Return: 113 | The matched indices corresponding to 1)location 2)confidence 3)landm preds. 114 | """ 115 | # jaccard index 116 | overlaps = jaccard( 117 | truths, 118 | point_form(priors) 119 | ) 120 | # (Bipartite Matching) 121 | # [1,num_objects] best prior for each ground truth 122 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 123 | 124 | # ignore hard gt 125 | valid_gt_idx = best_prior_overlap[:, 0] >= 0.2 126 | best_prior_idx_filter = best_prior_idx[valid_gt_idx, :] 127 | if best_prior_idx_filter.shape[0] <= 0: 128 | loc_t[idx] = 0 129 | conf_t[idx] = 0 130 | return 131 | 132 | # [1,num_priors] best ground truth for each prior 133 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 134 | best_truth_idx.squeeze_(0) 135 | best_truth_overlap.squeeze_(0) 136 | best_prior_idx.squeeze_(1) 137 | best_prior_idx_filter.squeeze_(1) 138 | best_prior_overlap.squeeze_(1) 139 | best_truth_overlap.index_fill_(0, best_prior_idx_filter, 2) # ensure best prior 140 | # TODO refactor: index best_prior_idx with long tensor 141 | # ensure every gt matches with its prior of max overlap 142 | for j in range(best_prior_idx.size(0)): # 判别此anchor是预测哪一个boxes 143 | best_truth_idx[best_prior_idx[j]] = j 144 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 此处为每一个anchor对应的bbox取出来 145 | conf = labels[best_truth_idx] # Shape: [num_priors] 此处为每一个anchor对应的label取出来 146 | conf[best_truth_overlap < threshold] = 0 # label as background overlap<0.35的全部作为负样本 147 | loc = encode(matches, priors, variances) 148 | 149 | matches_landm = landms[best_truth_idx] 150 | landm = encode_landm(matches_landm, priors, variances) 151 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 152 | conf_t[idx] = conf # [num_priors] top class label for each prior 153 | landm_t[idx] = landm 154 | 155 | 156 | def encode(matched, priors, variances): 157 | """Encode the variances from the priorbox layers into the ground truth boxes 158 | we have matched (based on jaccard overlap) with the prior boxes. 159 | Args: 160 | matched: (tensor) Coords of ground truth for each prior in point-form 161 | Shape: [num_priors, 4]. 162 | priors: (tensor) Prior boxes in center-offset form 163 | Shape: [num_priors,4]. 164 | variances: (list[float]) Variances of priorboxes 165 | Return: 166 | encoded boxes (tensor), Shape: [num_priors, 4] 167 | """ 168 | 169 | # dist b/t match center and prior's center 170 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 171 | # encode variance 172 | g_cxcy /= (variances[0] * priors[:, 2:]) 173 | # match wh / prior wh 174 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 175 | g_wh = torch.log(g_wh) / variances[1] 176 | # return target for smooth_l1_loss 177 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 178 | 179 | 180 | def encode_landm(matched, priors, variances): 181 | """Encode the variances from the priorbox layers into the ground truth boxes 182 | we have matched (based on jaccard overlap) with the prior boxes. 183 | Args: 184 | matched: (tensor) Coords of ground truth for each prior in point-form 185 | Shape: [num_priors, 10]. 186 | priors: (tensor) Prior boxes in center-offset form 187 | Shape: [num_priors,4]. 188 | variances: (list[float]) Variances of priorboxes 189 | Return: 190 | encoded landm (tensor), Shape: [num_priors, 10] 191 | """ 192 | 193 | # dist b/t match center and prior's center 194 | matched = torch.reshape(matched, (matched.size(0), 5, 2)) 195 | priors_cx = priors[:, 0].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 196 | priors_cy = priors[:, 1].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 197 | priors_w = priors[:, 2].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 198 | priors_h = priors[:, 3].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 199 | priors = torch.cat([priors_cx, priors_cy, priors_w, priors_h], dim=2) 200 | g_cxcy = matched[:, :, :2] - priors[:, :, :2] 201 | # encode variance 202 | g_cxcy /= (variances[0] * priors[:, :, 2:]) 203 | # g_cxcy /= priors[:, :, 2:] 204 | g_cxcy = g_cxcy.reshape(g_cxcy.size(0), -1) 205 | # return target for smooth_l1_loss 206 | return g_cxcy 207 | 208 | 209 | # Adapted from https://github.com/Hakuyume/chainer-ssd 210 | def decode(loc, priors, variances): 211 | """Decode locations from predictions using priors to undo 212 | the encoding we did for offset regression at train time. 213 | Args: 214 | loc (tensor): location predictions for loc layers, 215 | Shape: [num_priors,4] 216 | priors (tensor): Prior boxes in center-offset form. 217 | Shape: [num_priors,4]. 218 | variances: (list[float]) Variances of priorboxes 219 | Return: 220 | decoded bounding box predictions 221 | """ 222 | 223 | boxes = torch.cat(( 224 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 225 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 226 | boxes[:, :2] -= boxes[:, 2:] / 2 227 | boxes[:, 2:] += boxes[:, :2] 228 | return boxes 229 | 230 | 231 | def decode_landm(pre, priors, variances): 232 | """Decode landm from predictions using priors to undo 233 | the encoding we did for offset regression at train time. 234 | Args: 235 | pre (tensor): landm predictions for loc layers, 236 | Shape: [num_priors,10] 237 | priors (tensor): Prior boxes in center-offset form. 238 | Shape: [num_priors,4]. 239 | variances: (list[float]) Variances of priorboxes 240 | Return: 241 | decoded landm predictions 242 | """ 243 | landms = torch.cat((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:], 244 | priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:], 245 | priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:], 246 | priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:], 247 | priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:], 248 | ), dim=1) 249 | return landms 250 | 251 | 252 | def log_sum_exp(x): 253 | """Utility function for computing log_sum_exp while determining 254 | This will be used to determine unaveraged confidence loss across 255 | all examples in a batch. 256 | Args: 257 | x (Variable(tensor)): conf_preds from conf layers 258 | """ 259 | x_max = x.data.max() 260 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 261 | 262 | 263 | # Original author: Francisco Massa: 264 | # https://github.com/fmassa/object-detection.torch 265 | # Ported to PyTorch by Max deGroot (02/01/2017) 266 | def nms(boxes, scores, overlap=0.5, top_k=200): 267 | """Apply non-maximum suppression at test time to avoid detecting too many 268 | overlapping bounding boxes for a given object. 269 | Args: 270 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 271 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 272 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 273 | top_k: (int) The Maximum number of box preds to consider. 274 | Return: 275 | The indices of the kept boxes with respect to num_priors. 276 | """ 277 | 278 | keep = torch.Tensor(scores.size(0)).fill_(0).long() 279 | if boxes.numel() == 0: 280 | return keep 281 | x1 = boxes[:, 0] 282 | y1 = boxes[:, 1] 283 | x2 = boxes[:, 2] 284 | y2 = boxes[:, 3] 285 | area = torch.mul(x2 - x1, y2 - y1) 286 | v, idx = scores.sort(0) # sort in ascending order 287 | # I = I[v >= 0.01] 288 | idx = idx[-top_k:] # indices of the top-k largest vals 289 | xx1 = boxes.new() 290 | yy1 = boxes.new() 291 | xx2 = boxes.new() 292 | yy2 = boxes.new() 293 | w = boxes.new() 294 | h = boxes.new() 295 | 296 | # keep = torch.Tensor() 297 | count = 0 298 | while idx.numel() > 0: 299 | i = idx[-1] # index of current largest val 300 | # keep.append(i) 301 | keep[count] = i 302 | count += 1 303 | if idx.size(0) == 1: 304 | break 305 | idx = idx[:-1] # remove kept element from view 306 | # load bboxes of next highest vals 307 | torch.index_select(x1, 0, idx, out=xx1) 308 | torch.index_select(y1, 0, idx, out=yy1) 309 | torch.index_select(x2, 0, idx, out=xx2) 310 | torch.index_select(y2, 0, idx, out=yy2) 311 | # store element-wise max with next highest score 312 | xx1 = torch.clamp(xx1, min=x1[i]) 313 | yy1 = torch.clamp(yy1, min=y1[i]) 314 | xx2 = torch.clamp(xx2, max=x2[i]) 315 | yy2 = torch.clamp(yy2, max=y2[i]) 316 | w.resize_as_(xx2) 317 | h.resize_as_(yy2) 318 | w = xx2 - xx1 319 | h = yy2 - yy1 320 | # check sizes of xx1 and xx2.. after each iteration 321 | w = torch.clamp(w, min=0.0) 322 | h = torch.clamp(h, min=0.0) 323 | inter = w*h 324 | # IoU = i / (area(a) + area(b) - i) 325 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 326 | union = (rem_areas - inter) + area[i] 327 | IoU = inter/union # store result in iou 328 | # keep only elements with an IoU <= overlap 329 | idx = idx[IoU.le(overlap)] 330 | return keep, count 331 | 332 | 333 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | 3 | cfg_mnet = { 4 | 'name': 'mobilenet0.25', 5 | 'min_sizes': [[16, 32], [64, 128], [256, 512]], 6 | 'steps': [8, 16, 32], 7 | 'variance': [0.1, 0.2], 8 | 'clip': False, 9 | 'loc_weight': 2.0, 10 | 'gpu_train': True, 11 | 'batch_size': 32, 12 | 'ngpu': 1, 13 | 'epoch': 250, 14 | 'decay1': 190, 15 | 'decay2': 220, 16 | 'image_size': 640, 17 | 'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3}, 18 | 'in_channel': 32, 19 | 'out_channel': 64 20 | } 21 | 22 | cfg_re50 = { 23 | 'name': 'Resnet50', 24 | 'min_sizes': [[16, 32], [64, 128], [256, 512]], 25 | 'steps': [8, 16, 32], 26 | 'variance': [0.1, 0.2], 27 | 'clip': False, 28 | 'loc_weight': 2.0, 29 | 'gpu_train': True, 30 | 'batch_size': 24, 31 | 'ngpu': 4, 32 | 'epoch': 100, 33 | 'decay1': 70, 34 | 'decay2': 90, 35 | 'image_size': 840, 36 | 'return_layers': {'layer2': 1, 'layer3': 2, 'layer4': 3}, 37 | 'in_channel': 256, 38 | 'out_channel': 256 39 | } 40 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/prior_box.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from itertools import product as product 3 | from math import ceil 4 | 5 | 6 | class PriorBox(object): 7 | def __init__(self, cfg, image_size=None): 8 | super(PriorBox, self).__init__() 9 | self.min_sizes = cfg['min_sizes'] 10 | self.steps = cfg['steps'] 11 | self.clip = cfg['clip'] 12 | self.image_size = image_size 13 | self.feature_maps = [[ceil(self.image_size[0]/step), ceil(self.image_size[1]/step)] for step in self.steps] 14 | self.name = "s" 15 | 16 | def forward(self): 17 | anchors = [] 18 | for k, f in enumerate(self.feature_maps): 19 | min_sizes = self.min_sizes[k] 20 | for i, j in product(range(f[0]), range(f[1])): 21 | for min_size in min_sizes: 22 | s_kx = min_size / self.image_size[1] 23 | s_ky = min_size / self.image_size[0] 24 | dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]] 25 | dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]] 26 | for cy, cx in product(dense_cy, dense_cx): 27 | anchors += [cx, cy, s_kx, s_ky] 28 | 29 | # back to torch land 30 | output = torch.Tensor(anchors).view(-1, 4) 31 | if self.clip: 32 | output.clamp_(max=1, min=0) 33 | return output 34 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | 11 | def py_cpu_nms(dets, thresh, top_k): 12 | """Pure Python NMS baseline.""" 13 | x1 = dets[:, 0] 14 | y1 = dets[:, 1] 15 | x2 = dets[:, 2] 16 | y2 = dets[:, 3] 17 | scores = dets[:, 4] 18 | 19 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 20 | order = scores.argsort()[: -top_k - 1: -1] 21 | 22 | keep = [] 23 | while order.size > 0: 24 | i = order[0] 25 | keep.append(i) 26 | xx1 = np.maximum(x1[i], x1[order[1:]]) 27 | yy1 = np.maximum(y1[i], y1[order[1:]]) 28 | xx2 = np.minimum(x2[i], x2[order[1:]]) 29 | yy2 = np.minimum(y2[i], y2[order[1:]]) 30 | 31 | w = np.maximum(0.0, xx2 - xx1 + 1) 32 | h = np.maximum(0.0, yy2 - yy1 + 1) 33 | inter = w * h 34 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 35 | 36 | inds = np.where(ovr <= thresh)[0] 37 | order = order[inds + 1] 38 | 39 | return keep 40 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/retina_face.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torchvision.models as models 5 | import torchvision.models._utils as _utils 6 | from .retina_face_net import MobileNetV1, FPN, SSH 7 | 8 | 9 | class ClassHead(nn.Module): 10 | def __init__(self, inchannels=512, num_anchors=3): 11 | super(ClassHead, self).__init__() 12 | self.num_anchors = num_anchors 13 | self.conv1x1 = nn.Conv2d(inchannels, self.num_anchors*2, kernel_size=(1, 1), stride=1, padding=0) 14 | 15 | def forward(self, x): 16 | out = self.conv1x1(x) 17 | out = out.permute(0, 2, 3, 1).contiguous() 18 | 19 | return out.view(out.shape[0], -1, 2) 20 | 21 | 22 | class BboxHead(nn.Module): 23 | def __init__(self, inchannels=512, num_anchors=3): 24 | super(BboxHead, self).__init__() 25 | self.conv1x1 = nn.Conv2d(inchannels, num_anchors*4, kernel_size=(1, 1), stride=1,padding=0) 26 | 27 | def forward(self, x): 28 | out = self.conv1x1(x) 29 | out = out.permute(0, 2, 3, 1).contiguous() 30 | 31 | return out.view(out.shape[0], -1, 4) 32 | 33 | 34 | class LandmarkHead(nn.Module): 35 | def __init__(self, inchannels=512, num_anchors=3): 36 | super(LandmarkHead, self).__init__() 37 | self.conv1x1 = nn.Conv2d(inchannels,num_anchors*10, kernel_size=(1, 1), stride=1, padding=0) 38 | 39 | def forward(self, x): 40 | out = self.conv1x1(x) 41 | out = out.permute(0, 2, 3, 1).contiguous() 42 | 43 | return out.view(out.shape[0], -1, 10) 44 | 45 | 46 | class RetinaFace(nn.Module): 47 | def __init__(self, cfg=None, phase='train'): 48 | """ 49 | :param cfg: Network related settings. 50 | :param phase: train or test. 51 | """ 52 | super(RetinaFace, self).__init__() 53 | self.phase = phase 54 | backbone = None 55 | if cfg['name'] == 'mobilenet0.25': 56 | backbone = MobileNetV1() 57 | elif cfg['name'] == 'Resnet50': 58 | backbone = models.resnet50() 59 | 60 | self.body = _utils.IntermediateLayerGetter(backbone, cfg['return_layers']) 61 | in_channels_stage2 = cfg['in_channel'] 62 | in_channels_list = [ 63 | in_channels_stage2 * 2, 64 | in_channels_stage2 * 4, 65 | in_channels_stage2 * 8, 66 | ] 67 | out_channels = cfg['out_channel'] 68 | self.fpn = FPN(in_channels_list,out_channels) 69 | self.ssh1 = SSH(out_channels, out_channels) 70 | self.ssh2 = SSH(out_channels, out_channels) 71 | self.ssh3 = SSH(out_channels, out_channels) 72 | 73 | self.ClassHead = self._make_class_head(fpn_num=3, inchannels=cfg['out_channel']) 74 | self.BboxHead = self._make_bbox_head(fpn_num=3, inchannels=cfg['out_channel']) 75 | self.LandmarkHead = self._make_landmark_head(fpn_num=3, inchannels=cfg['out_channel']) 76 | 77 | def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2): 78 | classhead = nn.ModuleList() 79 | for i in range(fpn_num): 80 | classhead.append(ClassHead(inchannels, anchor_num)) 81 | return classhead 82 | 83 | def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2): 84 | bboxhead = nn.ModuleList() 85 | for i in range(fpn_num): 86 | bboxhead.append(BboxHead(inchannels, anchor_num)) 87 | return bboxhead 88 | 89 | def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2): 90 | landmarkhead = nn.ModuleList() 91 | for i in range(fpn_num): 92 | landmarkhead.append(LandmarkHead(inchannels, anchor_num)) 93 | return landmarkhead 94 | 95 | def forward(self, inputs): 96 | out = self.body(inputs) 97 | 98 | # FPN 99 | fpn = self.fpn(out) 100 | 101 | # SSH 102 | feature1 = self.ssh1(fpn[0]) 103 | feature2 = self.ssh2(fpn[1]) 104 | feature3 = self.ssh3(fpn[2]) 105 | features = [feature1, feature2, feature3] 106 | 107 | bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1) 108 | classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)], dim=1) 109 | ldm_regressions = torch.cat([self.LandmarkHead[i](feature) for i, feature in enumerate(features)], dim=1) 110 | 111 | if self.phase == 'train': 112 | output = (bbox_regressions, classifications, ldm_regressions) 113 | else: 114 | output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions) 115 | return output 116 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/retina_face_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | def conv_bn(inp, oup, stride = 1, leaky = 0): 7 | return nn.Sequential( 8 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 9 | nn.BatchNorm2d(oup), 10 | nn.LeakyReLU(negative_slope=leaky, inplace=True) 11 | ) 12 | 13 | 14 | def conv_bn_no_relu(inp, oup, stride): 15 | return nn.Sequential( 16 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 17 | nn.BatchNorm2d(oup), 18 | ) 19 | 20 | 21 | def conv_bn1X1(inp, oup, stride, leaky=0): 22 | return nn.Sequential( 23 | nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False), 24 | nn.BatchNorm2d(oup), 25 | nn.LeakyReLU(negative_slope=leaky, inplace=True) 26 | ) 27 | 28 | 29 | def conv_dw(inp, oup, stride, leaky=0.1): 30 | return nn.Sequential( 31 | nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), 32 | nn.BatchNorm2d(inp), 33 | nn.LeakyReLU(negative_slope=leaky, inplace=True), 34 | 35 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 36 | nn.BatchNorm2d(oup), 37 | nn.LeakyReLU(negative_slope=leaky, inplace=True), 38 | ) 39 | 40 | 41 | class SSH(nn.Module): 42 | def __init__(self, in_channel, out_channel): 43 | super(SSH, self).__init__() 44 | assert out_channel % 4 == 0 45 | leaky = 0 46 | if out_channel <= 64: 47 | leaky = 0.1 48 | self.conv3X3 = conv_bn_no_relu(in_channel, out_channel//2, stride=1) 49 | 50 | self.conv5X5_1 = conv_bn(in_channel, out_channel//4, stride=1, leaky = leaky) 51 | self.conv5X5_2 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1) 52 | 53 | self.conv7X7_2 = conv_bn(out_channel//4, out_channel//4, stride=1, leaky = leaky) 54 | self.conv7x7_3 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1) 55 | 56 | def forward(self, input): 57 | conv3X3 = self.conv3X3(input) 58 | 59 | conv5X5_1 = self.conv5X5_1(input) 60 | conv5X5 = self.conv5X5_2(conv5X5_1) 61 | 62 | conv7X7_2 = self.conv7X7_2(conv5X5_1) 63 | conv7X7 = self.conv7x7_3(conv7X7_2) 64 | 65 | out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1) 66 | out = F.relu(out) 67 | return out 68 | 69 | 70 | class FPN(nn.Module): 71 | def __init__(self,in_channels_list,out_channels): 72 | super(FPN,self).__init__() 73 | leaky = 0 74 | if out_channels <= 64: 75 | leaky = 0.1 76 | self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride=1, leaky=leaky) 77 | self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride=1, leaky=leaky) 78 | self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride=1, leaky=leaky) 79 | 80 | self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky) 81 | self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky) 82 | 83 | def forward(self, input): 84 | # names = list(input.keys()) 85 | input = list(input.values()) 86 | 87 | output1 = self.output1(input[0]) 88 | output2 = self.output2(input[1]) 89 | output3 = self.output3(input[2]) 90 | 91 | up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode="nearest") 92 | output2 = output2 + up3 93 | output2 = self.merge2(output2) 94 | 95 | up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode="nearest") 96 | output1 = output1 + up2 97 | output1 = self.merge1(output1) 98 | 99 | out = [output1, output2, output3] 100 | return out 101 | 102 | 103 | class MobileNetV1(nn.Module): 104 | def __init__(self): 105 | super(MobileNetV1, self).__init__() 106 | self.stage1 = nn.Sequential( 107 | conv_bn(3, 8, 2, leaky=0.1), # 3 108 | conv_dw(8, 16, 1), # 7 109 | conv_dw(16, 32, 2), # 11 110 | conv_dw(32, 32, 1), # 19 111 | conv_dw(32, 64, 2), # 27 112 | conv_dw(64, 64, 1), # 43 113 | ) 114 | self.stage2 = nn.Sequential( 115 | conv_dw(64, 128, 2), # 43 + 16 = 59 116 | conv_dw(128, 128, 1), # 59 + 32 = 91 117 | conv_dw(128, 128, 1), # 91 + 32 = 123 118 | conv_dw(128, 128, 1), # 123 + 32 = 155 119 | conv_dw(128, 128, 1), # 155 + 32 = 187 120 | conv_dw(128, 128, 1), # 187 + 32 = 219 121 | ) 122 | self.stage3 = nn.Sequential( 123 | conv_dw(128, 256, 2), # 219 +3 2 = 241 124 | conv_dw(256, 256, 1), # 241 + 64 = 301 125 | ) 126 | self.avg = nn.AdaptiveAvgPool2d((1,1)) 127 | self.fc = nn.Linear(256, 1000) 128 | 129 | def forward(self, x): 130 | x = self.stage1(x) 131 | x = self.stage2(x) 132 | x = self.stage3(x) 133 | x = self.avg(x) 134 | # x = self.model(x) 135 | x = x.view(-1, 256) 136 | x = self.fc(x) 137 | return x 138 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/retina_face_predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | from copy import deepcopy 5 | from types import SimpleNamespace 6 | from typing import Union, Optional 7 | from .prior_box import PriorBox 8 | from .py_cpu_nms import py_cpu_nms 9 | from .retina_face import RetinaFace 10 | from .config import cfg_mnet, cfg_re50 11 | from .box_utils import decode, decode_landm 12 | 13 | 14 | __all__ = ['RetinaFacePredictor'] 15 | 16 | 17 | class RetinaFacePredictor(object): 18 | def __init__(self, threshold: float = 0.8, device: Union[str, torch.device] = 'cuda:0', 19 | model: Optional[SimpleNamespace] = None, config: Optional[SimpleNamespace] = None) -> None: 20 | self.threshold = threshold 21 | self.device = device 22 | if model is None: 23 | model = RetinaFacePredictor.get_model() 24 | if config is None: 25 | config = RetinaFacePredictor.create_config() 26 | self.config = SimpleNamespace(**model.config.__dict__, **config.__dict__) 27 | self.net = RetinaFace(cfg=self.config.__dict__, phase='test').to(self.device) 28 | pretrained_dict = torch.load(model.weights, map_location=self.device) 29 | if 'state_dict' in pretrained_dict.keys(): 30 | pretrained_dict = {key.split('module.', 1)[-1] if key.startswith('module.') else key: value 31 | for key, value in pretrained_dict['state_dict'].items()} 32 | else: 33 | pretrained_dict = {key.split('module.', 1)[-1] if key.startswith('module.') else key: value 34 | for key, value in pretrained_dict.items()} 35 | self.net.load_state_dict(pretrained_dict, strict=False) 36 | self.net.eval() 37 | self.priors = None 38 | self.previous_size = None 39 | 40 | @staticmethod 41 | def get_model(name: str = 'resnet50') -> SimpleNamespace: 42 | name = name.lower().strip() 43 | if name == 'resnet50': 44 | return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__), 45 | 'weights', 'Resnet50_Final.pth')), 46 | config=SimpleNamespace(**deepcopy(cfg_re50))) 47 | elif name == 'mobilenet0.25': 48 | return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__), 49 | 'weights', 'mobilenet0.25_Final.pth')), 50 | config=SimpleNamespace(**deepcopy(cfg_mnet))) 51 | else: 52 | raise ValueError('name must be set to either resnet50 or mobilenet0.25') 53 | 54 | @staticmethod 55 | def create_config(top_k: int = 750, conf_thresh: float = 0.02, 56 | nms_thresh: float = 0.4, nms_top_k: int = 5000) -> SimpleNamespace: 57 | return SimpleNamespace(top_k=top_k, conf_thresh=conf_thresh, nms_thresh=nms_thresh, nms_top_k=nms_top_k) 58 | 59 | @torch.no_grad() 60 | def __call__(self, image: np.ndarray, rgb: bool = True) -> np.ndarray: 61 | im_height, im_width, _ = image.shape 62 | if rgb: 63 | image = image[..., ::-1] 64 | image = image.astype(int) - np.array([104, 117, 123]) 65 | image = image.transpose(2, 0, 1) 66 | image = torch.from_numpy(image).unsqueeze(0).float().to(self.device) 67 | scale = torch.Tensor([im_width, im_height, im_width, im_height]).to(self.device) 68 | loc, conf, landms = self.net(image) 69 | image_size = (im_height, im_width) 70 | if self.priors is None or self.previous_size != image_size: 71 | self.priors = PriorBox(self.config.__dict__, image_size=image_size).forward().to(self.device) 72 | self.previous_size = image_size 73 | prior_data = self.priors.data 74 | boxes = decode(loc.data.squeeze(0), prior_data, self.config.variance) 75 | boxes = boxes * scale 76 | boxes = boxes.cpu().numpy() 77 | scores = conf.squeeze(0).data.cpu().numpy()[:, 1] 78 | landms = decode_landm(landms.data.squeeze(0), prior_data, self.config.variance) 79 | scale1 = torch.Tensor([image.shape[3], image.shape[2], image.shape[3], image.shape[2], 80 | image.shape[3], image.shape[2], image.shape[3], image.shape[2], 81 | image.shape[3], image.shape[2]]).to(self.device) 82 | landms = landms * scale1 83 | landms = landms.cpu().numpy() 84 | 85 | # ignore low scores 86 | inds = np.where(scores > self.config.conf_thresh)[0] 87 | if len(inds) == 0: 88 | return np.empty(shape=(0, 15), dtype=np.float32) 89 | boxes = boxes[inds] 90 | landms = landms[inds] 91 | scores = scores[inds] 92 | 93 | # do NMS 94 | dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) 95 | keep = py_cpu_nms(dets, self.config.nms_thresh, self.config.nms_top_k) 96 | dets = dets[keep, :] 97 | landms = landms[keep] 98 | 99 | # keep top-K 100 | dets = dets[:self.config.top_k, :] 101 | landms = landms[:self.config.top_k, :] 102 | dets = np.concatenate((dets, landms), axis=1) 103 | 104 | # further filter by confidence 105 | inds = np.where(dets[:, 4] >= self.threshold)[0] 106 | if len(inds) == 0: 107 | return np.empty(shape=(0, 15), dtype=np.float32) 108 | else: 109 | return dets[inds] 110 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/weights/Resnet50_Final.pth: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d 3 | size 109497761 4 | -------------------------------------------------------------------------------- /ibug/face_detection/retina_face/weights/mobilenet0.25_Final.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hhj1897/face_detection/db2a4e8eae8c9c53385ff0773e9db08f03cf21ad/ibug/face_detection/retina_face/weights/mobilenet0.25_Final.pth -------------------------------------------------------------------------------- /ibug/face_detection/s3fd/__init__.py: -------------------------------------------------------------------------------- 1 | from .s3fd_predictor import S3FDPredictor 2 | -------------------------------------------------------------------------------- /ibug/face_detection/s3fd/s3fd_net.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.init as init 4 | import torch.nn.functional as F 5 | from .utils import Detect, PriorBox 6 | 7 | 8 | class L2Norm(nn.Module): 9 | 10 | def __init__(self, n_channels, scale): 11 | super(L2Norm, self).__init__() 12 | self.n_channels = n_channels 13 | self.gamma = scale or None 14 | self.eps = 1e-10 15 | self.weight = nn.Parameter(torch.Tensor(self.n_channels)) 16 | self.reset_parameters() 17 | 18 | def reset_parameters(self): 19 | init.constant_(self.weight, self.gamma) 20 | 21 | def forward(self, x): 22 | norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps 23 | x = torch.div(x, norm) 24 | out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x 25 | return out 26 | 27 | 28 | class S3FDNet(nn.Module): 29 | 30 | def __init__(self, config, device='cuda'): 31 | super(S3FDNet, self).__init__() 32 | self.config = config 33 | self.device = device 34 | 35 | self.vgg = nn.ModuleList([ 36 | nn.Conv2d(3, 64, 3, 1, padding=1), 37 | nn.ReLU(inplace=True), 38 | nn.Conv2d(64, 64, 3, 1, padding=1), 39 | nn.ReLU(inplace=True), 40 | nn.MaxPool2d(2, 2), 41 | 42 | nn.Conv2d(64, 128, 3, 1, padding=1), 43 | nn.ReLU(inplace=True), 44 | nn.Conv2d(128, 128, 3, 1, padding=1), 45 | nn.ReLU(inplace=True), 46 | nn.MaxPool2d(2, 2), 47 | 48 | nn.Conv2d(128, 256, 3, 1, padding=1), 49 | nn.ReLU(inplace=True), 50 | nn.Conv2d(256, 256, 3, 1, padding=1), 51 | nn.ReLU(inplace=True), 52 | nn.Conv2d(256, 256, 3, 1, padding=1), 53 | nn.ReLU(inplace=True), 54 | nn.MaxPool2d(2, 2, ceil_mode=True), 55 | 56 | nn.Conv2d(256, 512, 3, 1, padding=1), 57 | nn.ReLU(inplace=True), 58 | nn.Conv2d(512, 512, 3, 1, padding=1), 59 | nn.ReLU(inplace=True), 60 | nn.Conv2d(512, 512, 3, 1, padding=1), 61 | nn.ReLU(inplace=True), 62 | nn.MaxPool2d(2, 2), 63 | 64 | nn.Conv2d(512, 512, 3, 1, padding=1), 65 | nn.ReLU(inplace=True), 66 | nn.Conv2d(512, 512, 3, 1, padding=1), 67 | nn.ReLU(inplace=True), 68 | nn.Conv2d(512, 512, 3, 1, padding=1), 69 | nn.ReLU(inplace=True), 70 | nn.MaxPool2d(2, 2), 71 | 72 | nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6), 73 | nn.ReLU(inplace=True), 74 | nn.Conv2d(1024, 1024, 1, 1), 75 | nn.ReLU(inplace=True), 76 | ]) 77 | 78 | self.L2Norm3_3 = L2Norm(256, 10) 79 | self.L2Norm4_3 = L2Norm(512, 8) 80 | self.L2Norm5_3 = L2Norm(512, 5) 81 | 82 | self.extras = nn.ModuleList([ 83 | nn.Conv2d(1024, 256, 1, 1), 84 | nn.Conv2d(256, 512, 3, 2, padding=1), 85 | nn.Conv2d(512, 128, 1, 1), 86 | nn.Conv2d(128, 256, 3, 2, padding=1), 87 | ]) 88 | 89 | self.loc = nn.ModuleList([ 90 | nn.Conv2d(256, 4, 3, 1, padding=1), 91 | nn.Conv2d(512, 4, 3, 1, padding=1), 92 | nn.Conv2d(512, 4, 3, 1, padding=1), 93 | nn.Conv2d(1024, 4, 3, 1, padding=1), 94 | nn.Conv2d(512, 4, 3, 1, padding=1), 95 | nn.Conv2d(256, 4, 3, 1, padding=1), 96 | ]) 97 | 98 | self.conf = nn.ModuleList([ 99 | nn.Conv2d(256, 4, 3, 1, padding=1), 100 | nn.Conv2d(512, 2, 3, 1, padding=1), 101 | nn.Conv2d(512, 2, 3, 1, padding=1), 102 | nn.Conv2d(1024, 2, 3, 1, padding=1), 103 | nn.Conv2d(512, 2, 3, 1, padding=1), 104 | nn.Conv2d(256, 2, 3, 1, padding=1), 105 | ]) 106 | 107 | self.priors = None 108 | self.previous_size = None 109 | 110 | self.softmax = nn.Softmax(dim=-1) 111 | self.detect = Detect(self.config) 112 | 113 | def forward(self, x): 114 | size = x.size()[2:] 115 | sources = list() 116 | loc = list() 117 | conf = list() 118 | 119 | for k in range(16): 120 | x = self.vgg[k](x) 121 | s = self.L2Norm3_3(x) 122 | sources.append(s) 123 | 124 | for k in range(16, 23): 125 | x = self.vgg[k](x) 126 | s = self.L2Norm4_3(x) 127 | sources.append(s) 128 | 129 | for k in range(23, 30): 130 | x = self.vgg[k](x) 131 | s = self.L2Norm5_3(x) 132 | sources.append(s) 133 | 134 | for k in range(30, len(self.vgg)): 135 | x = self.vgg[k](x) 136 | sources.append(x) 137 | 138 | # apply extra layers and cache source layer outputs 139 | for k, v in enumerate(self.extras): 140 | x = F.relu(v(x), inplace=True) 141 | if k % 2 == 1: 142 | sources.append(x) 143 | 144 | # apply multibox head to source layers 145 | loc_x = self.loc[0](sources[0]) 146 | conf_x = self.conf[0](sources[0]) 147 | 148 | max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True) 149 | conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1) 150 | 151 | loc.append(loc_x.permute(0, 2, 3, 1).contiguous()) 152 | conf.append(conf_x.permute(0, 2, 3, 1).contiguous()) 153 | 154 | for i in range(1, len(sources)): 155 | x = sources[i] 156 | conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous()) 157 | loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous()) 158 | 159 | if self.priors is None or self.previous_size != size: 160 | with torch.no_grad(): 161 | features_maps = [] 162 | for i in range(len(loc)): 163 | feat = [] 164 | feat += [loc[i].size(1), loc[i].size(2)] 165 | features_maps += [feat] 166 | self.priors = PriorBox(size, features_maps, self.config).forward().to(self.device) 167 | self.previous_size = size 168 | 169 | loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1) 170 | conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1) 171 | conf = self.softmax(conf.view(conf.size(0), -1, 2)) 172 | 173 | output = self.detect(loc.view(loc.size(0), -1, 4), conf, self.priors) 174 | 175 | return output 176 | -------------------------------------------------------------------------------- /ibug/face_detection/s3fd/s3fd_predictor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | from types import SimpleNamespace 5 | from typing import Union, Optional 6 | from .s3fd_net import S3FDNet 7 | 8 | 9 | __all__ = ['S3FDPredictor'] 10 | 11 | 12 | class S3FDPredictor(object): 13 | def __init__(self, threshold: float = 0.8, device: Union[str, torch.device] = 'cuda:0', 14 | model: Optional[SimpleNamespace] = None, config: Optional[SimpleNamespace] = None) -> None: 15 | self.threshold = threshold 16 | self.device = device 17 | if model is None: 18 | model = S3FDPredictor.get_model() 19 | if config is None: 20 | config = S3FDPredictor.create_config() 21 | self.config = SimpleNamespace(**model.config.__dict__, **config.__dict__) 22 | self.net = S3FDNet(config=self.config, device=self.device).to(self.device) 23 | self.net.load_state_dict(torch.load(model.weights, map_location=self.device)) 24 | self.net.eval() 25 | 26 | @staticmethod 27 | def get_model(name: str = 's3fd') -> SimpleNamespace: 28 | name = name.lower().strip() 29 | if name == 's3fd': 30 | return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__), 31 | 'weights', 's3fd_weights.pth')), 32 | config=SimpleNamespace(num_classes=2, variance=(0.1, 0.2), 33 | prior_min_sizes=(16, 32, 64, 128, 256, 512), 34 | prior_steps=(4, 8, 16, 32, 64, 128), prior_clip=False)) 35 | else: 36 | raise ValueError('name must be set to s3fd') 37 | 38 | @staticmethod 39 | def create_config(top_k: int = 750, conf_thresh: float = 0.05,nms_thresh: float = 0.3, 40 | nms_top_k: int = 5000, use_nms_np: bool = True) -> SimpleNamespace: 41 | return SimpleNamespace(top_k=top_k, conf_thresh=conf_thresh, nms_thresh=nms_thresh, 42 | nms_top_k=nms_top_k, use_nms_np=use_nms_np) 43 | 44 | @torch.no_grad() 45 | def __call__(self, image: np.ndarray, rgb: bool = True) -> np.ndarray: 46 | w, h = image.shape[1], image.shape[0] 47 | if not rgb: 48 | image = image[..., ::-1] 49 | image = image.astype(int) - np.array([123, 117, 104]) 50 | image = image.transpose(2, 0, 1) 51 | image = image.reshape((1,) + image.shape) 52 | image = torch.from_numpy(image).float().to(self.device) 53 | 54 | bboxes = [] 55 | detections = self.net(image) 56 | scale = torch.Tensor([w, h, w, h]).to(detections.device) 57 | for i in range(detections.size(1)): 58 | j = 0 59 | while detections[0, i, j, 0] >= self.threshold: 60 | score = detections[0, i, j, 0] 61 | pt = (detections[0, i, j, 1:] * scale).cpu().numpy() 62 | bbox = (pt[0], pt[1], pt[2], pt[3], score) 63 | bboxes.append(bbox) 64 | j += 1 65 | if len(bboxes) > 0: 66 | return np.array(bboxes) 67 | else: 68 | return np.empty(shape=(0, 5), dtype=np.float32) 69 | -------------------------------------------------------------------------------- /ibug/face_detection/s3fd/utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from itertools import product 4 | 5 | 6 | def decode(loc, priors, variances): 7 | """Decode locations from predictions using priors to undo 8 | the encoding we did for offset regression at train time. 9 | Args: 10 | loc (tensor): location predictions for loc layers, 11 | Shape: [num_priors,4] 12 | priors (tensor): Prior boxes in center-offset form. 13 | Shape: [num_priors,4]. 14 | variances: (list[float]) Variances of priorboxes 15 | Return: 16 | decoded bounding box predictions 17 | """ 18 | 19 | boxes = torch.cat(( 20 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 21 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 22 | boxes[:, :2] -= boxes[:, 2:] / 2 23 | boxes[:, 2:] += boxes[:, :2] 24 | return boxes 25 | 26 | 27 | def nms(boxes, scores, overlap=0.5, top_k=200): 28 | """Apply non-maximum suppression at test time to avoid detecting too many 29 | overlapping bounding boxes for a given object. 30 | Args: 31 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 32 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 33 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 34 | top_k: (int) The Maximum number of box preds to consider. 35 | Return: 36 | The indices of the kept boxes with respect to num_priors. 37 | """ 38 | 39 | keep = scores.new(scores.size(0)).zero_().long() 40 | if boxes.numel() == 0: 41 | return keep, 0 42 | x1 = boxes[:, 0] 43 | y1 = boxes[:, 1] 44 | x2 = boxes[:, 2] 45 | y2 = boxes[:, 3] 46 | area = torch.mul(x2 - x1, y2 - y1) 47 | v, idx = scores.sort(0) # sort in ascending order 48 | # I = I[v >= 0.01] 49 | idx = idx[-top_k:] # indices of the top-k largest vals 50 | xx1 = boxes.new() 51 | yy1 = boxes.new() 52 | xx2 = boxes.new() 53 | yy2 = boxes.new() 54 | w = boxes.new() 55 | h = boxes.new() 56 | 57 | # keep = torch.Tensor() 58 | count = 0 59 | while idx.numel() > 0: 60 | i = idx[-1] # index of current largest val 61 | # keep.append(i) 62 | keep[count] = i 63 | count += 1 64 | if idx.size(0) == 1: 65 | break 66 | idx = idx[:-1] # remove kept element from view 67 | # load bboxes of next highest vals 68 | torch.index_select(x1, 0, idx, out=xx1) 69 | torch.index_select(y1, 0, idx, out=yy1) 70 | torch.index_select(x2, 0, idx, out=xx2) 71 | torch.index_select(y2, 0, idx, out=yy2) 72 | # store element-wise max with next highest score 73 | xx1 = torch.clamp(xx1, min=x1[i]) 74 | yy1 = torch.clamp(yy1, min=y1[i]) 75 | xx2 = torch.clamp(xx2, max=x2[i]) 76 | yy2 = torch.clamp(yy2, max=y2[i]) 77 | w.resize_as_(xx2) 78 | h.resize_as_(yy2) 79 | w = xx2 - xx1 80 | h = yy2 - yy1 81 | # check sizes of xx1 and xx2.. after each iteration 82 | w = torch.clamp(w, min=0.0) 83 | h = torch.clamp(h, min=0.0) 84 | inter = w * h 85 | # IoU = i / (area(a) + area(b) - i) 86 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 87 | union = (rem_areas - inter) + area[i] 88 | IoU = inter / union # store result in iou 89 | # keep only elements with an IoU <= overlap 90 | idx = idx[IoU.le(overlap)] 91 | return keep, count 92 | 93 | 94 | def nms_np(boxes, scores, overlap=0.5, top_k=200): 95 | """Apply non-maximum suppression at test time to avoid detecting too many 96 | overlapping bounding boxes for a given object, using numpy (for speed). 97 | Args: 98 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 99 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 100 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 101 | top_k: (int) The Maximum number of box preds to consider. 102 | Return: 103 | The indices of the kept boxes with respect to num_priors. 104 | """ 105 | 106 | if scores.size(0) == 0: 107 | return [], 0 108 | else: 109 | areas = torch.mul(boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]).cpu().numpy() 110 | x1, y1 = boxes[:, 0].cpu().numpy(), boxes[:, 1].cpu().numpy() 111 | x2, y2 = boxes[:, 2].cpu().numpy(), boxes[:, 3].cpu().numpy() 112 | scores = scores.cpu().numpy() 113 | order = scores.argsort()[: -top_k - 1: -1] 114 | 115 | keep = [] 116 | while order.size > 0: 117 | i = order[0] 118 | keep.append(i) 119 | xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]]) 120 | xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]]) 121 | 122 | w, h = np.maximum(0.0, xx2 - xx1), np.maximum(0.0, yy2 - yy1) 123 | ovr = w * h / (areas[i] + areas[order[1:]] - w * h) 124 | 125 | inds = np.where(ovr <= overlap)[0] 126 | order = order[inds + 1] 127 | 128 | return keep, len(keep) 129 | 130 | 131 | class Detect(object): 132 | 133 | def __init__(self, config): 134 | 135 | self.config = config 136 | 137 | def __call__(self, loc_data, conf_data, prior_data): 138 | 139 | num = loc_data.size(0) 140 | num_priors = prior_data.size(0) 141 | 142 | conf_preds = conf_data.view(num, num_priors, self.config.num_classes).transpose(2, 1) 143 | batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4) 144 | batch_priors = batch_priors.contiguous().view(-1, 4) 145 | 146 | decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.config.variance) 147 | decoded_boxes = decoded_boxes.view(num, num_priors, 4) 148 | 149 | output = torch.zeros(num, self.config.num_classes, self.config.top_k, 5) 150 | 151 | for i in range(num): 152 | boxes = decoded_boxes[i].clone() 153 | conf_scores = conf_preds[i].clone() 154 | 155 | for cl in range(1, self.config.num_classes): 156 | c_mask = conf_scores[cl].gt(self.config.conf_thresh) 157 | scores = conf_scores[cl][c_mask] 158 | 159 | if scores.dim() == 0: 160 | continue 161 | l_mask = c_mask.unsqueeze(1).expand_as(boxes) 162 | boxes_ = boxes[l_mask].view(-1, 4) 163 | if self.config.use_nms_np: 164 | ids, count = nms_np(boxes_, scores, self.config.nms_thresh, self.config.nms_top_k) 165 | else: 166 | ids, count = nms(boxes_, scores, self.config.nms_thresh, self.config.nms_top_k) 167 | count = count if count < self.config.top_k else self.config.top_k 168 | 169 | output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1) 170 | 171 | return output 172 | 173 | 174 | class PriorBox(object): 175 | 176 | def __init__(self, input_size, feature_maps, config): 177 | 178 | self.imh = input_size[0] 179 | self.imw = input_size[1] 180 | self.feature_maps = feature_maps 181 | 182 | self.config = config 183 | 184 | def forward(self): 185 | mean = [] 186 | for k, fmap in enumerate(self.feature_maps): 187 | feath = fmap[0] 188 | featw = fmap[1] 189 | for i, j in product(range(feath), range(featw)): 190 | f_kw = self.imw / self.config.prior_steps[k] 191 | f_kh = self.imh / self.config.prior_steps[k] 192 | 193 | cx = (j + 0.5) / f_kw 194 | cy = (i + 0.5) / f_kh 195 | 196 | s_kw = self.config.prior_min_sizes[k] / self.imw 197 | s_kh = self.config.prior_min_sizes[k] / self.imh 198 | 199 | mean += [cx, cy, s_kw, s_kh] 200 | 201 | output = torch.FloatTensor(mean).view(-1, 4) 202 | 203 | if self.config.prior_clip: 204 | output.clamp_(max=1, min=0) 205 | 206 | return output 207 | -------------------------------------------------------------------------------- /ibug/face_detection/s3fd/weights/s3fd_weights.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hhj1897/face_detection/db2a4e8eae8c9c53385ff0773e9db08f03cf21ad/ibug/face_detection/s3fd/weights/s3fd_weights.pth -------------------------------------------------------------------------------- /ibug/face_detection/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .head_pose_estimator import HeadPoseEstimator 2 | from .simple_face_tracker import SimpleFaceTracker 3 | -------------------------------------------------------------------------------- /ibug/face_detection/utils/data/bfm_lms.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hhj1897/face_detection/db2a4e8eae8c9c53385ff0773e9db08f03cf21ad/ibug/face_detection/utils/data/bfm_lms.npy -------------------------------------------------------------------------------- /ibug/face_detection/utils/head_pose_estimator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import cv2 3 | import math 4 | import numpy as np 5 | from typing import Optional, Tuple 6 | 7 | 8 | __all__ = ['HeadPoseEstimator'] 9 | 10 | 11 | class HeadPoseEstimator(object): 12 | def __init__(self, mean_shape_path: str = os.path.join(os.path.dirname(__file__), 13 | 'data', 'bfm_lms.npy')) -> None: 14 | # Load the 68-point mean shape derived from BFM 15 | mean_shape = np.load(mean_shape_path) 16 | 17 | # Calculate the 5-points mean shape 18 | left_eye = mean_shape[[37, 38, 40, 41]].mean(axis=0) 19 | right_eye = mean_shape[[43, 44, 46, 47]].mean(axis=0) 20 | self._mean_shape_5pts = np.vstack((left_eye, right_eye, mean_shape[[30, 48, 54]])) 21 | 22 | # Flip the y coordinates of the mean shape to match that of the image coordinate system 23 | self._mean_shape_5pts[:, 1] = -self._mean_shape_5pts[:, 1] 24 | 25 | def __call__(self, landmarks: np.ndarray, image_width: int = 0, image_height: int = 0, 26 | camera_matrix: Optional[np.ndarray] = None, dist_coeffs: Optional[np.ndarray] = None, 27 | output_preference: int = 0) -> Tuple[float, float, float]: 28 | # Form the camera matrix 29 | if camera_matrix is None: 30 | if image_width <= 0 or image_height <= 0: 31 | raise ValueError( 32 | 'image_width and image_height must be specified when camera_matrix is not given directly') 33 | else: 34 | camera_matrix = np.array([[image_width + image_height, 0, image_width / 2.0], 35 | [0, image_width + image_height, image_height / 2.0], 36 | [0, 0, 1]], dtype=float) 37 | 38 | # Prepare the landmarks 39 | if landmarks.shape[0] == 68: 40 | landmarks = landmarks[17:] 41 | if landmarks.shape[0] in [49, 51]: 42 | left_eye = landmarks[[20, 21, 23, 24]].mean(axis=0) 43 | right_eye = landmarks[[26, 27, 29, 30]].mean(axis=0) 44 | landmarks = np.vstack((left_eye, right_eye, landmarks[[13, 31, 37]])) 45 | 46 | # Use EPnP to estimate pitch, yaw, and roll 47 | _, rvec, _ = cv2.solvePnP(self._mean_shape_5pts, np.expand_dims(landmarks, axis=1), 48 | camera_matrix, dist_coeffs, flags=cv2.SOLVEPNP_EPNP) 49 | rot_mat, _ = cv2.Rodrigues(rvec) 50 | if 1.0 + rot_mat[2, 0] < 1e-9: 51 | pitch = 0.0 52 | yaw = 90.0 53 | roll = -math.atan2(rot_mat[0, 1], rot_mat[0, 2]) / math.pi * 180.0 54 | elif 1.0 - rot_mat[2, 0] < 1e-9: 55 | pitch = 0.0 56 | yaw = -90.0 57 | roll = math.atan2(-rot_mat[0, 1], -rot_mat[0, 2]) / math.pi * 180.0 58 | else: 59 | pitch = math.atan2(rot_mat[2, 1], rot_mat[2, 2]) / math.pi * 180.0 60 | yaw = -math.asin(rot_mat[2, 0]) / math.pi * 180.0 61 | roll = math.atan2(rot_mat[1, 0], rot_mat[0, 0]) / math.pi * 180.0 62 | 63 | # Respond to output_preference: 64 | # output_preference == 1: limit pitch to the range of -90.0 ~ 90.0 65 | # output_preference == 2: limit yaw to the range of -90.0 ~ 90.0 (already satisfied) 66 | # output_preference == 3: limit roll to the range of -90.0 ~ 90.0 67 | # otherwise: minimise total rotation, min(abs(pitch) + abs(yaw) + abs(roll)) 68 | if output_preference != 2: 69 | alt_pitch = pitch - 180.0 if pitch > 0.0 else pitch + 180.0 70 | alt_yaw = -180.0 - yaw if yaw < 0.0 else 180.0 - yaw 71 | alt_roll = roll - 180.0 if roll > 0.0 else roll + 180.0 72 | if (output_preference == 1 and -90.0 < alt_pitch < 90.0 or 73 | output_preference == 3 and -90.0 < alt_roll < 90.0 or 74 | output_preference not in (1, 2, 3) and 75 | abs(alt_pitch) + abs(alt_yaw) + abs(alt_roll) < abs(pitch) + abs(yaw) + abs(roll)): 76 | pitch, yaw, roll = alt_pitch, alt_yaw, alt_roll 77 | 78 | return -pitch, yaw, roll 79 | -------------------------------------------------------------------------------- /ibug/face_detection/utils/simple_face_tracker.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List, Optional 3 | from scipy.optimize import linear_sum_assignment 4 | 5 | 6 | __all__ = ['SimpleFaceTracker'] 7 | 8 | 9 | class SimpleFaceTracker(object): 10 | def __init__(self, iou_threshold: float = 0.4, minimum_face_size: float = 0.0) -> None: 11 | self._iou_threshold = iou_threshold 12 | self._minimum_face_size = minimum_face_size 13 | self._tracklets = [] 14 | self._tracklet_counter = 0 15 | 16 | @property 17 | def iou_threshold(self) -> float: 18 | return self._iou_threshold 19 | 20 | @iou_threshold.setter 21 | def iou_threshold(self, threshold: float) -> None: 22 | self._iou_threshold = threshold 23 | 24 | @property 25 | def minimum_face_size(self) -> float: 26 | return self._minimum_face_size 27 | 28 | @minimum_face_size.setter 29 | def minimum_face_size(self, face_size: float) -> None: 30 | self._minimum_face_size = face_size 31 | 32 | def __call__(self, face_boxes: np.ndarray) -> List[Optional[int]]: 33 | if face_boxes.size <= 0: 34 | self._tracklets = [] 35 | return [] 36 | 37 | # Calculate area of the faces 38 | face_areas = np.abs((face_boxes[:, 2] - face_boxes[:, 0]) * (face_boxes[:, 3] - face_boxes[:, 1])) 39 | 40 | # Prepare tracklets 41 | for tracklet in self._tracklets: 42 | tracklet['tracked'] = False 43 | 44 | # Calculate the distance matrix based on IOU 45 | iou_distance_threshold = np.clip(1.0 - self._iou_threshold, 0.0, 1.0) 46 | min_face_area = max(self._minimum_face_size ** 2, np.finfo(float).eps) 47 | distances = np.full(shape=(face_boxes.shape[0], len(self._tracklets)), 48 | fill_value=2.0 * min(face_boxes.shape[0], len(self._tracklets)), dtype=float) 49 | for row, face_box in enumerate(face_boxes): 50 | if face_areas[row] >= min_face_area: 51 | for col, tracklet in enumerate(self._tracklets): 52 | x_left = max(min(face_box[0], face_box[2]), min(tracklet['bbox'][0], tracklet['bbox'][2])) 53 | y_top = max(min(face_box[1], face_box[3]), min(tracklet['bbox'][1], tracklet['bbox'][3])) 54 | x_right = min(max(face_box[2], face_box[0]), max(tracklet['bbox'][2], tracklet['bbox'][0])) 55 | y_bottom = min(max(face_box[3], face_box[1]), max(tracklet['bbox'][3], tracklet['bbox'][1])) 56 | if x_right <= x_left or y_bottom <= y_top: 57 | distance = 1.0 58 | else: 59 | intersection_area = (x_right - x_left) * (y_bottom - y_top) 60 | distance = 1.0 - intersection_area / float(face_areas[row] + tracklet['area'] - 61 | intersection_area) 62 | if distance <= iou_distance_threshold: 63 | distances[row, col] = distance 64 | 65 | # ID assignment 66 | tracked_ids = [None] * face_boxes.shape[0] 67 | for row, col in zip(*linear_sum_assignment(distances)): 68 | if distances[row, col] <= iou_distance_threshold: 69 | tracked_ids[row] = self._tracklets[col]['id'] 70 | self._tracklets[col]['bbox'] = face_boxes[row, :4].copy() 71 | self._tracklets[col]['area'] = face_areas[row] 72 | self._tracklets[col]['tracked'] = True 73 | 74 | # Remove expired tracklets 75 | self._tracklets = [x for x in self._tracklets if x['tracked']] 76 | 77 | # Register new faces 78 | for idx, face_box in enumerate(face_boxes): 79 | if face_areas[idx] >= min_face_area and tracked_ids[idx] is None: 80 | self._tracklet_counter += 1 81 | self._tracklets.append({'bbox': face_box[:4].copy(), 'area': face_areas[idx], 82 | 'id': self._tracklet_counter, 'tracked': True}) 83 | tracked_ids[idx] = self._tracklets[-1]['id'] 84 | 85 | return tracked_ids 86 | 87 | def reset(self, reset_tracklet_counter: bool = True) -> None: 88 | self._tracklets = [] 89 | if reset_tracklet_counter: 90 | self._tracklet_counter = 0 91 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.16.0 2 | scipy>=1.1.0 3 | torch>=1.1.0 4 | torchvision>=0.3.0 5 | opencv-python>= 3.4.2 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | from setuptools import setup 5 | 6 | 7 | def clean_repo(): 8 | repo_folder = os.path.realpath(os.path.dirname(__file__)) 9 | dist_folder = os.path.join(repo_folder, 'dist') 10 | build_folder = os.path.join(repo_folder, 'build') 11 | if os.path.isdir(dist_folder): 12 | shutil.rmtree(dist_folder, ignore_errors=True) 13 | if os.path.isdir(build_folder): 14 | shutil.rmtree(build_folder, ignore_errors=True) 15 | 16 | 17 | # Read version string 18 | _version = None 19 | script_folder = os.path.realpath(os.path.dirname(__file__)) 20 | with open(os.path.join(script_folder, 'ibug', 'face_detection', '__init__.py')) as init: 21 | for line in init.read().splitlines(): 22 | fields = line.replace('=', ' ').replace('\'', ' ').replace('\"', ' ').replace('\t', ' ').split() 23 | if len(fields) >= 2 and fields[0] == '__version__': 24 | _version = fields[1] 25 | break 26 | if _version is None: 27 | sys.exit('Sorry, cannot find version information.') 28 | 29 | # Installation 30 | config = { 31 | 'name': 'ibug_face_detection', 32 | 'version': _version, 33 | 'description': 'A collection of pretrained face detectors including S3FD and RetinaFace.', 34 | 'author': 'Jie Shen', 35 | 'author_email': 'js1907@imperial.ac.uk', 36 | 'packages': ['ibug.face_detection'], 37 | 'install_requires': ['numpy>=1.16.0', 'scipy>=1.1.0', 'torch>=1.1.0', 38 | 'torchvision>=0.3.0', 'opencv-python>= 3.4.2'], 39 | 'zip_safe': False 40 | } 41 | clean_repo() 42 | setup(**config) 43 | clean_repo() 44 | --------------------------------------------------------------------------------