├── .gitattributes
├── .gitignore
├── LICENSE
├── README.md
├── face_detection_test.py
├── ibug
    └── face_detection
    │   ├── __init__.py
    │   ├── retina_face
    │       ├── __init__.py
    │       ├── box_utils.py
    │       ├── config.py
    │       ├── prior_box.py
    │       ├── py_cpu_nms.py
    │       ├── retina_face.py
    │       ├── retina_face_net.py
    │       ├── retina_face_predictor.py
    │       └── weights
    │       │   ├── Resnet50_Final.pth
    │       │   └── mobilenet0.25_Final.pth
    │   ├── s3fd
    │       ├── __init__.py
    │       ├── s3fd_net.py
    │       ├── s3fd_predictor.py
    │       ├── utils.py
    │       └── weights
    │       │   └── s3fd_weights.pth
    │   └── utils
    │       ├── __init__.py
    │       ├── data
    │           └── bfm_lms.npy
    │       ├── head_pose_estimator.py
    │       └── simple_face_tracker.py
├── requirements.txt
└── setup.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | . text !filter !merge !diff
2 | ibug/face_detection/retina_face/weights/Resnet50_Final.pth filter=lfs diff=lfs merge=lfs -text
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints
2 | .vscode
3 | .idea
4 | *.pyc
5 | build
6 | dist
7 | *.egg-info
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jie Shen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ibug.face_detection
 2 | A collection of pretrained face detectors including:
 3 | * [S3FD](http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_S3FD_Single_Shot_ICCV_2017_paper.pdf) \[1\] with weights trained on the [WIDER](http://shuoyang1213.me/WIDERFACE/) \[2\] dataset. Implementation of the algorithm is based on this repository: [https://github.com/cs-giung/face-detection-pytorch](https://github.com/cs-giung/face-detection-pytorch).
 4 | * [RetinaFace](https://arxiv.org/pdf/1905.00641) \[3\] with weights trained on the [WIDER](http://shuoyang1213.me/WIDERFACE/) \[2\] dataset. Wights for networks using either Resnet50 or MobileNet0.25 as the backbone are included. The implementation is based on this repository: [https://github.com/biubug6/Pytorch_Retinaface](https://github.com/biubug6/Pytorch_Retinaface).
 5 | 
 6 | For convenience, the package also includes a simple IOU-based face tracker and a head pose estimator using EPnP.
 7 | 
 8 | ## Prerequisites
 9 | * [Git LFS](https://git-lfs.github.com/), needed for downloading the pretrained weights that are larger than 100 MB.
10 | * [Numpy](https://www.numpy.org/): `$pip3 install numpy`
11 | * [Sciypy](https://www.scipy.org/): `$pip3 install scipy`
12 | * [PyTorch](https://pytorch.org/): `$pip3 install torch torchvision`
13 | * [OpenCV](https://opencv.org/): `$pip3 install opencv-python`
14 | 
15 | ## How to Install
16 | ```
17 | git clone https://github.com/hhj1897/face_detection.git
18 | cd face_detection
19 | git lfs pull
20 | pip install -e .
21 | ```
22 | 
23 | ## How to Test
24 | * To test on live video: `python face_detection_test.py [-i webcam_index]`
25 | * To test on a video file: `python face_detection_test.py [-i input_file] [-o output_file]`
26 | 
27 | By default, the test script would use RetinaFace with Resnet50, but you can change that using `--method` and `--weights` options. 
28 | 
29 | ## How to Use
30 | ```python
31 | # Import everything, just for illustration purposes
32 | import cv2
33 | from ibug.face_detection import RetinaFacePredictor, S3FDPredictor
34 | from ibug.face_detection.utils import HeadPoseEstimator, SimpleFaceTracker
35 | 
36 | # Create a RetinaFace detector using Resnet50 backbone, with the confidence 
37 | # threshold set to 0.8
38 | face_detector = RetinaFacePredictor(
39 |     threshold=0.8, device='cuda:0',
40 |     model=RetinaFacePredictor.get_model('resnet50'))
41 | 
42 | # Create a head pose estimator
43 | pose_estimator = HeadPoseEstimator()
44 | 
45 | # Create a simple face tracker, with mininum face size set to 64x64 pixels
46 | face_tracker = SimpleFaceTracker(minimum_face_size=64)
47 | 
48 | # Load a test image. Note that images loaded by OpenCV adopt the B-G-R channel
49 | # order.
50 | image = cv2.imread('test.png')
51 | 
52 | # Detect faces from the image
53 | # Note:
54 | #   1. The input image must be a byte array of dimension HxWx3.
55 | #   2. The return value is a Nx5 (for S3FD) or a Nx15 (for RetinaFace) matrix,
56 | #      in which N is the number of detected faces. The first 4 columns store 
57 | #      (in this order) the left, top, right, and bottom coordinates of the 
58 | #      detected face boxes. The 5th columns stores the detection confidences.
59 | #      The remaining columns store the coordinates (in the order of x1, y1, x2,
60 | #      y2, ...) of the detected landmarks.
61 | detected_faces = face_detector(image, rgb=False)
62 | 
63 | # Head pose estimation (only works for RetinaFace, which also detects the 5
64 | # landmarks on the face), which gives pitch, yaw, and roll (in degrees) of
65 | # the detected faces.
66 | for face in detected_faces:
67 |     pitch, yaw, roll = pose_estimator(face[5:].reshape((-1, 2)))
68 | 
69 | # If you are processing frames in a video, you can also perform rudimentary
70 | # face tracking, as shown below. The return value is a list containing the 
71 | # tracklet ID (>=1) of the detected faces. If a face cannot be tracked 
72 | # (such as because it is too small), its corresponding element in the list 
73 | # would be set to None.
74 | tracked_ids = face_tracker(detected_faces[:, :4])
75 | ```
76 | 
77 | ## References
78 | \[1\] Zhang, Shifeng, Xiangyu Zhu, Zhen Lei, Hailin Shi, Xiaobo Wang, and Stan Z. Li. "[S3fd: Single shot scale-invariant face detector.](http://openaccess.thecvf.com/content_ICCV_2017/papers/Zhang_S3FD_Single_Shot_ICCV_2017_paper.pdf)" In _Proceedings of the IEEE international conference on computer vision_, pp. 192-201. 2017.
79 | 
80 | \[2\] Yang, Shuo, Ping Luo, Chen-Change Loy, Xiaoou Tang. "[WIDER FACE: A Face Detection Benchmark.](http://openaccess.thecvf.com/content_cvpr_2016/papers/Yang_WIDER_FACE_A_CVPR_2016_paper.pdf)" In _Proceedings of the IEEE international conference on computer vision_, pp. 5525-5533. 2016.
81 | 
82 | \[3\] Deng, Jiankang, Jia Guo, Evangelos Ververas, Irene Kotsia, and Stefanos Zafeiriou. "[Retinaface: Single-shot multi-level face localisation in the wild.](https://openaccess.thecvf.com/content_CVPR_2020/papers/Deng_RetinaFace_Single-Shot_Multi-Level_Face_Localisation_in_the_Wild_CVPR_2020_paper.pdf)" In _Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition_, pp. 5203-5212. 2020.
83 | 


--------------------------------------------------------------------------------
/face_detection_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import time
  4 | import torch
  5 | from argparse import ArgumentParser
  6 | from ibug.face_detection import RetinaFacePredictor, S3FDPredictor
  7 | from ibug.face_detection.utils import SimpleFaceTracker, HeadPoseEstimator
  8 | 
  9 | 
 10 | def main() -> None:
 11 |     # Parse command-line arguments
 12 |     parser = ArgumentParser()
 13 |     parser.add_argument('--input', '-i', help='Input video path or webcam index (default=0)', default=0)
 14 |     parser.add_argument('--output', '-o', help='Output file path', default=None)
 15 |     parser.add_argument('--fourcc', '-f', help='FourCC of the output video (default=mp4v)',
 16 |                         type=str, default='mp4v')
 17 |     parser.add_argument('--benchmark', '-b', help='Enable benchmark mode for CUDNN',
 18 |                         action='store_true', default=False)
 19 |     parser.add_argument('--no-display', '-n', help='No display if processing a video file',
 20 |                         action='store_true', default=False)
 21 |     parser.add_argument('--threshold', '-t', help='Confidence threshold (default=0.8)',
 22 |                         type=float, default=0.8)
 23 |     parser.add_argument('--method', '-m', help='Method to use, can be either RatinaFace or S3FD (default=RatinaFace)',
 24 |                         default='retinaface')
 25 |     parser.add_argument('--weights', '-w',
 26 |                         help='Weights to load, can be either resnet50 or mobilenet0.25 when using RetinaFace',
 27 |                         default=None)
 28 |     parser.add_argument('--alternative-pth', '-p', help='Alternative pth file to load', default=None)
 29 |     parser.add_argument('--device', '-d', help='Device to be used by the model (default=cuda:0)',
 30 |                         default='cuda:0')
 31 |     parser.add_argument('--iou-threshold', '-iou',
 32 |                         help='IOU threshold used by the simple face tracker (default=0.4)',
 33 |                         type=float, default=0.4)
 34 |     parser.add_argument('--minimum-face-size', '-min',
 35 |                         help='Minimum face size used by the simple face tracker (default=0.0)',
 36 |                         type=float, default=0.0)
 37 |     parser.add_argument('--head-pose-preference', '-hp',
 38 |                         help='Head pose output preference (default=0)',
 39 |                         type=int, default=0)
 40 |     args = parser.parse_args()
 41 | 
 42 |     # Set benchmark mode flag for CUDNN
 43 |     torch.backends.cudnn.benchmark = args.benchmark
 44 | 
 45 |     vid = None
 46 |     out_vid = None
 47 |     has_window = False
 48 |     try:
 49 |         # Create the face detector
 50 |         args.method = args.method.lower().strip()
 51 |         if args.method == 'retinaface':
 52 |             face_detector_class = (RetinaFacePredictor, 'RetinaFace')
 53 |         elif args.method == 's3fd':
 54 |             face_detector_class = (S3FDPredictor, 'S3FD')
 55 |         else:
 56 |             raise ValueError('method must be set to either RetinaFace or S3FD')
 57 |         if args.weights is None:
 58 |             fd_model = face_detector_class[0].get_model()
 59 |         else:
 60 |             fd_model = face_detector_class[0].get_model(args.weights)
 61 |         if args.alternative_pth is not None:
 62 |             fd_model.weights = args.alternative_pth
 63 |         face_detector = face_detector_class[0](threshold=args.threshold, device=args.device, model=fd_model)
 64 |         print(f"Face detector created using {face_detector_class[1]} ({fd_model.weights}).")
 65 | 
 66 |         # Create the simple face tracker
 67 |         face_tracker = SimpleFaceTracker(iou_threshold=args.iou_threshold,
 68 |                                          minimum_face_size=args.minimum_face_size)
 69 |         print('Simple face tracker created.')
 70 | 
 71 |         # Create the head pose estimator
 72 |         head_pose_estimator = HeadPoseEstimator()
 73 |         print('Head pose estimator created.')
 74 | 
 75 |         # Open the input video
 76 |         using_webcam = not os.path.exists(args.input)
 77 |         vid = cv2.VideoCapture(int(args.input) if using_webcam else args.input)
 78 |         assert vid.isOpened()
 79 |         if using_webcam:
 80 |             print(f'Webcam #{int(args.input)} opened.')
 81 |         else:
 82 |             print(f'Input video "{args.input}" opened.')
 83 | 
 84 |         # Open the output video (if a path is given)
 85 |         if args.output is not None:
 86 |             out_vid = cv2.VideoWriter(args.output, fps=vid.get(cv2.CAP_PROP_FPS),
 87 |                                       frameSize=(int(vid.get(cv2.CAP_PROP_FRAME_WIDTH)),
 88 |                                                  int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))),
 89 |                                       fourcc=cv2.VideoWriter_fourcc(*args.fourcc))
 90 |             assert out_vid.isOpened()
 91 | 
 92 |         # Process the frames
 93 |         frame_number = 0
 94 |         window_title = os.path.splitext(os.path.basename(__file__))[0]
 95 |         colours = [(0, 0, 255), (0, 255, 0), (255, 0, 0), (0, 255, 255), (255, 0, 255), (255, 255, 0),
 96 |                    (0, 128, 255), (128, 255, 0), (255, 0, 128), (128, 0, 255), (0, 255, 128), (255, 128, 0)]
 97 |         print('Processing started, press \'Q\' to quit or \'R\' to reset the tracker.')
 98 |         while True:
 99 |             # Get a new frame
100 |             _, frame = vid.read()
101 |             if frame is None:
102 |                 break
103 |             else:
104 |                 # Detect and track faces, also estimate head pose if landmarks are available
105 |                 start_time = time.time()
106 |                 faces = face_detector(frame, rgb=False)
107 |                 tids = face_tracker(faces)
108 |                 if faces.shape[1] >= 15:
109 |                     head_poses = [head_pose_estimator(face[5:15].reshape((-1, 2)), *frame.shape[1::-1],
110 |                                                       output_preference=args.head_pose_preference)
111 |                                   for face in faces]
112 |                 else:
113 |                     head_poses = [None] * faces.shape[0]
114 |                 elapsed_time = time.time() - start_time
115 | 
116 |                 # Textural output
117 |                 print(f'Frame #{frame_number} processed in {elapsed_time * 1000.0:.04f} ms: ' +
118 |                       f'{len(faces)} faces detected.')
119 | 
120 |                 # Rendering
121 |                 for face, tid, head_pose in zip(faces, tids, head_poses):
122 |                     bbox = face[:4].astype(int)
123 |                     if tid is None:
124 |                         colour = (128, 128, 128)
125 |                     else:
126 |                         colour = colours[(tid - 1) % len(colours)]
127 |                     cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color=colour, thickness=2)
128 |                     if len(face) > 5:
129 |                         for pts in face[5:].reshape((-1, 2)):
130 |                             cv2.circle(frame, tuple(pts.astype(int).tolist()), 3, colour, -1)
131 |                     if tid is not None:
132 |                         cv2.putText(frame, f'Face {tid}', (bbox[0], bbox[1] - 10),
133 |                                     cv2.FONT_HERSHEY_DUPLEX, 0.6, colour, lineType=cv2.LINE_AA)
134 |                     if head_pose is not None:
135 |                         pitch, yaw, roll = head_pose
136 |                         cv2.putText(frame, f'Pitch: {pitch:.1f}', (bbox[2] + 5, bbox[1] + 10),
137 |                                     cv2.FONT_HERSHEY_DUPLEX, 0.5, colour, lineType=cv2.LINE_AA)
138 |                         cv2.putText(frame, f'Yaw: {yaw:.1f}', (bbox[2] + 5, bbox[1] + 30),
139 |                                     cv2.FONT_HERSHEY_DUPLEX, 0.5, colour, lineType=cv2.LINE_AA)
140 |                         cv2.putText(frame, f'Roll: {roll:.1f}', (bbox[2] + 5, bbox[1] + 50),
141 |                                     cv2.FONT_HERSHEY_DUPLEX, 0.5, colour, lineType=cv2.LINE_AA)
142 | 
143 |                 # Write the frame to output video (if recording)
144 |                 if out_vid is not None:
145 |                     out_vid.write(frame)
146 | 
147 |                 # Display the frame
148 |                 if using_webcam or not args.no_display:
149 |                     has_window = True
150 |                     cv2.imshow(window_title, frame)
151 |                     key = cv2.waitKey(1) % 2 ** 16
152 |                     if key == ord('q') or key == ord('Q'):
153 |                         print('\'Q\' pressed, we are done here.')
154 |                         break
155 |                     elif key == ord('r') or key == ord('R'):
156 |                         print('\'R\' pressed, reset the tracker.')
157 |                         face_tracker.reset()
158 |                 frame_number += 1
159 |     finally:
160 |         if has_window:
161 |             cv2.destroyAllWindows()
162 |         if out_vid is not None:
163 |             out_vid.release()
164 |         if vid is not None:
165 |             vid.release()
166 |         print('All done.')
167 | 
168 | 
169 | if __name__ == '__main__':
170 |     main()
171 | 


--------------------------------------------------------------------------------
/ibug/face_detection/__init__.py:
--------------------------------------------------------------------------------
1 | from .s3fd import S3FDPredictor
2 | from .retina_face import RetinaFacePredictor
3 | 
4 | 
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/__init__.py:
--------------------------------------------------------------------------------
1 | from .retina_face_predictor import RetinaFacePredictor
2 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/box_utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | 
  4 | 
  5 | def point_form(boxes):
  6 |     """ Convert prior_boxes to (xmin, ymin, xmax, ymax)
  7 |     representation for comparison to point form ground truth data.
  8 |     Args:
  9 |         boxes: (tensor) center-size default boxes from priorbox layers.
 10 |     Return:
 11 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 12 |     """
 13 |     return torch.cat((boxes[:, :2] - boxes[:, 2:]/2,     # xmin, ymin
 14 |                      boxes[:, :2] + boxes[:, 2:]/2), 1)  # xmax, ymax
 15 | 
 16 | 
 17 | def center_size(boxes):
 18 |     """ Convert prior_boxes to (cx, cy, w, h)
 19 |     representation for comparison to center-size form ground truth data.
 20 |     Args:
 21 |         boxes: (tensor) point_form boxes
 22 |     Return:
 23 |         boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes.
 24 |     """
 25 |     return torch.cat((boxes[:, 2:] + boxes[:, :2])/2,  # cx, cy
 26 |                      boxes[:, 2:] - boxes[:, :2], 1)  # w, h
 27 | 
 28 | 
 29 | def intersect(box_a, box_b):
 30 |     """ We resize both tensors to [A,B,2] without new malloc:
 31 |     [A,2] -> [A,1,2] -> [A,B,2]
 32 |     [B,2] -> [1,B,2] -> [A,B,2]
 33 |     Then we compute the area of intersect between box_a and box_b.
 34 |     Args:
 35 |       box_a: (tensor) bounding boxes, Shape: [A,4].
 36 |       box_b: (tensor) bounding boxes, Shape: [B,4].
 37 |     Return:
 38 |       (tensor) intersection area, Shape: [A,B].
 39 |     """
 40 |     A = box_a.size(0)
 41 |     B = box_b.size(0)
 42 |     max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2),
 43 |                        box_b[:, 2:].unsqueeze(0).expand(A, B, 2))
 44 |     min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2),
 45 |                        box_b[:, :2].unsqueeze(0).expand(A, B, 2))
 46 |     inter = torch.clamp((max_xy - min_xy), min=0)
 47 |     return inter[:, :, 0] * inter[:, :, 1]
 48 | 
 49 | 
 50 | def jaccard(box_a, box_b):
 51 |     """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
 52 |     is simply the intersection over union of two boxes.  Here we operate on
 53 |     ground truth boxes and default boxes.
 54 |     E.g.:
 55 |         A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
 56 |     Args:
 57 |         box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4]
 58 |         box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4]
 59 |     Return:
 60 |         jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)]
 61 |     """
 62 |     inter = intersect(box_a, box_b)
 63 |     area_a = ((box_a[:, 2]-box_a[:, 0]) *
 64 |               (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter)  # [A,B]
 65 |     area_b = ((box_b[:, 2]-box_b[:, 0]) *
 66 |               (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter)  # [A,B]
 67 |     union = area_a + area_b - inter
 68 |     return inter / union  # [A,B]
 69 | 
 70 | 
 71 | def matrix_iou(a, b):
 72 |     """
 73 |     return iou of a and b, numpy version for data augenmentation
 74 |     """
 75 |     lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
 76 |     rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
 77 | 
 78 |     area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
 79 |     area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
 80 |     area_b = np.prod(b[:, 2:] - b[:, :2], axis=1)
 81 |     return area_i / (area_a[:, np.newaxis] + area_b - area_i)
 82 | 
 83 | 
 84 | def matrix_iof(a, b):
 85 |     """
 86 |     return iof of a and b, numpy version for data augenmentation
 87 |     """
 88 |     lt = np.maximum(a[:, np.newaxis, :2], b[:, :2])
 89 |     rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:])
 90 | 
 91 |     area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2)
 92 |     area_a = np.prod(a[:, 2:] - a[:, :2], axis=1)
 93 |     return area_i / np.maximum(area_a[:, np.newaxis], 1)
 94 | 
 95 | 
 96 | def match(threshold, truths, priors, variances, labels, landms, loc_t, conf_t, landm_t, idx):
 97 |     """Match each prior box with the ground truth box of the highest jaccard
 98 |     overlap, encode the bounding boxes, then return the matched indices
 99 |     corresponding to both confidence and location preds.
100 |     Args:
101 |         threshold: (float) The overlap threshold used when mathing boxes.
102 |         truths: (tensor) Ground truth boxes, Shape: [num_obj, 4].
103 |         priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4].
104 |         variances: (tensor) Variances corresponding to each prior coord,
105 |             Shape: [num_priors, 4].
106 |         labels: (tensor) All the class labels for the image, Shape: [num_obj].
107 |         landms: (tensor) Ground truth landms, Shape [num_obj, 10].
108 |         loc_t: (tensor) Tensor to be filled w/ endcoded location targets.
109 |         conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds.
110 |         landm_t: (tensor) Tensor to be filled w/ endcoded landm targets.
111 |         idx: (int) current batch index
112 |     Return:
113 |         The matched indices corresponding to 1)location 2)confidence 3)landm preds.
114 |     """
115 |     # jaccard index
116 |     overlaps = jaccard(
117 |         truths,
118 |         point_form(priors)
119 |     )
120 |     # (Bipartite Matching)
121 |     # [1,num_objects] best prior for each ground truth
122 |     best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True)
123 | 
124 |     # ignore hard gt
125 |     valid_gt_idx = best_prior_overlap[:, 0] >= 0.2
126 |     best_prior_idx_filter = best_prior_idx[valid_gt_idx, :]
127 |     if best_prior_idx_filter.shape[0] <= 0:
128 |         loc_t[idx] = 0
129 |         conf_t[idx] = 0
130 |         return
131 | 
132 |     # [1,num_priors] best ground truth for each prior
133 |     best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True)
134 |     best_truth_idx.squeeze_(0)
135 |     best_truth_overlap.squeeze_(0)
136 |     best_prior_idx.squeeze_(1)
137 |     best_prior_idx_filter.squeeze_(1)
138 |     best_prior_overlap.squeeze_(1)
139 |     best_truth_overlap.index_fill_(0, best_prior_idx_filter, 2)  # ensure best prior
140 |     # TODO refactor: index  best_prior_idx with long tensor
141 |     # ensure every gt matches with its prior of max overlap
142 |     for j in range(best_prior_idx.size(0)):     # 判别此anchor是预测哪一个boxes
143 |         best_truth_idx[best_prior_idx[j]] = j
144 |     matches = truths[best_truth_idx]            # Shape: [num_priors,4] 此处为每一个anchor对应的bbox取出来
145 |     conf = labels[best_truth_idx]               # Shape: [num_priors]      此处为每一个anchor对应的label取出来
146 |     conf[best_truth_overlap < threshold] = 0    # label as background   overlap<0.35的全部作为负样本
147 |     loc = encode(matches, priors, variances)
148 | 
149 |     matches_landm = landms[best_truth_idx]
150 |     landm = encode_landm(matches_landm, priors, variances)
151 |     loc_t[idx] = loc    # [num_priors,4] encoded offsets to learn
152 |     conf_t[idx] = conf  # [num_priors] top class label for each prior
153 |     landm_t[idx] = landm
154 | 
155 | 
156 | def encode(matched, priors, variances):
157 |     """Encode the variances from the priorbox layers into the ground truth boxes
158 |     we have matched (based on jaccard overlap) with the prior boxes.
159 |     Args:
160 |         matched: (tensor) Coords of ground truth for each prior in point-form
161 |             Shape: [num_priors, 4].
162 |         priors: (tensor) Prior boxes in center-offset form
163 |             Shape: [num_priors,4].
164 |         variances: (list[float]) Variances of priorboxes
165 |     Return:
166 |         encoded boxes (tensor), Shape: [num_priors, 4]
167 |     """
168 | 
169 |     # dist b/t match center and prior's center
170 |     g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2]
171 |     # encode variance
172 |     g_cxcy /= (variances[0] * priors[:, 2:])
173 |     # match wh / prior wh
174 |     g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
175 |     g_wh = torch.log(g_wh) / variances[1]
176 |     # return target for smooth_l1_loss
177 |     return torch.cat([g_cxcy, g_wh], 1)  # [num_priors,4]
178 | 
179 | 
180 | def encode_landm(matched, priors, variances):
181 |     """Encode the variances from the priorbox layers into the ground truth boxes
182 |     we have matched (based on jaccard overlap) with the prior boxes.
183 |     Args:
184 |         matched: (tensor) Coords of ground truth for each prior in point-form
185 |             Shape: [num_priors, 10].
186 |         priors: (tensor) Prior boxes in center-offset form
187 |             Shape: [num_priors,4].
188 |         variances: (list[float]) Variances of priorboxes
189 |     Return:
190 |         encoded landm (tensor), Shape: [num_priors, 10]
191 |     """
192 | 
193 |     # dist b/t match center and prior's center
194 |     matched = torch.reshape(matched, (matched.size(0), 5, 2))
195 |     priors_cx = priors[:, 0].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
196 |     priors_cy = priors[:, 1].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
197 |     priors_w = priors[:, 2].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
198 |     priors_h = priors[:, 3].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2)
199 |     priors = torch.cat([priors_cx, priors_cy, priors_w, priors_h], dim=2)
200 |     g_cxcy = matched[:, :, :2] - priors[:, :, :2]
201 |     # encode variance
202 |     g_cxcy /= (variances[0] * priors[:, :, 2:])
203 |     # g_cxcy /= priors[:, :, 2:]
204 |     g_cxcy = g_cxcy.reshape(g_cxcy.size(0), -1)
205 |     # return target for smooth_l1_loss
206 |     return g_cxcy
207 | 
208 | 
209 | # Adapted from https://github.com/Hakuyume/chainer-ssd
210 | def decode(loc, priors, variances):
211 |     """Decode locations from predictions using priors to undo
212 |     the encoding we did for offset regression at train time.
213 |     Args:
214 |         loc (tensor): location predictions for loc layers,
215 |             Shape: [num_priors,4]
216 |         priors (tensor): Prior boxes in center-offset form.
217 |             Shape: [num_priors,4].
218 |         variances: (list[float]) Variances of priorboxes
219 |     Return:
220 |         decoded bounding box predictions
221 |     """
222 | 
223 |     boxes = torch.cat((
224 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
225 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
226 |     boxes[:, :2] -= boxes[:, 2:] / 2
227 |     boxes[:, 2:] += boxes[:, :2]
228 |     return boxes
229 | 
230 | 
231 | def decode_landm(pre, priors, variances):
232 |     """Decode landm from predictions using priors to undo
233 |     the encoding we did for offset regression at train time.
234 |     Args:
235 |         pre (tensor): landm predictions for loc layers,
236 |             Shape: [num_priors,10]
237 |         priors (tensor): Prior boxes in center-offset form.
238 |             Shape: [num_priors,4].
239 |         variances: (list[float]) Variances of priorboxes
240 |     Return:
241 |         decoded landm predictions
242 |     """
243 |     landms = torch.cat((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:],
244 |                         priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:],
245 |                         priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:],
246 |                         priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:],
247 |                         priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:],
248 |                         ), dim=1)
249 |     return landms
250 | 
251 | 
252 | def log_sum_exp(x):
253 |     """Utility function for computing log_sum_exp while determining
254 |     This will be used to determine unaveraged confidence loss across
255 |     all examples in a batch.
256 |     Args:
257 |         x (Variable(tensor)): conf_preds from conf layers
258 |     """
259 |     x_max = x.data.max()
260 |     return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max
261 | 
262 | 
263 | # Original author: Francisco Massa:
264 | # https://github.com/fmassa/object-detection.torch
265 | # Ported to PyTorch by Max deGroot (02/01/2017)
266 | def nms(boxes, scores, overlap=0.5, top_k=200):
267 |     """Apply non-maximum suppression at test time to avoid detecting too many
268 |     overlapping bounding boxes for a given object.
269 |     Args:
270 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
271 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
272 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
273 |         top_k: (int) The Maximum number of box preds to consider.
274 |     Return:
275 |         The indices of the kept boxes with respect to num_priors.
276 |     """
277 | 
278 |     keep = torch.Tensor(scores.size(0)).fill_(0).long()
279 |     if boxes.numel() == 0:
280 |         return keep
281 |     x1 = boxes[:, 0]
282 |     y1 = boxes[:, 1]
283 |     x2 = boxes[:, 2]
284 |     y2 = boxes[:, 3]
285 |     area = torch.mul(x2 - x1, y2 - y1)
286 |     v, idx = scores.sort(0)  # sort in ascending order
287 |     # I = I[v >= 0.01]
288 |     idx = idx[-top_k:]  # indices of the top-k largest vals
289 |     xx1 = boxes.new()
290 |     yy1 = boxes.new()
291 |     xx2 = boxes.new()
292 |     yy2 = boxes.new()
293 |     w = boxes.new()
294 |     h = boxes.new()
295 | 
296 |     # keep = torch.Tensor()
297 |     count = 0
298 |     while idx.numel() > 0:
299 |         i = idx[-1]  # index of current largest val
300 |         # keep.append(i)
301 |         keep[count] = i
302 |         count += 1
303 |         if idx.size(0) == 1:
304 |             break
305 |         idx = idx[:-1]  # remove kept element from view
306 |         # load bboxes of next highest vals
307 |         torch.index_select(x1, 0, idx, out=xx1)
308 |         torch.index_select(y1, 0, idx, out=yy1)
309 |         torch.index_select(x2, 0, idx, out=xx2)
310 |         torch.index_select(y2, 0, idx, out=yy2)
311 |         # store element-wise max with next highest score
312 |         xx1 = torch.clamp(xx1, min=x1[i])
313 |         yy1 = torch.clamp(yy1, min=y1[i])
314 |         xx2 = torch.clamp(xx2, max=x2[i])
315 |         yy2 = torch.clamp(yy2, max=y2[i])
316 |         w.resize_as_(xx2)
317 |         h.resize_as_(yy2)
318 |         w = xx2 - xx1
319 |         h = yy2 - yy1
320 |         # check sizes of xx1 and xx2.. after each iteration
321 |         w = torch.clamp(w, min=0.0)
322 |         h = torch.clamp(h, min=0.0)
323 |         inter = w*h
324 |         # IoU = i / (area(a) + area(b) - i)
325 |         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
326 |         union = (rem_areas - inter) + area[i]
327 |         IoU = inter/union  # store result in iou
328 |         # keep only elements with an IoU <= overlap
329 |         idx = idx[IoU.le(overlap)]
330 |     return keep, count
331 | 
332 | 
333 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/config.py:
--------------------------------------------------------------------------------
 1 | # config.py
 2 | 
 3 | cfg_mnet = {
 4 |     'name': 'mobilenet0.25',
 5 |     'min_sizes': [[16, 32], [64, 128], [256, 512]],
 6 |     'steps': [8, 16, 32],
 7 |     'variance': [0.1, 0.2],
 8 |     'clip': False,
 9 |     'loc_weight': 2.0,
10 |     'gpu_train': True,
11 |     'batch_size': 32,
12 |     'ngpu': 1,
13 |     'epoch': 250,
14 |     'decay1': 190,
15 |     'decay2': 220,
16 |     'image_size': 640,
17 |     'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3},
18 |     'in_channel': 32,
19 |     'out_channel': 64
20 | }
21 | 
22 | cfg_re50 = {
23 |     'name': 'Resnet50',
24 |     'min_sizes': [[16, 32], [64, 128], [256, 512]],
25 |     'steps': [8, 16, 32],
26 |     'variance': [0.1, 0.2],
27 |     'clip': False,
28 |     'loc_weight': 2.0,
29 |     'gpu_train': True,
30 |     'batch_size': 24,
31 |     'ngpu': 4,
32 |     'epoch': 100,
33 |     'decay1': 70,
34 |     'decay2': 90,
35 |     'image_size': 840,
36 |     'return_layers': {'layer2': 1, 'layer3': 2, 'layer4': 3},
37 |     'in_channel': 256,
38 |     'out_channel': 256
39 | }
40 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/prior_box.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from itertools import product as product
 3 | from math import ceil
 4 | 
 5 | 
 6 | class PriorBox(object):
 7 |     def __init__(self, cfg, image_size=None):
 8 |         super(PriorBox, self).__init__()
 9 |         self.min_sizes = cfg['min_sizes']
10 |         self.steps = cfg['steps']
11 |         self.clip = cfg['clip']
12 |         self.image_size = image_size
13 |         self.feature_maps = [[ceil(self.image_size[0]/step), ceil(self.image_size[1]/step)] for step in self.steps]
14 |         self.name = "s"
15 | 
16 |     def forward(self):
17 |         anchors = []
18 |         for k, f in enumerate(self.feature_maps):
19 |             min_sizes = self.min_sizes[k]
20 |             for i, j in product(range(f[0]), range(f[1])):
21 |                 for min_size in min_sizes:
22 |                     s_kx = min_size / self.image_size[1]
23 |                     s_ky = min_size / self.image_size[0]
24 |                     dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]]
25 |                     dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]]
26 |                     for cy, cx in product(dense_cy, dense_cx):
27 |                         anchors += [cx, cy, s_kx, s_ky]
28 | 
29 |         # back to torch land
30 |         output = torch.Tensor(anchors).view(-1, 4)
31 |         if self.clip:
32 |             output.clamp_(max=1, min=0)
33 |         return output
34 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/py_cpu_nms.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Fast R-CNN
 3 | # Copyright (c) 2015 Microsoft
 4 | # Licensed under The MIT License [see LICENSE for details]
 5 | # Written by Ross Girshick
 6 | # --------------------------------------------------------
 7 | 
 8 | import numpy as np
 9 | 
10 | 
11 | def py_cpu_nms(dets, thresh, top_k):
12 |     """Pure Python NMS baseline."""
13 |     x1 = dets[:, 0]
14 |     y1 = dets[:, 1]
15 |     x2 = dets[:, 2]
16 |     y2 = dets[:, 3]
17 |     scores = dets[:, 4]
18 | 
19 |     areas = (x2 - x1 + 1) * (y2 - y1 + 1)
20 |     order = scores.argsort()[: -top_k - 1: -1]
21 | 
22 |     keep = []
23 |     while order.size > 0:
24 |         i = order[0]
25 |         keep.append(i)
26 |         xx1 = np.maximum(x1[i], x1[order[1:]])
27 |         yy1 = np.maximum(y1[i], y1[order[1:]])
28 |         xx2 = np.minimum(x2[i], x2[order[1:]])
29 |         yy2 = np.minimum(y2[i], y2[order[1:]])
30 | 
31 |         w = np.maximum(0.0, xx2 - xx1 + 1)
32 |         h = np.maximum(0.0, yy2 - yy1 + 1)
33 |         inter = w * h
34 |         ovr = inter / (areas[i] + areas[order[1:]] - inter)
35 | 
36 |         inds = np.where(ovr <= thresh)[0]
37 |         order = order[inds + 1]
38 | 
39 |     return keep
40 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/retina_face.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import torchvision.models as models
  5 | import torchvision.models._utils as _utils
  6 | from .retina_face_net import MobileNetV1, FPN, SSH
  7 | 
  8 | 
  9 | class ClassHead(nn.Module):
 10 |     def __init__(self, inchannels=512, num_anchors=3):
 11 |         super(ClassHead, self).__init__()
 12 |         self.num_anchors = num_anchors
 13 |         self.conv1x1 = nn.Conv2d(inchannels, self.num_anchors*2, kernel_size=(1, 1), stride=1, padding=0)
 14 | 
 15 |     def forward(self, x):
 16 |         out = self.conv1x1(x)
 17 |         out = out.permute(0, 2, 3, 1).contiguous()
 18 |         
 19 |         return out.view(out.shape[0], -1, 2)
 20 | 
 21 | 
 22 | class BboxHead(nn.Module):
 23 |     def __init__(self, inchannels=512, num_anchors=3):
 24 |         super(BboxHead, self).__init__()
 25 |         self.conv1x1 = nn.Conv2d(inchannels, num_anchors*4, kernel_size=(1, 1), stride=1,padding=0)
 26 | 
 27 |     def forward(self, x):
 28 |         out = self.conv1x1(x)
 29 |         out = out.permute(0, 2, 3, 1).contiguous()
 30 | 
 31 |         return out.view(out.shape[0], -1, 4)
 32 | 
 33 | 
 34 | class LandmarkHead(nn.Module):
 35 |     def __init__(self, inchannels=512, num_anchors=3):
 36 |         super(LandmarkHead, self).__init__()
 37 |         self.conv1x1 = nn.Conv2d(inchannels,num_anchors*10, kernel_size=(1, 1), stride=1, padding=0)
 38 | 
 39 |     def forward(self, x):
 40 |         out = self.conv1x1(x)
 41 |         out = out.permute(0, 2, 3, 1).contiguous()
 42 | 
 43 |         return out.view(out.shape[0], -1, 10)
 44 | 
 45 | 
 46 | class RetinaFace(nn.Module):
 47 |     def __init__(self, cfg=None, phase='train'):
 48 |         """
 49 |         :param cfg:  Network related settings.
 50 |         :param phase: train or test.
 51 |         """
 52 |         super(RetinaFace, self).__init__()
 53 |         self.phase = phase
 54 |         backbone = None
 55 |         if cfg['name'] == 'mobilenet0.25':
 56 |             backbone = MobileNetV1()
 57 |         elif cfg['name'] == 'Resnet50':
 58 |             backbone = models.resnet50()
 59 | 
 60 |         self.body = _utils.IntermediateLayerGetter(backbone, cfg['return_layers'])
 61 |         in_channels_stage2 = cfg['in_channel']
 62 |         in_channels_list = [
 63 |             in_channels_stage2 * 2,
 64 |             in_channels_stage2 * 4,
 65 |             in_channels_stage2 * 8,
 66 |         ]
 67 |         out_channels = cfg['out_channel']
 68 |         self.fpn = FPN(in_channels_list,out_channels)
 69 |         self.ssh1 = SSH(out_channels, out_channels)
 70 |         self.ssh2 = SSH(out_channels, out_channels)
 71 |         self.ssh3 = SSH(out_channels, out_channels)
 72 | 
 73 |         self.ClassHead = self._make_class_head(fpn_num=3, inchannels=cfg['out_channel'])
 74 |         self.BboxHead = self._make_bbox_head(fpn_num=3, inchannels=cfg['out_channel'])
 75 |         self.LandmarkHead = self._make_landmark_head(fpn_num=3, inchannels=cfg['out_channel'])
 76 | 
 77 |     def _make_class_head(self, fpn_num=3, inchannels=64, anchor_num=2):
 78 |         classhead = nn.ModuleList()
 79 |         for i in range(fpn_num):
 80 |             classhead.append(ClassHead(inchannels, anchor_num))
 81 |         return classhead
 82 |     
 83 |     def _make_bbox_head(self, fpn_num=3, inchannels=64, anchor_num=2):
 84 |         bboxhead = nn.ModuleList()
 85 |         for i in range(fpn_num):
 86 |             bboxhead.append(BboxHead(inchannels, anchor_num))
 87 |         return bboxhead
 88 | 
 89 |     def _make_landmark_head(self, fpn_num=3, inchannels=64, anchor_num=2):
 90 |         landmarkhead = nn.ModuleList()
 91 |         for i in range(fpn_num):
 92 |             landmarkhead.append(LandmarkHead(inchannels, anchor_num))
 93 |         return landmarkhead
 94 | 
 95 |     def forward(self, inputs):
 96 |         out = self.body(inputs)
 97 | 
 98 |         # FPN
 99 |         fpn = self.fpn(out)
100 | 
101 |         # SSH
102 |         feature1 = self.ssh1(fpn[0])
103 |         feature2 = self.ssh2(fpn[1])
104 |         feature3 = self.ssh3(fpn[2])
105 |         features = [feature1, feature2, feature3]
106 | 
107 |         bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1)
108 |         classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)], dim=1)
109 |         ldm_regressions = torch.cat([self.LandmarkHead[i](feature) for i, feature in enumerate(features)], dim=1)
110 | 
111 |         if self.phase == 'train':
112 |             output = (bbox_regressions, classifications, ldm_regressions)
113 |         else:
114 |             output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions)
115 |         return output
116 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/retina_face_net.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | 
  5 | 
  6 | def conv_bn(inp, oup, stride = 1, leaky = 0):
  7 |     return nn.Sequential(
  8 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
  9 |         nn.BatchNorm2d(oup),
 10 |         nn.LeakyReLU(negative_slope=leaky, inplace=True)
 11 |     )
 12 | 
 13 | 
 14 | def conv_bn_no_relu(inp, oup, stride):
 15 |     return nn.Sequential(
 16 |         nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
 17 |         nn.BatchNorm2d(oup),
 18 |     )
 19 | 
 20 | 
 21 | def conv_bn1X1(inp, oup, stride, leaky=0):
 22 |     return nn.Sequential(
 23 |         nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False),
 24 |         nn.BatchNorm2d(oup),
 25 |         nn.LeakyReLU(negative_slope=leaky, inplace=True)
 26 |     )
 27 | 
 28 | 
 29 | def conv_dw(inp, oup, stride, leaky=0.1):
 30 |     return nn.Sequential(
 31 |         nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
 32 |         nn.BatchNorm2d(inp),
 33 |         nn.LeakyReLU(negative_slope=leaky, inplace=True),
 34 | 
 35 |         nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
 36 |         nn.BatchNorm2d(oup),
 37 |         nn.LeakyReLU(negative_slope=leaky, inplace=True),
 38 |     )
 39 | 
 40 | 
 41 | class SSH(nn.Module):
 42 |     def __init__(self, in_channel, out_channel):
 43 |         super(SSH, self).__init__()
 44 |         assert out_channel % 4 == 0
 45 |         leaky = 0
 46 |         if out_channel <= 64:
 47 |             leaky = 0.1
 48 |         self.conv3X3 = conv_bn_no_relu(in_channel, out_channel//2, stride=1)
 49 | 
 50 |         self.conv5X5_1 = conv_bn(in_channel, out_channel//4, stride=1, leaky = leaky)
 51 |         self.conv5X5_2 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)
 52 | 
 53 |         self.conv7X7_2 = conv_bn(out_channel//4, out_channel//4, stride=1, leaky = leaky)
 54 |         self.conv7x7_3 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1)
 55 | 
 56 |     def forward(self, input):
 57 |         conv3X3 = self.conv3X3(input)
 58 | 
 59 |         conv5X5_1 = self.conv5X5_1(input)
 60 |         conv5X5 = self.conv5X5_2(conv5X5_1)
 61 | 
 62 |         conv7X7_2 = self.conv7X7_2(conv5X5_1)
 63 |         conv7X7 = self.conv7x7_3(conv7X7_2)
 64 | 
 65 |         out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1)
 66 |         out = F.relu(out)
 67 |         return out
 68 | 
 69 | 
 70 | class FPN(nn.Module):
 71 |     def __init__(self,in_channels_list,out_channels):
 72 |         super(FPN,self).__init__()
 73 |         leaky = 0
 74 |         if out_channels <= 64:
 75 |             leaky = 0.1
 76 |         self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride=1, leaky=leaky)
 77 |         self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride=1, leaky=leaky)
 78 |         self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride=1, leaky=leaky)
 79 | 
 80 |         self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky)
 81 |         self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky)
 82 | 
 83 |     def forward(self, input):
 84 |         # names = list(input.keys())
 85 |         input = list(input.values())
 86 | 
 87 |         output1 = self.output1(input[0])
 88 |         output2 = self.output2(input[1])
 89 |         output3 = self.output3(input[2])
 90 | 
 91 |         up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode="nearest")
 92 |         output2 = output2 + up3
 93 |         output2 = self.merge2(output2)
 94 | 
 95 |         up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode="nearest")
 96 |         output1 = output1 + up2
 97 |         output1 = self.merge1(output1)
 98 | 
 99 |         out = [output1, output2, output3]
100 |         return out
101 | 
102 | 
103 | class MobileNetV1(nn.Module):
104 |     def __init__(self):
105 |         super(MobileNetV1, self).__init__()
106 |         self.stage1 = nn.Sequential(
107 |             conv_bn(3, 8, 2, leaky=0.1),    # 3
108 |             conv_dw(8, 16, 1),   # 7
109 |             conv_dw(16, 32, 2),  # 11
110 |             conv_dw(32, 32, 1),  # 19
111 |             conv_dw(32, 64, 2),  # 27
112 |             conv_dw(64, 64, 1),  # 43
113 |         )
114 |         self.stage2 = nn.Sequential(
115 |             conv_dw(64, 128, 2),    # 43 + 16 = 59
116 |             conv_dw(128, 128, 1),   # 59 + 32 = 91
117 |             conv_dw(128, 128, 1),   # 91 + 32 = 123
118 |             conv_dw(128, 128, 1),   # 123 + 32 = 155
119 |             conv_dw(128, 128, 1),   # 155 + 32 = 187
120 |             conv_dw(128, 128, 1),   # 187 + 32 = 219
121 |         )
122 |         self.stage3 = nn.Sequential(
123 |             conv_dw(128, 256, 2),   # 219 +3 2 = 241
124 |             conv_dw(256, 256, 1),   # 241 + 64 = 301
125 |         )
126 |         self.avg = nn.AdaptiveAvgPool2d((1,1))
127 |         self.fc = nn.Linear(256, 1000)
128 | 
129 |     def forward(self, x):
130 |         x = self.stage1(x)
131 |         x = self.stage2(x)
132 |         x = self.stage3(x)
133 |         x = self.avg(x)
134 |         # x = self.model(x)
135 |         x = x.view(-1, 256)
136 |         x = self.fc(x)
137 |         return x
138 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/retina_face_predictor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import numpy as np
  4 | from copy import deepcopy
  5 | from types import SimpleNamespace
  6 | from typing import Union, Optional
  7 | from .prior_box import PriorBox
  8 | from .py_cpu_nms import py_cpu_nms
  9 | from .retina_face import RetinaFace
 10 | from .config import cfg_mnet, cfg_re50
 11 | from .box_utils import decode, decode_landm
 12 | 
 13 | 
 14 | __all__ = ['RetinaFacePredictor']
 15 | 
 16 | 
 17 | class RetinaFacePredictor(object):
 18 |     def __init__(self, threshold: float = 0.8, device: Union[str, torch.device] = 'cuda:0',
 19 |                  model: Optional[SimpleNamespace] = None, config: Optional[SimpleNamespace] = None) -> None:
 20 |         self.threshold = threshold
 21 |         self.device = device
 22 |         if model is None:
 23 |             model = RetinaFacePredictor.get_model()
 24 |         if config is None:
 25 |             config = RetinaFacePredictor.create_config()
 26 |         self.config = SimpleNamespace(**model.config.__dict__, **config.__dict__)
 27 |         self.net = RetinaFace(cfg=self.config.__dict__, phase='test').to(self.device)
 28 |         pretrained_dict = torch.load(model.weights, map_location=self.device)
 29 |         if 'state_dict' in pretrained_dict.keys():
 30 |             pretrained_dict = {key.split('module.', 1)[-1] if key.startswith('module.') else key: value
 31 |                                for key, value in pretrained_dict['state_dict'].items()}
 32 |         else:
 33 |             pretrained_dict = {key.split('module.', 1)[-1] if key.startswith('module.') else key: value
 34 |                                for key, value in pretrained_dict.items()}
 35 |         self.net.load_state_dict(pretrained_dict, strict=False)
 36 |         self.net.eval()
 37 |         self.priors = None
 38 |         self.previous_size = None
 39 | 
 40 |     @staticmethod
 41 |     def get_model(name: str = 'resnet50') -> SimpleNamespace:
 42 |         name = name.lower().strip()
 43 |         if name == 'resnet50':
 44 |             return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__),
 45 |                                                                          'weights', 'Resnet50_Final.pth')),
 46 |                                    config=SimpleNamespace(**deepcopy(cfg_re50)))
 47 |         elif name == 'mobilenet0.25':
 48 |             return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__),
 49 |                                                                          'weights', 'mobilenet0.25_Final.pth')),
 50 |                                    config=SimpleNamespace(**deepcopy(cfg_mnet)))
 51 |         else:
 52 |             raise ValueError('name must be set to either resnet50 or mobilenet0.25')
 53 | 
 54 |     @staticmethod
 55 |     def create_config(top_k: int = 750, conf_thresh: float = 0.02,
 56 |                       nms_thresh: float = 0.4, nms_top_k: int = 5000) -> SimpleNamespace:
 57 |         return SimpleNamespace(top_k=top_k, conf_thresh=conf_thresh, nms_thresh=nms_thresh, nms_top_k=nms_top_k)
 58 | 
 59 |     @torch.no_grad()
 60 |     def __call__(self, image: np.ndarray, rgb: bool = True) -> np.ndarray:
 61 |         im_height, im_width, _ = image.shape
 62 |         if rgb:
 63 |             image = image[..., ::-1]
 64 |         image = image.astype(int) - np.array([104, 117, 123])
 65 |         image = image.transpose(2, 0, 1)
 66 |         image = torch.from_numpy(image).unsqueeze(0).float().to(self.device)
 67 |         scale = torch.Tensor([im_width, im_height, im_width, im_height]).to(self.device)
 68 |         loc, conf, landms = self.net(image)
 69 |         image_size = (im_height, im_width)
 70 |         if self.priors is None or self.previous_size != image_size:
 71 |             self.priors = PriorBox(self.config.__dict__, image_size=image_size).forward().to(self.device)
 72 |             self.previous_size = image_size
 73 |         prior_data = self.priors.data
 74 |         boxes = decode(loc.data.squeeze(0), prior_data, self.config.variance)
 75 |         boxes = boxes * scale
 76 |         boxes = boxes.cpu().numpy()
 77 |         scores = conf.squeeze(0).data.cpu().numpy()[:, 1]
 78 |         landms = decode_landm(landms.data.squeeze(0), prior_data, self.config.variance)
 79 |         scale1 = torch.Tensor([image.shape[3], image.shape[2], image.shape[3], image.shape[2],
 80 |                                image.shape[3], image.shape[2], image.shape[3], image.shape[2],
 81 |                                image.shape[3], image.shape[2]]).to(self.device)
 82 |         landms = landms * scale1
 83 |         landms = landms.cpu().numpy()
 84 | 
 85 |         # ignore low scores
 86 |         inds = np.where(scores > self.config.conf_thresh)[0]
 87 |         if len(inds) == 0:
 88 |             return np.empty(shape=(0, 15), dtype=np.float32)
 89 |         boxes = boxes[inds]
 90 |         landms = landms[inds]
 91 |         scores = scores[inds]
 92 | 
 93 |         # do NMS
 94 |         dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False)
 95 |         keep = py_cpu_nms(dets, self.config.nms_thresh, self.config.nms_top_k)
 96 |         dets = dets[keep, :]
 97 |         landms = landms[keep]
 98 | 
 99 |         # keep top-K
100 |         dets = dets[:self.config.top_k, :]
101 |         landms = landms[:self.config.top_k, :]
102 |         dets = np.concatenate((dets, landms), axis=1)
103 | 
104 |         # further filter by confidence
105 |         inds = np.where(dets[:, 4] >= self.threshold)[0]
106 |         if len(inds) == 0:
107 |             return np.empty(shape=(0, 15), dtype=np.float32)
108 |         else:
109 |             return dets[inds]
110 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/weights/Resnet50_Final.pth:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6d1de9c2944f2ccddca5f5e010ea5ae64a39845a86311af6fdf30841b0a5a16d
3 | size 109497761
4 | 


--------------------------------------------------------------------------------
/ibug/face_detection/retina_face/weights/mobilenet0.25_Final.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hhj1897/face_detection/db2a4e8eae8c9c53385ff0773e9db08f03cf21ad/ibug/face_detection/retina_face/weights/mobilenet0.25_Final.pth


--------------------------------------------------------------------------------
/ibug/face_detection/s3fd/__init__.py:
--------------------------------------------------------------------------------
1 | from .s3fd_predictor import S3FDPredictor
2 | 


--------------------------------------------------------------------------------
/ibug/face_detection/s3fd/s3fd_net.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.init as init
  4 | import torch.nn.functional as F
  5 | from .utils import Detect, PriorBox
  6 | 
  7 | 
  8 | class L2Norm(nn.Module):
  9 | 
 10 |     def __init__(self, n_channels, scale):
 11 |         super(L2Norm, self).__init__()
 12 |         self.n_channels = n_channels
 13 |         self.gamma = scale or None
 14 |         self.eps = 1e-10
 15 |         self.weight = nn.Parameter(torch.Tensor(self.n_channels))
 16 |         self.reset_parameters()
 17 | 
 18 |     def reset_parameters(self):
 19 |         init.constant_(self.weight, self.gamma)
 20 | 
 21 |     def forward(self, x):
 22 |         norm = x.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps
 23 |         x = torch.div(x, norm)
 24 |         out = self.weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(x) * x
 25 |         return out
 26 | 
 27 | 
 28 | class S3FDNet(nn.Module):
 29 | 
 30 |     def __init__(self, config, device='cuda'):
 31 |         super(S3FDNet, self).__init__()
 32 |         self.config = config
 33 |         self.device = device
 34 | 
 35 |         self.vgg = nn.ModuleList([
 36 |             nn.Conv2d(3, 64, 3, 1, padding=1),
 37 |             nn.ReLU(inplace=True),
 38 |             nn.Conv2d(64, 64, 3, 1, padding=1),
 39 |             nn.ReLU(inplace=True),
 40 |             nn.MaxPool2d(2, 2),
 41 | 
 42 |             nn.Conv2d(64, 128, 3, 1, padding=1),
 43 |             nn.ReLU(inplace=True),
 44 |             nn.Conv2d(128, 128, 3, 1, padding=1),
 45 |             nn.ReLU(inplace=True),
 46 |             nn.MaxPool2d(2, 2),
 47 | 
 48 |             nn.Conv2d(128, 256, 3, 1, padding=1),
 49 |             nn.ReLU(inplace=True),
 50 |             nn.Conv2d(256, 256, 3, 1, padding=1),
 51 |             nn.ReLU(inplace=True),
 52 |             nn.Conv2d(256, 256, 3, 1, padding=1),
 53 |             nn.ReLU(inplace=True),
 54 |             nn.MaxPool2d(2, 2, ceil_mode=True),
 55 | 
 56 |             nn.Conv2d(256, 512, 3, 1, padding=1),
 57 |             nn.ReLU(inplace=True),
 58 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 59 |             nn.ReLU(inplace=True),
 60 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 61 |             nn.ReLU(inplace=True),
 62 |             nn.MaxPool2d(2, 2),
 63 | 
 64 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 65 |             nn.ReLU(inplace=True),
 66 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 67 |             nn.ReLU(inplace=True),
 68 |             nn.Conv2d(512, 512, 3, 1, padding=1),
 69 |             nn.ReLU(inplace=True),
 70 |             nn.MaxPool2d(2, 2),
 71 | 
 72 |             nn.Conv2d(512, 1024, 3, 1, padding=6, dilation=6),
 73 |             nn.ReLU(inplace=True),
 74 |             nn.Conv2d(1024, 1024, 1, 1),
 75 |             nn.ReLU(inplace=True),
 76 |         ])
 77 | 
 78 |         self.L2Norm3_3 = L2Norm(256, 10)
 79 |         self.L2Norm4_3 = L2Norm(512, 8)
 80 |         self.L2Norm5_3 = L2Norm(512, 5)
 81 | 
 82 |         self.extras = nn.ModuleList([
 83 |             nn.Conv2d(1024, 256, 1, 1),
 84 |             nn.Conv2d(256, 512, 3, 2, padding=1),
 85 |             nn.Conv2d(512, 128, 1, 1),
 86 |             nn.Conv2d(128, 256, 3, 2, padding=1),
 87 |         ])
 88 | 
 89 |         self.loc = nn.ModuleList([
 90 |             nn.Conv2d(256, 4, 3, 1, padding=1),
 91 |             nn.Conv2d(512, 4, 3, 1, padding=1),
 92 |             nn.Conv2d(512, 4, 3, 1, padding=1),
 93 |             nn.Conv2d(1024, 4, 3, 1, padding=1),
 94 |             nn.Conv2d(512, 4, 3, 1, padding=1),
 95 |             nn.Conv2d(256, 4, 3, 1, padding=1),
 96 |         ])
 97 | 
 98 |         self.conf = nn.ModuleList([
 99 |             nn.Conv2d(256, 4, 3, 1, padding=1),
100 |             nn.Conv2d(512, 2, 3, 1, padding=1),
101 |             nn.Conv2d(512, 2, 3, 1, padding=1),
102 |             nn.Conv2d(1024, 2, 3, 1, padding=1),
103 |             nn.Conv2d(512, 2, 3, 1, padding=1),
104 |             nn.Conv2d(256, 2, 3, 1, padding=1),
105 |         ])
106 | 
107 |         self.priors = None
108 |         self.previous_size = None
109 | 
110 |         self.softmax = nn.Softmax(dim=-1)
111 |         self.detect = Detect(self.config)
112 | 
113 |     def forward(self, x):
114 |         size = x.size()[2:]
115 |         sources = list()
116 |         loc = list()
117 |         conf = list()
118 | 
119 |         for k in range(16):
120 |             x = self.vgg[k](x)
121 |         s = self.L2Norm3_3(x)
122 |         sources.append(s)
123 | 
124 |         for k in range(16, 23):
125 |             x = self.vgg[k](x)
126 |         s = self.L2Norm4_3(x)
127 |         sources.append(s)
128 | 
129 |         for k in range(23, 30):
130 |             x = self.vgg[k](x)
131 |         s = self.L2Norm5_3(x)
132 |         sources.append(s)
133 | 
134 |         for k in range(30, len(self.vgg)):
135 |             x = self.vgg[k](x)
136 |         sources.append(x)
137 | 
138 |         # apply extra layers and cache source layer outputs
139 |         for k, v in enumerate(self.extras):
140 |             x = F.relu(v(x), inplace=True)
141 |             if k % 2 == 1:
142 |                 sources.append(x)
143 | 
144 |         # apply multibox head to source layers
145 |         loc_x = self.loc[0](sources[0])
146 |         conf_x = self.conf[0](sources[0])
147 | 
148 |         max_conf, _ = torch.max(conf_x[:, 0:3, :, :], dim=1, keepdim=True)
149 |         conf_x = torch.cat((max_conf, conf_x[:, 3:, :, :]), dim=1)
150 | 
151 |         loc.append(loc_x.permute(0, 2, 3, 1).contiguous())
152 |         conf.append(conf_x.permute(0, 2, 3, 1).contiguous())
153 | 
154 |         for i in range(1, len(sources)):
155 |             x = sources[i]
156 |             conf.append(self.conf[i](x).permute(0, 2, 3, 1).contiguous())
157 |             loc.append(self.loc[i](x).permute(0, 2, 3, 1).contiguous())
158 | 
159 |         if self.priors is None or self.previous_size != size:
160 |             with torch.no_grad():
161 |                 features_maps = []
162 |                 for i in range(len(loc)):
163 |                     feat = []
164 |                     feat += [loc[i].size(1), loc[i].size(2)]
165 |                     features_maps += [feat]
166 |                 self.priors = PriorBox(size, features_maps, self.config).forward().to(self.device)
167 |                 self.previous_size = size
168 | 
169 |         loc = torch.cat([o.view(o.size(0), -1) for o in loc], 1)
170 |         conf = torch.cat([o.view(o.size(0), -1) for o in conf], 1)
171 |         conf = self.softmax(conf.view(conf.size(0), -1, 2))
172 | 
173 |         output = self.detect(loc.view(loc.size(0), -1, 4), conf, self.priors)
174 | 
175 |         return output
176 | 


--------------------------------------------------------------------------------
/ibug/face_detection/s3fd/s3fd_predictor.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import numpy as np
 4 | from types import SimpleNamespace
 5 | from typing import Union, Optional
 6 | from .s3fd_net import S3FDNet
 7 | 
 8 | 
 9 | __all__ = ['S3FDPredictor']
10 | 
11 | 
12 | class S3FDPredictor(object):
13 |     def __init__(self, threshold: float = 0.8, device: Union[str, torch.device] = 'cuda:0',
14 |                  model: Optional[SimpleNamespace] = None, config: Optional[SimpleNamespace] = None) -> None:
15 |         self.threshold = threshold
16 |         self.device = device
17 |         if model is None:
18 |             model = S3FDPredictor.get_model()
19 |         if config is None:
20 |             config = S3FDPredictor.create_config()
21 |         self.config = SimpleNamespace(**model.config.__dict__, **config.__dict__)
22 |         self.net = S3FDNet(config=self.config, device=self.device).to(self.device)
23 |         self.net.load_state_dict(torch.load(model.weights, map_location=self.device))
24 |         self.net.eval()
25 | 
26 |     @staticmethod
27 |     def get_model(name: str = 's3fd') -> SimpleNamespace:
28 |         name = name.lower().strip()
29 |         if name == 's3fd':
30 |             return SimpleNamespace(weights=os.path.realpath(os.path.join(os.path.dirname(__file__),
31 |                                                                          'weights', 's3fd_weights.pth')),
32 |                                    config=SimpleNamespace(num_classes=2, variance=(0.1, 0.2),
33 |                                                           prior_min_sizes=(16, 32, 64, 128, 256, 512),
34 |                                                           prior_steps=(4, 8, 16, 32, 64, 128), prior_clip=False))
35 |         else:
36 |             raise ValueError('name must be set to s3fd')
37 | 
38 |     @staticmethod
39 |     def create_config(top_k: int = 750, conf_thresh: float = 0.05,nms_thresh: float = 0.3,
40 |                       nms_top_k: int = 5000, use_nms_np: bool = True) -> SimpleNamespace:
41 |         return SimpleNamespace(top_k=top_k, conf_thresh=conf_thresh, nms_thresh=nms_thresh,
42 |                                nms_top_k=nms_top_k, use_nms_np=use_nms_np)
43 | 
44 |     @torch.no_grad()
45 |     def __call__(self, image: np.ndarray, rgb: bool = True) -> np.ndarray:
46 |         w, h = image.shape[1], image.shape[0]
47 |         if not rgb:
48 |             image = image[..., ::-1]
49 |         image = image.astype(int) - np.array([123, 117, 104])
50 |         image = image.transpose(2, 0, 1)
51 |         image = image.reshape((1,) + image.shape)
52 |         image = torch.from_numpy(image).float().to(self.device)
53 | 
54 |         bboxes = []
55 |         detections = self.net(image)
56 |         scale = torch.Tensor([w, h, w, h]).to(detections.device)
57 |         for i in range(detections.size(1)):
58 |             j = 0
59 |             while detections[0, i, j, 0] >= self.threshold:
60 |                 score = detections[0, i, j, 0]
61 |                 pt = (detections[0, i, j, 1:] * scale).cpu().numpy()
62 |                 bbox = (pt[0], pt[1], pt[2], pt[3], score)
63 |                 bboxes.append(bbox)
64 |                 j += 1
65 |         if len(bboxes) > 0:
66 |             return np.array(bboxes)
67 |         else:
68 |             return np.empty(shape=(0, 5), dtype=np.float32)
69 | 


--------------------------------------------------------------------------------
/ibug/face_detection/s3fd/utils.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from itertools import product
  4 | 
  5 | 
  6 | def decode(loc, priors, variances):
  7 |     """Decode locations from predictions using priors to undo
  8 |     the encoding we did for offset regression at train time.
  9 |     Args:
 10 |         loc (tensor): location predictions for loc layers,
 11 |             Shape: [num_priors,4]
 12 |         priors (tensor): Prior boxes in center-offset form.
 13 |             Shape: [num_priors,4].
 14 |         variances: (list[float]) Variances of priorboxes
 15 |     Return:
 16 |         decoded bounding box predictions
 17 |     """
 18 | 
 19 |     boxes = torch.cat((
 20 |         priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
 21 |         priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
 22 |     boxes[:, :2] -= boxes[:, 2:] / 2
 23 |     boxes[:, 2:] += boxes[:, :2]
 24 |     return boxes
 25 | 
 26 | 
 27 | def nms(boxes, scores, overlap=0.5, top_k=200):
 28 |     """Apply non-maximum suppression at test time to avoid detecting too many
 29 |     overlapping bounding boxes for a given object.
 30 |     Args:
 31 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
 32 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
 33 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
 34 |         top_k: (int) The Maximum number of box preds to consider.
 35 |     Return:
 36 |         The indices of the kept boxes with respect to num_priors.
 37 |     """
 38 | 
 39 |     keep = scores.new(scores.size(0)).zero_().long()
 40 |     if boxes.numel() == 0:
 41 |         return keep, 0
 42 |     x1 = boxes[:, 0]
 43 |     y1 = boxes[:, 1]
 44 |     x2 = boxes[:, 2]
 45 |     y2 = boxes[:, 3]
 46 |     area = torch.mul(x2 - x1, y2 - y1)
 47 |     v, idx = scores.sort(0)  # sort in ascending order
 48 |     # I = I[v >= 0.01]
 49 |     idx = idx[-top_k:]  # indices of the top-k largest vals
 50 |     xx1 = boxes.new()
 51 |     yy1 = boxes.new()
 52 |     xx2 = boxes.new()
 53 |     yy2 = boxes.new()
 54 |     w = boxes.new()
 55 |     h = boxes.new()
 56 | 
 57 |     # keep = torch.Tensor()
 58 |     count = 0
 59 |     while idx.numel() > 0:
 60 |         i = idx[-1]  # index of current largest val
 61 |         # keep.append(i)
 62 |         keep[count] = i
 63 |         count += 1
 64 |         if idx.size(0) == 1:
 65 |             break
 66 |         idx = idx[:-1]  # remove kept element from view
 67 |         # load bboxes of next highest vals
 68 |         torch.index_select(x1, 0, idx, out=xx1)
 69 |         torch.index_select(y1, 0, idx, out=yy1)
 70 |         torch.index_select(x2, 0, idx, out=xx2)
 71 |         torch.index_select(y2, 0, idx, out=yy2)
 72 |         # store element-wise max with next highest score
 73 |         xx1 = torch.clamp(xx1, min=x1[i])
 74 |         yy1 = torch.clamp(yy1, min=y1[i])
 75 |         xx2 = torch.clamp(xx2, max=x2[i])
 76 |         yy2 = torch.clamp(yy2, max=y2[i])
 77 |         w.resize_as_(xx2)
 78 |         h.resize_as_(yy2)
 79 |         w = xx2 - xx1
 80 |         h = yy2 - yy1
 81 |         # check sizes of xx1 and xx2.. after each iteration
 82 |         w = torch.clamp(w, min=0.0)
 83 |         h = torch.clamp(h, min=0.0)
 84 |         inter = w * h
 85 |         # IoU = i / (area(a) + area(b) - i)
 86 |         rem_areas = torch.index_select(area, 0, idx)  # load remaining areas)
 87 |         union = (rem_areas - inter) + area[i]
 88 |         IoU = inter / union  # store result in iou
 89 |         # keep only elements with an IoU <= overlap
 90 |         idx = idx[IoU.le(overlap)]
 91 |     return keep, count
 92 | 
 93 | 
 94 | def nms_np(boxes, scores, overlap=0.5, top_k=200):
 95 |     """Apply non-maximum suppression at test time to avoid detecting too many
 96 |     overlapping bounding boxes for a given object, using numpy (for speed).
 97 |     Args:
 98 |         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
 99 |         scores: (tensor) The class predscores for the img, Shape:[num_priors].
100 |         overlap: (float) The overlap thresh for suppressing unnecessary boxes.
101 |         top_k: (int) The Maximum number of box preds to consider.
102 |     Return:
103 |         The indices of the kept boxes with respect to num_priors.
104 |     """
105 | 
106 |     if scores.size(0) == 0:
107 |         return [], 0
108 |     else:
109 |         areas = torch.mul(boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]).cpu().numpy()
110 |         x1, y1 = boxes[:, 0].cpu().numpy(), boxes[:, 1].cpu().numpy()
111 |         x2, y2 = boxes[:, 2].cpu().numpy(), boxes[:, 3].cpu().numpy()
112 |         scores = scores.cpu().numpy()
113 |         order = scores.argsort()[: -top_k - 1: -1]
114 | 
115 |         keep = []
116 |         while order.size > 0:
117 |             i = order[0]
118 |             keep.append(i)
119 |             xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
120 |             xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])
121 | 
122 |             w, h = np.maximum(0.0, xx2 - xx1), np.maximum(0.0, yy2 - yy1)
123 |             ovr = w * h / (areas[i] + areas[order[1:]] - w * h)
124 | 
125 |             inds = np.where(ovr <= overlap)[0]
126 |             order = order[inds + 1]
127 | 
128 |         return keep, len(keep)
129 | 
130 | 
131 | class Detect(object):
132 | 
133 |     def __init__(self, config):
134 | 
135 |         self.config = config
136 | 
137 |     def __call__(self, loc_data, conf_data, prior_data):
138 | 
139 |         num = loc_data.size(0)
140 |         num_priors = prior_data.size(0)
141 | 
142 |         conf_preds = conf_data.view(num, num_priors, self.config.num_classes).transpose(2, 1)
143 |         batch_priors = prior_data.view(-1, num_priors, 4).expand(num, num_priors, 4)
144 |         batch_priors = batch_priors.contiguous().view(-1, 4)
145 | 
146 |         decoded_boxes = decode(loc_data.view(-1, 4), batch_priors, self.config.variance)
147 |         decoded_boxes = decoded_boxes.view(num, num_priors, 4)
148 | 
149 |         output = torch.zeros(num, self.config.num_classes, self.config.top_k, 5)
150 | 
151 |         for i in range(num):
152 |             boxes = decoded_boxes[i].clone()
153 |             conf_scores = conf_preds[i].clone()
154 | 
155 |             for cl in range(1, self.config.num_classes):
156 |                 c_mask = conf_scores[cl].gt(self.config.conf_thresh)
157 |                 scores = conf_scores[cl][c_mask]
158 | 
159 |                 if scores.dim() == 0:
160 |                     continue
161 |                 l_mask = c_mask.unsqueeze(1).expand_as(boxes)
162 |                 boxes_ = boxes[l_mask].view(-1, 4)
163 |                 if self.config.use_nms_np:
164 |                     ids, count = nms_np(boxes_, scores, self.config.nms_thresh, self.config.nms_top_k)
165 |                 else:
166 |                     ids, count = nms(boxes_, scores, self.config.nms_thresh, self.config.nms_top_k)
167 |                 count = count if count < self.config.top_k else self.config.top_k
168 | 
169 |                 output[i, cl, :count] = torch.cat((scores[ids[:count]].unsqueeze(1), boxes_[ids[:count]]), 1)
170 | 
171 |         return output
172 | 
173 | 
174 | class PriorBox(object):
175 | 
176 |     def __init__(self, input_size, feature_maps, config):
177 | 
178 |         self.imh = input_size[0]
179 |         self.imw = input_size[1]
180 |         self.feature_maps = feature_maps
181 | 
182 |         self.config = config
183 | 
184 |     def forward(self):
185 |         mean = []
186 |         for k, fmap in enumerate(self.feature_maps):
187 |             feath = fmap[0]
188 |             featw = fmap[1]
189 |             for i, j in product(range(feath), range(featw)):
190 |                 f_kw = self.imw / self.config.prior_steps[k]
191 |                 f_kh = self.imh / self.config.prior_steps[k]
192 | 
193 |                 cx = (j + 0.5) / f_kw
194 |                 cy = (i + 0.5) / f_kh
195 | 
196 |                 s_kw = self.config.prior_min_sizes[k] / self.imw
197 |                 s_kh = self.config.prior_min_sizes[k] / self.imh
198 | 
199 |                 mean += [cx, cy, s_kw, s_kh]
200 | 
201 |         output = torch.FloatTensor(mean).view(-1, 4)
202 | 
203 |         if self.config.prior_clip:
204 |             output.clamp_(max=1, min=0)
205 | 
206 |         return output
207 | 


--------------------------------------------------------------------------------
/ibug/face_detection/s3fd/weights/s3fd_weights.pth:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hhj1897/face_detection/db2a4e8eae8c9c53385ff0773e9db08f03cf21ad/ibug/face_detection/s3fd/weights/s3fd_weights.pth


--------------------------------------------------------------------------------
/ibug/face_detection/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .head_pose_estimator import HeadPoseEstimator
2 | from .simple_face_tracker import SimpleFaceTracker
3 | 


--------------------------------------------------------------------------------
/ibug/face_detection/utils/data/bfm_lms.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hhj1897/face_detection/db2a4e8eae8c9c53385ff0773e9db08f03cf21ad/ibug/face_detection/utils/data/bfm_lms.npy


--------------------------------------------------------------------------------
/ibug/face_detection/utils/head_pose_estimator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import cv2
 3 | import math
 4 | import numpy as np
 5 | from typing import Optional, Tuple
 6 | 
 7 | 
 8 | __all__ = ['HeadPoseEstimator']
 9 | 
10 | 
11 | class HeadPoseEstimator(object):
12 |     def __init__(self, mean_shape_path: str = os.path.join(os.path.dirname(__file__),
13 |                                                            'data', 'bfm_lms.npy')) -> None:
14 |         # Load the 68-point mean shape derived from BFM
15 |         mean_shape = np.load(mean_shape_path)
16 | 
17 |         # Calculate the 5-points mean shape
18 |         left_eye = mean_shape[[37, 38, 40, 41]].mean(axis=0)
19 |         right_eye = mean_shape[[43, 44, 46, 47]].mean(axis=0)
20 |         self._mean_shape_5pts = np.vstack((left_eye, right_eye, mean_shape[[30, 48, 54]]))
21 | 
22 |         # Flip the y coordinates of the mean shape to match that of the image coordinate system
23 |         self._mean_shape_5pts[:, 1] = -self._mean_shape_5pts[:, 1]
24 | 
25 |     def __call__(self, landmarks: np.ndarray, image_width: int = 0, image_height: int = 0,
26 |                  camera_matrix: Optional[np.ndarray] = None, dist_coeffs: Optional[np.ndarray] = None,
27 |                  output_preference: int = 0) -> Tuple[float, float, float]:
28 |         # Form the camera matrix
29 |         if camera_matrix is None:
30 |             if image_width <= 0 or image_height <= 0:
31 |                 raise ValueError(
32 |                     'image_width and image_height must be specified when camera_matrix is not given directly')
33 |             else:
34 |                 camera_matrix = np.array([[image_width + image_height, 0, image_width / 2.0],
35 |                                           [0, image_width + image_height, image_height / 2.0],
36 |                                           [0, 0, 1]], dtype=float)
37 | 
38 |         # Prepare the landmarks
39 |         if landmarks.shape[0] == 68:
40 |             landmarks = landmarks[17:]
41 |         if landmarks.shape[0] in [49, 51]:
42 |             left_eye = landmarks[[20, 21, 23, 24]].mean(axis=0)
43 |             right_eye = landmarks[[26, 27, 29, 30]].mean(axis=0)
44 |             landmarks = np.vstack((left_eye, right_eye, landmarks[[13, 31, 37]]))
45 | 
46 |         # Use EPnP to estimate pitch, yaw, and roll
47 |         _, rvec, _ = cv2.solvePnP(self._mean_shape_5pts, np.expand_dims(landmarks, axis=1),
48 |                                   camera_matrix, dist_coeffs, flags=cv2.SOLVEPNP_EPNP)
49 |         rot_mat, _ = cv2.Rodrigues(rvec)
50 |         if 1.0 + rot_mat[2, 0] < 1e-9:
51 |             pitch = 0.0
52 |             yaw = 90.0
53 |             roll = -math.atan2(rot_mat[0, 1], rot_mat[0, 2]) / math.pi * 180.0
54 |         elif 1.0 - rot_mat[2, 0] < 1e-9:
55 |             pitch = 0.0
56 |             yaw = -90.0
57 |             roll = math.atan2(-rot_mat[0, 1], -rot_mat[0, 2]) / math.pi * 180.0
58 |         else:
59 |             pitch = math.atan2(rot_mat[2, 1], rot_mat[2, 2]) / math.pi * 180.0
60 |             yaw = -math.asin(rot_mat[2, 0]) / math.pi * 180.0
61 |             roll = math.atan2(rot_mat[1, 0], rot_mat[0, 0]) / math.pi * 180.0
62 | 
63 |         # Respond to output_preference:
64 |         # output_preference == 1: limit pitch to the range of -90.0 ~ 90.0
65 |         # output_preference == 2: limit yaw to the range of -90.0 ~ 90.0 (already satisfied)
66 |         # output_preference == 3: limit roll to the range of -90.0 ~ 90.0
67 |         # otherwise: minimise total rotation, min(abs(pitch) + abs(yaw) + abs(roll))
68 |         if output_preference != 2:
69 |             alt_pitch = pitch - 180.0 if pitch > 0.0 else pitch + 180.0
70 |             alt_yaw = -180.0 - yaw if yaw < 0.0 else 180.0 - yaw
71 |             alt_roll = roll - 180.0 if roll > 0.0 else roll + 180.0
72 |             if (output_preference == 1 and -90.0 < alt_pitch < 90.0 or
73 |                     output_preference == 3 and -90.0 < alt_roll < 90.0 or
74 |                     output_preference not in (1, 2, 3) and
75 |                     abs(alt_pitch) + abs(alt_yaw) + abs(alt_roll) < abs(pitch) + abs(yaw) + abs(roll)):
76 |                 pitch, yaw, roll = alt_pitch, alt_yaw, alt_roll
77 | 
78 |         return -pitch, yaw, roll
79 | 


--------------------------------------------------------------------------------
/ibug/face_detection/utils/simple_face_tracker.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from typing import List, Optional
 3 | from scipy.optimize import linear_sum_assignment
 4 | 
 5 | 
 6 | __all__ = ['SimpleFaceTracker']
 7 | 
 8 | 
 9 | class SimpleFaceTracker(object):
10 |     def __init__(self, iou_threshold: float = 0.4, minimum_face_size: float = 0.0) -> None:
11 |         self._iou_threshold = iou_threshold
12 |         self._minimum_face_size = minimum_face_size
13 |         self._tracklets = []
14 |         self._tracklet_counter = 0
15 | 
16 |     @property
17 |     def iou_threshold(self) -> float:
18 |         return self._iou_threshold
19 | 
20 |     @iou_threshold.setter
21 |     def iou_threshold(self, threshold: float) -> None:
22 |         self._iou_threshold = threshold
23 | 
24 |     @property
25 |     def minimum_face_size(self) -> float:
26 |         return self._minimum_face_size
27 | 
28 |     @minimum_face_size.setter
29 |     def minimum_face_size(self, face_size: float) -> None:
30 |         self._minimum_face_size = face_size
31 | 
32 |     def __call__(self, face_boxes: np.ndarray) -> List[Optional[int]]:
33 |         if face_boxes.size <= 0:
34 |             self._tracklets = []
35 |             return []
36 | 
37 |         # Calculate area of the faces
38 |         face_areas = np.abs((face_boxes[:, 2] - face_boxes[:, 0]) * (face_boxes[:, 3] - face_boxes[:, 1]))
39 | 
40 |         # Prepare tracklets
41 |         for tracklet in self._tracklets:
42 |             tracklet['tracked'] = False
43 | 
44 |         # Calculate the distance matrix based on IOU
45 |         iou_distance_threshold = np.clip(1.0 - self._iou_threshold, 0.0, 1.0)
46 |         min_face_area = max(self._minimum_face_size ** 2, np.finfo(float).eps)
47 |         distances = np.full(shape=(face_boxes.shape[0], len(self._tracklets)),
48 |                             fill_value=2.0 * min(face_boxes.shape[0], len(self._tracklets)), dtype=float)
49 |         for row, face_box in enumerate(face_boxes):
50 |             if face_areas[row] >= min_face_area:
51 |                 for col, tracklet in enumerate(self._tracklets):
52 |                     x_left = max(min(face_box[0], face_box[2]), min(tracklet['bbox'][0], tracklet['bbox'][2]))
53 |                     y_top = max(min(face_box[1], face_box[3]), min(tracklet['bbox'][1], tracklet['bbox'][3]))
54 |                     x_right = min(max(face_box[2], face_box[0]), max(tracklet['bbox'][2], tracklet['bbox'][0]))
55 |                     y_bottom = min(max(face_box[3], face_box[1]), max(tracklet['bbox'][3], tracklet['bbox'][1]))
56 |                     if x_right <= x_left or y_bottom <= y_top:
57 |                         distance = 1.0
58 |                     else:
59 |                         intersection_area = (x_right - x_left) * (y_bottom - y_top)
60 |                         distance = 1.0 - intersection_area / float(face_areas[row] + tracklet['area'] -
61 |                                                                    intersection_area)
62 |                     if distance <= iou_distance_threshold:
63 |                         distances[row, col] = distance
64 | 
65 |         # ID assignment
66 |         tracked_ids = [None] * face_boxes.shape[0]
67 |         for row, col in zip(*linear_sum_assignment(distances)):
68 |             if distances[row, col] <= iou_distance_threshold:
69 |                 tracked_ids[row] = self._tracklets[col]['id']
70 |                 self._tracklets[col]['bbox'] = face_boxes[row, :4].copy()
71 |                 self._tracklets[col]['area'] = face_areas[row]
72 |                 self._tracklets[col]['tracked'] = True
73 | 
74 |         # Remove expired tracklets
75 |         self._tracklets = [x for x in self._tracklets if x['tracked']]
76 | 
77 |         # Register new faces
78 |         for idx, face_box in enumerate(face_boxes):
79 |             if face_areas[idx] >= min_face_area and tracked_ids[idx] is None:
80 |                 self._tracklet_counter += 1
81 |                 self._tracklets.append({'bbox': face_box[:4].copy(), 'area': face_areas[idx],
82 |                                         'id': self._tracklet_counter, 'tracked': True})
83 |                 tracked_ids[idx] = self._tracklets[-1]['id']
84 | 
85 |         return tracked_ids
86 | 
87 |     def reset(self, reset_tracklet_counter: bool = True) -> None:
88 |         self._tracklets = []
89 |         if reset_tracklet_counter:
90 |             self._tracklet_counter = 0
91 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.16.0
2 | scipy>=1.1.0
3 | torch>=1.1.0
4 | torchvision>=0.3.0
5 | opencv-python>= 3.4.2
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import shutil
 4 | from setuptools import setup
 5 | 
 6 | 
 7 | def clean_repo():
 8 |     repo_folder = os.path.realpath(os.path.dirname(__file__))
 9 |     dist_folder = os.path.join(repo_folder, 'dist')
10 |     build_folder = os.path.join(repo_folder, 'build')
11 |     if os.path.isdir(dist_folder):
12 |         shutil.rmtree(dist_folder, ignore_errors=True)
13 |     if os.path.isdir(build_folder):
14 |         shutil.rmtree(build_folder, ignore_errors=True)
15 | 
16 | 
17 | # Read version string
18 | _version = None
19 | script_folder = os.path.realpath(os.path.dirname(__file__))
20 | with open(os.path.join(script_folder, 'ibug', 'face_detection', '__init__.py')) as init:
21 |     for line in init.read().splitlines():
22 |         fields = line.replace('=', ' ').replace('\'', ' ').replace('\"', ' ').replace('\t', ' ').split()
23 |         if len(fields) >= 2 and fields[0] == '__version__':
24 |             _version = fields[1]
25 |             break
26 | if _version is None:
27 |     sys.exit('Sorry, cannot find version information.')
28 | 
29 | # Installation
30 | config = {
31 |     'name': 'ibug_face_detection',
32 |     'version': _version,
33 |     'description': 'A collection of pretrained face detectors including S3FD and RetinaFace.',
34 |     'author': 'Jie Shen',
35 |     'author_email': 'js1907@imperial.ac.uk',
36 |     'packages': ['ibug.face_detection'],
37 |     'install_requires': ['numpy>=1.16.0', 'scipy>=1.1.0', 'torch>=1.1.0',
38 |                          'torchvision>=0.3.0', 'opencv-python>= 3.4.2'],
39 |     'zip_safe': False
40 | }
41 | clean_repo()
42 | setup(**config)
43 | clean_repo()
44 | 


--------------------------------------------------------------------------------