├── LICENSE ├── PASSify ├── README.md ├── face │ ├── __init__.py │ ├── convert_to_onnx.py │ ├── data │ │ ├── __init__.py │ │ ├── config.py │ │ ├── data_augment.py │ │ └── wider_face.py │ ├── detect.py │ ├── layers │ │ ├── __init__.py │ │ ├── functions │ │ │ ├── __init__.py │ │ │ └── prior_box.py │ │ └── modules │ │ │ ├── __init__.py │ │ │ └── multibox_loss.py │ ├── main_face_detector.py │ ├── models │ │ ├── __init__.py │ │ ├── net.py │ │ └── retinaface.py │ ├── sbatch_face_example.sh │ ├── train.py │ ├── utils │ │ ├── __init__.py │ │ ├── box_utils.py │ │ ├── nms │ │ │ ├── __init__.py │ │ │ └── py_cpu_nms.py │ │ └── timer.py │ └── weights │ │ └── mobilenet0.25_Final.pth ├── passify.py └── person │ ├── __init__.py │ ├── cascade_rcnn.yaml │ ├── main_person_detector.py │ └── sbatch_person_example.sh ├── README.md ├── download.sh ├── hubconf.py ├── img.png ├── pass.gif ├── version_history.txt └── vision_transformer.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Yuki M. Asano 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PASSify/README.md: -------------------------------------------------------------------------------- 1 | # PASSify your dataset 2 | Here we provide the automated scripts that remove humans from your dataset using a face detector and a person detector. 3 | This automated procedure does **not** guarantee full exclusion of humans or personal identifiable information (e.g. licence plates will still be present, some humans might slip through). 4 | However with this you can find: 5 | * how much of your dataset roughly includes humans and human faces 6 | * how much your model performance changes when trained on the PASSified version of your dataset. 7 | 8 | ## Running instructions 9 | For all of the following, we provide instructions for running commands on a SLURM managed cluster, but you can tailor them to run on a single machine too. 10 | You very likely need to adapt the slurm headers slightly to fit your cluster, you can find them in `passify.py`. 11 | Have your dataset of images in in a structure as `/path/to/dataset/{folders}/{imagename}`. 12 | 13 | 1. Face detector 14 | We start with the face detector as this one is cheaper to run and can be run on CPUs. 15 | ```sh 16 | DATA_DIRECTORY=/path/to/dataset/ 17 | python passify.py 0 $DATA_DIRECTORY 18 | sbatch face/sbatch_face.sh 19 | ``` 20 | 21 | 2. Person detector 22 | Next we run the person detector on GPUs. For this you need to have installed the [detectron2 repo](https://github.com/facebookresearch/detectron2). 23 | ```sh 24 | 25 | python passify.py 1 26 | sbatch person/sbatch_person.sh 27 | ``` 28 | 29 | 3. Final list 30 | Finally count the files that you're left with. 31 | ```sh 32 | python passify.py 2 33 | ``` 34 | 35 | ## References 36 | This work relies on two excellent repos: 37 | 38 | The facedetector is from [Retinaface](https://github.com/biubug6/Pytorch_Retinaface) (MIT Licence) 39 | ``` 40 | @inproceedings{deng2019retinaface, 41 | title={RetinaFace: Single-stage Dense Face Localisation in the Wild}, 42 | author={Deng, Jiankang and Guo, Jia and Yuxiang, Zhou and Jinke Yu and Irene Kotsia and Zafeiriou, Stefanos}, 43 | booktitle={arxiv}, 44 | year={2019} 45 | ``` 46 | The person detector is from [detectron2 repo](https://github.com/facebookresearch/detectron2) (Apache Licence), specifically, the Cascade-RCNN trained with 3x. 47 | ``` 48 | @misc{wu2019detectron2, 49 | author = {Yuxin Wu and Alexander Kirillov and Francisco Massa and 50 | Wan-Yen Lo and Ross Girshick}, 51 | title = {Detectron2}, 52 | howpublished = {\url{https://github.com/facebookresearch/detectron2}}, 53 | year = {2019} 54 | } 55 | ``` 56 | ## Citation 57 | If you found this useful please consider citing 58 | ``` 59 | @Article{asano21pass, 60 | author = "Yuki M. Asano and Christian Rupprecht and Andrew Zisserman and Andrea Vedaldi", 61 | title = "PASS: An ImageNet replacement for self-supervised pretraining without humans", 62 | journal = "NeurIPS Track on Datasets and Benchmarks", 63 | year = "2021" 64 | } 65 | ``` 66 | -------------------------------------------------------------------------------- /PASSify/face/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukimasano/PASS/e91b4fc9daf219c765ec5816610e76306de0150c/PASSify/face/__init__.py -------------------------------------------------------------------------------- /PASSify/face/convert_to_onnx.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import argparse 4 | import torch 5 | from data import cfg_mnet, cfg_re50 6 | from layers.functions.prior_box import PriorBox 7 | from models.retinaface import RetinaFace 8 | 9 | parser = argparse.ArgumentParser(description='Test') 10 | parser.add_argument('-m', '--trained_model', default='./weights/mobilenet0.25_Final.pth', 11 | type=str, help='Trained state_dict file path to open') 12 | parser.add_argument('--network', default='mobile0.25', help='Backbone network mobile0.25 or resnet50') 13 | parser.add_argument('--long_side', default=640, help='when origin_size is false, long_side is scaled size(320 or 640 for long side)') 14 | parser.add_argument('--cpu', action="store_true", default=True, help='Use cpu inference') 15 | 16 | args = parser.parse_args() 17 | 18 | 19 | def check_keys(model, pretrained_state_dict): 20 | ckpt_keys = set(pretrained_state_dict.keys()) 21 | model_keys = set(model.state_dict().keys()) 22 | used_pretrained_keys = model_keys & ckpt_keys 23 | unused_pretrained_keys = ckpt_keys - model_keys 24 | missing_keys = model_keys - ckpt_keys 25 | print('Missing keys:{}'.format(len(missing_keys))) 26 | print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys))) 27 | print('Used keys:{}'.format(len(used_pretrained_keys))) 28 | assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' 29 | return True 30 | 31 | 32 | def remove_prefix(state_dict, prefix): 33 | ''' Old style model is stored with all names of parameters sharing common prefix 'module.' ''' 34 | print('remove prefix \'{}\''.format(prefix)) 35 | f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x 36 | return {f(key): value for key, value in state_dict.items()} 37 | 38 | 39 | def load_model(model, pretrained_path, load_to_cpu): 40 | print('Loading pretrained model from {}'.format(pretrained_path)) 41 | if load_to_cpu: 42 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 43 | else: 44 | device = torch.cuda.current_device() 45 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) 46 | if "state_dict" in pretrained_dict.keys(): 47 | pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.') 48 | else: 49 | pretrained_dict = remove_prefix(pretrained_dict, 'module.') 50 | check_keys(model, pretrained_dict) 51 | model.load_state_dict(pretrained_dict, strict=False) 52 | return model 53 | 54 | 55 | if __name__ == '__main__': 56 | torch.set_grad_enabled(False) 57 | cfg = None 58 | if args.network == "mobile0.25": 59 | cfg = cfg_mnet 60 | elif args.network == "resnet50": 61 | cfg = cfg_re50 62 | # net and model 63 | net = RetinaFace(cfg=cfg, phase = 'test') 64 | net = load_model(net, args.trained_model, args.cpu) 65 | net.eval() 66 | print('Finished loading model!') 67 | print(net) 68 | device = torch.device("cpu" if args.cpu else "cuda") 69 | net = net.to(device) 70 | 71 | # ------------------------ export ----------------------------- 72 | output_onnx = 'FaceDetector.onnx' 73 | print("==> Exporting model to ONNX format at '{}'".format(output_onnx)) 74 | input_names = ["input0"] 75 | output_names = ["output0"] 76 | inputs = torch.randn(1, 3, args.long_side, args.long_side).to(device) 77 | 78 | torch_out = torch.onnx._export(net, inputs, output_onnx, export_params=True, verbose=False, 79 | input_names=input_names, output_names=output_names) 80 | 81 | 82 | -------------------------------------------------------------------------------- /PASSify/face/data/__init__.py: -------------------------------------------------------------------------------- 1 | from .wider_face import WiderFaceDetection, detection_collate 2 | from .data_augment import * 3 | from .config import * 4 | -------------------------------------------------------------------------------- /PASSify/face/data/config.py: -------------------------------------------------------------------------------- 1 | # config.py 2 | 3 | cfg_mnet = { 4 | 'name': 'mobilenet0.25', 5 | 'min_sizes': [[16, 32], [64, 128], [256, 512]], 6 | 'steps': [8, 16, 32], 7 | 'variance': [0.1, 0.2], 8 | 'clip': False, 9 | 'loc_weight': 2.0, 10 | 'gpu_train': True, 11 | 'batch_size': 32, 12 | 'ngpu': 1, 13 | 'epoch': 250, 14 | 'decay1': 190, 15 | 'decay2': 220, 16 | 'image_size': 640, 17 | 'pretrain': True, 18 | 'return_layers': {'stage1': 1, 'stage2': 2, 'stage3': 3}, 19 | 'in_channel': 32, 20 | 'out_channel': 64 21 | } 22 | 23 | cfg_re50 = { 24 | 'name': 'Resnet50', 25 | 'min_sizes': [[16, 32], [64, 128], [256, 512]], 26 | 'steps': [8, 16, 32], 27 | 'variance': [0.1, 0.2], 28 | 'clip': False, 29 | 'loc_weight': 2.0, 30 | 'gpu_train': True, 31 | 'batch_size': 24, 32 | 'ngpu': 4, 33 | 'epoch': 100, 34 | 'decay1': 70, 35 | 'decay2': 90, 36 | 'image_size': 840, 37 | 'pretrain': True, 38 | 'return_layers': {'layer2': 1, 'layer3': 2, 'layer4': 3}, 39 | 'in_channel': 256, 40 | 'out_channel': 256 41 | } 42 | 43 | -------------------------------------------------------------------------------- /PASSify/face/data/data_augment.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import random 4 | from utils.box_utils import matrix_iof 5 | 6 | 7 | def _crop(image, boxes, labels, landm, img_dim): 8 | height, width, _ = image.shape 9 | pad_image_flag = True 10 | 11 | for _ in range(250): 12 | """ 13 | if random.uniform(0, 1) <= 0.2: 14 | scale = 1.0 15 | else: 16 | scale = random.uniform(0.3, 1.0) 17 | """ 18 | PRE_SCALES = [0.3, 0.45, 0.6, 0.8, 1.0] 19 | scale = random.choice(PRE_SCALES) 20 | short_side = min(width, height) 21 | w = int(scale * short_side) 22 | h = w 23 | 24 | if width == w: 25 | l = 0 26 | else: 27 | l = random.randrange(width - w) 28 | if height == h: 29 | t = 0 30 | else: 31 | t = random.randrange(height - h) 32 | roi = np.array((l, t, l + w, t + h)) 33 | 34 | value = matrix_iof(boxes, roi[np.newaxis]) 35 | flag = (value >= 1) 36 | if not flag.any(): 37 | continue 38 | 39 | centers = (boxes[:, :2] + boxes[:, 2:]) / 2 40 | mask_a = np.logical_and(roi[:2] < centers, centers < roi[2:]).all(axis=1) 41 | boxes_t = boxes[mask_a].copy() 42 | labels_t = labels[mask_a].copy() 43 | landms_t = landm[mask_a].copy() 44 | landms_t = landms_t.reshape([-1, 5, 2]) 45 | 46 | if boxes_t.shape[0] == 0: 47 | continue 48 | 49 | image_t = image[roi[1]:roi[3], roi[0]:roi[2]] 50 | 51 | boxes_t[:, :2] = np.maximum(boxes_t[:, :2], roi[:2]) 52 | boxes_t[:, :2] -= roi[:2] 53 | boxes_t[:, 2:] = np.minimum(boxes_t[:, 2:], roi[2:]) 54 | boxes_t[:, 2:] -= roi[:2] 55 | 56 | # landm 57 | landms_t[:, :, :2] = landms_t[:, :, :2] - roi[:2] 58 | landms_t[:, :, :2] = np.maximum(landms_t[:, :, :2], np.array([0, 0])) 59 | landms_t[:, :, :2] = np.minimum(landms_t[:, :, :2], roi[2:] - roi[:2]) 60 | landms_t = landms_t.reshape([-1, 10]) 61 | 62 | 63 | # make sure that the cropped image contains at least one face > 16 pixel at training image scale 64 | b_w_t = (boxes_t[:, 2] - boxes_t[:, 0] + 1) / w * img_dim 65 | b_h_t = (boxes_t[:, 3] - boxes_t[:, 1] + 1) / h * img_dim 66 | mask_b = np.minimum(b_w_t, b_h_t) > 0.0 67 | boxes_t = boxes_t[mask_b] 68 | labels_t = labels_t[mask_b] 69 | landms_t = landms_t[mask_b] 70 | 71 | if boxes_t.shape[0] == 0: 72 | continue 73 | 74 | pad_image_flag = False 75 | 76 | return image_t, boxes_t, labels_t, landms_t, pad_image_flag 77 | return image, boxes, labels, landm, pad_image_flag 78 | 79 | 80 | def _distort(image): 81 | 82 | def _convert(image, alpha=1, beta=0): 83 | tmp = image.astype(float) * alpha + beta 84 | tmp[tmp < 0] = 0 85 | tmp[tmp > 255] = 255 86 | image[:] = tmp 87 | 88 | image = image.copy() 89 | 90 | if random.randrange(2): 91 | 92 | #brightness distortion 93 | if random.randrange(2): 94 | _convert(image, beta=random.uniform(-32, 32)) 95 | 96 | #contrast distortion 97 | if random.randrange(2): 98 | _convert(image, alpha=random.uniform(0.5, 1.5)) 99 | 100 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 101 | 102 | #saturation distortion 103 | if random.randrange(2): 104 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 105 | 106 | #hue distortion 107 | if random.randrange(2): 108 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 109 | tmp %= 180 110 | image[:, :, 0] = tmp 111 | 112 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 113 | 114 | else: 115 | 116 | #brightness distortion 117 | if random.randrange(2): 118 | _convert(image, beta=random.uniform(-32, 32)) 119 | 120 | image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 121 | 122 | #saturation distortion 123 | if random.randrange(2): 124 | _convert(image[:, :, 1], alpha=random.uniform(0.5, 1.5)) 125 | 126 | #hue distortion 127 | if random.randrange(2): 128 | tmp = image[:, :, 0].astype(int) + random.randint(-18, 18) 129 | tmp %= 180 130 | image[:, :, 0] = tmp 131 | 132 | image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR) 133 | 134 | #contrast distortion 135 | if random.randrange(2): 136 | _convert(image, alpha=random.uniform(0.5, 1.5)) 137 | 138 | return image 139 | 140 | 141 | def _expand(image, boxes, fill, p): 142 | if random.randrange(2): 143 | return image, boxes 144 | 145 | height, width, depth = image.shape 146 | 147 | scale = random.uniform(1, p) 148 | w = int(scale * width) 149 | h = int(scale * height) 150 | 151 | left = random.randint(0, w - width) 152 | top = random.randint(0, h - height) 153 | 154 | boxes_t = boxes.copy() 155 | boxes_t[:, :2] += (left, top) 156 | boxes_t[:, 2:] += (left, top) 157 | expand_image = np.empty( 158 | (h, w, depth), 159 | dtype=image.dtype) 160 | expand_image[:, :] = fill 161 | expand_image[top:top + height, left:left + width] = image 162 | image = expand_image 163 | 164 | return image, boxes_t 165 | 166 | 167 | def _mirror(image, boxes, landms): 168 | _, width, _ = image.shape 169 | if random.randrange(2): 170 | image = image[:, ::-1] 171 | boxes = boxes.copy() 172 | boxes[:, 0::2] = width - boxes[:, 2::-2] 173 | 174 | # landm 175 | landms = landms.copy() 176 | landms = landms.reshape([-1, 5, 2]) 177 | landms[:, :, 0] = width - landms[:, :, 0] 178 | tmp = landms[:, 1, :].copy() 179 | landms[:, 1, :] = landms[:, 0, :] 180 | landms[:, 0, :] = tmp 181 | tmp1 = landms[:, 4, :].copy() 182 | landms[:, 4, :] = landms[:, 3, :] 183 | landms[:, 3, :] = tmp1 184 | landms = landms.reshape([-1, 10]) 185 | 186 | return image, boxes, landms 187 | 188 | 189 | def _pad_to_square(image, rgb_mean, pad_image_flag): 190 | if not pad_image_flag: 191 | return image 192 | height, width, _ = image.shape 193 | long_side = max(width, height) 194 | image_t = np.empty((long_side, long_side, 3), dtype=image.dtype) 195 | image_t[:, :] = rgb_mean 196 | image_t[0:0 + height, 0:0 + width] = image 197 | return image_t 198 | 199 | 200 | def _resize_subtract_mean(image, insize, rgb_mean): 201 | interp_methods = [cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_NEAREST, cv2.INTER_LANCZOS4] 202 | interp_method = interp_methods[random.randrange(5)] 203 | image = cv2.resize(image, (insize, insize), interpolation=interp_method) 204 | image = image.astype(np.float32) 205 | image -= rgb_mean 206 | return image.transpose(2, 0, 1) 207 | 208 | 209 | class preproc(object): 210 | 211 | def __init__(self, img_dim, rgb_means): 212 | self.img_dim = img_dim 213 | self.rgb_means = rgb_means 214 | 215 | def __call__(self, image, targets): 216 | assert targets.shape[0] > 0, "this image does not have gt" 217 | 218 | boxes = targets[:, :4].copy() 219 | labels = targets[:, -1].copy() 220 | landm = targets[:, 4:-1].copy() 221 | 222 | image_t, boxes_t, labels_t, landm_t, pad_image_flag = _crop(image, boxes, labels, landm, self.img_dim) 223 | image_t = _distort(image_t) 224 | image_t = _pad_to_square(image_t,self.rgb_means, pad_image_flag) 225 | image_t, boxes_t, landm_t = _mirror(image_t, boxes_t, landm_t) 226 | height, width, _ = image_t.shape 227 | image_t = _resize_subtract_mean(image_t, self.img_dim, self.rgb_means) 228 | boxes_t[:, 0::2] /= width 229 | boxes_t[:, 1::2] /= height 230 | 231 | landm_t[:, 0::2] /= width 232 | landm_t[:, 1::2] /= height 233 | 234 | labels_t = np.expand_dims(labels_t, 1) 235 | targets_t = np.hstack((boxes_t, landm_t, labels_t)) 236 | 237 | return image_t, targets_t 238 | -------------------------------------------------------------------------------- /PASSify/face/data/wider_face.py: -------------------------------------------------------------------------------- 1 | import os 2 | import os.path 3 | import sys 4 | import torch 5 | import torch.utils.data as data 6 | import cv2 7 | import numpy as np 8 | 9 | class WiderFaceDetection(data.Dataset): 10 | def __init__(self, txt_path, preproc=None): 11 | self.preproc = preproc 12 | self.imgs_path = [] 13 | self.words = [] 14 | f = open(txt_path,'r') 15 | lines = f.readlines() 16 | isFirst = True 17 | labels = [] 18 | for line in lines: 19 | line = line.rstrip() 20 | if line.startswith('#'): 21 | if isFirst is True: 22 | isFirst = False 23 | else: 24 | labels_copy = labels.copy() 25 | self.words.append(labels_copy) 26 | labels.clear() 27 | path = line[2:] 28 | path = txt_path.replace('label.txt','images/') + path 29 | self.imgs_path.append(path) 30 | else: 31 | line = line.split(' ') 32 | label = [float(x) for x in line] 33 | labels.append(label) 34 | 35 | self.words.append(labels) 36 | 37 | def __len__(self): 38 | return len(self.imgs_path) 39 | 40 | def __getitem__(self, index): 41 | img = cv2.imread(self.imgs_path[index]) 42 | height, width, _ = img.shape 43 | 44 | labels = self.words[index] 45 | annotations = np.zeros((0, 15)) 46 | if len(labels) == 0: 47 | return annotations 48 | for idx, label in enumerate(labels): 49 | annotation = np.zeros((1, 15)) 50 | # bbox 51 | annotation[0, 0] = label[0] # x1 52 | annotation[0, 1] = label[1] # y1 53 | annotation[0, 2] = label[0] + label[2] # x2 54 | annotation[0, 3] = label[1] + label[3] # y2 55 | 56 | # landmarks 57 | annotation[0, 4] = label[4] # l0_x 58 | annotation[0, 5] = label[5] # l0_y 59 | annotation[0, 6] = label[7] # l1_x 60 | annotation[0, 7] = label[8] # l1_y 61 | annotation[0, 8] = label[10] # l2_x 62 | annotation[0, 9] = label[11] # l2_y 63 | annotation[0, 10] = label[13] # l3_x 64 | annotation[0, 11] = label[14] # l3_y 65 | annotation[0, 12] = label[16] # l4_x 66 | annotation[0, 13] = label[17] # l4_y 67 | if (annotation[0, 4]<0): 68 | annotation[0, 14] = -1 69 | else: 70 | annotation[0, 14] = 1 71 | 72 | annotations = np.append(annotations, annotation, axis=0) 73 | target = np.array(annotations) 74 | if self.preproc is not None: 75 | img, target = self.preproc(img, target) 76 | 77 | return torch.from_numpy(img), target 78 | 79 | def detection_collate(batch): 80 | """Custom collate fn for dealing with batches of images that have a different 81 | number of associated object annotations (bounding boxes). 82 | 83 | Arguments: 84 | batch: (tuple) A tuple of tensor images and lists of annotations 85 | 86 | Return: 87 | A tuple containing: 88 | 1) (tensor) batch of images stacked on their 0 dim 89 | 2) (list of tensors) annotations for a given image are stacked on 0 dim 90 | """ 91 | targets = [] 92 | imgs = [] 93 | for _, sample in enumerate(batch): 94 | for _, tup in enumerate(sample): 95 | if torch.is_tensor(tup): 96 | imgs.append(tup) 97 | elif isinstance(tup, type(np.empty(0))): 98 | annos = torch.from_numpy(tup).float() 99 | targets.append(annos) 100 | 101 | return (torch.stack(imgs, 0), targets) 102 | -------------------------------------------------------------------------------- /PASSify/face/detect.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import argparse 3 | import torch 4 | import torch.backends.cudnn as cudnn 5 | import numpy as np 6 | from data import cfg_mnet, cfg_re50 7 | from layers.functions.prior_box import PriorBox 8 | from experts.face.utils import py_cpu_nms 9 | import cv2 10 | from models.retinaface import RetinaFace 11 | from experts.face.utils import decode, decode_landm 12 | import time 13 | 14 | parser = argparse.ArgumentParser(description='Retinaface') 15 | 16 | parser.add_argument('-m', '--trained_model', default='./weights/Resnet50_Final.pth', 17 | type=str, help='Trained state_dict file path to open') 18 | parser.add_argument('--network', default='resnet50', help='Backbone network mobile0.25 or resnet50') 19 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference') 20 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold') 21 | parser.add_argument('--top_k', default=5000, type=int, help='top_k') 22 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold') 23 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k') 24 | parser.add_argument('-s', '--save_image', action="store_true", default=True, help='show detection results') 25 | parser.add_argument('--vis_thres', default=0.6, type=float, help='visualization_threshold') 26 | args = parser.parse_args() 27 | 28 | 29 | def check_keys(model, pretrained_state_dict): 30 | ckpt_keys = set(pretrained_state_dict.keys()) 31 | model_keys = set(model.state_dict().keys()) 32 | used_pretrained_keys = model_keys & ckpt_keys 33 | unused_pretrained_keys = ckpt_keys - model_keys 34 | missing_keys = model_keys - ckpt_keys 35 | print('Missing keys:{}'.format(len(missing_keys))) 36 | print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys))) 37 | print('Used keys:{}'.format(len(used_pretrained_keys))) 38 | assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' 39 | return True 40 | 41 | 42 | def remove_prefix(state_dict, prefix): 43 | ''' Old style model is stored with all names of parameters sharing common prefix 'module.' ''' 44 | print('remove prefix \'{}\''.format(prefix)) 45 | f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x 46 | return {f(key): value for key, value in state_dict.items()} 47 | 48 | 49 | def load_model(model, pretrained_path, load_to_cpu): 50 | print('Loading pretrained model from {}'.format(pretrained_path)) 51 | if load_to_cpu: 52 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 53 | else: 54 | device = torch.cuda.current_device() 55 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) 56 | if "state_dict" in pretrained_dict.keys(): 57 | pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.') 58 | else: 59 | pretrained_dict = remove_prefix(pretrained_dict, 'module.') 60 | check_keys(model, pretrained_dict) 61 | model.load_state_dict(pretrained_dict, strict=False) 62 | return model 63 | 64 | 65 | if __name__ == '__main__': 66 | torch.set_grad_enabled(False) 67 | cfg = None 68 | if args.network == "mobile0.25": 69 | cfg = cfg_mnet 70 | elif args.network == "resnet50": 71 | cfg = cfg_re50 72 | # net and model 73 | net = RetinaFace(cfg=cfg, phase = 'test') 74 | net = load_model(net, args.trained_model, args.cpu) 75 | net.eval() 76 | print('Finished loading model!') 77 | print(net) 78 | cudnn.benchmark = True 79 | device = torch.device("cpu" if args.cpu else "cuda") 80 | net = net.to(device) 81 | 82 | resize = 1 83 | 84 | # testing begin 85 | for i in range(100): 86 | image_path = "curve/test.jpg" 87 | img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR) 88 | 89 | img = np.float32(img_raw) 90 | 91 | im_height, im_width, _ = img.shape 92 | scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) 93 | img -= (104, 117, 123) 94 | img = img.transpose(2, 0, 1) 95 | img = torch.from_numpy(img).unsqueeze(0) 96 | img = img.to(device) 97 | scale = scale.to(device) 98 | 99 | tic = time.time() 100 | loc, conf, landms = net(img) # forward pass 101 | print('net forward time: {:.4f}'.format(time.time() - tic)) 102 | 103 | priorbox = PriorBox(cfg, image_size=(im_height, im_width)) 104 | priors = priorbox.forward() 105 | priors = priors.to(device) 106 | prior_data = priors.data 107 | boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance']) 108 | boxes = boxes * scale / resize 109 | boxes = boxes.cpu().numpy() 110 | scores = conf.squeeze(0).data.cpu().numpy()[:, 1] 111 | landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance']) 112 | scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2], 113 | img.shape[3], img.shape[2], img.shape[3], img.shape[2], 114 | img.shape[3], img.shape[2]]) 115 | scale1 = scale1.to(device) 116 | landms = landms * scale1 / resize 117 | landms = landms.cpu().numpy() 118 | 119 | # ignore low scores 120 | inds = np.where(scores > args.confidence_threshold)[0] 121 | boxes = boxes[inds] 122 | landms = landms[inds] 123 | scores = scores[inds] 124 | 125 | # keep top-K before NMS 126 | order = scores.argsort()[::-1][:args.top_k] 127 | boxes = boxes[order] 128 | landms = landms[order] 129 | scores = scores[order] 130 | 131 | # do NMS 132 | dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) 133 | keep = py_cpu_nms(dets, args.nms_threshold) 134 | # keep = nms(dets, args.nms_threshold,force_cpu=args.cpu) 135 | dets = dets[keep, :] 136 | landms = landms[keep] 137 | 138 | # keep top-K faster NMS 139 | dets = dets[:args.keep_top_k, :] 140 | landms = landms[:args.keep_top_k, :] 141 | 142 | dets = np.concatenate((dets, landms), axis=1) 143 | 144 | # show image 145 | if args.save_image: 146 | for b in dets: 147 | if b[4] < args.vis_thres: 148 | continue 149 | text = "{:.4f}".format(b[4]) 150 | b = list(map(int, b)) 151 | cv2.rectangle(img_raw, (b[0], b[1]), (b[2], b[3]), (0, 0, 255), 2) 152 | cx = b[0] 153 | cy = b[1] + 12 154 | cv2.putText(img_raw, text, (cx, cy), 155 | cv2.FONT_HERSHEY_DUPLEX, 0.5, (255, 255, 255)) 156 | 157 | # landms 158 | cv2.circle(img_raw, (b[5], b[6]), 1, (0, 0, 255), 4) 159 | cv2.circle(img_raw, (b[7], b[8]), 1, (0, 255, 255), 4) 160 | cv2.circle(img_raw, (b[9], b[10]), 1, (255, 0, 255), 4) 161 | cv2.circle(img_raw, (b[11], b[12]), 1, (0, 255, 0), 4) 162 | cv2.circle(img_raw, (b[13], b[14]), 1, (255, 0, 0), 4) 163 | # save image 164 | 165 | name = "test.jpg" 166 | cv2.imwrite(name, img_raw) 167 | 168 | -------------------------------------------------------------------------------- /PASSify/face/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from .functions import * 2 | from .modules import * 3 | -------------------------------------------------------------------------------- /PASSify/face/layers/functions/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukimasano/PASS/e91b4fc9daf219c765ec5816610e76306de0150c/PASSify/face/layers/functions/__init__.py -------------------------------------------------------------------------------- /PASSify/face/layers/functions/prior_box.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from itertools import product as product 3 | import numpy as np 4 | from math import ceil 5 | 6 | 7 | class PriorBox(object): 8 | def __init__(self, cfg, image_size=None, phase='train'): 9 | super(PriorBox, self).__init__() 10 | self.min_sizes = cfg['min_sizes'] 11 | self.steps = cfg['steps'] 12 | self.clip = cfg['clip'] 13 | self.image_size = image_size 14 | self.feature_maps = [[ceil(self.image_size[0]/step), ceil(self.image_size[1]/step)] for step in self.steps] 15 | self.name = "s" 16 | 17 | def forward(self): 18 | anchors = [] 19 | for k, f in enumerate(self.feature_maps): 20 | min_sizes = self.min_sizes[k] 21 | for i, j in product(range(f[0]), range(f[1])): 22 | for min_size in min_sizes: 23 | s_kx = min_size / self.image_size[1] 24 | s_ky = min_size / self.image_size[0] 25 | dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]] 26 | dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]] 27 | for cy, cx in product(dense_cy, dense_cx): 28 | anchors += [cx, cy, s_kx, s_ky] 29 | 30 | # back to torch land 31 | output = torch.Tensor(anchors).view(-1, 4) 32 | if self.clip: 33 | output.clamp_(max=1, min=0) 34 | return output 35 | -------------------------------------------------------------------------------- /PASSify/face/layers/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .multibox_loss import MultiBoxLoss 2 | 3 | __all__ = ['MultiBoxLoss'] 4 | -------------------------------------------------------------------------------- /PASSify/face/layers/modules/multibox_loss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from utils.box_utils import match, log_sum_exp 6 | from data import cfg_mnet 7 | GPU = cfg_mnet['gpu_train'] 8 | 9 | class MultiBoxLoss(nn.Module): 10 | """SSD Weighted Loss Function 11 | Compute Targets: 12 | 1) Produce Confidence Target Indices by matching ground truth boxes 13 | with (default) 'priorboxes' that have jaccard index > threshold parameter 14 | (default threshold: 0.5). 15 | 2) Produce localization target by 'encoding' variance into offsets of ground 16 | truth boxes and their matched 'priorboxes'. 17 | 3) Hard negative mining to filter the excessive number of negative examples 18 | that comes with using a large number of default bounding boxes. 19 | (default negative:positive ratio 3:1) 20 | Objective Loss: 21 | L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 22 | Where, Lconf is the CrossEntropy Loss and Lloc is the SmoothL1 Loss 23 | weighted by α which is set to 1 by cross val. 24 | Args: 25 | c: class confidences, 26 | l: predicted boxes, 27 | g: ground truth boxes 28 | N: number of matched default boxes 29 | See: https://arxiv.org/pdf/1512.02325.pdf for more details. 30 | """ 31 | 32 | def __init__(self, num_classes, overlap_thresh, prior_for_matching, bkg_label, neg_mining, neg_pos, neg_overlap, encode_target): 33 | super(MultiBoxLoss, self).__init__() 34 | self.num_classes = num_classes 35 | self.threshold = overlap_thresh 36 | self.background_label = bkg_label 37 | self.encode_target = encode_target 38 | self.use_prior_for_matching = prior_for_matching 39 | self.do_neg_mining = neg_mining 40 | self.negpos_ratio = neg_pos 41 | self.neg_overlap = neg_overlap 42 | self.variance = [0.1, 0.2] 43 | 44 | def forward(self, predictions, priors, targets): 45 | """Multibox Loss 46 | Args: 47 | predictions (tuple): A tuple containing loc preds, conf preds, 48 | and prior boxes from SSD net. 49 | conf shape: torch.size(batch_size,num_priors,num_classes) 50 | loc shape: torch.size(batch_size,num_priors,4) 51 | priors shape: torch.size(num_priors,4) 52 | 53 | ground_truth (tensor): Ground truth boxes and labels for a batch, 54 | shape: [batch_size,num_objs,5] (last idx is the label). 55 | """ 56 | 57 | loc_data, conf_data, landm_data = predictions 58 | priors = priors 59 | num = loc_data.size(0) 60 | num_priors = (priors.size(0)) 61 | 62 | # match priors (default boxes) and ground truth boxes 63 | loc_t = torch.Tensor(num, num_priors, 4) 64 | landm_t = torch.Tensor(num, num_priors, 10) 65 | conf_t = torch.LongTensor(num, num_priors) 66 | for idx in range(num): 67 | truths = targets[idx][:, :4].data 68 | labels = targets[idx][:, -1].data 69 | landms = targets[idx][:, 4:14].data 70 | defaults = priors.data 71 | match(self.threshold, truths, defaults, self.variance, labels, landms, loc_t, conf_t, landm_t, idx) 72 | if GPU: 73 | loc_t = loc_t.cuda() 74 | conf_t = conf_t.cuda() 75 | landm_t = landm_t.cuda() 76 | 77 | zeros = torch.tensor(0).cuda() 78 | # landm Loss (Smooth L1) 79 | # Shape: [batch,num_priors,10] 80 | pos1 = conf_t > zeros 81 | num_pos_landm = pos1.long().sum(1, keepdim=True) 82 | N1 = max(num_pos_landm.data.sum().float(), 1) 83 | pos_idx1 = pos1.unsqueeze(pos1.dim()).expand_as(landm_data) 84 | landm_p = landm_data[pos_idx1].view(-1, 10) 85 | landm_t = landm_t[pos_idx1].view(-1, 10) 86 | loss_landm = F.smooth_l1_loss(landm_p, landm_t, reduction='sum') 87 | 88 | 89 | pos = conf_t != zeros 90 | conf_t[pos] = 1 91 | 92 | # Localization Loss (Smooth L1) 93 | # Shape: [batch,num_priors,4] 94 | pos_idx = pos.unsqueeze(pos.dim()).expand_as(loc_data) 95 | loc_p = loc_data[pos_idx].view(-1, 4) 96 | loc_t = loc_t[pos_idx].view(-1, 4) 97 | loss_l = F.smooth_l1_loss(loc_p, loc_t, reduction='sum') 98 | 99 | # Compute max conf across batch for hard negative mining 100 | batch_conf = conf_data.view(-1, self.num_classes) 101 | loss_c = log_sum_exp(batch_conf) - batch_conf.gather(1, conf_t.view(-1, 1)) 102 | 103 | # Hard Negative Mining 104 | loss_c[pos.view(-1, 1)] = 0 # filter out pos boxes for now 105 | loss_c = loss_c.view(num, -1) 106 | _, loss_idx = loss_c.sort(1, descending=True) 107 | _, idx_rank = loss_idx.sort(1) 108 | num_pos = pos.long().sum(1, keepdim=True) 109 | num_neg = torch.clamp(self.negpos_ratio*num_pos, max=pos.size(1)-1) 110 | neg = idx_rank < num_neg.expand_as(idx_rank) 111 | 112 | # Confidence Loss Including Positive and Negative Examples 113 | pos_idx = pos.unsqueeze(2).expand_as(conf_data) 114 | neg_idx = neg.unsqueeze(2).expand_as(conf_data) 115 | conf_p = conf_data[(pos_idx+neg_idx).gt(0)].view(-1,self.num_classes) 116 | targets_weighted = conf_t[(pos+neg).gt(0)] 117 | loss_c = F.cross_entropy(conf_p, targets_weighted, reduction='sum') 118 | 119 | # Sum of losses: L(x,c,l,g) = (Lconf(x, c) + αLloc(x,l,g)) / N 120 | N = max(num_pos.data.sum().float(), 1) 121 | loss_l /= N 122 | loss_c /= N 123 | loss_landm /= N1 124 | 125 | return loss_l, loss_c, loss_landm 126 | -------------------------------------------------------------------------------- /PASSify/face/main_face_detector.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import torch 4 | import numpy as np 5 | from data import cfg_mnet, cfg_re50 6 | from layers.functions.prior_box import PriorBox 7 | # import utils 8 | from utils.nms.py_cpu_nms import py_cpu_nms 9 | import cv2 10 | from models.retinaface import RetinaFace 11 | from utils.box_utils import decode, decode_landm 12 | from utils.timer import Timer 13 | 14 | 15 | parser = argparse.ArgumentParser(description='Retinaface') 16 | parser.add_argument('-m', '--trained_model', default='./weights/mobilenet0.25_Final.pth', 17 | type=str, help='Trained state_dict file path to open') 18 | parser.add_argument('--network', default='mobile0.25', help='Backbone network mobile0.25 or resnet50') 19 | parser.add_argument('--origin_size', default=True, type=str, help='Whether use origin image size to evaluate') 20 | # parser.add_argument('--save_folder', default='/scratch/shared/beegfs/yuki/fast/yfcc/retinaface_out/', type=str, help='Dir to save txt results') 21 | parser.add_argument('--save_folder', default='./testout/', type=str, help='Dir to save txt results') 22 | 23 | parser.add_argument('--cpu', action="store_true", default=False, help='Use cpu inference') 24 | parser.add_argument('--dataset_folder', default='/scratch/shared/beegfs/yuki/data/ILSVRC12/val/n01440764/', type=str, help='dataset path') 25 | parser.add_argument('--input_txt', default='', type=str, help='dataset path') 26 | 27 | parser.add_argument('--confidence_threshold', default=0.02, type=float, help='confidence_threshold') 28 | parser.add_argument('--top_k', default=5000, type=int, help='top_k') 29 | parser.add_argument('--nms_threshold', default=0.4, type=float, help='nms_threshold') 30 | parser.add_argument('--keep_top_k', default=750, type=int, help='keep_top_k') 31 | parser.add_argument('-s', '--save_image', action="store_true", default=False, help='show detection results') 32 | parser.add_argument('--vis_thres', default=0.5, type=float, help='visualization_threshold') 33 | args = parser.parse_args() 34 | 35 | 36 | def check_keys(model, pretrained_state_dict): 37 | ckpt_keys = set(pretrained_state_dict.keys()) 38 | model_keys = set(model.state_dict().keys()) 39 | used_pretrained_keys = model_keys & ckpt_keys 40 | unused_pretrained_keys = ckpt_keys - model_keys 41 | missing_keys = model_keys - ckpt_keys 42 | print('Missing keys:{}'.format(len(missing_keys))) 43 | print('Unused checkpoint keys:{}'.format(len(unused_pretrained_keys))) 44 | print('Used keys:{}'.format(len(used_pretrained_keys))) 45 | assert len(used_pretrained_keys) > 0, 'load NONE from pretrained checkpoint' 46 | return True 47 | 48 | 49 | def remove_prefix(state_dict, prefix): 50 | ''' Old style model is stored with all names of parameters sharing common prefix 'module.' ''' 51 | print('remove prefix \'{}\''.format(prefix)) 52 | f = lambda x: x.split(prefix, 1)[-1] if x.startswith(prefix) else x 53 | return {f(key): value for key, value in state_dict.items()} 54 | 55 | 56 | def load_model(model, pretrained_path, load_to_cpu): 57 | print('Loading pretrained model from {}'.format(pretrained_path)) 58 | if load_to_cpu: 59 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage) 60 | else: 61 | device = torch.cuda.current_device() 62 | pretrained_dict = torch.load(pretrained_path, map_location=lambda storage, loc: storage.cuda(device)) 63 | if "state_dict" in pretrained_dict.keys(): 64 | pretrained_dict = remove_prefix(pretrained_dict['state_dict'], 'module.') 65 | else: 66 | pretrained_dict = remove_prefix(pretrained_dict, 'module.') 67 | check_keys(model, pretrained_dict) 68 | model.load_state_dict(pretrained_dict, strict=False) 69 | return model 70 | 71 | def touch(fname, times=None): 72 | with open(fname, 'a'): 73 | os.utime(fname, times) 74 | 75 | 76 | 77 | if __name__ == '__main__': 78 | torch.set_grad_enabled(False) 79 | 80 | cfg = None 81 | if args.network == "mobile0.25": 82 | cfg = cfg_mnet 83 | elif args.network == "resnet50": 84 | cfg = cfg_re50 85 | cfg['pretrain'] = False 86 | # net and model 87 | net = RetinaFace(cfg=cfg, phase = 'test') 88 | net = load_model(net, args.trained_model, args.cpu) 89 | net.eval() 90 | print('Finished loading model!') 91 | print(net) 92 | # cudnn.benchmark = True 93 | device = torch.device("cpu" if args.cpu else "cuda") 94 | net = net.to(device) 95 | 96 | # testing dataset 97 | # testset_folder = args.dataset_folder 98 | # test_dataset = os.listdir(testset_folder) 99 | if args.input_txt != '': 100 | f_list = open(args.input_txt, 'r') 101 | test_dataset = f_list.readlines() 102 | test_dataset = [_d.strip() for _d in test_dataset] 103 | print(f'done preparing dataset!, N={len(test_dataset)}', flush=True) 104 | num_images = len(test_dataset) 105 | 106 | _t = {'forward_pass': Timer(), 'misc': Timer()} 107 | 108 | # testing begin 109 | for i, image_path in enumerate(test_dataset): 110 | img_raw = cv2.imread(image_path, cv2.IMREAD_COLOR) 111 | img_name = image_path.split('/')[-1] 112 | subfolder = image_path.split('/')[-2] 113 | save_name = os.path.join(args.save_folder, subfolder, img_name.split('.')[0] + ".txt") 114 | dirname = os.path.dirname(save_name) 115 | if not os.path.isdir(dirname): 116 | os.makedirs(dirname) 117 | if not os.path.exists(save_name): 118 | img = np.float32(img_raw) 119 | 120 | # testing scale 121 | target_size = 1600 122 | max_size = 2150 123 | im_shape = img.shape 124 | im_size_min = np.min(im_shape[0:2]) 125 | im_size_max = np.max(im_shape[0:2]) 126 | resize = float(target_size) / float(im_size_min) 127 | # prevent bigger axis from being more than max_size: 128 | if np.round(resize * im_size_max) > max_size: 129 | resize = float(max_size) / float(im_size_max) 130 | if args.origin_size: 131 | resize = 1 132 | 133 | if resize != 1: 134 | img = cv2.resize(img, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR) 135 | im_height, im_width, _ = img.shape 136 | scale = torch.Tensor([img.shape[1], img.shape[0], img.shape[1], img.shape[0]]) 137 | img -= (104, 117, 123) 138 | img = img.transpose(2, 0, 1) 139 | img = torch.from_numpy(img).unsqueeze(0) 140 | img = img.to(device) 141 | scale = scale.to(device) 142 | 143 | _t['forward_pass'].tic() 144 | loc, conf, landms = net(img) # forward pass 145 | _t['forward_pass'].toc() 146 | _t['misc'].tic() 147 | priorbox = PriorBox(cfg, image_size=(im_height, im_width)) 148 | priors = priorbox.forward() 149 | priors = priors.to(device) 150 | prior_data = priors.data 151 | boxes = decode(loc.data.squeeze(0), prior_data, cfg['variance']) 152 | boxes = boxes * scale / resize 153 | boxes = boxes.cpu().numpy() 154 | scores = conf.squeeze(0).data.cpu().numpy()[:, 1] 155 | landms = decode_landm(landms.data.squeeze(0), prior_data, cfg['variance']) 156 | scale1 = torch.Tensor([img.shape[3], img.shape[2], img.shape[3], img.shape[2], 157 | img.shape[3], img.shape[2], img.shape[3], img.shape[2], 158 | img.shape[3], img.shape[2]]) 159 | scale1 = scale1.to(device) 160 | landms = landms * scale1 / resize 161 | landms = landms.cpu().numpy() 162 | 163 | # ignore low scores 164 | inds = np.where(scores > args.confidence_threshold)[0] 165 | boxes = boxes[inds] 166 | landms = landms[inds] 167 | scores = scores[inds] 168 | 169 | # keep top-K before NMS 170 | order = scores.argsort()[::-1] 171 | # order = scores.argsort()[::-1][:args.top_k] 172 | boxes = boxes[order] 173 | landms = landms[order] 174 | scores = scores[order] 175 | 176 | # do NMS 177 | dets = np.hstack((boxes, scores[:, np.newaxis])).astype(np.float32, copy=False) 178 | keep = py_cpu_nms(dets, args.nms_threshold) 179 | # keep = nms(dets, args.nms_threshold,force_cpu=args.cpu) 180 | dets = dets[keep, :] 181 | landms = landms[keep] 182 | 183 | # keep top-K faster NMS 184 | # dets = dets[:args.keep_top_k, :] 185 | # landms = landms[:args.keep_top_k, :] 186 | 187 | dets = np.concatenate((dets, landms), axis=1) 188 | _t['misc'].toc() 189 | 190 | # -------------------------------------------------------------------- 191 | name_ifface = os.path.join(args.save_folder, 'face_index',subfolder, img_name) 192 | name_ifnoface = os.path.join(args.save_folder, 'noface_index', subfolder, img_name ) 193 | 194 | 195 | with open(save_name, "w") as fd: 196 | bboxs = dets 197 | file_name = os.path.basename(save_name)[:-4] + "\n" 198 | bboxs_num = sum(bboxs[:, 4] > 0.5) 199 | # fd.write(file_name) 200 | # fd.write(bboxs_num) 201 | for box in bboxs: 202 | x = int(box[0]) 203 | y = int(box[1]) 204 | w = int(box[2]) - int(box[0]) 205 | h = int(box[3]) - int(box[1]) 206 | confidence = str(box[4]) 207 | line = str(x) + "," + str(y) + "," + str(w) + "," + str(h) + "," + confidence + " \n" 208 | fd.write(line) 209 | if bboxs_num > 0 : 210 | if not os.path.isdir(os.path.dirname(name_ifface)): 211 | os.makedirs(os.path.dirname(name_ifface), exist_ok=True) 212 | touch(name_ifface) 213 | else: 214 | if not os.path.isdir(os.path.dirname(name_ifnoface)): 215 | os.makedirs(os.path.dirname(name_ifnoface), exist_ok=True) 216 | touch(name_ifnoface) 217 | if i % 10 == 0: 218 | print(f"im_detect: {i + 1:5}/{num_images} Time: {_t['forward_pass'].average_time+_t['misc'].average_time:.3f}s", 219 | f"== {1./(_t['forward_pass'].average_time +_t['misc'].average_time) :.1f}Hz", flush=True) 220 | -------------------------------------------------------------------------------- /PASSify/face/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukimasano/PASS/e91b4fc9daf219c765ec5816610e76306de0150c/PASSify/face/models/__init__.py -------------------------------------------------------------------------------- /PASSify/face/models/net.py: -------------------------------------------------------------------------------- 1 | import time 2 | import torch 3 | import torch.nn as nn 4 | import torchvision.models._utils as _utils 5 | import torchvision.models as models 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | 9 | def conv_bn(inp, oup, stride = 1, leaky = 0): 10 | return nn.Sequential( 11 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 12 | nn.BatchNorm2d(oup), 13 | nn.LeakyReLU(negative_slope=leaky, inplace=True) 14 | ) 15 | 16 | def conv_bn_no_relu(inp, oup, stride): 17 | return nn.Sequential( 18 | nn.Conv2d(inp, oup, 3, stride, 1, bias=False), 19 | nn.BatchNorm2d(oup), 20 | ) 21 | 22 | def conv_bn1X1(inp, oup, stride, leaky=0): 23 | return nn.Sequential( 24 | nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False), 25 | nn.BatchNorm2d(oup), 26 | nn.LeakyReLU(negative_slope=leaky, inplace=True) 27 | ) 28 | 29 | def conv_dw(inp, oup, stride, leaky=0.1): 30 | return nn.Sequential( 31 | nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), 32 | nn.BatchNorm2d(inp), 33 | nn.LeakyReLU(negative_slope= leaky,inplace=True), 34 | 35 | nn.Conv2d(inp, oup, 1, 1, 0, bias=False), 36 | nn.BatchNorm2d(oup), 37 | nn.LeakyReLU(negative_slope= leaky,inplace=True), 38 | ) 39 | 40 | class SSH(nn.Module): 41 | def __init__(self, in_channel, out_channel): 42 | super(SSH, self).__init__() 43 | assert out_channel % 4 == 0 44 | leaky = 0 45 | if (out_channel <= 64): 46 | leaky = 0.1 47 | self.conv3X3 = conv_bn_no_relu(in_channel, out_channel//2, stride=1) 48 | 49 | self.conv5X5_1 = conv_bn(in_channel, out_channel//4, stride=1, leaky = leaky) 50 | self.conv5X5_2 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1) 51 | 52 | self.conv7X7_2 = conv_bn(out_channel//4, out_channel//4, stride=1, leaky = leaky) 53 | self.conv7x7_3 = conv_bn_no_relu(out_channel//4, out_channel//4, stride=1) 54 | 55 | def forward(self, input): 56 | conv3X3 = self.conv3X3(input) 57 | 58 | conv5X5_1 = self.conv5X5_1(input) 59 | conv5X5 = self.conv5X5_2(conv5X5_1) 60 | 61 | conv7X7_2 = self.conv7X7_2(conv5X5_1) 62 | conv7X7 = self.conv7x7_3(conv7X7_2) 63 | 64 | out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1) 65 | out = F.relu(out) 66 | return out 67 | 68 | class FPN(nn.Module): 69 | def __init__(self,in_channels_list,out_channels): 70 | super(FPN,self).__init__() 71 | leaky = 0 72 | if (out_channels <= 64): 73 | leaky = 0.1 74 | self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride = 1, leaky = leaky) 75 | self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride = 1, leaky = leaky) 76 | self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride = 1, leaky = leaky) 77 | 78 | self.merge1 = conv_bn(out_channels, out_channels, leaky = leaky) 79 | self.merge2 = conv_bn(out_channels, out_channels, leaky = leaky) 80 | 81 | def forward(self, input): 82 | # names = list(input.keys()) 83 | input = list(input.values()) 84 | 85 | output1 = self.output1(input[0]) 86 | output2 = self.output2(input[1]) 87 | output3 = self.output3(input[2]) 88 | 89 | up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode="nearest") 90 | output2 = output2 + up3 91 | output2 = self.merge2(output2) 92 | 93 | up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode="nearest") 94 | output1 = output1 + up2 95 | output1 = self.merge1(output1) 96 | 97 | out = [output1, output2, output3] 98 | return out 99 | 100 | 101 | 102 | class MobileNetV1(nn.Module): 103 | def __init__(self): 104 | super(MobileNetV1, self).__init__() 105 | self.stage1 = nn.Sequential( 106 | conv_bn(3, 8, 2, leaky = 0.1), # 3 107 | conv_dw(8, 16, 1), # 7 108 | conv_dw(16, 32, 2), # 11 109 | conv_dw(32, 32, 1), # 19 110 | conv_dw(32, 64, 2), # 27 111 | conv_dw(64, 64, 1), # 43 112 | ) 113 | self.stage2 = nn.Sequential( 114 | conv_dw(64, 128, 2), # 43 + 16 = 59 115 | conv_dw(128, 128, 1), # 59 + 32 = 91 116 | conv_dw(128, 128, 1), # 91 + 32 = 123 117 | conv_dw(128, 128, 1), # 123 + 32 = 155 118 | conv_dw(128, 128, 1), # 155 + 32 = 187 119 | conv_dw(128, 128, 1), # 187 + 32 = 219 120 | ) 121 | self.stage3 = nn.Sequential( 122 | conv_dw(128, 256, 2), # 219 +3 2 = 241 123 | conv_dw(256, 256, 1), # 241 + 64 = 301 124 | ) 125 | self.avg = nn.AdaptiveAvgPool2d((1,1)) 126 | self.fc = nn.Linear(256, 1000) 127 | 128 | def forward(self, x): 129 | x = self.stage1(x) 130 | x = self.stage2(x) 131 | x = self.stage3(x) 132 | x = self.avg(x) 133 | # x = self.model(x) 134 | x = x.view(-1, 256) 135 | x = self.fc(x) 136 | return x 137 | 138 | -------------------------------------------------------------------------------- /PASSify/face/models/retinaface.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torchvision.models.detection.backbone_utils as backbone_utils 4 | import torchvision.models._utils as _utils 5 | import torch.nn.functional as F 6 | from collections import OrderedDict 7 | 8 | from models.net import MobileNetV1 as MobileNetV1 9 | from models.net import FPN as FPN 10 | from models.net import SSH as SSH 11 | 12 | 13 | 14 | class ClassHead(nn.Module): 15 | def __init__(self,inchannels=512,num_anchors=3): 16 | super(ClassHead,self).__init__() 17 | self.num_anchors = num_anchors 18 | self.conv1x1 = nn.Conv2d(inchannels,self.num_anchors*2,kernel_size=(1,1),stride=1,padding=0) 19 | 20 | def forward(self,x): 21 | out = self.conv1x1(x) 22 | out = out.permute(0,2,3,1).contiguous() 23 | 24 | return out.view(out.shape[0], -1, 2) 25 | 26 | class BboxHead(nn.Module): 27 | def __init__(self,inchannels=512,num_anchors=3): 28 | super(BboxHead,self).__init__() 29 | self.conv1x1 = nn.Conv2d(inchannels,num_anchors*4,kernel_size=(1,1),stride=1,padding=0) 30 | 31 | def forward(self,x): 32 | out = self.conv1x1(x) 33 | out = out.permute(0,2,3,1).contiguous() 34 | 35 | return out.view(out.shape[0], -1, 4) 36 | 37 | class LandmarkHead(nn.Module): 38 | def __init__(self,inchannels=512,num_anchors=3): 39 | super(LandmarkHead,self).__init__() 40 | self.conv1x1 = nn.Conv2d(inchannels,num_anchors*10,kernel_size=(1,1),stride=1,padding=0) 41 | 42 | def forward(self,x): 43 | out = self.conv1x1(x) 44 | out = out.permute(0,2,3,1).contiguous() 45 | 46 | return out.view(out.shape[0], -1, 10) 47 | 48 | class RetinaFace(nn.Module): 49 | def __init__(self, cfg = None, phase = 'train'): 50 | """ 51 | :param cfg: Network related settings. 52 | :param phase: train or test. 53 | """ 54 | super(RetinaFace,self).__init__() 55 | self.phase = phase 56 | backbone = None 57 | if cfg['name'] == 'mobilenet0.25': 58 | backbone = MobileNetV1() 59 | if cfg['pretrain']: 60 | checkpoint = torch.load("./weights/mobilenet0.25_Final.pth", map_location=torch.device('cpu')) 61 | from collections import OrderedDict 62 | new_state_dict = OrderedDict() 63 | for k, v in checkpoint['state_dict'].items(): 64 | name = k[7:] # remove module. 65 | new_state_dict[name] = v 66 | # load params 67 | backbone.load_state_dict(new_state_dict) 68 | elif cfg['name'] == 'Resnet50': 69 | import torchvision.models as models 70 | backbone = models.resnet50(pretrained=cfg['pretrain']) 71 | 72 | self.body = _utils.IntermediateLayerGetter(backbone, cfg['return_layers']) 73 | in_channels_stage2 = cfg['in_channel'] 74 | in_channels_list = [ 75 | in_channels_stage2 * 2, 76 | in_channels_stage2 * 4, 77 | in_channels_stage2 * 8, 78 | ] 79 | out_channels = cfg['out_channel'] 80 | self.fpn = FPN(in_channels_list,out_channels) 81 | self.ssh1 = SSH(out_channels, out_channels) 82 | self.ssh2 = SSH(out_channels, out_channels) 83 | self.ssh3 = SSH(out_channels, out_channels) 84 | 85 | self.ClassHead = self._make_class_head(fpn_num=3, inchannels=cfg['out_channel']) 86 | self.BboxHead = self._make_bbox_head(fpn_num=3, inchannels=cfg['out_channel']) 87 | self.LandmarkHead = self._make_landmark_head(fpn_num=3, inchannels=cfg['out_channel']) 88 | 89 | def _make_class_head(self,fpn_num=3,inchannels=64,anchor_num=2): 90 | classhead = nn.ModuleList() 91 | for i in range(fpn_num): 92 | classhead.append(ClassHead(inchannels,anchor_num)) 93 | return classhead 94 | 95 | def _make_bbox_head(self,fpn_num=3,inchannels=64,anchor_num=2): 96 | bboxhead = nn.ModuleList() 97 | for i in range(fpn_num): 98 | bboxhead.append(BboxHead(inchannels,anchor_num)) 99 | return bboxhead 100 | 101 | def _make_landmark_head(self,fpn_num=3,inchannels=64,anchor_num=2): 102 | landmarkhead = nn.ModuleList() 103 | for i in range(fpn_num): 104 | landmarkhead.append(LandmarkHead(inchannels,anchor_num)) 105 | return landmarkhead 106 | 107 | def forward(self,inputs): 108 | out = self.body(inputs) 109 | 110 | # FPN 111 | fpn = self.fpn(out) 112 | 113 | # SSH 114 | feature1 = self.ssh1(fpn[0]) 115 | feature2 = self.ssh2(fpn[1]) 116 | feature3 = self.ssh3(fpn[2]) 117 | features = [feature1, feature2, feature3] 118 | 119 | bbox_regressions = torch.cat([self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1) 120 | classifications = torch.cat([self.ClassHead[i](feature) for i, feature in enumerate(features)],dim=1) 121 | ldm_regressions = torch.cat([self.LandmarkHead[i](feature) for i, feature in enumerate(features)], dim=1) 122 | 123 | if self.phase == 'train': 124 | output = (bbox_regressions, classifications, ldm_regressions) 125 | else: 126 | output = (bbox_regressions, F.softmax(classifications, dim=-1), ldm_regressions) 127 | return output -------------------------------------------------------------------------------- /PASSify/face/sbatch_face_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --array=0-X%50 # being nice and not running more than 50 jobs in parallel 3 | #SBATCH --mem=5G 4 | #SBATCH --cpus-per-task=2 5 | #SBATCH --time=8:00:00 6 | #SBATCH --partition=compute # we only need CPUs 7 | #SBACTH --open-mode=append 8 | #SBATCH --job-name=PASSify-face 9 | #SBATCH --constraint=10GbE 10 | 11 | 12 | echo $SLURM_ARRAY_TASK_ID 13 | X=$((${SLURM_ARRAY_TASK_ID}*80000)) 14 | Y=$(((${SLURM_ARRAY_TASK_ID} + 1)*80000)) 15 | 16 | in_file=_tmp_all_files_${SLURM_ARRAY_TASK_ID}.txt 17 | rm ${in_file} 18 | results_dir='/facedetector_results/' 19 | < all_files.txt tail -n +"$X" | head -n "$((Y - X))" >> ${in_file} 20 | 21 | echo "from " ${X} 22 | echo "to " ${Y} 23 | 24 | # ETA 4-6Hz 25 | /scratch/shared/beegfs/yuki/envs/py37/bin/python3 -W ignore main_face_detector.py \ 26 | --input_txt=${in_file} \ 27 | --save_folder=${results_dir} \ 28 | --cpu -------------------------------------------------------------------------------- /PASSify/face/train.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import torch 4 | import torch.optim as optim 5 | import torch.backends.cudnn as cudnn 6 | import argparse 7 | import torch.utils.data as data 8 | from data import WiderFaceDetection, detection_collate, preproc, cfg_mnet, cfg_re50 9 | from layers.modules import MultiBoxLoss 10 | from layers.functions.prior_box import PriorBox 11 | import time 12 | import datetime 13 | import math 14 | from models.retinaface import RetinaFace 15 | 16 | parser = argparse.ArgumentParser(description='Retinaface Training') 17 | parser.add_argument('--training_dataset', default='./data/widerface/train/label.txt', help='Training dataset directory') 18 | parser.add_argument('--network', default='mobile0.25', help='Backbone network mobile0.25 or resnet50') 19 | parser.add_argument('--num_workers', default=4, type=int, help='Number of workers used in dataloading') 20 | parser.add_argument('--lr', '--learning-rate', default=1e-3, type=float, help='initial learning rate') 21 | parser.add_argument('--momentum', default=0.9, type=float, help='momentum') 22 | parser.add_argument('--resume_net', default=None, help='resume net for retraining') 23 | parser.add_argument('--resume_epoch', default=0, type=int, help='resume iter for retraining') 24 | parser.add_argument('--weight_decay', default=5e-4, type=float, help='Weight decay for SGD') 25 | parser.add_argument('--gamma', default=0.1, type=float, help='Gamma update for SGD') 26 | parser.add_argument('--save_folder', default='./weights/', help='Location to save checkpoint models') 27 | 28 | args = parser.parse_args() 29 | 30 | if not os.path.exists(args.save_folder): 31 | os.mkdir(args.save_folder) 32 | cfg = None 33 | if args.network == "mobile0.25": 34 | cfg = cfg_mnet 35 | elif args.network == "resnet50": 36 | cfg = cfg_re50 37 | 38 | rgb_mean = (104, 117, 123) # bgr order 39 | num_classes = 2 40 | img_dim = cfg['image_size'] 41 | num_gpu = cfg['ngpu'] 42 | batch_size = cfg['batch_size'] 43 | max_epoch = cfg['epoch'] 44 | gpu_train = cfg['gpu_train'] 45 | 46 | num_workers = args.num_workers 47 | momentum = args.momentum 48 | weight_decay = args.weight_decay 49 | initial_lr = args.lr 50 | gamma = args.gamma 51 | training_dataset = args.training_dataset 52 | save_folder = args.save_folder 53 | 54 | net = RetinaFace(cfg=cfg) 55 | print("Printing net...") 56 | print(net) 57 | 58 | if args.resume_net is not None: 59 | print('Loading resume network...') 60 | state_dict = torch.load(args.resume_net) 61 | # create new OrderedDict that does not contain `module.` 62 | from collections import OrderedDict 63 | new_state_dict = OrderedDict() 64 | for k, v in state_dict.items(): 65 | head = k[:7] 66 | if head == 'module.': 67 | name = k[7:] # remove `module.` 68 | else: 69 | name = k 70 | new_state_dict[name] = v 71 | net.load_state_dict(new_state_dict) 72 | 73 | if num_gpu > 1 and gpu_train: 74 | net = torch.nn.DataParallel(net).cuda() 75 | else: 76 | net = net.cuda() 77 | 78 | cudnn.benchmark = True 79 | 80 | 81 | optimizer = optim.SGD(net.parameters(), lr=initial_lr, momentum=momentum, weight_decay=weight_decay) 82 | criterion = MultiBoxLoss(num_classes, 0.35, True, 0, True, 7, 0.35, False) 83 | 84 | priorbox = PriorBox(cfg, image_size=(img_dim, img_dim)) 85 | with torch.no_grad(): 86 | priors = priorbox.forward() 87 | priors = priors.cuda() 88 | 89 | def train(): 90 | net.train() 91 | epoch = 0 + args.resume_epoch 92 | print('Loading Dataset...') 93 | 94 | dataset = WiderFaceDetection( training_dataset,preproc(img_dim, rgb_mean)) 95 | 96 | epoch_size = math.ceil(len(dataset) / batch_size) 97 | max_iter = max_epoch * epoch_size 98 | 99 | stepvalues = (cfg['decay1'] * epoch_size, cfg['decay2'] * epoch_size) 100 | step_index = 0 101 | 102 | if args.resume_epoch > 0: 103 | start_iter = args.resume_epoch * epoch_size 104 | else: 105 | start_iter = 0 106 | 107 | for iteration in range(start_iter, max_iter): 108 | if iteration % epoch_size == 0: 109 | # create batch iterator 110 | batch_iterator = iter(data.DataLoader(dataset, batch_size, shuffle=True, num_workers=num_workers, collate_fn=detection_collate)) 111 | if (epoch % 10 == 0 and epoch > 0) or (epoch % 5 == 0 and epoch > cfg['decay1']): 112 | torch.save(net.state_dict(), save_folder + cfg['name']+ '_epoch_' + str(epoch) + '.pth') 113 | epoch += 1 114 | 115 | load_t0 = time.time() 116 | if iteration in stepvalues: 117 | step_index += 1 118 | lr = adjust_learning_rate(optimizer, gamma, epoch, step_index, iteration, epoch_size) 119 | 120 | # load train data 121 | images, targets = next(batch_iterator) 122 | images = images.cuda() 123 | targets = [anno.cuda() for anno in targets] 124 | 125 | # forward 126 | out = net(images) 127 | 128 | # backprop 129 | optimizer.zero_grad() 130 | loss_l, loss_c, loss_landm = criterion(out, priors, targets) 131 | loss = cfg['loc_weight'] * loss_l + loss_c + loss_landm 132 | loss.backward() 133 | optimizer.step() 134 | load_t1 = time.time() 135 | batch_time = load_t1 - load_t0 136 | eta = int(batch_time * (max_iter - iteration)) 137 | print('Epoch:{}/{} || Epochiter: {}/{} || Iter: {}/{} || Loc: {:.4f} Cla: {:.4f} Landm: {:.4f} || LR: {:.8f} || Batchtime: {:.4f} s || ETA: {}' 138 | .format(epoch, max_epoch, (iteration % epoch_size) + 1, 139 | epoch_size, iteration + 1, max_iter, loss_l.item(), loss_c.item(), loss_landm.item(), lr, batch_time, str(datetime.timedelta(seconds=eta)))) 140 | 141 | torch.save(net.state_dict(), save_folder + cfg['name'] + '_Final.pth') 142 | # torch.save(net.state_dict(), save_folder + 'Final_Retinaface.pth') 143 | 144 | 145 | def adjust_learning_rate(optimizer, gamma, epoch, step_index, iteration, epoch_size): 146 | """Sets the learning rate 147 | # Adapted from PyTorch Imagenet example: 148 | # https://github.com/pytorch/examples/blob/master/imagenet/main.py 149 | """ 150 | warmup_epoch = -1 151 | if epoch <= warmup_epoch: 152 | lr = 1e-6 + (initial_lr-1e-6) * iteration / (epoch_size * warmup_epoch) 153 | else: 154 | lr = initial_lr * (gamma ** (step_index)) 155 | for param_group in optimizer.param_groups: 156 | param_group['lr'] = lr 157 | return lr 158 | 159 | if __name__ == '__main__': 160 | train() 161 | -------------------------------------------------------------------------------- /PASSify/face/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukimasano/PASS/e91b4fc9daf219c765ec5816610e76306de0150c/PASSify/face/utils/__init__.py -------------------------------------------------------------------------------- /PASSify/face/utils/box_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | 4 | 5 | def point_form(boxes): 6 | """ Convert prior_boxes to (xmin, ymin, xmax, ymax) 7 | representation for comparison to point form ground truth data. 8 | Args: 9 | boxes: (tensor) center-size default boxes from priorbox layers. 10 | Return: 11 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 12 | """ 13 | return torch.cat((boxes[:, :2] - boxes[:, 2:]/2, # xmin, ymin 14 | boxes[:, :2] + boxes[:, 2:]/2), 1) # xmax, ymax 15 | 16 | 17 | def center_size(boxes): 18 | """ Convert prior_boxes to (cx, cy, w, h) 19 | representation for comparison to center-size form ground truth data. 20 | Args: 21 | boxes: (tensor) point_form boxes 22 | Return: 23 | boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. 24 | """ 25 | return torch.cat((boxes[:, 2:] + boxes[:, :2])/2, # cx, cy 26 | boxes[:, 2:] - boxes[:, :2], 1) # w, h 27 | 28 | 29 | def intersect(box_a, box_b): 30 | """ We resize both tensors to [A,B,2] without new malloc: 31 | [A,2] -> [A,1,2] -> [A,B,2] 32 | [B,2] -> [1,B,2] -> [A,B,2] 33 | Then we compute the area of intersect between box_a and box_b. 34 | Args: 35 | box_a: (tensor) bounding boxes, Shape: [A,4]. 36 | box_b: (tensor) bounding boxes, Shape: [B,4]. 37 | Return: 38 | (tensor) intersection area, Shape: [A,B]. 39 | """ 40 | A = box_a.size(0) 41 | B = box_b.size(0) 42 | max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), 43 | box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) 44 | min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), 45 | box_b[:, :2].unsqueeze(0).expand(A, B, 2)) 46 | inter = torch.clamp((max_xy - min_xy), min=0) 47 | return inter[:, :, 0] * inter[:, :, 1] 48 | 49 | 50 | def jaccard(box_a, box_b): 51 | """Compute the jaccard overlap of two sets of boxes. The jaccard overlap 52 | is simply the intersection over union of two boxes. Here we operate on 53 | ground truth boxes and default boxes. 54 | E.g.: 55 | A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) 56 | Args: 57 | box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] 58 | box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] 59 | Return: 60 | jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] 61 | """ 62 | inter = intersect(box_a, box_b) 63 | area_a = ((box_a[:, 2]-box_a[:, 0]) * 64 | (box_a[:, 3]-box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] 65 | area_b = ((box_b[:, 2]-box_b[:, 0]) * 66 | (box_b[:, 3]-box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] 67 | union = area_a + area_b - inter 68 | return inter / union # [A,B] 69 | 70 | 71 | def matrix_iou(a, b): 72 | """ 73 | return iou of a and b, numpy version for data augenmentation 74 | """ 75 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 76 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 77 | 78 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 79 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 80 | area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) 81 | return area_i / (area_a[:, np.newaxis] + area_b - area_i) 82 | 83 | 84 | def matrix_iof(a, b): 85 | """ 86 | return iof of a and b, numpy version for data augenmentation 87 | """ 88 | lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) 89 | rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) 90 | 91 | area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) 92 | area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) 93 | return area_i / np.maximum(area_a[:, np.newaxis], 1) 94 | 95 | 96 | def match(threshold, truths, priors, variances, labels, landms, loc_t, conf_t, landm_t, idx): 97 | """Match each prior box with the ground truth box of the highest jaccard 98 | overlap, encode the bounding boxes, then return the matched indices 99 | corresponding to both confidence and location preds. 100 | Args: 101 | threshold: (float) The overlap threshold used when mathing boxes. 102 | truths: (tensor) Ground truth boxes, Shape: [num_obj, 4]. 103 | priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. 104 | variances: (tensor) Variances corresponding to each prior coord, 105 | Shape: [num_priors, 4]. 106 | labels: (tensor) All the class labels for the image, Shape: [num_obj]. 107 | landms: (tensor) Ground truth landms, Shape [num_obj, 10]. 108 | loc_t: (tensor) Tensor to be filled w/ endcoded location targets. 109 | conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. 110 | landm_t: (tensor) Tensor to be filled w/ endcoded landm targets. 111 | idx: (int) current batch index 112 | Return: 113 | The matched indices corresponding to 1)location 2)confidence 3)landm preds. 114 | """ 115 | # jaccard index 116 | overlaps = jaccard( 117 | truths, 118 | point_form(priors) 119 | ) 120 | # (Bipartite Matching) 121 | # [1,num_objects] best prior for each ground truth 122 | best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) 123 | 124 | # ignore hard gt 125 | valid_gt_idx = best_prior_overlap[:, 0] >= 0.2 126 | best_prior_idx_filter = best_prior_idx[valid_gt_idx, :] 127 | if best_prior_idx_filter.shape[0] <= 0: 128 | loc_t[idx] = 0 129 | conf_t[idx] = 0 130 | return 131 | 132 | # [1,num_priors] best ground truth for each prior 133 | best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) 134 | best_truth_idx.squeeze_(0) 135 | best_truth_overlap.squeeze_(0) 136 | best_prior_idx.squeeze_(1) 137 | best_prior_idx_filter.squeeze_(1) 138 | best_prior_overlap.squeeze_(1) 139 | best_truth_overlap.index_fill_(0, best_prior_idx_filter, 2) # ensure best prior 140 | # TODO refactor: index best_prior_idx with long tensor 141 | # ensure every gt matches with its prior of max overlap 142 | for j in range(best_prior_idx.size(0)): # 判别此anchor是预测哪一个boxes 143 | best_truth_idx[best_prior_idx[j]] = j 144 | matches = truths[best_truth_idx] # Shape: [num_priors,4] 此处为每一个anchor对应的bbox取出来 145 | conf = labels[best_truth_idx] # Shape: [num_priors] 此处为每一个anchor对应的label取出来 146 | conf[best_truth_overlap < threshold] = 0 # label as background overlap<0.35的全部作为负样本 147 | loc = encode(matches, priors, variances) 148 | 149 | matches_landm = landms[best_truth_idx] 150 | landm = encode_landm(matches_landm, priors, variances) 151 | loc_t[idx] = loc # [num_priors,4] encoded offsets to learn 152 | conf_t[idx] = conf # [num_priors] top class label for each prior 153 | landm_t[idx] = landm 154 | 155 | 156 | def encode(matched, priors, variances): 157 | """Encode the variances from the priorbox layers into the ground truth boxes 158 | we have matched (based on jaccard overlap) with the prior boxes. 159 | Args: 160 | matched: (tensor) Coords of ground truth for each prior in point-form 161 | Shape: [num_priors, 4]. 162 | priors: (tensor) Prior boxes in center-offset form 163 | Shape: [num_priors,4]. 164 | variances: (list[float]) Variances of priorboxes 165 | Return: 166 | encoded boxes (tensor), Shape: [num_priors, 4] 167 | """ 168 | 169 | # dist b/t match center and prior's center 170 | g_cxcy = (matched[:, :2] + matched[:, 2:])/2 - priors[:, :2] 171 | # encode variance 172 | g_cxcy /= (variances[0] * priors[:, 2:]) 173 | # match wh / prior wh 174 | g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] 175 | g_wh = torch.log(g_wh) / variances[1] 176 | # return target for smooth_l1_loss 177 | return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] 178 | 179 | def encode_landm(matched, priors, variances): 180 | """Encode the variances from the priorbox layers into the ground truth boxes 181 | we have matched (based on jaccard overlap) with the prior boxes. 182 | Args: 183 | matched: (tensor) Coords of ground truth for each prior in point-form 184 | Shape: [num_priors, 10]. 185 | priors: (tensor) Prior boxes in center-offset form 186 | Shape: [num_priors,4]. 187 | variances: (list[float]) Variances of priorboxes 188 | Return: 189 | encoded landm (tensor), Shape: [num_priors, 10] 190 | """ 191 | 192 | # dist b/t match center and prior's center 193 | matched = torch.reshape(matched, (matched.size(0), 5, 2)) 194 | priors_cx = priors[:, 0].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 195 | priors_cy = priors[:, 1].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 196 | priors_w = priors[:, 2].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 197 | priors_h = priors[:, 3].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) 198 | priors = torch.cat([priors_cx, priors_cy, priors_w, priors_h], dim=2) 199 | g_cxcy = matched[:, :, :2] - priors[:, :, :2] 200 | # encode variance 201 | g_cxcy /= (variances[0] * priors[:, :, 2:]) 202 | # g_cxcy /= priors[:, :, 2:] 203 | g_cxcy = g_cxcy.reshape(g_cxcy.size(0), -1) 204 | # return target for smooth_l1_loss 205 | return g_cxcy 206 | 207 | 208 | # Adapted from https://github.com/Hakuyume/chainer-ssd 209 | def decode(loc, priors, variances): 210 | """Decode locations from predictions using priors to undo 211 | the encoding we did for offset regression at train time. 212 | Args: 213 | loc (tensor): location predictions for loc layers, 214 | Shape: [num_priors,4] 215 | priors (tensor): Prior boxes in center-offset form. 216 | Shape: [num_priors,4]. 217 | variances: (list[float]) Variances of priorboxes 218 | Return: 219 | decoded bounding box predictions 220 | """ 221 | 222 | boxes = torch.cat(( 223 | priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], 224 | priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) 225 | boxes[:, :2] -= boxes[:, 2:] / 2 226 | boxes[:, 2:] += boxes[:, :2] 227 | return boxes 228 | 229 | def decode_landm(pre, priors, variances): 230 | """Decode landm from predictions using priors to undo 231 | the encoding we did for offset regression at train time. 232 | Args: 233 | pre (tensor): landm predictions for loc layers, 234 | Shape: [num_priors,10] 235 | priors (tensor): Prior boxes in center-offset form. 236 | Shape: [num_priors,4]. 237 | variances: (list[float]) Variances of priorboxes 238 | Return: 239 | decoded landm predictions 240 | """ 241 | landms = torch.cat((priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:], 242 | priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:], 243 | priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:], 244 | priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:], 245 | priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:], 246 | ), dim=1) 247 | return landms 248 | 249 | 250 | def log_sum_exp(x): 251 | """Utility function for computing log_sum_exp while determining 252 | This will be used to determine unaveraged confidence loss across 253 | all examples in a batch. 254 | Args: 255 | x (Variable(tensor)): conf_preds from conf layers 256 | """ 257 | x_max = x.data.max() 258 | return torch.log(torch.sum(torch.exp(x-x_max), 1, keepdim=True)) + x_max 259 | 260 | 261 | # Original author: Francisco Massa: 262 | # https://github.com/fmassa/object-detection.torch 263 | # Ported to PyTorch by Max deGroot (02/01/2017) 264 | def nms(boxes, scores, overlap=0.5, top_k=200): 265 | """Apply non-maximum suppression at test time to avoid detecting too many 266 | overlapping bounding boxes for a given object. 267 | Args: 268 | boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. 269 | scores: (tensor) The class predscores for the img, Shape:[num_priors]. 270 | overlap: (float) The overlap thresh for suppressing unnecessary boxes. 271 | top_k: (int) The Maximum number of box preds to consider. 272 | Return: 273 | The indices of the kept boxes with respect to num_priors. 274 | """ 275 | 276 | keep = torch.Tensor(scores.size(0)).fill_(0).long() 277 | if boxes.numel() == 0: 278 | return keep 279 | x1 = boxes[:, 0] 280 | y1 = boxes[:, 1] 281 | x2 = boxes[:, 2] 282 | y2 = boxes[:, 3] 283 | area = torch.mul(x2 - x1, y2 - y1) 284 | v, idx = scores.sort(0) # sort in ascending order 285 | # I = I[v >= 0.01] 286 | idx = idx[-top_k:] # indices of the top-k largest vals 287 | xx1 = boxes.new() 288 | yy1 = boxes.new() 289 | xx2 = boxes.new() 290 | yy2 = boxes.new() 291 | w = boxes.new() 292 | h = boxes.new() 293 | 294 | # keep = torch.Tensor() 295 | count = 0 296 | while idx.numel() > 0: 297 | i = idx[-1] # index of current largest val 298 | # keep.append(i) 299 | keep[count] = i 300 | count += 1 301 | if idx.size(0) == 1: 302 | break 303 | idx = idx[:-1] # remove kept element from view 304 | # load bboxes of next highest vals 305 | torch.index_select(x1, 0, idx, out=xx1) 306 | torch.index_select(y1, 0, idx, out=yy1) 307 | torch.index_select(x2, 0, idx, out=xx2) 308 | torch.index_select(y2, 0, idx, out=yy2) 309 | # store element-wise max with next highest score 310 | xx1 = torch.clamp(xx1, min=x1[i]) 311 | yy1 = torch.clamp(yy1, min=y1[i]) 312 | xx2 = torch.clamp(xx2, max=x2[i]) 313 | yy2 = torch.clamp(yy2, max=y2[i]) 314 | w.resize_as_(xx2) 315 | h.resize_as_(yy2) 316 | w = xx2 - xx1 317 | h = yy2 - yy1 318 | # check sizes of xx1 and xx2.. after each iteration 319 | w = torch.clamp(w, min=0.0) 320 | h = torch.clamp(h, min=0.0) 321 | inter = w*h 322 | # IoU = i / (area(a) + area(b) - i) 323 | rem_areas = torch.index_select(area, 0, idx) # load remaining areas) 324 | union = (rem_areas - inter) + area[i] 325 | IoU = inter/union # store result in iou 326 | # keep only elements with an IoU <= overlap 327 | idx = idx[IoU.le(overlap)] 328 | return keep, count 329 | 330 | 331 | -------------------------------------------------------------------------------- /PASSify/face/utils/nms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukimasano/PASS/e91b4fc9daf219c765ec5816610e76306de0150c/PASSify/face/utils/nms/__init__.py -------------------------------------------------------------------------------- /PASSify/face/utils/nms/py_cpu_nms.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import numpy as np 9 | 10 | def py_cpu_nms(dets, thresh): 11 | """Pure Python NMS baseline.""" 12 | x1 = dets[:, 0] 13 | y1 = dets[:, 1] 14 | x2 = dets[:, 2] 15 | y2 = dets[:, 3] 16 | scores = dets[:, 4] 17 | 18 | areas = (x2 - x1 + 1) * (y2 - y1 + 1) 19 | order = scores.argsort()[::-1] 20 | 21 | keep = [] 22 | while order.size > 0: 23 | i = order[0] 24 | keep.append(i) 25 | xx1 = np.maximum(x1[i], x1[order[1:]]) 26 | yy1 = np.maximum(y1[i], y1[order[1:]]) 27 | xx2 = np.minimum(x2[i], x2[order[1:]]) 28 | yy2 = np.minimum(y2[i], y2[order[1:]]) 29 | 30 | w = np.maximum(0.0, xx2 - xx1 + 1) 31 | h = np.maximum(0.0, yy2 - yy1 + 1) 32 | inter = w * h 33 | ovr = inter / (areas[i] + areas[order[1:]] - inter) 34 | 35 | inds = np.where(ovr <= thresh)[0] 36 | order = order[inds + 1] 37 | 38 | return keep 39 | -------------------------------------------------------------------------------- /PASSify/face/utils/timer.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Fast R-CNN 3 | # Copyright (c) 2015 Microsoft 4 | # Licensed under The MIT License [see LICENSE for details] 5 | # Written by Ross Girshick 6 | # -------------------------------------------------------- 7 | 8 | import time 9 | 10 | 11 | class Timer(object): 12 | """A simple timer.""" 13 | def __init__(self): 14 | self.total_time = 0. 15 | self.calls = 0 16 | self.start_time = 0. 17 | self.diff = 0. 18 | self.average_time = 0. 19 | 20 | def tic(self): 21 | # using time.time instead of time.clock because time time.clock 22 | # does not normalize for multithreading 23 | self.start_time = time.time() 24 | 25 | def toc(self, average=True): 26 | self.diff = time.time() - self.start_time 27 | self.total_time += self.diff 28 | self.calls += 1 29 | self.average_time = self.total_time / self.calls 30 | if average: 31 | return self.average_time 32 | else: 33 | return self.diff 34 | 35 | def clear(self): 36 | self.total_time = 0. 37 | self.calls = 0 38 | self.start_time = 0. 39 | self.diff = 0. 40 | self.average_time = 0. 41 | -------------------------------------------------------------------------------- /PASSify/face/weights/mobilenet0.25_Final.pth: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukimasano/PASS/e91b4fc9daf219c765ec5816610e76306de0150c/PASSify/face/weights/mobilenet0.25_Final.pth -------------------------------------------------------------------------------- /PASSify/passify.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import sys 3 | 4 | step = int(sys.argv[1]) 5 | 6 | if step == 0: 7 | dir = str(sys.argv[2]) 8 | # enlist all files 9 | files = glob.glob(dir + '/*/**') 10 | f = open("0_all_files.txt", "a") 11 | i = 0 12 | for file in files: 13 | i += 1 14 | f.write(file +"\n") 15 | f.close() 16 | print(f"found {i} files") 17 | nj = (i // 80000) +1 # number of slurm jobs 18 | f = open('face/sbatch_face.sh','a') 19 | command = f"""#!/bin/bash 20 | #SBATCH --mem=5G 21 | #SBATCH --cpus-per-task=2 22 | #SBATCH --time=8:00:00 23 | #SBATCH --partition=compute # we only need CPUs, adapt to your cluster 24 | #SBATCH --array=0-{nj}%50 # being nice and not running more than 50 jobs in parallel 25 | #SBATCH --job-name=PASSify-face 26 | #SBATCH --constraint=10GbE 27 | 28 | 29 | cd face/ 30 | echo $SLURM_ARRAY_TASK_ID 31 | X=$(($SLURM_ARRAY_TASK_ID*80000)) 32 | Y=$((($SLURM_ARRAY_TASK_ID + 1)*80000)) 33 | 34 | in_file=_tmp_all_files_$SLURM_ARRAY_TASK_ID.txt 35 | rm $in_file 36 | results_dir='/facedetector_results/' 37 | < all_files.txt tail -n +"$X" | head -n "$((Y - X))" >> $in_file 38 | 39 | echo "from " $X 40 | echo "to " $Y 41 | 42 | 43 | # ETA 4-6Hz 44 | python3 -W ignore main_face_detector.py \ 45 | --input_txt=$in_file \ 46 | --save_folder=$results_dir \ 47 | --cpu 48 | """ 49 | f.write(command) 50 | f.close() 51 | 52 | 53 | if step == 1: 54 | files = glob.glob('facedetector_results/noface_index/*/**') 55 | f = open("1_no_faces.txt", "a") 56 | i = 0 57 | for file in files: 58 | i += 1 59 | f.write(file +"\n") 60 | f.close() 61 | print(f"left with {i} images that do not contain faces") 62 | nj = (i // 80000) +1 # number of slurm jobs 63 | f = open('person/sbatch_person.sh','a') 64 | command = f"""#!/bin/bash 65 | #SBATCH --mem=10G 66 | #SBATCH --cpus-per-task=5 67 | #SBATCH --time=8:00:00 # this is on the low-end, jobs might finish quicker. 68 | #SBATCH --gres=gpu:1 69 | #SBATCH --partition=gpu # we need GPUs, adapt to your cluster 70 | #SBATCH --job-name=PASSify-person 71 | #SBATCH --array=0-{nj}%50 # being nice and not running more than 50 jobs in parallel 72 | 73 | 74 | 75 | echo $SLURM_ARRAY_TASK_ID 76 | X=$(($SLURM_ARRAY_TASK_ID*80000)) 77 | Y=$((($SLURM_ARRAY_TASK_ID + 1)*80000)) 78 | 79 | in_file=_tmp_noface_$SLURM_ARRAY_TASK_ID.txt 80 | rm $in_file 81 | results_dir='persondetector_results/' 82 | mkdir -p $results_dir 83 | < 1_no_faces.txt tail -n +"$X" | head -n "$((Y - X))" >> $in_file 84 | 85 | echo "from " $X 86 | echo "to " $Y 87 | 88 | # ETA 4-6Hz 89 | python3 -W -W ignore main_person_detector.py \ 90 | --img_list=$in_file \ 91 | --save_folder=$results_dir 92 | """ 93 | f.write(command) 94 | f.close() 95 | 96 | if step == 2: 97 | files = glob.glob('persondetector_results/noperson_index/*/**') 98 | f = open("2_no_faces__no_person.txt", "a") 99 | i = 0 100 | for file in files: 101 | i += 1 102 | f.write(file +"\n") 103 | f.close() 104 | print(f"left with {i} images that do not contain faces nor persons") 105 | print("final file of images left can be found in: 2_no_faces__no_person.txt") 106 | print("Note that the results are from automated algorithms," 107 | " and thus do not work 100% well and might work differently well on different humans, possibly introducing bias. " 108 | "For all real applications, please thoroughly run human evaluations.") 109 | -------------------------------------------------------------------------------- /PASSify/person/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukimasano/PASS/e91b4fc9daf219c765ec5816610e76306de0150c/PASSify/person/__init__.py -------------------------------------------------------------------------------- /PASSify/person/cascade_rcnn.yaml: -------------------------------------------------------------------------------- 1 | MODEL: 2 | META_ARCHITECTURE: "GeneralizedRCNN" 3 | WEIGHTS: "/scratch/shared/beegfs/yuki/adiwol/experts/cascade_rcnn.pkl" 4 | BACKBONE: 5 | NAME: "build_resnet_fpn_backbone" 6 | MASK_ON: True 7 | RESNETS: 8 | OUT_FEATURES: ["res2", "res3", "res4", "res5"] 9 | DEPTH: 50 10 | FPN: 11 | IN_FEATURES: ["res2", "res3", "res4", "res5"] 12 | ANCHOR_GENERATOR: 13 | SIZES: [[32], [64], [128], [256], [512]] # One size for each in feature map 14 | ASPECT_RATIOS: [[0.5, 1.0, 2.0]] # Three aspect ratios (same for all in feature maps) 15 | RPN: 16 | IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"] 17 | PRE_NMS_TOPK_TRAIN: 2000 # Per FPN level 18 | PRE_NMS_TOPK_TEST: 1000 # Per FPN level 19 | POST_NMS_TOPK_TRAIN: 2000 20 | POST_NMS_TOPK_TEST: 1000 21 | 22 | ROI_HEADS: 23 | NAME: CascadeROIHeads 24 | IN_FEATURES: ["p2", "p3", "p4", "p5"] 25 | ROI_BOX_HEAD: 26 | NAME: "FastRCNNConvFCHead" 27 | CLS_AGNOSTIC_BBOX_REG: True 28 | NUM_FC: 2 29 | POOLER_RESOLUTION: 7 30 | ROI_MASK_HEAD: 31 | NAME: "MaskRCNNConvUpsampleHead" 32 | NUM_CONV: 4 33 | POOLER_RESOLUTION: 14 34 | DATASETS: 35 | TRAIN: ("coco_2017_train",) 36 | TEST: ("coco_2017_val",) 37 | SOLVER: 38 | IMS_PER_BATCH: 16 39 | BASE_LR: 0.02 40 | STEPS: (60000, 80000) 41 | MAX_ITER: 90000 42 | INPUT: 43 | MIN_SIZE_TRAIN: (640, 672, 704, 736, 768, 800) 44 | VERSION: 2 -------------------------------------------------------------------------------- /PASSify/person/main_person_detector.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import argparse 3 | import multiprocessing as mp 4 | import os 5 | import time 6 | import tqdm 7 | 8 | from detectron2.config import get_cfg 9 | from detectron2.data.detection_utils import read_image 10 | from detectron2.utils.logger import setup_logger 11 | 12 | from detectron2.data import MetadataCatalog 13 | from detectron2.engine.defaults import DefaultPredictor 14 | from detectron2.utils.visualizer import ColorMode, Visualizer 15 | 16 | 17 | class Runner(object): 18 | def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): 19 | """ 20 | Args: 21 | cfg (CfgNode): 22 | instance_mode (ColorMode): 23 | parallel (bool): whether to run the model in different processes from visualization. 24 | Useful since the visualization logic can be slow. 25 | """ 26 | self.metadata = MetadataCatalog.get( 27 | cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" 28 | ) 29 | self.cpu_device = torch.device("cpu") 30 | self.instance_mode = instance_mode 31 | 32 | self.parallel = parallel 33 | self.predictor = DefaultPredictor(cfg) 34 | 35 | 36 | def setup_cfg(args): 37 | # load config from file and command-line arguments 38 | cfg = get_cfg() 39 | cfg.merge_from_file(args.config_file) 40 | cfg.merge_from_list(args.opts) 41 | # Set score_threshold for builtin models 42 | cfg.MODEL.RETINANET.SCORE_THRESH_TEST = args.confidence_threshold 43 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = args.confidence_threshold 44 | cfg.MODEL.PANOPTIC_FPN.COMBINE.INSTANCES_CONFIDENCE_THRESH = args.confidence_threshold 45 | cfg.freeze() 46 | return cfg 47 | 48 | 49 | def get_parser(): 50 | parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs") 51 | parser.add_argument( 52 | "--config-file", 53 | default="cascade_rcnn.yaml", 54 | metavar="FILE", 55 | help="path to config file", 56 | ) 57 | parser.add_argument( 58 | "--img_list", 59 | type=str, 60 | default='', 61 | ) 62 | parser.add_argument( 63 | "--output", 64 | help="outputfolder", 65 | ) 66 | parser.add_argument( 67 | "--confidence-threshold", 68 | type=float, 69 | default=0.5, 70 | help="Minimum score for instance predictions to be shown", 71 | ) 72 | parser.add_argument( 73 | "--opts", 74 | help="Modify config options using the command-line 'KEY VALUE' pairs", 75 | default=[], 76 | nargs=argparse.REMAINDER, 77 | ) 78 | parser.add_argument('--save_folder', default='./testout/', type=str, help='Dir to save txt results') 79 | 80 | return parser 81 | 82 | 83 | def touch(fname, times=None): 84 | with open(fname, 'a'): 85 | os.utime(fname, times) 86 | 87 | if __name__ == "__main__": 88 | mp.set_start_method("spawn", force=True) 89 | args = get_parser().parse_args() 90 | setup_logger(name="fvcore") 91 | logger = setup_logger() 92 | logger.info("Arguments: " + str(args)) 93 | 94 | cfg = setup_cfg(args) 95 | 96 | runner = Runner(cfg) 97 | if args.img_list != '': 98 | f_list = open(args.img_list, 'r') 99 | test_dataset = f_list.readlines() 100 | test_dataset = [_d.strip() for _d in test_dataset] 101 | print(f'done preparing dataset!, N={len(test_dataset)}', flush=True) 102 | 103 | for image_path in tqdm.tqdm(test_dataset): 104 | # use PIL, to be consistent with evaluation 105 | img_name = image_path.split('/')[-1] 106 | subfolder = image_path.split('/')[-2] 107 | save_name = os.path.join(args.save_folder, subfolder, img_name.split('.')[0] + ".txt") 108 | 109 | img = read_image(image_path, format="BGR") 110 | start_time = time.time() 111 | predictions = runner.predictor(img) 112 | logger.info( 113 | "{}: {} in {:.2f}s".format( 114 | img_name, 115 | "detected {} instances".format(len(predictions["instances"])) 116 | if "instances" in predictions 117 | else "finished", 118 | time.time() - start_time, 119 | ) 120 | ) 121 | classes, scores = predictions["instances"].pred_classes.cpu(), predictions["instances"].scores.cpu() 122 | # -------------------------------------------------------------------- 123 | name_ifperson = os.path.join(args.save_folder, 'person_index', subfolder, img_name) 124 | name_ifnoperson = os.path.join(args.save_folder, 'noperson_index', subfolder, img_name ) 125 | 126 | dirname = os.path.dirname(save_name) 127 | if not os.path.isdir(dirname): 128 | os.makedirs(dirname) 129 | with open(save_name, "w") as fd: 130 | has_person = 1 if any([int(c) == 0 for c in classes]) else 0 131 | fd.write(str(has_person) + " \n") 132 | for _c, _s in zip(classes, scores): 133 | line = str(_c.item()) + ":" + str(_s.item()) + " \n" 134 | fd.write(line) 135 | if has_person: 136 | if not os.path.isdir(os.path.dirname(name_ifperson)): 137 | os.makedirs(os.path.dirname(name_ifperson)) 138 | touch(name_ifperson) 139 | else: 140 | if not os.path.isdir(os.path.dirname(name_ifnoperson)): 141 | os.makedirs(os.path.dirname(name_ifnoperson)) 142 | touch(name_ifnoperson) -------------------------------------------------------------------------------- /PASSify/person/sbatch_person_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --mem=10G 3 | #SBATCH --cpus-per-task=5 4 | #SBATCH --time=5:00:00 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --partition=gpu 7 | #SBATCH --job-name=PASSify-person 8 | #SBATCH --array=0-175%50 # being nice and not running more than 50 jobs in parallel 9 | 10 | 11 | 12 | echo $SLURM_ARRAY_TASK_ID 13 | X=$((${SLURM_ARRAY_TASK_ID}*80000)) 14 | Y=$(((${SLURM_ARRAY_TASK_ID} + 1)*80000)) 15 | 16 | in_file=_tmp_noface_${SLURM_ARRAY_TASK_ID}.txt 17 | rm ${in_file} 18 | results_dir='persondetector_results/' 19 | mkdir -p ${results_dir} 20 | < 1_no_faces.txt tail -n +"$X" | head -n "$((Y - X))" >> ${in_file} 21 | 22 | echo "from " ${X} 23 | echo "to " ${Y} 24 | 25 | # ETA 4-6Hz 26 | /scratch/shared/beegfs/yuki/envs/py37/bin/python3 -W ignore main_person_detector.py \ 27 | --img_list=${in_file} \ 28 | --save_folder=${results_dir} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PASS: Pictures without humAns for Self-Supervised Pretraining 2 | **TL;DR:** An ImageNet replacement dataset for self-supervised pretraining without humans 3 | 4 | ![img.png](img.png?style=centerme) 5 | 6 | 7 | 8 | ## Content 9 | PASS is a large-scale image dataset that does not include any humans, human parts, or other personally identifiable information that can be used for high-quality pretraining while significantly reducing privacy concerns. 10 | 11 | ![pass.gif](pass.gif) 12 | 13 | ## Download the dataset 14 | 15 | The quickest way: 16 | ```sh 17 | git clone https://github.com/yukimasano/PASS 18 | cd PASS 19 | source download.sh # maybe change the directory where you want to download it 20 | ``` 21 | Generally: all information is on our [webpage](https://www.robots.ox.ac.uk/~vgg/research/pass/). 22 | 23 | For downloading the dataset, please visit our [dataset on zenodo](https://zenodo.org/record/6615455). There you can download it in tar files and find the meta-data. 24 | 25 | You can also download the images from their AWS urls, from [here](https://www.robots.ox.ac.uk/~vgg/research/pass/pass_urls.txt). 26 | 27 | ## Pretrained models 28 | | Pretraining | Method | Epochs | IN-1k Acc. | Places205 Acc. | | 29 | |-------------|------------------------------------------------------------------------|--------|------------|----------------|----------------------------------------------------------------------------------------------------------------------------------------------| 30 | | (IN-1k) | [MoCo-v2 ](https://github.com/facebookresearch/moco) | 200 | 60.6 | 50.1 | [visit MoCo-v2 repo](https://github.com/facebookresearch/moco#models) | 31 | | PASS | [MoCo-v2](https://github.com/facebookresearch/moco) | 180 | 59.1 | 52.8 | [R50 weights](https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/moco_v2_180ep_of200ep.pth.tar) | 32 | | PASS | [MoCo-v2](https://github.com/facebookresearch/moco) | 200 | 59.5 | 52.8 | [R50 weights](https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/moco_v2_200ep.pth.tar) | 33 | | PASS | [MoCo-v2](https://github.com/facebookresearch/moco) | 800 | 61.2 | 54.0 | [R50 weights](https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/moco_v2_800ep.pth.tar) | 34 | | PASS | [MoCo-v2 (R18)](https://github.com/facebookresearch/moco) | 800 | 45.3 | 44.4 | [R18 weights](https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/moco_v2_r18_800ep.pth.tar) | 35 | | PASS | [MoCo-v2-CLD](https://github.com/frank-xwang/CLD-UnsupervisedLearning) | 200 | 60.2 | 53.1 | [R50 weights](https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/moco_v2_CLD_200ep.pth.tar) | 36 | | PASS | [SwAV](https://github.com/facebookresearch/swav) | 200 | 60.8 | 55.5 | [R50 weights](https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/swav_200ep.pth.tar) | 37 | | PASS | [DINO](https://github.com/facebookresearch/dino) | 100 | 61.3 | 54.6 | [ViT S16 weights](https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/dino_deit_100ep.pth.tar) | 38 | | PASS | [DINO](https://github.com/facebookresearch/dino) | 300 | 65.0 | 55.7 | [ViT S16 weights](https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/dino_deit_300ep_ttemp0o07_warumup30ep_normlayerF.pth.tar) | 39 | 40 | 41 | In the table above we give the download links to the full checkpoints (including momentum encoder etc.) to the models we've trained. 42 | For comparison, we include MoCo-v2 trained on ILSVRC-12 ("IN-1k") and report linear probing performance on IN-1k and Places205. 43 | 44 | ## Pretrained models from PyTorch Hub 45 | ```python 46 | import torch 47 | vits16_100ep = torch.hub.load('yukimasano/PASS:main', 'dino_100ep_vits16') 48 | vits16 = torch.hub.load('yukimasano/PASS:main', 'dino_vits16') 49 | r50_swav_200ep = torch.hub.load('yukimasano/PASS:main', 'swav_resnet50') 50 | r50_moco_800ep = torch.hub.load('yukimasano/PASS:main', 'moco_resnet50') 51 | r50_moco_cld_200ep = torch.hub.load('yukimasano/PASS:main', 'moco_cld_resnet50') 52 | ``` 53 | 54 | ## PASSify your dataset 55 | In the folder [PASSify](PASSify/README.md) of this repo, you can find automated scripts that try to remove humans from image datasets. 56 | 57 | ### Contribute your models 58 | 59 | Please let us know if you have a model pretrained on this dataset and I will add this to the list above. 60 | 61 | ## Citation 62 | ``` 63 | @Article{asano21pass, 64 | author = "Yuki M. Asano and Christian Rupprecht and Andrew Zisserman and Andrea Vedaldi", 65 | title = "PASS: An ImageNet replacement for self-supervised pretraining without humans", 66 | journal = "NeurIPS Track on Datasets and Benchmarks", 67 | year = "2021" 68 | } 69 | ``` 70 | -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # download files 4 | echo "downloading dataset tar files" 5 | for PART in 0 1 2 3 4 5 6 7 8 9 6 | do 7 | echo "download part" $PART 8 | curl https://zenodo.org/record/6615455/files/PASS.${PART}.tar --output PASS.${PART}.tar 9 | done 10 | 11 | # extract dataset 12 | ## will create dataset with images in PASS_dataset/dummy_folder/img-hash.jpeg 13 | for file in *.tar; do tar -xf "$file"; done 14 | 15 | # you can use this now e.g. with torchvision.datasets.ImageFolder('/dir/to/PASS') 16 | -------------------------------------------------------------------------------- /hubconf.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torchvision.models.resnet import resnet50 as __resnet50 3 | 4 | import vision_transformer as vits 5 | 6 | dependencies = ["torch", "torchvision"] 7 | 8 | def dino_vits16(pretrained=True, **kwargs): 9 | """ 10 | ViT-Small/16x16 pre-trained with DINO for 300 epochs, teacher-temp=0.07, warmup epochs=30, norm-layer=False 11 | """ 12 | model = vits.__dict__["vit_small"](patch_size=16, num_classes=0, **kwargs) 13 | if pretrained: 14 | state_dict = torch.hub.load_state_dict_from_url( 15 | url="https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/dino_deit_300ep_ttemp0o07_warumup30ep_normlayerF.pth.tar", 16 | map_location="cpu", 17 | )['teacher'] 18 | state_dict = __clean_ckpt(state_dict) 19 | msg = model.load_state_dict(state_dict, strict=False) 20 | print(msg) 21 | return model 22 | 23 | 24 | def dino_100ep_vits16(pretrained=True, **kwargs): 25 | """ 26 | ViT-Small/16x16 pre-trained with DINO. 27 | """ 28 | model = vits.__dict__["vit_small"](patch_size=16, num_classes=0, **kwargs) 29 | if pretrained: 30 | state_dict = torch.hub.load_state_dict_from_url( 31 | url="https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/dino_deit_100ep.pth.tar", 32 | map_location="cpu", 33 | )['teacher'] 34 | state_dict = __clean_ckpt(state_dict) 35 | msg = model.load_state_dict(state_dict, strict=False) 36 | print(msg) 37 | return model 38 | 39 | 40 | def moco_resnet50(pretrained=True, **kwargs): 41 | """ 42 | ResNet-50 pre-trained with MoCo-v2 for 800epochs 43 | """ 44 | model = __resnet50(pretrained=False, **kwargs) 45 | model.fc = torch.nn.Identity() 46 | if pretrained: 47 | state_dict = torch.hub.load_state_dict_from_url( 48 | url="https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/moco_v2_800ep.pth.tar", 49 | map_location="cpu", 50 | )['state_dict'] 51 | state_dict = __clean_ckpt(state_dict) 52 | msg = model.load_state_dict(state_dict, strict=False) 53 | print(msg) 54 | return model 55 | 56 | def moco_resnet50_200ep(pretrained=True, **kwargs): 57 | """ 58 | ResNet-50 pre-trained with MoCo-v2 for 200epochs 59 | """ 60 | model = __resnet50(pretrained=False, **kwargs) 61 | model.fc = torch.nn.Identity() 62 | if pretrained: 63 | state_dict = torch.hub.load_state_dict_from_url( 64 | url="https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/moco_v2_200ep.pth.tar", 65 | map_location="cpu", 66 | )['state_dict'] 67 | state_dict = __clean_ckpt(state_dict) 68 | msg = model.load_state_dict(state_dict, strict=False) 69 | print(msg) 70 | return model 71 | 72 | def moco_cld_resnet50(pretrained=True, **kwargs): 73 | """ 74 | ResNet-50 pre-trained with MoCo-v2 for 200epochs 75 | """ 76 | model = __resnet50(pretrained=False, **kwargs) 77 | model.fc = torch.nn.Identity() 78 | if pretrained: 79 | state_dict = torch.hub.load_state_dict_from_url( 80 | url="https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/moco_v2_CLD_200ep.pth.tar", 81 | map_location="cpu", 82 | )['state_dict'] 83 | state_dict = __clean_ckpt(state_dict) 84 | msg = model.load_state_dict(state_dict, strict=False) 85 | print(msg) 86 | return model 87 | 88 | def swav_resnet50(pretrained=True, **kwargs): 89 | """ 90 | ResNet-50 pre-trained with SwAV for 200 epochs. 2 large crops 6 small ones. 91 | """ 92 | model = __resnet50(pretrained=False, **kwargs) 93 | model.fc = torch.nn.Identity() 94 | if pretrained: 95 | state_dict = torch.hub.load_state_dict_from_url( 96 | url="https://www.robots.ox.ac.uk/~vgg/research/pass/pretrained_models/swav_200ep.pth.tar", 97 | map_location="cpu", 98 | )['state_dict'] 99 | state_dict = __clean_ckpt(state_dict) 100 | msg = model.load_state_dict(state_dict, strict=False) 101 | print(msg) 102 | return model 103 | 104 | def __clean_ckpt(state_dict): 105 | is_moco = any(['module.encoder_q' in k for k in state_dict.keys()]) 106 | if is_moco: 107 | state_dict = {k.replace('module.encoder_q.',''):v for k,v in state_dict.items() if 'encoder_q' in k} 108 | else: 109 | state_dict = {k.replace('module.',''):v for k,v in state_dict.items()} 110 | return state_dict 111 | -------------------------------------------------------------------------------- /img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukimasano/PASS/e91b4fc9daf219c765ec5816610e76306de0150c/img.png -------------------------------------------------------------------------------- /pass.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yukimasano/PASS/e91b4fc9daf219c765ec5816610e76306de0150c/pass.gif -------------------------------------------------------------------------------- /version_history.txt: -------------------------------------------------------------------------------- 1 | 21.09.2021: 2 | v1: initial release: 1,440,191 images 3 | 4 | 14.10.2021 5 | v2: Removed 472 images, now 1,439,719 images. Thanks to the Know-your-data (https://knowyourdata-tfds.withgoogle.com/#dataset=pass) page (published on the 13.10.2021), we were able to identify 472 further images that contained humans. 6 | Most images that we have removed only contained human depictions (e.g. in newspapers, black-white portraits, ads) in some background, and very few were actual photographs of people (<50). We used KYD to sort images both by face area and face probablity to find all images that were missed in v1. 7 | We have further added more metadata that will aid further analysis in KYD in the future. 8 | 9 | 07.04.2022 10 | v3: Compared to v2.0 we have removed further 131 images that mostly contained faces, other body parts or images of tattoos. -------------------------------------------------------------------------------- /vision_transformer.py: -------------------------------------------------------------------------------- 1 | """ 2 | copied from DINO: https://github.com/facebookresearch/dino 3 | Mostly copy-paste from timm library. 4 | https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py 5 | """ 6 | import math 7 | from functools import partial 8 | 9 | import torch 10 | import torch.nn as nn 11 | 12 | 13 | def _no_grad_trunc_normal_(tensor, mean, std, a, b): 14 | # Cut & paste from PyTorch official master until it's in a few official releases - RW 15 | # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf 16 | def norm_cdf(x): 17 | # Computes standard normal cumulative distribution function 18 | return (1. + math.erf(x / math.sqrt(2.))) / 2. 19 | 20 | if (mean < a - 2 * std) or (mean > b + 2 * std): 21 | warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " 22 | "The distribution of values may be incorrect.", 23 | stacklevel=2) 24 | 25 | with torch.no_grad(): 26 | # Values are generated by using a truncated uniform distribution and 27 | # then using the inverse CDF for the normal distribution. 28 | # Get upper and lower cdf values 29 | l = norm_cdf((a - mean) / std) 30 | u = norm_cdf((b - mean) / std) 31 | 32 | # Uniformly fill tensor with values from [l, u], then translate to 33 | # [2l-1, 2u-1]. 34 | tensor.uniform_(2 * l - 1, 2 * u - 1) 35 | 36 | # Use inverse cdf transform for normal distribution to get truncated 37 | # standard normal 38 | tensor.erfinv_() 39 | 40 | # Transform to proper mean, std 41 | tensor.mul_(std * math.sqrt(2.)) 42 | tensor.add_(mean) 43 | 44 | # Clamp to ensure it's in the proper range 45 | tensor.clamp_(min=a, max=b) 46 | return tensor 47 | 48 | 49 | def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): 50 | # type: (Tensor, float, float, float, float) -> Tensor 51 | return _no_grad_trunc_normal_(tensor, mean, std, a, b) 52 | 53 | 54 | def drop_path(x, drop_prob: float = 0., training: bool = False): 55 | if drop_prob == 0. or not training: 56 | return x 57 | keep_prob = 1 - drop_prob 58 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 59 | random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device) 60 | random_tensor.floor_() # binarize 61 | output = x.div(keep_prob) * random_tensor 62 | return output 63 | 64 | 65 | class DropPath(nn.Module): 66 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). 67 | """ 68 | def __init__(self, drop_prob=None): 69 | super(DropPath, self).__init__() 70 | self.drop_prob = drop_prob 71 | 72 | def forward(self, x): 73 | return drop_path(x, self.drop_prob, self.training) 74 | 75 | 76 | class Mlp(nn.Module): 77 | def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): 78 | super().__init__() 79 | out_features = out_features or in_features 80 | hidden_features = hidden_features or in_features 81 | self.fc1 = nn.Linear(in_features, hidden_features) 82 | self.act = act_layer() 83 | self.fc2 = nn.Linear(hidden_features, out_features) 84 | self.drop = nn.Dropout(drop) 85 | 86 | def forward(self, x): 87 | x = self.fc1(x) 88 | x = self.act(x) 89 | x = self.drop(x) 90 | x = self.fc2(x) 91 | x = self.drop(x) 92 | return x 93 | 94 | 95 | class Attention(nn.Module): 96 | def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): 97 | super().__init__() 98 | self.num_heads = num_heads 99 | head_dim = dim // num_heads 100 | self.scale = qk_scale or head_dim ** -0.5 101 | 102 | self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) 103 | self.attn_drop = nn.Dropout(attn_drop) 104 | self.proj = nn.Linear(dim, dim) 105 | self.proj_drop = nn.Dropout(proj_drop) 106 | 107 | def forward(self, x): 108 | B, N, C = x.shape 109 | qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) 110 | q, k, v = qkv[0], qkv[1], qkv[2] 111 | 112 | attn = (q @ k.transpose(-2, -1)) * self.scale 113 | attn = attn.softmax(dim=-1) 114 | attn = self.attn_drop(attn) 115 | 116 | x = (attn @ v).transpose(1, 2).reshape(B, N, C) 117 | x = self.proj(x) 118 | x = self.proj_drop(x) 119 | return x, attn 120 | 121 | 122 | class Block(nn.Module): 123 | def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., 124 | drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): 125 | super().__init__() 126 | self.norm1 = norm_layer(dim) 127 | self.attn = Attention( 128 | dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) 129 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 130 | self.norm2 = norm_layer(dim) 131 | mlp_hidden_dim = int(dim * mlp_ratio) 132 | self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) 133 | 134 | def forward(self, x, return_attention=False): 135 | y, attn = self.attn(self.norm1(x)) 136 | if return_attention: 137 | return attn 138 | x = x + self.drop_path(y) 139 | x = x + self.drop_path(self.mlp(self.norm2(x))) 140 | return x 141 | 142 | 143 | class PatchEmbed(nn.Module): 144 | """ Image to Patch Embedding 145 | """ 146 | def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): 147 | super().__init__() 148 | num_patches = (img_size // patch_size) * (img_size // patch_size) 149 | self.img_size = img_size 150 | self.patch_size = patch_size 151 | self.num_patches = num_patches 152 | 153 | self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) 154 | 155 | def forward(self, x): 156 | B, C, H, W = x.shape 157 | x = self.proj(x).flatten(2).transpose(1, 2) 158 | return x 159 | 160 | 161 | class VisionTransformer(nn.Module): 162 | """ Vision Transformer """ 163 | def __init__(self, img_size=[224], patch_size=16, in_chans=3, num_classes=0, embed_dim=768, depth=12, 164 | num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., 165 | drop_path_rate=0., norm_layer=nn.LayerNorm, **kwargs): 166 | super().__init__() 167 | self.num_features = self.embed_dim = embed_dim 168 | 169 | self.patch_embed = PatchEmbed( 170 | img_size=img_size[0], patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) 171 | num_patches = self.patch_embed.num_patches 172 | 173 | self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) 174 | self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) 175 | self.pos_drop = nn.Dropout(p=drop_rate) 176 | 177 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule 178 | self.blocks = nn.ModuleList([ 179 | Block( 180 | dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale, 181 | drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer) 182 | for i in range(depth)]) 183 | self.norm = norm_layer(embed_dim) 184 | 185 | # Classifier head 186 | self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() 187 | 188 | trunc_normal_(self.pos_embed, std=.02) 189 | trunc_normal_(self.cls_token, std=.02) 190 | self.apply(self._init_weights) 191 | 192 | def _init_weights(self, m): 193 | if isinstance(m, nn.Linear): 194 | trunc_normal_(m.weight, std=.02) 195 | if isinstance(m, nn.Linear) and m.bias is not None: 196 | nn.init.constant_(m.bias, 0) 197 | elif isinstance(m, nn.LayerNorm): 198 | nn.init.constant_(m.bias, 0) 199 | nn.init.constant_(m.weight, 1.0) 200 | 201 | def interpolate_pos_encoding(self, x, w, h): 202 | npatch = x.shape[1] - 1 203 | N = self.pos_embed.shape[1] - 1 204 | if npatch == N and w == h: 205 | return self.pos_embed 206 | class_pos_embed = self.pos_embed[:, 0] 207 | patch_pos_embed = self.pos_embed[:, 1:] 208 | dim = x.shape[-1] 209 | w0 = w // self.patch_embed.patch_size 210 | h0 = h // self.patch_embed.patch_size 211 | # we add a small number to avoid floating point error in the interpolation 212 | # see discussion at https://github.com/facebookresearch/dino/issues/8 213 | w0, h0 = w0 + 0.1, h0 + 0.1 214 | patch_pos_embed = nn.functional.interpolate( 215 | patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), dim).permute(0, 3, 1, 2), 216 | scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), 217 | mode='bicubic', 218 | ) 219 | assert int(w0) == patch_pos_embed.shape[-2] and int(h0) == patch_pos_embed.shape[-1] 220 | patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) 221 | return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) 222 | 223 | def prepare_tokens(self, x): 224 | B, nc, w, h = x.shape 225 | x = self.patch_embed(x) # patch linear embedding 226 | 227 | # add the [CLS] token to the embed patch tokens 228 | cls_tokens = self.cls_token.expand(B, -1, -1) 229 | x = torch.cat((cls_tokens, x), dim=1) 230 | 231 | # add positional encoding to each token 232 | x = x + self.interpolate_pos_encoding(x, w, h) 233 | 234 | return self.pos_drop(x) 235 | 236 | def forward(self, x): 237 | x = self.prepare_tokens(x) 238 | for blk in self.blocks: 239 | x = blk(x) 240 | x = self.norm(x) 241 | return x[:, 0] 242 | 243 | def get_last_selfattention(self, x): 244 | x = self.prepare_tokens(x) 245 | for i, blk in enumerate(self.blocks): 246 | if i < len(self.blocks) - 1: 247 | x = blk(x) 248 | else: 249 | # return attention of the last block 250 | return blk(x, return_attention=True) 251 | 252 | def get_intermediate_layers(self, x, n=1): 253 | x = self.prepare_tokens(x) 254 | # we return the output tokens from the `n` last blocks 255 | output = [] 256 | for i, blk in enumerate(self.blocks): 257 | x = blk(x) 258 | if len(self.blocks) - i <= n: 259 | output.append(self.norm(x)) 260 | return output 261 | 262 | 263 | def vit_tiny(patch_size=16, **kwargs): 264 | model = VisionTransformer( 265 | patch_size=patch_size, embed_dim=192, depth=12, num_heads=3, mlp_ratio=4, 266 | qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 267 | return model 268 | 269 | 270 | def vit_small(patch_size=16, **kwargs): 271 | model = VisionTransformer( 272 | patch_size=patch_size, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4, 273 | qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 274 | return model 275 | 276 | 277 | def vit_base(patch_size=16, **kwargs): 278 | model = VisionTransformer( 279 | patch_size=patch_size, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, 280 | qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) 281 | return model 282 | 283 | 284 | class DINOHead(nn.Module): 285 | def __init__(self, in_dim, out_dim, use_bn=False, norm_last_layer=True, nlayers=3, hidden_dim=2048, bottleneck_dim=256): 286 | super().__init__() 287 | nlayers = max(nlayers, 1) 288 | if nlayers == 1: 289 | self.mlp = nn.Linear(in_dim, bottleneck_dim) 290 | else: 291 | layers = [nn.Linear(in_dim, hidden_dim)] 292 | if use_bn: 293 | layers.append(nn.BatchNorm1d(hidden_dim)) 294 | layers.append(nn.GELU()) 295 | for _ in range(nlayers - 2): 296 | layers.append(nn.Linear(hidden_dim, hidden_dim)) 297 | if use_bn: 298 | layers.append(nn.BatchNorm1d(hidden_dim)) 299 | layers.append(nn.GELU()) 300 | layers.append(nn.Linear(hidden_dim, bottleneck_dim)) 301 | self.mlp = nn.Sequential(*layers) 302 | self.apply(self._init_weights) 303 | self.last_layer = nn.utils.weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) 304 | self.last_layer.weight_g.data.fill_(1) 305 | if norm_last_layer: 306 | self.last_layer.weight_g.requires_grad = False 307 | 308 | def _init_weights(self, m): 309 | if isinstance(m, nn.Linear): 310 | trunc_normal_(m.weight, std=.02) 311 | if isinstance(m, nn.Linear) and m.bias is not None: 312 | nn.init.constant_(m.bias, 0) 313 | 314 | def forward(self, x): 315 | x = self.mlp(x) 316 | x = nn.functional.normalize(x, dim=-1, p=2) 317 | x = self.last_layer(x) 318 | return x 319 | --------------------------------------------------------------------------------