├── README.md ├── deploy.prototxt ├── face_full_conv.prototxt ├── image_preprocess.py ├── solver.prototxt ├── test.py ├── train.sh └── train_val.prototxt /README.md: -------------------------------------------------------------------------------- 1 | # FaceDetection_CNN 2 | Implement Yahoo Paper: Multi-view Face Detection Using Deep Convolutional Neural Networks

3 | 1. Image Preprocess aflw dataset[1]. Use iou>=0.5 as positive, iou<=0.3 as negative.

4 | 2. Fine-tune Alex-Net using AFLW dataset.

5 | 3. Convert fully connected layers into convolutional layers by reshaping layer parameters, see [2]

6 | 4. Get heat map for each scale of image.

7 | 5. Process heat map by using non-maximal suppression to accurately localize the faces.

8 | 9 | ========== 10 | Reference: 11 | [1]https://lrs.icg.tugraz.at/research/aflw/

12 | [2] http://nbviewer.ipython.org/github/BVLC/caffe/blob/master/examples/net_surgery.ipynb

13 | 14 | -------------------------------------------------------------------------------- /deploy.prototxt: -------------------------------------------------------------------------------- 1 | name: "CaffeNet" 2 | input: "data" 3 | input_dim: 10 4 | input_dim: 3 5 | input_dim: 227 6 | input_dim: 227 7 | layer { 8 | name: "conv1" 9 | type: "Convolution" 10 | bottom: "data" 11 | top: "conv1" 12 | param { 13 | lr_mult: 1 14 | decay_mult: 1 15 | } 16 | param { 17 | lr_mult: 2 18 | decay_mult: 0 19 | } 20 | convolution_param { 21 | num_output: 96 22 | kernel_size: 11 23 | stride: 4 24 | weight_filler { 25 | type: "gaussian" 26 | std: 0.01 27 | } 28 | bias_filler { 29 | type: "constant" 30 | value: 0 31 | } 32 | } 33 | } 34 | layer { 35 | name: "relu1" 36 | type: "ReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 3 48 | stride: 2 49 | } 50 | } 51 | layer { 52 | name: "norm1" 53 | type: "LRN" 54 | bottom: "pool1" 55 | top: "norm1" 56 | lrn_param { 57 | local_size: 5 58 | alpha: 0.0001 59 | beta: 0.75 60 | } 61 | } 62 | layer { 63 | name: "conv2" 64 | type: "Convolution" 65 | bottom: "norm1" 66 | top: "conv2" 67 | param { 68 | lr_mult: 1 69 | decay_mult: 1 70 | } 71 | param { 72 | lr_mult: 2 73 | decay_mult: 0 74 | } 75 | convolution_param { 76 | num_output: 256 77 | pad: 2 78 | kernel_size: 5 79 | group: 2 80 | weight_filler { 81 | type: "gaussian" 82 | std: 0.01 83 | } 84 | bias_filler { 85 | type: "constant" 86 | value: 1 87 | } 88 | } 89 | } 90 | layer { 91 | name: "relu2" 92 | type: "ReLU" 93 | bottom: "conv2" 94 | top: "conv2" 95 | } 96 | layer { 97 | name: "pool2" 98 | type: "Pooling" 99 | bottom: "conv2" 100 | top: "pool2" 101 | pooling_param { 102 | pool: MAX 103 | kernel_size: 3 104 | stride: 2 105 | } 106 | } 107 | layer { 108 | name: "norm2" 109 | type: "LRN" 110 | bottom: "pool2" 111 | top: "norm2" 112 | lrn_param { 113 | local_size: 5 114 | alpha: 0.0001 115 | beta: 0.75 116 | } 117 | } 118 | layer { 119 | name: "conv3" 120 | type: "Convolution" 121 | bottom: "norm2" 122 | top: "conv3" 123 | param { 124 | lr_mult: 1 125 | decay_mult: 1 126 | } 127 | param { 128 | lr_mult: 2 129 | decay_mult: 0 130 | } 131 | convolution_param { 132 | num_output: 384 133 | pad: 1 134 | kernel_size: 3 135 | weight_filler { 136 | type: "gaussian" 137 | std: 0.01 138 | } 139 | bias_filler { 140 | type: "constant" 141 | value: 0 142 | } 143 | } 144 | } 145 | layer { 146 | name: "relu3" 147 | type: "ReLU" 148 | bottom: "conv3" 149 | top: "conv3" 150 | } 151 | layer { 152 | name: "conv4" 153 | type: "Convolution" 154 | bottom: "conv3" 155 | top: "conv4" 156 | param { 157 | lr_mult: 1 158 | decay_mult: 1 159 | } 160 | param { 161 | lr_mult: 2 162 | decay_mult: 0 163 | } 164 | convolution_param { 165 | num_output: 384 166 | pad: 1 167 | kernel_size: 3 168 | group: 2 169 | weight_filler { 170 | type: "gaussian" 171 | std: 0.01 172 | } 173 | bias_filler { 174 | type: "constant" 175 | value: 1 176 | } 177 | } 178 | } 179 | layer { 180 | name: "relu4" 181 | type: "ReLU" 182 | bottom: "conv4" 183 | top: "conv4" 184 | } 185 | layer { 186 | name: "conv5" 187 | type: "Convolution" 188 | bottom: "conv4" 189 | top: "conv5" 190 | param { 191 | lr_mult: 1 192 | decay_mult: 1 193 | } 194 | param { 195 | lr_mult: 2 196 | decay_mult: 0 197 | } 198 | convolution_param { 199 | num_output: 256 200 | pad: 1 201 | kernel_size: 3 202 | group: 2 203 | weight_filler { 204 | type: "gaussian" 205 | std: 0.01 206 | } 207 | bias_filler { 208 | type: "constant" 209 | value: 1 210 | } 211 | } 212 | } 213 | layer { 214 | name: "relu5" 215 | type: "ReLU" 216 | bottom: "conv5" 217 | top: "conv5" 218 | } 219 | layer { 220 | name: "pool5" 221 | type: "Pooling" 222 | bottom: "conv5" 223 | top: "pool5" 224 | pooling_param { 225 | pool: MAX 226 | kernel_size: 3 227 | stride: 2 228 | } 229 | } 230 | layer { 231 | name: "fc6" 232 | type: "InnerProduct" 233 | bottom: "pool5" 234 | top: "fc6" 235 | param { 236 | lr_mult: 1 237 | decay_mult: 1 238 | } 239 | param { 240 | lr_mult: 2 241 | decay_mult: 0 242 | } 243 | inner_product_param { 244 | num_output: 4096 245 | weight_filler { 246 | type: "gaussian" 247 | std: 0.005 248 | } 249 | bias_filler { 250 | type: "constant" 251 | value: 1 252 | } 253 | } 254 | } 255 | layer { 256 | name: "relu6" 257 | type: "ReLU" 258 | bottom: "fc6" 259 | top: "fc6" 260 | } 261 | layer { 262 | name: "drop6" 263 | type: "Dropout" 264 | bottom: "fc6" 265 | top: "fc6" 266 | dropout_param { 267 | dropout_ratio: 0.5 268 | } 269 | } 270 | layer { 271 | name: "fc7" 272 | type: "InnerProduct" 273 | bottom: "fc6" 274 | top: "fc7" 275 | # Note that lr_mult can be set to 0 to disable any fine-tuning of this, and any other, layer 276 | param { 277 | lr_mult: 1 278 | decay_mult: 1 279 | } 280 | param { 281 | lr_mult: 2 282 | decay_mult: 0 283 | } 284 | inner_product_param { 285 | num_output: 4096 286 | weight_filler { 287 | type: "gaussian" 288 | std: 0.005 289 | } 290 | bias_filler { 291 | type: "constant" 292 | value: 1 293 | } 294 | } 295 | } 296 | layer { 297 | name: "relu7" 298 | type: "ReLU" 299 | bottom: "fc7" 300 | top: "fc7" 301 | } 302 | layer { 303 | name: "drop7" 304 | type: "Dropout" 305 | bottom: "fc7" 306 | top: "fc7" 307 | dropout_param { 308 | dropout_ratio: 0.5 309 | } 310 | } 311 | layer { 312 | name: "fc8_flickr" 313 | type: "InnerProduct" 314 | bottom: "fc7" 315 | top: "fc8_flickr" 316 | # lr_mult is set to higher than for other layers, because this layer is starting from random while the others are already trained 317 | param { 318 | lr_mult: 10 319 | decay_mult: 1 320 | } 321 | param { 322 | lr_mult: 20 323 | decay_mult: 0 324 | } 325 | inner_product_param { 326 | num_output: 2 327 | weight_filler { 328 | type: "gaussian" 329 | std: 0.01 330 | } 331 | bias_filler { 332 | type: "constant" 333 | value: 0 334 | } 335 | } 336 | } 337 | layer { 338 | name: "prob" 339 | type: "Softmax" 340 | bottom: "fc8_flickr" 341 | top: "prob" 342 | } 343 | -------------------------------------------------------------------------------- /face_full_conv.prototxt: -------------------------------------------------------------------------------- 1 | # Fully convolutional network version of CaffeNet. 2 | name: "CaffeNetConv" 3 | input: "data" 4 | input_dim: 1 5 | input_dim: 3 6 | input_dim: 500 7 | input_dim: 500 8 | layer { 9 | name: "conv1" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1" 13 | convolution_param { 14 | num_output: 96 15 | kernel_size: 11 16 | stride: 4 17 | } 18 | } 19 | layer { 20 | name: "relu1" 21 | type: "ReLU" 22 | bottom: "conv1" 23 | top: "conv1" 24 | } 25 | layer { 26 | name: "pool1" 27 | type: "Pooling" 28 | bottom: "conv1" 29 | top: "pool1" 30 | pooling_param { 31 | pool: MAX 32 | kernel_size: 3 33 | stride: 2 34 | } 35 | } 36 | layer { 37 | name: "norm1" 38 | type: "LRN" 39 | bottom: "pool1" 40 | top: "norm1" 41 | lrn_param { 42 | local_size: 5 43 | alpha: 0.0001 44 | beta: 0.75 45 | } 46 | } 47 | layer { 48 | name: "conv2" 49 | type: "Convolution" 50 | bottom: "norm1" 51 | top: "conv2" 52 | convolution_param { 53 | num_output: 256 54 | pad: 2 55 | kernel_size: 5 56 | group: 2 57 | } 58 | } 59 | layer { 60 | name: "relu2" 61 | type: "ReLU" 62 | bottom: "conv2" 63 | top: "conv2" 64 | } 65 | layer { 66 | name: "pool2" 67 | type: "Pooling" 68 | bottom: "conv2" 69 | top: "pool2" 70 | pooling_param { 71 | pool: MAX 72 | kernel_size: 3 73 | stride: 2 74 | } 75 | } 76 | layer { 77 | name: "norm2" 78 | type: "LRN" 79 | bottom: "pool2" 80 | top: "norm2" 81 | lrn_param { 82 | local_size: 5 83 | alpha: 0.0001 84 | beta: 0.75 85 | } 86 | } 87 | layer { 88 | name: "conv3" 89 | type: "Convolution" 90 | bottom: "norm2" 91 | top: "conv3" 92 | convolution_param { 93 | num_output: 384 94 | pad: 1 95 | kernel_size: 3 96 | } 97 | } 98 | layer { 99 | name: "relu3" 100 | type: "ReLU" 101 | bottom: "conv3" 102 | top: "conv3" 103 | } 104 | layer { 105 | name: "conv4" 106 | type: "Convolution" 107 | bottom: "conv3" 108 | top: "conv4" 109 | convolution_param { 110 | num_output: 384 111 | pad: 1 112 | kernel_size: 3 113 | group: 2 114 | } 115 | } 116 | layer { 117 | name: "relu4" 118 | type: "ReLU" 119 | bottom: "conv4" 120 | top: "conv4" 121 | } 122 | layer { 123 | name: "conv5" 124 | type: "Convolution" 125 | bottom: "conv4" 126 | top: "conv5" 127 | convolution_param { 128 | num_output: 256 129 | pad: 1 130 | kernel_size: 3 131 | group: 2 132 | } 133 | } 134 | layer { 135 | name: "relu5" 136 | type: "ReLU" 137 | bottom: "conv5" 138 | top: "conv5" 139 | } 140 | layer { 141 | name: "pool5" 142 | type: "Pooling" 143 | bottom: "conv5" 144 | top: "pool5" 145 | pooling_param { 146 | pool: MAX 147 | kernel_size: 3 148 | stride: 2 149 | } 150 | } 151 | layer { 152 | name: "fc6-conv" 153 | type: "Convolution" 154 | bottom: "pool5" 155 | top: "fc6-conv" 156 | convolution_param { 157 | num_output: 4096 158 | kernel_size: 6 159 | } 160 | } 161 | layer { 162 | name: "relu6" 163 | type: "ReLU" 164 | bottom: "fc6-conv" 165 | top: "fc6-conv" 166 | } 167 | layer { 168 | name: "drop6" 169 | type: "Dropout" 170 | bottom: "fc6-conv" 171 | top: "fc6-conv" 172 | dropout_param { 173 | dropout_ratio: 0.5 174 | } 175 | } 176 | layer { 177 | name: "fc7-conv" 178 | type: "Convolution" 179 | bottom: "fc6-conv" 180 | top: "fc7-conv" 181 | convolution_param { 182 | num_output: 4096 183 | kernel_size: 1 184 | } 185 | } 186 | layer { 187 | name: "relu7" 188 | type: "ReLU" 189 | bottom: "fc7-conv" 190 | top: "fc7-conv" 191 | } 192 | layer { 193 | name: "drop7" 194 | type: "Dropout" 195 | bottom: "fc7-conv" 196 | top: "fc7-conv" 197 | dropout_param { 198 | dropout_ratio: 0.5 199 | } 200 | } 201 | layer { 202 | name: "fc8-conv" 203 | type: "Convolution" 204 | bottom: "fc7-conv" 205 | top: "fc8-conv" 206 | convolution_param { 207 | num_output: 2 208 | kernel_size: 1 209 | } 210 | } 211 | layer { 212 | name: "prob" 213 | type: "Softmax" 214 | bottom: "fc8-conv" 215 | top: "prob" 216 | } 217 | -------------------------------------------------------------------------------- /image_preprocess.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import numpy as np 3 | import random 4 | 5 | 6 | def IoUofTwoSameImages(region1, region2): 7 | #clock-wise 8 | m1 = ((region1[0], region1[1]), (region1[2], region1[1]), (region1[2],region1[3]),(region1[0], region1[3])) 9 | m2 = ((region2[0], region2[1]), (region2[2], region2[1]), (region2[2],region2[3]), (region2[0], region2[3])) 10 | result = [] 11 | intersection = 0.0 12 | area = 2*abs((m1[1][0] - m1[0][0]) * (m1[3][1] - m1[0][1])) 13 | for p in m1: 14 | if(p[0] >= m2[0][0] and p[0] <= m2[1][0] and p[1] >= m2[0][1] and p[1] <= m2[3][1]): 15 | result.append(p) 16 | for p in m2: 17 | if(p[0] >= m1[0][0] and p[0] <= m1[1][0] and p[1] >= m1[0][1] and p[1] <= m1[3][1]): 18 | result.append(p) 19 | #print "len:", len(result) 20 | if(len(result) == 2): 21 | intersection = abs((result[1][0] - result[0][0]) * (result[1][1] - result[0][1])) 22 | elif(len(result) == 4): 23 | #find the duijiao 24 | if(result[0][0] != result[1][0] and result[0][1] != result[1][1]): 25 | intersection = abs((result[1][0] - result[0][0]) * (result[1][1] - result[0][1])) 26 | elif(result[0][0] != result[2][0] and result[0][1] != result[2][1]): 27 | intersection = abs((result[2][0] - result[0][0]) * (result[2][1] - result[0][1])) 28 | elif(result[0][0] != result[3][0] and result[0][1] != result[3][1]): 29 | intersection = abs((result[3][0] - result[0][0]) * (result[3][1] - result[0][1])) 30 | #print region1, region2, intersection/float(area - intersection) 31 | return intersection/float(area - intersection) 32 | 33 | if __name__ == "__main__": 34 | #write to file 35 | output = open('aflw.list', 'w') 36 | #read faces rect from file 37 | faces_file = open('face_rect.txt', 'r') 38 | imageFaces = {} 39 | for line in faces_file.readlines(): 40 | if(not line.startswith('#')): 41 | imagePath = line.split('\t')[1].strip() 42 | if(imagePath in imageFaces): 43 | imageFaces[imagePath].append(line) 44 | else: 45 | imageFaces[imagePath] = [line] 46 | faces_file.close() 47 | #del the proprocess image. 48 | ########################################### 49 | #has_pro = open('aflw.list5', 'r') 50 | # for line in has_pro.readlines(): 51 | # has_image = line.split(' ')[0].split('/')[-1].split('_')[0] + '.jpg' 52 | # if('flickr/3/' + has_image in imageFaces): 53 | # print has_image 54 | # imageFaces.pop('flickr/3/' + has_image, None) 55 | # elif('flickr/0/' + has_image in imageFaces): 56 | # print has_image 57 | # imageFaces.pop('flickr/0/' + has_image, None) 58 | # elif('flickr/2/' + has_image in imageFaces): 59 | # print has_image 60 | # imageFaces.pop('flickr/2/' + has_image, None) 61 | ##################################### 62 | count = 0 63 | for imagePath, faces in imageFaces.iteritems(): 64 | imagePath = 'aflw/data/' + imagePath 65 | try: 66 | im = Image.open(imagePath) 67 | face_regions = [] 68 | for item in faces: 69 | face_id = item.split('\t')[0].strip() 70 | imageName = item.split("\t")[1].split("/")[-1].replace(".jpg", "").strip() 71 | face_x = int(item.split('\t')[2].strip()) 72 | face_y = int(item.split('\t')[3].strip()) 73 | face_w = int(item.split('\t')[4].strip()) 74 | face_h = int(item.split('\t')[5].strip()) 75 | #crop the face. 76 | face_region_x = face_x + face_w 77 | face_region_y = face_y + face_h 78 | if(face_region_x > im.size[0]): 79 | face_region_x = im.size[0] 80 | if(face_region_y > im.size[1]): 81 | face_region_y = im.size[1] 82 | face_region = (face_x, face_y, face_region_x, face_region_y) 83 | face_regions.append(face_region) 84 | 85 | #use sliding windows of the same size with face, select the IoU >= 0.5 as face, IoU <=0.3 as non-face. 86 | face_width = face_regions[0][2]-face_regions[0][0] 87 | face_height = face_regions[0][3]-face_regions[0][1] 88 | step = face_width/ 5 89 | for i in range(0, im.size[0]- face_width + 1, step): 90 | for j in range(0, im.size[1] - face_height + 1, step): 91 | #crop the image. 92 | # crop = im.crop((i,j, i + face_width, j+face_height)) 93 | iou_face = False 94 | for face_region in face_regions: 95 | IoU = IoUofTwoSameImages((i,j, i + face_width, j+face_height), (face_region[0], face_region[1], face_region[2], face_region[3])) 96 | if(IoU >= 0.5): 97 | crop = im.crop((i,j, i + face_width, j+face_height)) 98 | crop.save("crop_images/face/" + imageName + "_" + str(count) + ".jpg") 99 | output.write("crop_images/face/" + imageName + "_" + str(count) + ".jpg" + " " + str(IoU) + "\n") 100 | count += 1 101 | iou_face = True 102 | if(iou_face): 103 | continue 104 | IoUCount = 0 105 | for face_region in face_regions: 106 | IoU = IoUofTwoSameImages((i,j, i + face_width, j+face_height), (face_region[0], face_region[1], face_region[2], face_region[3])) 107 | if(IoU <= 0.2): 108 | IoUCount += 1 109 | if(IoUCount == len(face_regions)): 110 | pixels = list(crop.getdata()) 111 | array = np.array(pixels) 112 | remove = 0 113 | if(type(array.std(axis=0)) is np.float64): 114 | if(array.std(axis=0) < 8 and random.random() < 0.05): 115 | crop = im.crop((i,j, i + face_width, j+face_height)) 116 | crop.save("crop_images/non-face/" + imageName + "_" + str(count) + ".jpg") 117 | output.write("crop_images/non-face/" + imageName + "_" + str(count) + ".jpg" + " " + str(IoU) + "\n") 118 | count += 1 119 | continue 120 | elif(type(array.std(axis=0)) is not np.float64): 121 | #print array.std(axis=0) 122 | for k in array.std(axis=0): 123 | if(k < 8): 124 | remove += 1 125 | if(remove == 3 and random.random() < 0.05): 126 | crop = im.crop((i,j, i + face_width, j+face_height)) 127 | crop.save("crop_images/non-face/" + imageName + "_" + str(count) + ".jpg") 128 | output.write("crop_images/non-face/" + imageName + "_" + str(count) + ".jpg" + " " + str(IoU) + "\n") 129 | count += 1 130 | continue 131 | if(random.random() < 0.1): 132 | crop = im.crop((i,j, i + face_width, j+face_height)) 133 | crop.save("crop_images/non-face/" + imageName + "_" + str(count) + ".jpg") 134 | output.write("crop_images/non-face/" + imageName + "_" + str(count) + ".jpg" + " " + str(IoU) + "\n") 135 | count += 1 136 | print count 137 | except IOError: 138 | print "No such file", imagePath 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | -------------------------------------------------------------------------------- /solver.prototxt: -------------------------------------------------------------------------------- 1 | net: "examples/face_detection_yahoo/alexNet/train_val.prototxt" 2 | test_iter: 157 3 | test_interval: 10000 4 | # lr for fine-tuning should be lower than when starting from scratch 5 | base_lr: 0.001 6 | lr_policy: "step" 7 | gamma: 0.1 8 | # stepsize should also be lower, as we're closer to being done 9 | stepsize: 20000 10 | display: 20 11 | max_iter: 100000 12 | momentum: 0.9 13 | weight_decay: 0.0005 14 | snapshot: 10000 15 | snapshot_prefix: "examples/face_detection_yahoo/alexNet/alexNet_" 16 | # uncomment the following to default to CPU mode solving 17 | # solver_mode: CPU 18 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | #import Image 4 | import sys 5 | import os 6 | import PIL 7 | import operator 8 | from math import pow 9 | from PIL import Image, ImageDraw, ImageFont 10 | caffe_root = '/mnt_data/caffe/caffe/' 11 | 12 | sys.path.insert(0, caffe_root + 'python') 13 | import caffe 14 | caffe.set_device(0) 15 | caffe.set_mode_gpu() 16 | 17 | 18 | 19 | 20 | 21 | #helper show filter outputs 22 | def show_filters(net): 23 | net.forward() 24 | plt.figure() 25 | filt_min, filt_max = net.blobs['conv'].data.min(), net.blobs['conv'].data.max() 26 | for i in range(3): # three feature map. 27 | plt.subplot(1,4,i+2) 28 | plt.title("filter #{} output".format(i)) 29 | plt.imshow(net.blobs['conv'].data[0,i], vmin=filt_min, vmax=filt_max) 30 | plt.tight_layout() 31 | plt.axis('off') 32 | plt.show() 33 | 34 | 35 | def generateBoundingBox(featureMap, scale): 36 | boundingBox = [] 37 | stride = 32 38 | cellSize = 227 39 | #227 x 227 cell, stride=32 40 | for (x,y), prob in np.ndenumerate(featureMap): 41 | if(prob >= 0.85): 42 | boundingBox.append([float(stride * y)/ scale, float(x * stride)/scale, float(stride * y + cellSize - 1)/scale, float(stride * x + cellSize - 1)/scale, prob]) 43 | #sort by prob, from max to min. 44 | #boxes = np.array(boundingBox) 45 | return boundingBox 46 | def nms_average(boxes, overlapThresh=0.2): 47 | result_boxes = [] 48 | if len(boxes) == 0: 49 | return [] 50 | # initialize the list of picked indexes 51 | pick = [] 52 | # grab the coordinates of the bounding boxes 53 | x1 = boxes[:,0] 54 | y1 = boxes[:,1] 55 | x2 = boxes[:,2] 56 | y2 = boxes[:,3] 57 | # compute the area of the bounding boxes and sort the bounding 58 | # boxes by the bottom-right y-coordinate of the bounding box 59 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 60 | idxs = np.argsort(boxes[:,4]) 61 | 62 | # keep looping while some indexes still remain in the indexes 63 | # list 64 | while len(idxs) > 0: 65 | # grab the last index in the indexes list and add the 66 | # index value to the list of picked indexes 67 | last = len(idxs) - 1 68 | i = idxs[last] 69 | pick.append(i) 70 | 71 | # find the largest (x, y) coordinates for the start of 72 | # the bounding box and the smallest (x, y) coordinates 73 | # for the end of the bounding box 74 | xx1 = np.maximum(x1[i], x1[idxs[:last]]) 75 | yy1 = np.maximum(y1[i], y1[idxs[:last]]) 76 | xx2 = np.minimum(x2[i], x2[idxs[:last]]) 77 | yy2 = np.minimum(y2[i], y2[idxs[:last]]) 78 | 79 | # compute the width and height of the bounding box 80 | w = np.maximum(0, xx2 - xx1 + 1) 81 | h = np.maximum(0, yy2 - yy1 + 1) 82 | #area of i. 83 | area_i = np.maximum(0, x2[i] - x1[i] + 1) * np.maximum(0, y2[i] - y1[i] + 1) 84 | area_array = np.zeros(len(idxs) - 1) 85 | area_array.fill(area_i) 86 | # compute the ratio of overlap 87 | #overlap = (w * h) / (area[idxs[:last]] - w * h + area_array) 88 | 89 | overlap = (w * h) / (area[idxs[:last]]) 90 | delete_idxs = np.concatenate(([last],np.where(overlap > overlapThresh)[0])) 91 | xmin = 10000 92 | ymin = 10000 93 | xmax = 0 94 | ymax = 0 95 | ave_prob = 0 96 | width = x2[i] - x1[i] + 1 97 | height = y2[i] - y1[i] + 1 98 | for idx in delete_idxs: 99 | ave_prob += boxes[idxs[idx]][4] 100 | if(boxes[idxs[idx]][0] < xmin): 101 | xmin = boxes[idxs[idx]][0] 102 | if(boxes[idxs[idx]][1] < ymin): 103 | ymin = boxes[idxs[idx]][1] 104 | if(boxes[idxs[idx]][2] > xmax): 105 | xmax = boxes[idxs[idx]][2] 106 | if(boxes[idxs[idx]][3] > ymax): 107 | ymax = boxes[idxs[idx]][3] 108 | if(x1[i] - xmin > 0.1 * width): 109 | xmin = x1[i] - 0.1 * width 110 | if(y1[i] - ymin > 0.1 * height): 111 | ymin = y1[i] - 0.1 * height 112 | if(xmax - x2[i]> 0.1 * width): 113 | xmax = x2[i] + 0.1 * width 114 | if( ymax - y2[i] > 0.1 * height): 115 | ymax = y2[i] + 0.1 * height 116 | result_boxes.append([xmin, ymin, xmax, ymax, ave_prob / len(delete_idxs)]) 117 | # delete all indexes from the index list that have 118 | idxs = np.delete(idxs, delete_idxs) 119 | 120 | # return only the bounding boxes that were picked using the 121 | # integer data type 122 | #result = np.delete(boxes[pick],np.where(boxes[pick][:, 4] < 0.9)[0], axis=0) 123 | #print boxes[pick] 124 | return result_boxes 125 | 126 | 127 | 128 | 129 | def nms_max(boxes, overlapThresh=0.3): 130 | if len(boxes) == 0: 131 | return [] 132 | # initialize the list of picked indexes 133 | pick = [] 134 | # grab the coordinates of the bounding boxes 135 | x1 = boxes[:,0] 136 | y1 = boxes[:,1] 137 | x2 = boxes[:,2] 138 | y2 = boxes[:,3] 139 | # compute the area of the bounding boxes and sort the bounding 140 | # boxes by the bottom-right y-coordinate of the bounding box 141 | area = (x2 - x1 + 1) * (y2 - y1 + 1) 142 | idxs = np.argsort(boxes[:,4]) 143 | 144 | # keep looping while some indexes still remain in the indexes 145 | # list 146 | while len(idxs) > 0: 147 | # grab the last index in the indexes list and add the 148 | # index value to the list of picked indexes 149 | last = len(idxs) - 1 150 | i = idxs[last] 151 | pick.append(i) 152 | 153 | # find the largest (x, y) coordinates for the start of 154 | # the bounding box and the smallest (x, y) coordinates 155 | # for the end of the bounding box 156 | xx1 = np.maximum(x1[i], x1[idxs[:last]]) 157 | yy1 = np.maximum(y1[i], y1[idxs[:last]]) 158 | xx2 = np.minimum(x2[i], x2[idxs[:last]]) 159 | yy2 = np.minimum(y2[i], y2[idxs[:last]]) 160 | 161 | # compute the width and height of the bounding box 162 | w = np.maximum(0, xx2 - xx1 + 1) 163 | h = np.maximum(0, yy2 - yy1 + 1) 164 | #area of i. 165 | area_i = np.maximum(0, x2[i] - x1[i] + 1) * np.maximum(0, y2[i] - y1[i] + 1) 166 | area_array = np.zeros(len(idxs) - 1) 167 | area_array.fill(area_i) 168 | # compute the ratio of overlap 169 | overlap = (w * h) / (area[idxs[:last]] - w * h + area_array) 170 | #overlap = (w * h) / (area[idxs[:last]]) 171 | # delete all indexes from the index list that have 172 | idxs = np.delete(idxs, np.concatenate(([last],np.where(overlap > overlapThresh)[0]))) 173 | 174 | # return only the bounding boxes that were picked using the 175 | # integer data type 176 | #result = np.delete(boxes[pick],np.where(boxes[pick][:, 4] < 0.9)[0], axis=0) 177 | #print boxes[pick] 178 | return boxes[pick] 179 | 180 | def convert_full_conv(): 181 | # Load the original network and extract the fully connected layers' parameters. 182 | net = caffe.Net('deploy.prototxt', 183 | 'alexNet__iter_60000.caffemodel', 184 | caffe.TEST) 185 | params = ['fc6', 'fc7', 'fc8_flickr'] 186 | fc_params = {pr: (net.params[pr][0].data, net.params[pr][1].data) for pr in params} 187 | # Load the fully convolutional network to transplant the parameters. 188 | net_full_conv = caffe.Net('face_full_conv.prototxt', 189 | 'alexNet__iter_60000.caffemodel', 190 | caffe.TEST) 191 | params_full_conv = ['fc6-conv', 'fc7-conv', 'fc8-conv'] 192 | conv_params = {pr: (net_full_conv.params[pr][0].data, net_full_conv.params[pr][1].data) for pr in params_full_conv} 193 | for pr, pr_conv in zip(params, params_full_conv): 194 | conv_params[pr_conv][0].flat = fc_params[pr][0].flat # flat unrolls the arrays 195 | conv_params[pr_conv][1][...] = fc_params[pr][1] 196 | net_full_conv.save('face_full_conv.caffemodel') 197 | 198 | def face_detection(imgList): 199 | img_count = 0 200 | for imgFile in open(imgList).readlines(): 201 | scales = [] 202 | factor = 0.793700526 203 | img = Image.open(imgFile.strip()) 204 | min = 0 205 | max = 0 206 | if(img.size[0] > img.size[1]): 207 | min = img.size[1] 208 | max = img.size[0] 209 | else: 210 | min = img.size[0] 211 | max = img.size[1] 212 | delim = 2500/max 213 | if(delim == 1): 214 | scales.append(1) 215 | elif(delim > 1): 216 | scales.append(delim) 217 | 218 | #scales.append(5) 219 | min = min * factor 220 | factor_count = 1 221 | while(min >= 227): 222 | scales.append(pow(factor, factor_count)) 223 | min = min * factor 224 | factor_count += 1 225 | total_boxes = [] 226 | print 'size:', img.size[0], img.size[1] 227 | print scales 228 | for scale in scales: 229 | #resize image 230 | scale_img = img.resize((int(img.size[0] * scale), int(img.size[1] * scale))) 231 | scale_img.save("tmp.jpg") 232 | # print 'size:', scale_img.size[0], scale_img.size[1] 233 | #modify the full_conv prototxt. 234 | prototxt = open('face_full_conv.prototxt', 'r') 235 | new_line = "" 236 | for i, line in enumerate(prototxt): 237 | if i== 5: 238 | new_line += "input_dim: " + str(scale_img.size[1]) + "\n" 239 | elif i== 6: 240 | new_line += "input_dim: " + str(scale_img.size[0]) + "\n" 241 | else: 242 | new_line += line 243 | output = open('face_full_conv2.prototxt', 'w') 244 | output.write(new_line) 245 | output.close() 246 | prototxt.close() 247 | net_full_conv = caffe.Net('face_full_conv2.prototxt', 248 | 'face_full_conv.caffemodel', 249 | caffe.TEST) 250 | # load input and configure preprocessing 251 | im = caffe.io.load_image("tmp.jpg") 252 | transformer = caffe.io.Transformer({'data': net_full_conv.blobs['data'].data.shape}) 253 | transformer.set_mean('data', np.load(caffe_root + 'python/caffe/imagenet/ilsvrc_2012_mean.npy').mean(1).mean(1)) 254 | transformer.set_transpose('data', (2,0,1)) 255 | transformer.set_channel_swap('data', (2,1,0)) 256 | transformer.set_raw_scale('data', 255.0) 257 | 258 | # make classification map by forward and print prediction indices at each location 259 | out = net_full_conv.forward_all(data=np.asarray([transformer.preprocess('data', im)])) 260 | #print out['prob'][0].argmax(axis=0) 261 | boxes = generateBoundingBox(out['prob'][0,1], scale) 262 | #plt.subplot(1, 2, 1) 263 | #plt.imshow(transformer.deprocess('data', net_full_conv.blobs['data'].data[0])) 264 | #plt.subplot(1, 2, 2) 265 | #plt.imshow(out['prob'][0,1]) 266 | #plt.show() 267 | #print boxes 268 | if(boxes): 269 | total_boxes.extend(boxes) 270 | 271 | # boxes_nms = np.array(total_boxes) 272 | # true_boxes = nms(boxes_nms, overlapThresh=0.3) 273 | # #display the nmx bounding box in image. 274 | # draw = ImageDraw.Draw(scale_img) 275 | # for box in true_boxes: 276 | # draw.rectangle((box[0], box[1], box[2], box[3]) ) 277 | # scale_img.show() 278 | 279 | #nms 280 | boxes_nms = np.array(total_boxes) 281 | true_boxes1 = nms_max(boxes_nms, overlapThresh=0.3) 282 | true_boxes = nms_average(np.array(true_boxes1), overlapThresh=0.07) 283 | #display the nmx bounding box in image. 284 | draw = ImageDraw.Draw(img) 285 | print "width:", img.size[0], "height:", img.size[1] 286 | for box in true_boxes: 287 | draw.rectangle((box[0], box[1], box[2], box[3]), outline=(255,0,0) ) 288 | font_path=os.environ.get("FONT_PATH", "/usr/share/fonts/truetype/dejavu/DejaVuSerif.ttf") 289 | ttFont = ImageFont.truetype(font_path, 20) 290 | draw.text((box[0], box[1]), "{0:.2f}".format(box[4]), font=ttFont) 291 | img.save("result/" + str(img_count) + ".jpg") 292 | img_count+=1 293 | #img.show() 294 | 295 | if __name__ == "__main__": 296 | #convert_full_conv() 297 | face_detection("lfw.txt") 298 | -------------------------------------------------------------------------------- /train.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | GLOG_logtostderr=1 ./build/tools/caffe train --solver=examples/face_detection_yahoo/alexNet/solver.prototxt --weights=models/bvlc_reference_caffenet/bvlc_reference_caffenet.caffemodel --gpu 0 2>&1 | tee examples/face_detection_yahoo/alexNet/log 4 | -------------------------------------------------------------------------------- /train_val.prototxt: -------------------------------------------------------------------------------- 1 | name: "CaffeNet" 2 | layer { 3 | name: "data" 4 | type: "ImageData" 5 | top: "data" 6 | top: "label" 7 | include { 8 | phase: TRAIN 9 | } 10 | transform_param { 11 | mirror: true 12 | crop_size: 227 13 | mean_file: "data/ilsvrc12/imagenet_mean.binaryproto" 14 | } 15 | image_data_param { 16 | source: "aflw/crop_images/train.txt" 17 | batch_size: 128 18 | new_height: 256 19 | new_width: 256 20 | } 21 | } 22 | layer { 23 | name: "data" 24 | type: "ImageData" 25 | top: "data" 26 | top: "label" 27 | include { 28 | phase: TEST 29 | } 30 | transform_param { 31 | mirror: true 32 | crop_size: 227 33 | mean_file: "data/ilsvrc12/imagenet_mean.binaryproto" 34 | } 35 | image_data_param { 36 | source: "aflw/crop_images/val.txt" 37 | batch_size: 50 38 | new_height: 256 39 | new_width: 256 40 | } 41 | } 42 | layer { 43 | name: "conv1" 44 | type: "Convolution" 45 | bottom: "data" 46 | top: "conv1" 47 | param { 48 | lr_mult: 1 49 | decay_mult: 1 50 | } 51 | param { 52 | lr_mult: 2 53 | decay_mult: 0 54 | } 55 | convolution_param { 56 | num_output: 96 57 | kernel_size: 11 58 | stride: 4 59 | weight_filler { 60 | type: "gaussian" 61 | std: 0.01 62 | } 63 | bias_filler { 64 | type: "constant" 65 | value: 0 66 | } 67 | } 68 | } 69 | layer { 70 | name: "relu1" 71 | type: "ReLU" 72 | bottom: "conv1" 73 | top: "conv1" 74 | } 75 | layer { 76 | name: "pool1" 77 | type: "Pooling" 78 | bottom: "conv1" 79 | top: "pool1" 80 | pooling_param { 81 | pool: MAX 82 | kernel_size: 3 83 | stride: 2 84 | } 85 | } 86 | layer { 87 | name: "norm1" 88 | type: "LRN" 89 | bottom: "pool1" 90 | top: "norm1" 91 | lrn_param { 92 | local_size: 5 93 | alpha: 0.0001 94 | beta: 0.75 95 | } 96 | } 97 | layer { 98 | name: "conv2" 99 | type: "Convolution" 100 | bottom: "norm1" 101 | top: "conv2" 102 | param { 103 | lr_mult: 1 104 | decay_mult: 1 105 | } 106 | param { 107 | lr_mult: 2 108 | decay_mult: 0 109 | } 110 | convolution_param { 111 | num_output: 256 112 | pad: 2 113 | kernel_size: 5 114 | group: 2 115 | weight_filler { 116 | type: "gaussian" 117 | std: 0.01 118 | } 119 | bias_filler { 120 | type: "constant" 121 | value: 1 122 | } 123 | } 124 | } 125 | layer { 126 | name: "relu2" 127 | type: "ReLU" 128 | bottom: "conv2" 129 | top: "conv2" 130 | } 131 | layer { 132 | name: "pool2" 133 | type: "Pooling" 134 | bottom: "conv2" 135 | top: "pool2" 136 | pooling_param { 137 | pool: MAX 138 | kernel_size: 3 139 | stride: 2 140 | } 141 | } 142 | layer { 143 | name: "norm2" 144 | type: "LRN" 145 | bottom: "pool2" 146 | top: "norm2" 147 | lrn_param { 148 | local_size: 5 149 | alpha: 0.0001 150 | beta: 0.75 151 | } 152 | } 153 | layer { 154 | name: "conv3" 155 | type: "Convolution" 156 | bottom: "norm2" 157 | top: "conv3" 158 | param { 159 | lr_mult: 1 160 | decay_mult: 1 161 | } 162 | param { 163 | lr_mult: 2 164 | decay_mult: 0 165 | } 166 | convolution_param { 167 | num_output: 384 168 | pad: 1 169 | kernel_size: 3 170 | weight_filler { 171 | type: "gaussian" 172 | std: 0.01 173 | } 174 | bias_filler { 175 | type: "constant" 176 | value: 0 177 | } 178 | } 179 | } 180 | layer { 181 | name: "relu3" 182 | type: "ReLU" 183 | bottom: "conv3" 184 | top: "conv3" 185 | } 186 | layer { 187 | name: "conv4" 188 | type: "Convolution" 189 | bottom: "conv3" 190 | top: "conv4" 191 | param { 192 | lr_mult: 1 193 | decay_mult: 1 194 | } 195 | param { 196 | lr_mult: 2 197 | decay_mult: 0 198 | } 199 | convolution_param { 200 | num_output: 384 201 | pad: 1 202 | kernel_size: 3 203 | group: 2 204 | weight_filler { 205 | type: "gaussian" 206 | std: 0.01 207 | } 208 | bias_filler { 209 | type: "constant" 210 | value: 1 211 | } 212 | } 213 | } 214 | layer { 215 | name: "relu4" 216 | type: "ReLU" 217 | bottom: "conv4" 218 | top: "conv4" 219 | } 220 | layer { 221 | name: "conv5" 222 | type: "Convolution" 223 | bottom: "conv4" 224 | top: "conv5" 225 | param { 226 | lr_mult: 1 227 | decay_mult: 1 228 | } 229 | param { 230 | lr_mult: 2 231 | decay_mult: 0 232 | } 233 | convolution_param { 234 | num_output: 256 235 | pad: 1 236 | kernel_size: 3 237 | group: 2 238 | weight_filler { 239 | type: "gaussian" 240 | std: 0.01 241 | } 242 | bias_filler { 243 | type: "constant" 244 | value: 1 245 | } 246 | } 247 | } 248 | layer { 249 | name: "relu5" 250 | type: "ReLU" 251 | bottom: "conv5" 252 | top: "conv5" 253 | } 254 | layer { 255 | name: "pool5" 256 | type: "Pooling" 257 | bottom: "conv5" 258 | top: "pool5" 259 | pooling_param { 260 | pool: MAX 261 | kernel_size: 3 262 | stride: 2 263 | } 264 | } 265 | layer { 266 | name: "fc6" 267 | type: "InnerProduct" 268 | bottom: "pool5" 269 | top: "fc6" 270 | param { 271 | lr_mult: 1 272 | decay_mult: 1 273 | } 274 | param { 275 | lr_mult: 2 276 | decay_mult: 0 277 | } 278 | inner_product_param { 279 | num_output: 4096 280 | weight_filler { 281 | type: "gaussian" 282 | std: 0.005 283 | } 284 | bias_filler { 285 | type: "constant" 286 | value: 1 287 | } 288 | } 289 | } 290 | layer { 291 | name: "relu6" 292 | type: "ReLU" 293 | bottom: "fc6" 294 | top: "fc6" 295 | } 296 | layer { 297 | name: "drop6" 298 | type: "Dropout" 299 | bottom: "fc6" 300 | top: "fc6" 301 | dropout_param { 302 | dropout_ratio: 0.5 303 | } 304 | } 305 | layer { 306 | name: "fc7" 307 | type: "InnerProduct" 308 | bottom: "fc6" 309 | top: "fc7" 310 | # Note that lr_mult can be set to 0 to disable any fine-tuning of this, and any other, layer 311 | param { 312 | lr_mult: 1 313 | decay_mult: 1 314 | } 315 | param { 316 | lr_mult: 2 317 | decay_mult: 0 318 | } 319 | inner_product_param { 320 | num_output: 4096 321 | weight_filler { 322 | type: "gaussian" 323 | std: 0.005 324 | } 325 | bias_filler { 326 | type: "constant" 327 | value: 1 328 | } 329 | } 330 | } 331 | layer { 332 | name: "relu7" 333 | type: "ReLU" 334 | bottom: "fc7" 335 | top: "fc7" 336 | } 337 | layer { 338 | name: "drop7" 339 | type: "Dropout" 340 | bottom: "fc7" 341 | top: "fc7" 342 | dropout_param { 343 | dropout_ratio: 0.5 344 | } 345 | } 346 | layer { 347 | name: "fc8_flickr" 348 | type: "InnerProduct" 349 | bottom: "fc7" 350 | top: "fc8_flickr" 351 | # lr_mult is set to higher than for other layers, because this layer is starting from random while the others are already trained 352 | param { 353 | lr_mult: 10 354 | decay_mult: 1 355 | } 356 | param { 357 | lr_mult: 20 358 | decay_mult: 0 359 | } 360 | inner_product_param { 361 | num_output: 2 362 | weight_filler { 363 | type: "gaussian" 364 | std: 0.01 365 | } 366 | bias_filler { 367 | type: "constant" 368 | value: 0 369 | } 370 | } 371 | } 372 | layer { 373 | name: "loss" 374 | type: "SoftmaxWithLoss" 375 | bottom: "fc8_flickr" 376 | bottom: "label" 377 | } 378 | layer { 379 | name: "accuracy" 380 | type: "Accuracy" 381 | bottom: "fc8_flickr" 382 | bottom: "label" 383 | top: "accuracy" 384 | include { 385 | phase: TEST 386 | } 387 | } 388 | --------------------------------------------------------------------------------