├── README.md ├── mtcnn ├── det1.caffemodel ├── det1.prototxt ├── det2.caffemodel ├── det2.prototxt ├── det3.caffemodel └── det3.prototxt └── src ├── crop-alignment.py ├── data.py ├── extract_feature.py ├── log ├── events.out.tfevents.1519200364.Hysia-System ├── events.out.tfevents.1519200402.Hysia-System ├── events.out.tfevents.1519200426.Hysia-System ├── events.out.tfevents.1519200494.Hysia-System └── events.out.tfevents.1519961335.Hysia-System ├── model ├── attention.ckpt-0.data-00000-of-00001 ├── attention.ckpt-0.index ├── attention.ckpt-0.meta ├── attention.ckpt-20000.data-00000-of-00001 ├── attention.ckpt-20000.index ├── attention.ckpt-20000.meta ├── attention.ckpt-40000.data-00000-of-00001 ├── attention.ckpt-40000.index ├── attention.ckpt-40000.meta ├── attention.ckpt-60000.data-00000-of-00001 ├── attention.ckpt-60000.index ├── attention.ckpt-60000.meta ├── attention.ckpt-80000.data-00000-of-00001 ├── attention.ckpt-80000.index ├── attention.ckpt-80000.meta └── checkpoint └── network.py /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This is a repository of reproducing this paper: [CVPR2017: Neural Aggregation Network for Video Face Recognition on](https://arxiv.org/abs/1603.05474)Tensorflow platform. 4 | 5 | # Content 6 | 7 | src/extract_feature.py this python file extract feature using sphereface model and save it into .mat file. 8 | 9 | src/data.py load the feature and generate a batch for network training. 10 | 11 | src/crop-alignment.py detect the face and align it and save the face on disk. 12 | 13 | src/network.py the aggregation network which consists of two attention modules. 14 | 15 | src/log this is a directory that save the training loss and training accuracy by tf.summary 16 | 17 | src/model this is the model file of network. 18 | 19 | 20 | 21 | # Test 22 | 23 | updating. 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /mtcnn/det1.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/mtcnn/det1.caffemodel -------------------------------------------------------------------------------- /mtcnn/det1.prototxt: -------------------------------------------------------------------------------- 1 | name: "PNet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 3 5 | input_dim: 12 6 | input_dim: 12 7 | 8 | layer { 9 | name: "conv1" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1" 13 | param { 14 | lr_mult: 1 15 | decay_mult: 1 16 | } 17 | param { 18 | lr_mult: 2 19 | decay_mult: 0 20 | } 21 | convolution_param { 22 | num_output: 10 23 | kernel_size: 3 24 | stride: 1 25 | weight_filler { 26 | type: "xavier" 27 | } 28 | bias_filler { 29 | type: "constant" 30 | value: 0 31 | } 32 | } 33 | } 34 | layer { 35 | name: "PReLU1" 36 | type: "PReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 2 48 | stride: 2 49 | } 50 | } 51 | 52 | layer { 53 | name: "conv2" 54 | type: "Convolution" 55 | bottom: "pool1" 56 | top: "conv2" 57 | param { 58 | lr_mult: 1 59 | decay_mult: 1 60 | } 61 | param { 62 | lr_mult: 2 63 | decay_mult: 0 64 | } 65 | convolution_param { 66 | num_output: 16 67 | kernel_size: 3 68 | stride: 1 69 | weight_filler { 70 | type: "xavier" 71 | } 72 | bias_filler { 73 | type: "constant" 74 | value: 0 75 | } 76 | } 77 | } 78 | layer { 79 | name: "PReLU2" 80 | type: "PReLU" 81 | bottom: "conv2" 82 | top: "conv2" 83 | } 84 | 85 | layer { 86 | name: "conv3" 87 | type: "Convolution" 88 | bottom: "conv2" 89 | top: "conv3" 90 | param { 91 | lr_mult: 1 92 | decay_mult: 1 93 | } 94 | param { 95 | lr_mult: 2 96 | decay_mult: 0 97 | } 98 | convolution_param { 99 | num_output: 32 100 | kernel_size: 3 101 | stride: 1 102 | weight_filler { 103 | type: "xavier" 104 | } 105 | bias_filler { 106 | type: "constant" 107 | value: 0 108 | } 109 | } 110 | } 111 | layer { 112 | name: "PReLU3" 113 | type: "PReLU" 114 | bottom: "conv3" 115 | top: "conv3" 116 | } 117 | 118 | 119 | layer { 120 | name: "conv4-1" 121 | type: "Convolution" 122 | bottom: "conv3" 123 | top: "conv4-1" 124 | param { 125 | lr_mult: 1 126 | decay_mult: 1 127 | } 128 | param { 129 | lr_mult: 2 130 | decay_mult: 0 131 | } 132 | convolution_param { 133 | num_output: 2 134 | kernel_size: 1 135 | stride: 1 136 | weight_filler { 137 | type: "xavier" 138 | } 139 | bias_filler { 140 | type: "constant" 141 | value: 0 142 | } 143 | } 144 | } 145 | 146 | layer { 147 | name: "conv4-2" 148 | type: "Convolution" 149 | bottom: "conv3" 150 | top: "conv4-2" 151 | param { 152 | lr_mult: 1 153 | decay_mult: 1 154 | } 155 | param { 156 | lr_mult: 2 157 | decay_mult: 0 158 | } 159 | convolution_param { 160 | num_output: 4 161 | kernel_size: 1 162 | stride: 1 163 | weight_filler { 164 | type: "xavier" 165 | } 166 | bias_filler { 167 | type: "constant" 168 | value: 0 169 | } 170 | } 171 | } 172 | layer { 173 | name: "prob1" 174 | type: "Softmax" 175 | bottom: "conv4-1" 176 | top: "prob1" 177 | } 178 | -------------------------------------------------------------------------------- /mtcnn/det2.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/mtcnn/det2.caffemodel -------------------------------------------------------------------------------- /mtcnn/det2.prototxt: -------------------------------------------------------------------------------- 1 | name: "RNet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 3 5 | input_dim: 24 6 | input_dim: 24 7 | 8 | 9 | ########################## 10 | ###################### 11 | layer { 12 | name: "conv1" 13 | type: "Convolution" 14 | bottom: "data" 15 | top: "conv1" 16 | param { 17 | lr_mult: 0 18 | decay_mult: 0 19 | } 20 | param { 21 | lr_mult: 0 22 | decay_mult: 0 23 | } 24 | convolution_param { 25 | num_output: 28 26 | kernel_size: 3 27 | stride: 1 28 | weight_filler { 29 | type: "xavier" 30 | } 31 | bias_filler { 32 | type: "constant" 33 | value: 0 34 | } 35 | } 36 | } 37 | layer { 38 | name: "prelu1" 39 | type: "PReLU" 40 | bottom: "conv1" 41 | top: "conv1" 42 | propagate_down: true 43 | } 44 | layer { 45 | name: "pool1" 46 | type: "Pooling" 47 | bottom: "conv1" 48 | top: "pool1" 49 | pooling_param { 50 | pool: MAX 51 | kernel_size: 3 52 | stride: 2 53 | } 54 | } 55 | 56 | layer { 57 | name: "conv2" 58 | type: "Convolution" 59 | bottom: "pool1" 60 | top: "conv2" 61 | param { 62 | lr_mult: 0 63 | decay_mult: 0 64 | } 65 | param { 66 | lr_mult: 0 67 | decay_mult: 0 68 | } 69 | convolution_param { 70 | num_output: 48 71 | kernel_size: 3 72 | stride: 1 73 | weight_filler { 74 | type: "xavier" 75 | } 76 | bias_filler { 77 | type: "constant" 78 | value: 0 79 | } 80 | } 81 | } 82 | layer { 83 | name: "prelu2" 84 | type: "PReLU" 85 | bottom: "conv2" 86 | top: "conv2" 87 | propagate_down: true 88 | } 89 | layer { 90 | name: "pool2" 91 | type: "Pooling" 92 | bottom: "conv2" 93 | top: "pool2" 94 | pooling_param { 95 | pool: MAX 96 | kernel_size: 3 97 | stride: 2 98 | } 99 | } 100 | #################################### 101 | 102 | ################################## 103 | layer { 104 | name: "conv3" 105 | type: "Convolution" 106 | bottom: "pool2" 107 | top: "conv3" 108 | param { 109 | lr_mult: 0 110 | decay_mult: 0 111 | } 112 | param { 113 | lr_mult: 0 114 | decay_mult: 0 115 | } 116 | convolution_param { 117 | num_output: 64 118 | kernel_size: 2 119 | stride: 1 120 | weight_filler { 121 | type: "xavier" 122 | } 123 | bias_filler { 124 | type: "constant" 125 | value: 0 126 | } 127 | } 128 | } 129 | layer { 130 | name: "prelu3" 131 | type: "PReLU" 132 | bottom: "conv3" 133 | top: "conv3" 134 | propagate_down: true 135 | } 136 | ############################### 137 | 138 | ############################### 139 | 140 | layer { 141 | name: "conv4" 142 | type: "InnerProduct" 143 | bottom: "conv3" 144 | top: "conv4" 145 | param { 146 | lr_mult: 0 147 | decay_mult: 0 148 | } 149 | param { 150 | lr_mult: 0 151 | decay_mult: 0 152 | } 153 | inner_product_param { 154 | num_output: 128 155 | weight_filler { 156 | type: "xavier" 157 | } 158 | bias_filler { 159 | type: "constant" 160 | value: 0 161 | } 162 | } 163 | } 164 | layer { 165 | name: "prelu4" 166 | type: "PReLU" 167 | bottom: "conv4" 168 | top: "conv4" 169 | } 170 | 171 | layer { 172 | name: "conv5-1" 173 | type: "InnerProduct" 174 | bottom: "conv4" 175 | top: "conv5-1" 176 | param { 177 | lr_mult: 0 178 | decay_mult: 0 179 | } 180 | param { 181 | lr_mult: 0 182 | decay_mult: 0 183 | } 184 | inner_product_param { 185 | num_output: 2 186 | #kernel_size: 1 187 | #stride: 1 188 | weight_filler { 189 | type: "xavier" 190 | } 191 | bias_filler { 192 | type: "constant" 193 | value: 0 194 | } 195 | } 196 | } 197 | layer { 198 | name: "conv5-2" 199 | type: "InnerProduct" 200 | bottom: "conv4" 201 | top: "conv5-2" 202 | param { 203 | lr_mult: 1 204 | decay_mult: 1 205 | } 206 | param { 207 | lr_mult: 2 208 | decay_mult: 1 209 | } 210 | inner_product_param { 211 | num_output: 4 212 | #kernel_size: 1 213 | #stride: 1 214 | weight_filler { 215 | type: "xavier" 216 | } 217 | bias_filler { 218 | type: "constant" 219 | value: 0 220 | } 221 | } 222 | } 223 | layer { 224 | name: "prob1" 225 | type: "Softmax" 226 | bottom: "conv5-1" 227 | top: "prob1" 228 | } -------------------------------------------------------------------------------- /mtcnn/det3.caffemodel: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/mtcnn/det3.caffemodel -------------------------------------------------------------------------------- /mtcnn/det3.prototxt: -------------------------------------------------------------------------------- 1 | name: "ONet" 2 | input: "data" 3 | input_dim: 1 4 | input_dim: 3 5 | input_dim: 48 6 | input_dim: 48 7 | ################################## 8 | layer { 9 | name: "conv1" 10 | type: "Convolution" 11 | bottom: "data" 12 | top: "conv1" 13 | param { 14 | lr_mult: 1 15 | decay_mult: 1 16 | } 17 | param { 18 | lr_mult: 2 19 | decay_mult: 1 20 | } 21 | convolution_param { 22 | num_output: 32 23 | kernel_size: 3 24 | stride: 1 25 | weight_filler { 26 | type: "xavier" 27 | } 28 | bias_filler { 29 | type: "constant" 30 | value: 0 31 | } 32 | } 33 | } 34 | layer { 35 | name: "prelu1" 36 | type: "PReLU" 37 | bottom: "conv1" 38 | top: "conv1" 39 | } 40 | layer { 41 | name: "pool1" 42 | type: "Pooling" 43 | bottom: "conv1" 44 | top: "pool1" 45 | pooling_param { 46 | pool: MAX 47 | kernel_size: 3 48 | stride: 2 49 | } 50 | } 51 | layer { 52 | name: "conv2" 53 | type: "Convolution" 54 | bottom: "pool1" 55 | top: "conv2" 56 | param { 57 | lr_mult: 1 58 | decay_mult: 1 59 | } 60 | param { 61 | lr_mult: 2 62 | decay_mult: 1 63 | } 64 | convolution_param { 65 | num_output: 64 66 | kernel_size: 3 67 | stride: 1 68 | weight_filler { 69 | type: "xavier" 70 | } 71 | bias_filler { 72 | type: "constant" 73 | value: 0 74 | } 75 | } 76 | } 77 | 78 | layer { 79 | name: "prelu2" 80 | type: "PReLU" 81 | bottom: "conv2" 82 | top: "conv2" 83 | } 84 | layer { 85 | name: "pool2" 86 | type: "Pooling" 87 | bottom: "conv2" 88 | top: "pool2" 89 | pooling_param { 90 | pool: MAX 91 | kernel_size: 3 92 | stride: 2 93 | } 94 | } 95 | 96 | layer { 97 | name: "conv3" 98 | type: "Convolution" 99 | bottom: "pool2" 100 | top: "conv3" 101 | param { 102 | lr_mult: 1 103 | decay_mult: 1 104 | } 105 | param { 106 | lr_mult: 2 107 | decay_mult: 1 108 | } 109 | convolution_param { 110 | num_output: 64 111 | kernel_size: 3 112 | weight_filler { 113 | type: "xavier" 114 | } 115 | bias_filler { 116 | type: "constant" 117 | value: 0 118 | } 119 | } 120 | } 121 | layer { 122 | name: "prelu3" 123 | type: "PReLU" 124 | bottom: "conv3" 125 | top: "conv3" 126 | } 127 | layer { 128 | name: "pool3" 129 | type: "Pooling" 130 | bottom: "conv3" 131 | top: "pool3" 132 | pooling_param { 133 | pool: MAX 134 | kernel_size: 2 135 | stride: 2 136 | } 137 | } 138 | layer { 139 | name: "conv4" 140 | type: "Convolution" 141 | bottom: "pool3" 142 | top: "conv4" 143 | param { 144 | lr_mult: 1 145 | decay_mult: 1 146 | } 147 | param { 148 | lr_mult: 2 149 | decay_mult: 1 150 | } 151 | convolution_param { 152 | num_output: 128 153 | kernel_size: 2 154 | weight_filler { 155 | type: "xavier" 156 | } 157 | bias_filler { 158 | type: "constant" 159 | value: 0 160 | } 161 | } 162 | } 163 | layer { 164 | name: "prelu4" 165 | type: "PReLU" 166 | bottom: "conv4" 167 | top: "conv4" 168 | } 169 | 170 | 171 | layer { 172 | name: "conv5" 173 | type: "InnerProduct" 174 | bottom: "conv4" 175 | top: "conv5" 176 | param { 177 | lr_mult: 1 178 | decay_mult: 1 179 | } 180 | param { 181 | lr_mult: 2 182 | decay_mult: 1 183 | } 184 | inner_product_param { 185 | #kernel_size: 3 186 | num_output: 256 187 | weight_filler { 188 | type: "xavier" 189 | } 190 | bias_filler { 191 | type: "constant" 192 | value: 0 193 | } 194 | } 195 | } 196 | 197 | layer { 198 | name: "drop5" 199 | type: "Dropout" 200 | bottom: "conv5" 201 | top: "conv5" 202 | dropout_param { 203 | dropout_ratio: 0.25 204 | } 205 | } 206 | layer { 207 | name: "prelu5" 208 | type: "PReLU" 209 | bottom: "conv5" 210 | top: "conv5" 211 | } 212 | 213 | 214 | layer { 215 | name: "conv6-1" 216 | type: "InnerProduct" 217 | bottom: "conv5" 218 | top: "conv6-1" 219 | param { 220 | lr_mult: 1 221 | decay_mult: 1 222 | } 223 | param { 224 | lr_mult: 2 225 | decay_mult: 1 226 | } 227 | inner_product_param { 228 | #kernel_size: 1 229 | num_output: 2 230 | weight_filler { 231 | type: "xavier" 232 | } 233 | bias_filler { 234 | type: "constant" 235 | value: 0 236 | } 237 | } 238 | } 239 | layer { 240 | name: "conv6-2" 241 | type: "InnerProduct" 242 | bottom: "conv5" 243 | top: "conv6-2" 244 | param { 245 | lr_mult: 1 246 | decay_mult: 1 247 | } 248 | param { 249 | lr_mult: 2 250 | decay_mult: 1 251 | } 252 | inner_product_param { 253 | #kernel_size: 1 254 | num_output: 4 255 | weight_filler { 256 | type: "xavier" 257 | } 258 | bias_filler { 259 | type: "constant" 260 | value: 0 261 | } 262 | } 263 | } 264 | layer { 265 | name: "conv6-3" 266 | type: "InnerProduct" 267 | bottom: "conv5" 268 | top: "conv6-3" 269 | param { 270 | lr_mult: 1 271 | decay_mult: 1 272 | } 273 | param { 274 | lr_mult: 2 275 | decay_mult: 1 276 | } 277 | inner_product_param { 278 | #kernel_size: 1 279 | num_output: 10 280 | weight_filler { 281 | type: "xavier" 282 | } 283 | bias_filler { 284 | type: "constant" 285 | value: 0 286 | } 287 | } 288 | } 289 | layer { 290 | name: "prob1" 291 | type: "Softmax" 292 | bottom: "conv6-1" 293 | top: "prob1" 294 | } 295 | -------------------------------------------------------------------------------- /src/crop-alignment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import caffe 4 | import cv2 5 | import numpy as np 6 | import os 7 | import copy 8 | import argparse 9 | from skimage import transform as trans 10 | import json 11 | 12 | 13 | def bbreg(boundingbox, reg): 14 | reg = reg.T 15 | 16 | # calibrate bouding boxes 17 | if reg.shape[1] == 1: 18 | print "reshape of reg" 19 | pass # reshape of reg 20 | w = boundingbox[:,2] - boundingbox[:,0] + 1 21 | h = boundingbox[:,3] - boundingbox[:,1] + 1 22 | 23 | bb0 = boundingbox[:,0] + reg[:,0]*w 24 | bb1 = boundingbox[:,1] + reg[:,1]*h 25 | bb2 = boundingbox[:,2] + reg[:,2]*w 26 | bb3 = boundingbox[:,3] + reg[:,3]*h 27 | 28 | boundingbox[:,0:4] = np.array([bb0, bb1, bb2, bb3]).T 29 | #print "bb", boundingbox 30 | return boundingbox 31 | 32 | 33 | def pad(boxesA, w, h): 34 | boxes = boxesA.copy() # shit, value parameter!!! 35 | #print '#################' 36 | #print 'boxes', boxes 37 | #print 'w,h', w, h 38 | 39 | tmph = boxes[:,3] - boxes[:,1] + 1 40 | tmpw = boxes[:,2] - boxes[:,0] + 1 41 | numbox = boxes.shape[0] 42 | 43 | #print 'tmph', tmph 44 | #print 'tmpw', tmpw 45 | 46 | dx = np.ones(numbox) 47 | dy = np.ones(numbox) 48 | edx = tmpw 49 | edy = tmph 50 | 51 | x = boxes[:,0:1][:,0] 52 | y = boxes[:,1:2][:,0] 53 | ex = boxes[:,2:3][:,0] 54 | ey = boxes[:,3:4][:,0] 55 | 56 | 57 | tmp = np.where(ex > w)[0] 58 | if tmp.shape[0] != 0: 59 | edx[tmp] = -ex[tmp] + w-1 + tmpw[tmp] 60 | ex[tmp] = w-1 61 | 62 | tmp = np.where(ey > h)[0] 63 | if tmp.shape[0] != 0: 64 | edy[tmp] = -ey[tmp] + h-1 + tmph[tmp] 65 | ey[tmp] = h-1 66 | 67 | tmp = np.where(x < 1)[0] 68 | if tmp.shape[0] != 0: 69 | dx[tmp] = 2 - x[tmp] 70 | x[tmp] = np.ones_like(x[tmp]) 71 | 72 | tmp = np.where(y < 1)[0] 73 | if tmp.shape[0] != 0: 74 | dy[tmp] = 2 - y[tmp] 75 | y[tmp] = np.ones_like(y[tmp]) 76 | 77 | # for python index from 0, while matlab from 1 78 | dy = np.maximum(0, dy-1) 79 | dx = np.maximum(0, dx-1) 80 | y = np.maximum(0, y-1) 81 | x = np.maximum(0, x-1) 82 | edy = np.maximum(0, edy-1) 83 | edx = np.maximum(0, edx-1) 84 | ey = np.maximum(0, ey-1) 85 | ex = np.maximum(0, ex-1) 86 | 87 | 88 | #print 'boxes', boxes 89 | return [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] 90 | 91 | 92 | 93 | def rerec(bboxA): 94 | # convert bboxA to square 95 | w = bboxA[:,2] - bboxA[:,0] 96 | h = bboxA[:,3] - bboxA[:,1] 97 | l = np.maximum(w,h).T 98 | 99 | #print 'bboxA', bboxA 100 | #print 'w', w 101 | #print 'h', h 102 | #print 'l', l 103 | bboxA[:,0] = bboxA[:,0] + w*0.5 - l*0.5 104 | bboxA[:,1] = bboxA[:,1] + h*0.5 - l*0.5 105 | bboxA[:,2:4] = bboxA[:,0:2] + np.repeat([l], 2, axis = 0).T 106 | return bboxA 107 | 108 | 109 | def nms(boxes, threshold, type): 110 | """nms 111 | :boxes: [:,0:5] 112 | :threshold: 0.5 like 113 | :type: 'Min' or others 114 | :returns: TODO 115 | """ 116 | if boxes.shape[0] == 0: 117 | return np.array([]) 118 | x1 = boxes[:,0] 119 | y1 = boxes[:,1] 120 | x2 = boxes[:,2] 121 | y2 = boxes[:,3] 122 | s = boxes[:,4] 123 | area = np.multiply(x2-x1+1, y2-y1+1) 124 | I = np.array(s.argsort()) # read s using I 125 | 126 | pick = []; 127 | while len(I) > 0: 128 | xx1 = np.maximum(x1[I[-1]], x1[I[0:-1]]) 129 | yy1 = np.maximum(y1[I[-1]], y1[I[0:-1]]) 130 | xx2 = np.minimum(x2[I[-1]], x2[I[0:-1]]) 131 | yy2 = np.minimum(y2[I[-1]], y2[I[0:-1]]) 132 | w = np.maximum(0.0, xx2 - xx1 + 1) 133 | h = np.maximum(0.0, yy2 - yy1 + 1) 134 | inter = w * h 135 | if type == 'Min': 136 | o = inter / np.minimum(area[I[-1]], area[I[0:-1]]) 137 | else: 138 | o = inter / (area[I[-1]] + area[I[0:-1]] - inter) 139 | pick.append(I[-1]) 140 | I = I[np.where( o <= threshold)[0]] 141 | return pick 142 | 143 | 144 | def generateBoundingBox(map, reg, scale, t): 145 | stride = 2 146 | cellsize = 12 147 | map = map.T 148 | dx1 = reg[0,:,:].T 149 | dy1 = reg[1,:,:].T 150 | dx2 = reg[2,:,:].T 151 | dy2 = reg[3,:,:].T 152 | (x, y) = np.where(map >= t) 153 | 154 | yy = y 155 | xx = x 156 | 157 | 158 | score = map[x,y] 159 | reg = np.array([dx1[x,y], dy1[x,y], dx2[x,y], dy2[x,y]]) 160 | 161 | if reg.shape[0] == 0: 162 | pass 163 | boundingbox = np.array([yy, xx]).T 164 | 165 | bb1 = np.fix((stride * (boundingbox) + 1) / scale).T # matlab index from 1, so with "boundingbox-1" 166 | bb2 = np.fix((stride * (boundingbox) + cellsize - 1 + 1) / scale).T # while python don't have to 167 | score = np.array([score]) 168 | 169 | boundingbox_out = np.concatenate((bb1, bb2, score, reg), axis=0) 170 | 171 | 172 | return boundingbox_out.T 173 | 174 | 175 | count = 0 176 | 177 | def drawBoxes(im, boxes): 178 | x1 = boxes[:,0] 179 | y1 = boxes[:,1] 180 | x2 = boxes[:,2] 181 | y2 = boxes[:,3] 182 | for i in range(x1.shape[0]): 183 | cv2.rectangle(im, (int(x1[i]), int(y1[i])), (int(x2[i]), int(y2[i])), (0,255,0), 1) 184 | return im 185 | 186 | ''' 187 | def drawBoxes(im, boxes, model, SNet, clf, name_identities): 188 | 189 | 190 | x1 = boxes[:,0] 191 | y1 = boxes[:,1] 192 | x2 = boxes[:,2] 193 | y2 = boxes[:,3] 194 | global count 195 | width, height = im.shape[0], im.shape[1] 196 | for i in range(x1.shape[0]): 197 | if int(y1[i]) >=0 and int(y2[i]) < width and int(x1[i]) >= 0 and int(x2[i]) < height: 198 | img = im[int(y1[i]):int(y2[i]), int(x1[i]):int(x2[i])] 199 | img = cv2.resize(img, (96, 112)) 200 | save_name = '../tmp/' + str(count) + '.jpg' 201 | count += 1 202 | print count 203 | cv2.imwrite(save_name, img) 204 | 205 | #assert img != None 206 | img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) 207 | img = (img - 127.5 )/ 128 208 | img = np.transpose(img, (2, 0, 1)) 209 | img = [img] 210 | SNet.blobs['data'].data[...] = img 211 | SNet.forward() 212 | a = copy.copy(SNet.blobs['fc5'].data[0]) 213 | a = [a] 214 | index = clf.predict(a) 215 | person = name_identities[int(index)] 216 | # print person 217 | cv2.putText(im, person, (int(x1[i]), int(y1[i])), 218 | cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.75, 219 | color=(152, 255, 204), thickness=2) 220 | cv2.rectangle(im, (int(x1[i]), int(y1[i])), (int(x2[i]), int(y2[i])), (0,255,0), 1) 221 | 222 | 223 | return im 224 | 225 | ''' 226 | 227 | def detect_face(img, minsize, PNet, RNet, ONet, threshold, fastresize, factor): 228 | 229 | img2 = img.copy() 230 | 231 | factor_count = 0 232 | total_boxes = np.zeros((0,9), np.float) 233 | points = [] 234 | h = img.shape[0] 235 | w = img.shape[1] 236 | minl = min(h, w) 237 | img = img.astype(float) 238 | m = 12.0/minsize 239 | minl = minl*m 240 | 241 | 242 | #total_boxes = np.load('total_boxes.npy') 243 | #total_boxes = np.load('total_boxes_242.npy') 244 | #total_boxes = np.load('total_boxes_101.npy') 245 | 246 | 247 | # create scale pyramid 248 | scales = [] 249 | while minl >= 12: 250 | scales.append(m * pow(factor, factor_count)) 251 | minl *= factor 252 | factor_count += 1 253 | 254 | # first stage 255 | for scale in scales: 256 | hs = int(np.ceil(h*scale)) 257 | ws = int(np.ceil(w*scale)) 258 | 259 | if fastresize: 260 | im_data = (img-127.5)*0.0078125 # [0,255] -> [-1,1] 261 | im_data = cv2.resize(im_data, (ws,hs)) # default is bilinear 262 | else: 263 | im_data = cv2.resize(img, (ws,hs)) # default is bilinear 264 | im_data = (im_data-127.5)*0.0078125 # [0,255] -> [-1,1] 265 | #im_data = imResample(img, hs, ws); print "scale:", scale 266 | 267 | 268 | im_data = np.swapaxes(im_data, 0, 2) 269 | im_data = np.array([im_data], dtype = np.float) 270 | PNet.blobs['data'].reshape(1, 3, ws, hs) 271 | PNet.blobs['data'].data[...] = im_data 272 | out = PNet.forward() 273 | 274 | boxes = generateBoundingBox(out['prob1'][0,1,:,:], out['conv4-2'][0], scale, threshold[0]) 275 | if boxes.shape[0] != 0: 276 | #print boxes[4:9] 277 | #print 'im_data', im_data[0:5, 0:5, 0], '\n' 278 | #print 'prob1', out['prob1'][0,0,0:3,0:3] 279 | 280 | pick = nms(boxes, 0.5, 'Union') 281 | 282 | if len(pick) > 0 : 283 | boxes = boxes[pick, :] 284 | 285 | if boxes.shape[0] != 0: 286 | total_boxes = np.concatenate((total_boxes, boxes), axis=0) 287 | 288 | #np.save('total_boxes_101.npy', total_boxes) 289 | 290 | ##### 291 | # 1 # 292 | ##### 293 | #print "[1]:",total_boxes.shape[0] 294 | #print total_boxes 295 | #return total_boxes, [] 296 | 297 | 298 | numbox = total_boxes.shape[0] 299 | if numbox > 0: 300 | # nms 301 | pick = nms(total_boxes, 0.7, 'Union') 302 | total_boxes = total_boxes[pick, :] 303 | #print "[2]:",total_boxes.shape[0] 304 | 305 | # revise and convert to square 306 | regh = total_boxes[:,3] - total_boxes[:,1] 307 | regw = total_boxes[:,2] - total_boxes[:,0] 308 | t1 = total_boxes[:,0] + total_boxes[:,5]*regw 309 | t2 = total_boxes[:,1] + total_boxes[:,6]*regh 310 | t3 = total_boxes[:,2] + total_boxes[:,7]*regw 311 | t4 = total_boxes[:,3] + total_boxes[:,8]*regh 312 | t5 = total_boxes[:,4] 313 | total_boxes = np.array([t1,t2,t3,t4,t5]).T 314 | #print "[3]:",total_boxes.shape[0] 315 | #print regh 316 | #print regw 317 | #print 't1',t1 318 | #print total_boxes 319 | 320 | total_boxes = rerec(total_boxes) # convert box to square 321 | #print "[4]:",total_boxes.shape[0] 322 | 323 | total_boxes[:,0:4] = np.fix(total_boxes[:,0:4]) 324 | #print "[4.5]:",total_boxes.shape[0] 325 | #print total_boxes 326 | [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = pad(total_boxes, w, h) 327 | 328 | #print total_boxes.shape 329 | #print total_boxes 330 | 331 | numbox = total_boxes.shape[0] 332 | if numbox > 0: 333 | # second stage 334 | 335 | #print 'tmph', tmph 336 | #print 'tmpw', tmpw 337 | #print "y,ey,x,ex", y, ey, x, ex, 338 | #print "edy", edy 339 | 340 | #tempimg = np.load('tempimg.npy') 341 | 342 | # construct input for RNet 343 | tempimg = np.zeros((numbox, 24, 24, 3)) # (24, 24, 3, numbox) 344 | for k in range(numbox): 345 | tmp = np.zeros((int(tmph[k]) +1, int(tmpw[k]) + 1,3)) 346 | 347 | #print "dx[k], edx[k]:", dx[k], edx[k] 348 | #print "dy[k], edy[k]:", dy[k], edy[k] 349 | #print "img.shape", img[y[k]:ey[k]+1, x[k]:ex[k]+1].shape 350 | #print "tmp.shape", tmp[dy[k]:edy[k]+1, dx[k]:edx[k]+1].shape 351 | 352 | tmp[int(dy[k]):int(edy[k])+1, int(dx[k]):int(edx[k])+1] = img[int(y[k]):int(ey[k])+1, int(x[k]):int(ex[k])+1] 353 | #print "y,ey,x,ex", y[k], ey[k], x[k], ex[k] 354 | #print "tmp", tmp.shape 355 | 356 | tempimg[k,:,:,:] = cv2.resize(tmp, (24, 24)) 357 | #tempimg[k,:,:,:] = imResample(tmp, 24, 24) 358 | #print 'tempimg', tempimg[k,:,:,:].shape 359 | #print tempimg[k,0:5,0:5,0] 360 | #print tempimg[k,0:5,0:5,1] 361 | #print tempimg[k,0:5,0:5,2] 362 | #print k 363 | 364 | #print tempimg.shape 365 | #print tempimg[0,0,0,:] 366 | tempimg = (tempimg-127.5)*0.0078125 # done in imResample function wrapped by python 367 | 368 | #np.save('tempimg.npy', tempimg) 369 | 370 | # RNet 371 | 372 | tempimg = np.swapaxes(tempimg, 1, 3) 373 | #print tempimg[0,:,0,0] 374 | 375 | RNet.blobs['data'].reshape(numbox, 3, 24, 24) 376 | RNet.blobs['data'].data[...] = tempimg 377 | out = RNet.forward() 378 | 379 | #print out['conv5-2'].shape 380 | #print out['prob1'].shape 381 | 382 | score = out['prob1'][:,1] 383 | #print 'score', score 384 | pass_t = np.where(score>threshold[1])[0] 385 | #print 'pass_t', pass_t 386 | 387 | score = np.array([score[pass_t]]).T 388 | total_boxes = np.concatenate( (total_boxes[pass_t, 0:4], score), axis = 1) 389 | #print "[5]:",total_boxes.shape[0] 390 | #print total_boxes 391 | 392 | #print "1.5:",total_boxes.shape 393 | 394 | mv = out['conv5-2'][pass_t, :].T 395 | #print "mv", mv 396 | if total_boxes.shape[0] > 0: 397 | pick = nms(total_boxes, 0.7, 'Union') 398 | #print 'pick', pick 399 | if len(pick) > 0 : 400 | total_boxes = total_boxes[pick, :] 401 | #print "[6]:",total_boxes.shape[0] 402 | total_boxes = bbreg(total_boxes, mv[:, pick]) 403 | #print "[7]:",total_boxes.shape[0] 404 | total_boxes = rerec(total_boxes) 405 | #print "[8]:",total_boxes.shape[0] 406 | 407 | ##### 408 | # 2 # 409 | ##### 410 | #print "2:",total_boxes.shape 411 | 412 | numbox = total_boxes.shape[0] 413 | if numbox > 0: 414 | # third stage 415 | 416 | total_boxes = np.fix(total_boxes) 417 | [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = pad(total_boxes, w, h) 418 | 419 | #print 'tmpw', tmpw 420 | #print 'tmph', tmph 421 | #print 'y ', y 422 | #print 'ey', ey 423 | #print 'x ', x 424 | #print 'ex', ex 425 | 426 | 427 | tempimg = np.zeros((numbox, 48, 48, 3)) 428 | for k in range(numbox): 429 | tmp = np.zeros((int(tmph[k]), int(tmpw[k]),3)) 430 | tmp[int(dy[k]):int(edy[k])+1, int(dx[k]):int(edx[k])+1] = img[int(y[k]):int(ey[k])+1, int(x[k]):int(ex[k])+1] 431 | tempimg[k,:,:,:] = cv2.resize(tmp, (48, 48)) 432 | tempimg = (tempimg-127.5)*0.0078125 # [0,255] -> [-1,1] 433 | 434 | # ONet 435 | tempimg = np.swapaxes(tempimg, 1, 3) 436 | ONet.blobs['data'].reshape(numbox, 3, 48, 48) 437 | ONet.blobs['data'].data[...] = tempimg 438 | out = ONet.forward() 439 | 440 | score = out['prob1'][:,1] 441 | points = out['conv6-3'] 442 | pass_t = np.where(score>threshold[2])[0] 443 | points = points[pass_t, :] 444 | score = np.array([score[pass_t]]).T 445 | total_boxes = np.concatenate( (total_boxes[pass_t, 0:4], score), axis=1) 446 | #print "[9]:",total_boxes.shape[0] 447 | 448 | mv = out['conv6-2'][pass_t, :].T 449 | w = total_boxes[:,3] - total_boxes[:,1] + 1 450 | h = total_boxes[:,2] - total_boxes[:,0] + 1 451 | 452 | points[:, 0:5] = np.tile(w, (5,1)).T * points[:, 0:5] + np.tile(total_boxes[:,0], (5,1)).T - 1 453 | points[:, 5:10] = np.tile(h, (5,1)).T * points[:, 5:10] + np.tile(total_boxes[:,1], (5,1)).T -1 454 | 455 | if total_boxes.shape[0] > 0: 456 | total_boxes = bbreg(total_boxes, mv[:,:]) 457 | #print "[10]:",total_boxes.shape[0] 458 | pick = nms(total_boxes, 0.7, 'Min') 459 | 460 | #print pick 461 | if len(pick) > 0 : 462 | total_boxes = total_boxes[pick, :] 463 | #print "[11]:",total_boxes.shape[0] 464 | points = points[pick, :] 465 | 466 | ##### 467 | # 3 # 468 | ##### 469 | #print "3:",total_boxes.shape 470 | 471 | return total_boxes, points 472 | 473 | def images_align(input_dir, output_dir): 474 | 475 | minsize = 80 #120 476 | 477 | caffe_model_path = "../mtcnn" 478 | 479 | threshold = [0.6, 0.7, 0.7] 480 | factor = 0.709 481 | # factor = 0.5 482 | 483 | face_size = (112, 96) 484 | 485 | src = np.array([ 486 | [30.2946, 51.6963], 487 | [65.5318, 51.5014], 488 | [48.0252, 71.7366], 489 | [33.5493, 92.3655], 490 | [62.7299, 92.2041]], dtype = np.float32) 491 | tform = trans.SimilarityTransform() 492 | 493 | 494 | caffe.set_mode_gpu() 495 | PNet = caffe.Net(caffe_model_path+"/det1.prototxt", caffe_model_path+"/det1.caffemodel", caffe.TEST) 496 | RNet = caffe.Net(caffe_model_path+"/det2.prototxt", caffe_model_path+"/det2.caffemodel", caffe.TEST) 497 | ONet = caffe.Net(caffe_model_path+"/det3.prototxt", caffe_model_path+"/det3.caffemodel", caffe.TEST) 498 | 499 | for sub_dir in os.listdir(input_dir): 500 | subinputdir = input_dir + '/' + sub_dir 501 | suboutputdir = output_dir + '/' + sub_dir 502 | if not os.path.exists(suboutputdir): 503 | os.mkdir(suboutputdir) 504 | for subsub_dir in os.listdir(subinputdir): 505 | subsubinputdir = subinputdir + '/' + subsub_dir 506 | subsuboutputdir = suboutputdir + '/' + subsub_dir 507 | if not os.path.exists(subsuboutputdir): 508 | os.mkdir(subsuboutputdir) 509 | for filename in os.listdir(subsubinputdir): 510 | img_name = subsubinputdir + '/' + filename 511 | save_image_name = subsuboutputdir + '/' + filename 512 | print save_image_name 513 | img = cv2.imread(img_name) #bgr image 514 | img = cv2.resize(img, (400, 400)) 515 | if len(img) == 0: 516 | print "open image " + img_name + " error" 517 | continue 518 | img_matlab = img.copy() 519 | tmp = img_matlab[:,:,2].copy() 520 | img_matlab[:,:,2] = img_matlab[:,:,0] 521 | img_matlab[:,:,0] = tmp # bgr 2 rgb 522 | 523 | boundingboxes, points = detect_face(img_matlab, minsize, PNet, RNet, ONet, threshold, True, factor) 524 | print len(points) 525 | if len(points) > 0: 526 | tform.estimate(np.array(points[0]).reshape(2,5).T, src) 527 | M = tform.params[0:2, :] 528 | warped = cv2.warpAffine(img, M, (96, 112), borderValue = 0.0) 529 | 530 | cv2.imwrite(save_image_name, warped) 531 | 532 | def parser_args(): 533 | parser = argparse.ArgumentParser(description = 'face alignment') 534 | parser.add_argument('-i', '--input', type = str, default = '/media/hysia/wyj/dataset/face_recog/YoutubeFaces/aligned_images_DB', help = 'input directory') 535 | parser.add_argument('-o', '--output', type = str, default = '/media/hysia/wyj/dataset/face_recog/YoutubeFaces-crop-align/', help ='save directory') 536 | args = parser.parse_args() 537 | return args.input, args.output 538 | 539 | if __name__ == "__main__": 540 | input_dir, output_dir = parser_args() 541 | images_align(input_dir, output_dir) 542 | 543 | -------------------------------------------------------------------------------- /src/data.py: -------------------------------------------------------------------------------- 1 | # author: Wang Yongjie 2 | # Email: wangyongjie@ict.ac.cn 3 | 4 | """ 5 | generate train batch for deep fusion module 6 | """ 7 | 8 | import scipy.io as sio 9 | import sys 10 | import random 11 | 12 | class Data(object): 13 | 14 | def __init__(self, filename, batch_size, class_num): 15 | """ 16 | filename: feature name 17 | batch_size: train_batch 18 | class_num: class number 19 | """ 20 | self.filename = filename 21 | self.batch_size = batch_size 22 | self.class_num = class_num 23 | 24 | def load_feature(self): 25 | self.features = [] 26 | self.labels = [] 27 | dataset = sio.loadmat(self.filename) 28 | flag = 0 29 | f = open("dataset.txt", 'w') 30 | not_include = ["__version__", "__globals__", "__header__"] # scipy.io.savemat save these unrelevant information 31 | for k, v in dataset.iteritems(): 32 | if k not in not_include: 33 | label = [0] * self.class_num 34 | #print flag 35 | label[flag] = 1 36 | flag = flag + 1 37 | sub_feature = [] 38 | for i in range(len(v)): 39 | sub_feature.append(v[i]) 40 | self.labels.append(label) 41 | self.features.append(sub_feature) 42 | 43 | #pairs = list(zip(self.features, self.labels)) 44 | #random.shuffle(pairs) 45 | #self.features, self.labels = zip(*pairs) 46 | f.close() 47 | 48 | def next_batch(self, group_num): 49 | """ 50 | frame numbers of each group 51 | """ 52 | train_feature, train_label = [], [] 53 | start = random.randint(0, self.class_num) 54 | for i in range(start, start + self.batch_size): 55 | train_group = [] 56 | seed = random.randint(0, len(self.features[i % self.class_num]) - group_num) 57 | for j in range(seed, seed + group_num): 58 | #print i, j 59 | train_group.append(self.features[i % self.class_num][j]) 60 | 61 | train_feature.append(train_group) 62 | train_label.append(self.labels[i % self.class_num]) 63 | 64 | return train_feature, train_label 65 | 66 | 67 | if __name__ == "__main__": 68 | filename = "./YoutubeFaces.mat" 69 | dataset = Data(filename, 3, 1595) 70 | dataset.load_feature() 71 | train_features, train_label = dataset.next_batch(5) 72 | print train_features, train_label 73 | 74 | -------------------------------------------------------------------------------- /src/extract_feature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # Author: Wang Yongjie 4 | # Email: wangyongjie@ict.ac.cn 5 | 6 | import os 7 | import sys 8 | import caffe 9 | import scipy.io as sio 10 | import argparse 11 | import numpy as np 12 | import copy 13 | # from sklearn.decomposition import PCA 14 | 15 | 16 | class cnn_feature(object): 17 | """ 18 | extract facial feature from cnn neural network. 19 | 20 | """ 21 | def __init__(self, prototxt, weights, layer, gpu = True): 22 | 23 | """ 24 | default construct function 25 | 26 | - prototxt: string, cnn structure(caffe prototxt) 27 | - weights: string, network weights file name 28 | - gpu: boolean,gpu or cpu mode 29 | - layer: extract layer's feature 30 | 31 | """ 32 | self.prototxt = prototxt 33 | self.weights = weights 34 | self.gpu = gpu 35 | self.layer = layer 36 | 37 | def load_network(self): 38 | """ 39 | load network from the prototxt and weights 40 | 41 | """ 42 | if self.gpu: 43 | caffe.set_mode_gpu() 44 | else: 45 | caffe.set_mode_cpu() 46 | 47 | self.net = caffe.Net(self.prototxt, self.weights, caffe.TEST) 48 | self.height = self.net.blobs["data"].data.shape[2] 49 | self.width = self.net.blobs["data"].data.shape[3] 50 | self.channels = self.net.blobs["data"].data.shape[1] 51 | 52 | 53 | def extract_feature(self, image_dir, feature_name): 54 | """ 55 | extract feature from specified directory and save in feature_dir 56 | image_dir: string, face image directory 57 | feature_name: string, feature name ended with .mat 58 | """ 59 | 60 | assert type(image_dir) == str and type(feature_name) == str 61 | assert feature_name.split(".")[-1] == "mat" 62 | 63 | self.transformer = caffe.io.Transformer({'data':self.net.blobs['data'].data.shape}) 64 | 65 | # [height, width, channels] -> [channels, height, width] 66 | self.transformer.set_transpose('data', (2, 0, 1)) 67 | # RGB2BGR 68 | self.transformer.set_channel_swap('data', (2, 1, 0)) 69 | # 0 - 255 70 | self.transformer.set_raw_scale('data', 255.0) 71 | 72 | self.net.blobs['data'].reshape(1, 3, 112, 96) 73 | feature_set = {} 74 | 75 | f = open("feature.txt", "w") 76 | 77 | for term in os.listdir(image_dir): 78 | sub_img_dir = os.path.join(image_dir, term) 79 | sub_feature_list = [] 80 | f.write(term + "\n") 81 | for subitem in os.listdir(sub_img_dir): 82 | sub_sub_img_dir = os.path.join(sub_img_dir, subitem) 83 | for iterm in os.listdir(sub_sub_img_dir): 84 | filename = os.path.join(sub_sub_img_dir, iterm) 85 | #print filename, iterm 86 | # featurename = os.path.join(sub_fea_dir, iterm) 87 | img = caffe.io.load_image(filename) 88 | if len(img) == 0: 89 | print "open " + filename + " error!" 90 | continue 91 | 92 | self.net.blobs['data'].data[...] = self.transformer.preprocess('data', img) 93 | self.net.forward() 94 | # extract feature 95 | feature = copy.copy(self.net.blobs[self.layer].data[0]) 96 | sub_feature_list.append(feature) 97 | 98 | feature_set[term] = sub_feature_list 99 | 100 | sio.savemat(feature_name, feature_set) 101 | f.close() 102 | 103 | 104 | def parser_args(): 105 | """ 106 | parser argument 107 | 108 | """ 109 | parser = argparse.ArgumentParser(description = "extract cnn feature") 110 | parser.add_argument("-p", "--prototxt", type = str, default = "/home/wyj/experiment/sphereface/train/code/sphereface_deploy.prototxt") 111 | parser.add_argument("-m", "--model", type = str, default = "/home/wyj/experiment/sphereface/train/code/sphereface_model.caffemodel") 112 | parser.add_argument("-l", "--layer", type = str, default = "fc5") 113 | parser.add_argument("-g", "--gpu", type = bool, default = True) 114 | 115 | parser.add_argument("-d", "--directory", type = str, default = "/media/hysia/wyj/dataset/face_recog/YoutubeFaces-crop-align/") 116 | parser.add_argument("-n", "--name", type = str, default = "YoutubeFaces.mat") 117 | args = parser.parse_args() 118 | 119 | return args.prototxt, args.model, args.layer, args.gpu, args.directory, args.name 120 | 121 | 122 | if __name__ == "__main__": 123 | 124 | prototxt, model, layer, gpu, directory, name = parser_args() 125 | 126 | Extracter = cnn_feature(prototxt, model, layer, layer) 127 | Extracter.load_network() 128 | Extracter.extract_feature(directory, name) 129 | 130 | -------------------------------------------------------------------------------- /src/log/events.out.tfevents.1519200364.Hysia-System: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519200364.Hysia-System -------------------------------------------------------------------------------- /src/log/events.out.tfevents.1519200402.Hysia-System: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519200402.Hysia-System -------------------------------------------------------------------------------- /src/log/events.out.tfevents.1519200426.Hysia-System: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519200426.Hysia-System -------------------------------------------------------------------------------- /src/log/events.out.tfevents.1519200494.Hysia-System: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519200494.Hysia-System -------------------------------------------------------------------------------- /src/log/events.out.tfevents.1519961335.Hysia-System: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519961335.Hysia-System -------------------------------------------------------------------------------- /src/model/attention.ckpt-0.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-0.data-00000-of-00001 -------------------------------------------------------------------------------- /src/model/attention.ckpt-0.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-0.index -------------------------------------------------------------------------------- /src/model/attention.ckpt-0.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-0.meta -------------------------------------------------------------------------------- /src/model/attention.ckpt-20000.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-20000.data-00000-of-00001 -------------------------------------------------------------------------------- /src/model/attention.ckpt-20000.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-20000.index -------------------------------------------------------------------------------- /src/model/attention.ckpt-20000.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-20000.meta -------------------------------------------------------------------------------- /src/model/attention.ckpt-40000.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-40000.data-00000-of-00001 -------------------------------------------------------------------------------- /src/model/attention.ckpt-40000.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-40000.index -------------------------------------------------------------------------------- /src/model/attention.ckpt-40000.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-40000.meta -------------------------------------------------------------------------------- /src/model/attention.ckpt-60000.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-60000.data-00000-of-00001 -------------------------------------------------------------------------------- /src/model/attention.ckpt-60000.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-60000.index -------------------------------------------------------------------------------- /src/model/attention.ckpt-60000.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-60000.meta -------------------------------------------------------------------------------- /src/model/attention.ckpt-80000.data-00000-of-00001: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-80000.data-00000-of-00001 -------------------------------------------------------------------------------- /src/model/attention.ckpt-80000.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-80000.index -------------------------------------------------------------------------------- /src/model/attention.ckpt-80000.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-80000.meta -------------------------------------------------------------------------------- /src/model/checkpoint: -------------------------------------------------------------------------------- 1 | model_checkpoint_path: "attention.ckpt-80000" 2 | all_model_checkpoint_paths: "attention.ckpt-0" 3 | all_model_checkpoint_paths: "attention.ckpt-20000" 4 | all_model_checkpoint_paths: "attention.ckpt-40000" 5 | all_model_checkpoint_paths: "attention.ckpt-60000" 6 | all_model_checkpoint_paths: "attention.ckpt-80000" 7 | -------------------------------------------------------------------------------- /src/network.py: -------------------------------------------------------------------------------- 1 | #Author: Wang Yongjie 2 | #Email: wangyongjie@ict.ac.cn 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | import time 7 | from data import Data 8 | 9 | 10 | class Network(object): 11 | """ 12 | 13 | CVPR2017: Neural Aggregation Network for Video Face Recognition 14 | Aggregation module 15 | 16 | """ 17 | def __init__(self, batch_size, feature_len, class_num, group): 18 | """ 19 | batch_size: batch size 20 | feature_len: input feature length 21 | class_num: class number 22 | """ 23 | self.batch_size = batch_size 24 | self.feature_len = feature_len 25 | self.class_num = class_num 26 | self.group = group 27 | 28 | def create_network(self, input_x): 29 | 30 | w1 = tf.get_variable("fc1/weights", shape = [self.feature_len, self.feature_len], initializer = tf.random_normal_initializer(mean = 0.0, stddev = 1e-4)) 31 | b1 = tf.get_variable("fc1/biases", shape = [self.feature_len], initializer = tf.constant_initializer(0.0001)) 32 | w2 = tf.get_variable("fc2/weights", shape = [self.feature_len, self.class_num], initializer = tf.random_normal_initializer(mean = 0.0, stddev = 1e-4)) 33 | b2 = tf.get_variable("fc2/biases", shape = [self.class_num], initializer = tf.constant_initializer(0.0001)) 34 | q_param = tf.get_variable("q0", shape = [self.feature_len], initializer = tf.constant_initializer(0.0001)) 35 | 36 | #attention module 1 37 | resize_input = tf.reshape(input_x, [self.batch_size * self.group, self.feature_len]) 38 | expand_param = tf.expand_dims(q_param, 1) 39 | temp = tf.matmul(resize_input, expand_param) 40 | temp = tf.reshape(temp, [self.batch_size, self.group]) 41 | temp = tf.nn.softmax(temp) 42 | features = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = input_x) 43 | temps = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = temp) 44 | fusion = [tf.matmul(temps[i], features[i][0]) for i in range(self.batch_size)] 45 | r1 = tf.concat(axis = 0, values = fusion) 46 | 47 | 48 | #fc1 layer 49 | fc = tf.add(tf.matmul(r1, w1), b1, name = "fc1") 50 | tanh = tf.nn.tanh(fc) 51 | 52 | #attention module 2 53 | input_split = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = input_x) 54 | q1_split = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = tanh) 55 | a1 = [tf.tensordot(features[i], q1_split[i], 1) for i in range(self.batch_size)] 56 | a1_fusion = tf.concat(axis = 0, values = a1) 57 | e1 = tf.nn.softmax(a1_fusion) 58 | temp1 = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = e1) 59 | fusion1 = [tf.matmul(temps[i], features[i][0]) for i in range(self.batch_size)] 60 | r2 = tf.concat(axis = 0, values = fusion) 61 | 62 | 63 | #fc2 layer 64 | predict = tf.add(tf.matmul(r2, w2), b2, name = "predict") 65 | return r2, predict 66 | 67 | 68 | def train_network(self, epoch, filename): 69 | """ 70 | """ 71 | input_x = tf.placeholder(tf.float32, shape = [self.batch_size, self.group, self.feature_len]) 72 | label_x = tf.placeholder(tf.int32, shape = [self.batch_size, self.class_num]) 73 | _, predict = self.create_network(input_x) 74 | 75 | dataset = Data(filename, self.batch_size, self.class_num) 76 | dataset.load_feature() 77 | 78 | static = tf.equal(tf.argmax(predict, 1), tf.argmax(label_x, 1)) 79 | accuracy = tf.reduce_mean(tf.cast(static, tf.float32)) 80 | tf.summary.scalar("accuracy", accuracy) 81 | 82 | loss = tf.nn.softmax_cross_entropy_with_logits(labels = label_x, logits = predict) 83 | loss = tf.reduce_mean(loss) 84 | tf.summary.scalar("loss", loss) 85 | 86 | optim = tf.train.RMSPropOptimizer(learning_rate = 0.001).minimize(loss) 87 | 88 | sess = tf.Session() 89 | sess.run(tf.global_variables_initializer()) 90 | saver = tf.train.Saver(tf.global_variables()) 91 | merged = tf.summary.merge_all() 92 | writer = tf.summary.FileWriter("log/", sess.graph) 93 | 94 | for i in range(epoch): 95 | feature_x, labels_x = dataset.next_batch(self.group) 96 | _ = sess.run([optim], feed_dict = {input_x:feature_x, label_x:labels_x}) 97 | if i % 10 == 0: 98 | _acc, _loss, results = sess.run([accuracy, loss, merged], feed_dict = {input_x:feature_x, label_x: labels_x}) 99 | print("%s\tIteration\t%d\tAccuracy\t%f\tLoss\t%f"%(time.asctime(), i, _acc.item(), _loss.item())) 100 | writer.add_summary(results, i) 101 | 102 | if i % (epoch / 5) == 0: 103 | saver.save(sess, "./model/attention.ckpt", global_step = i) 104 | 105 | 106 | 107 | if __name__ == "__main__": 108 | filename = "./YoutubeFaces.mat" 109 | batch_size = 128 110 | class_num = 1595 111 | net = Network(batch_size, 512, class_num, 5) 112 | net.train_network(1000000, filename) 113 | --------------------------------------------------------------------------------