├── README.md
├── mtcnn
    ├── det1.caffemodel
    ├── det1.prototxt
    ├── det2.caffemodel
    ├── det2.prototxt
    ├── det3.caffemodel
    └── det3.prototxt
└── src
    ├── crop-alignment.py
    ├── data.py
    ├── extract_feature.py
    ├── log
        ├── events.out.tfevents.1519200364.Hysia-System
        ├── events.out.tfevents.1519200402.Hysia-System
        ├── events.out.tfevents.1519200426.Hysia-System
        ├── events.out.tfevents.1519200494.Hysia-System
        └── events.out.tfevents.1519961335.Hysia-System
    ├── model
        ├── attention.ckpt-0.data-00000-of-00001
        ├── attention.ckpt-0.index
        ├── attention.ckpt-0.meta
        ├── attention.ckpt-20000.data-00000-of-00001
        ├── attention.ckpt-20000.index
        ├── attention.ckpt-20000.meta
        ├── attention.ckpt-40000.data-00000-of-00001
        ├── attention.ckpt-40000.index
        ├── attention.ckpt-40000.meta
        ├── attention.ckpt-60000.data-00000-of-00001
        ├── attention.ckpt-60000.index
        ├── attention.ckpt-60000.meta
        ├── attention.ckpt-80000.data-00000-of-00001
        ├── attention.ckpt-80000.index
        ├── attention.ckpt-80000.meta
        └── checkpoint
    └── network.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Introduction
 2 | 
 3 | This is a repository of reproducing this paper: [CVPR2017: Neural Aggregation Network for Video Face Recognition on](https://arxiv.org/abs/1603.05474)Tensorflow platform.
 4 | 
 5 | # Content
 6 | 
 7 | src/extract_feature.py this python file extract feature using sphereface model and save it into .mat file.
 8 | 
 9 | src/data.py load the feature and generate a batch for network training.
10 | 
11 | src/crop-alignment.py detect the face and align it and save the face on disk.
12 | 
13 | src/network.py  the aggregation network which consists of two attention modules.
14 | 
15 | src/log this is a directory that save the training loss and training accuracy by tf.summary
16 | 
17 | src/model this is the model file of network. 
18 | 
19 | 
20 | 
21 | # Test 
22 | 
23 | updating.
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/mtcnn/det1.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/mtcnn/det1.caffemodel


--------------------------------------------------------------------------------
/mtcnn/det1.prototxt:
--------------------------------------------------------------------------------
  1 | name: "PNet"
  2 | input: "data"
  3 | input_dim: 1
  4 | input_dim: 3
  5 | input_dim: 12
  6 | input_dim: 12
  7 | 
  8 | layer {
  9 |   name: "conv1"
 10 |   type: "Convolution"
 11 |   bottom: "data"
 12 |   top: "conv1"
 13 |   param {
 14 |     lr_mult: 1
 15 |     decay_mult: 1
 16 |   }
 17 |   param {
 18 |     lr_mult: 2
 19 |     decay_mult: 0
 20 |   }
 21 |   convolution_param {
 22 |     num_output: 10
 23 |     kernel_size: 3
 24 |     stride: 1
 25 |     weight_filler {
 26 |       type: "xavier"
 27 |     }
 28 |     bias_filler {
 29 |       type: "constant"
 30 |       value: 0
 31 |     }
 32 |   }
 33 | }
 34 | layer {
 35 |   name: "PReLU1"
 36 |   type: "PReLU"
 37 |   bottom: "conv1"
 38 |   top: "conv1"
 39 | }
 40 | layer {
 41 |   name: "pool1"
 42 |   type: "Pooling"
 43 |   bottom: "conv1"
 44 |   top: "pool1"
 45 |   pooling_param {
 46 |     pool: MAX
 47 |     kernel_size: 2
 48 |     stride: 2
 49 |   }
 50 | }
 51 | 
 52 | layer {
 53 |   name: "conv2"
 54 |   type: "Convolution"
 55 |   bottom: "pool1"
 56 |   top: "conv2"
 57 |   param {
 58 |     lr_mult: 1
 59 |     decay_mult: 1
 60 |   }
 61 |   param {
 62 |     lr_mult: 2
 63 |     decay_mult: 0
 64 |   }
 65 |   convolution_param {
 66 |     num_output: 16
 67 |     kernel_size: 3
 68 |     stride: 1
 69 |      weight_filler {
 70 |       type: "xavier"
 71 |     }
 72 |     bias_filler {
 73 |       type: "constant"
 74 |       value: 0
 75 |     }
 76 |   }
 77 | }
 78 | layer {
 79 |   name: "PReLU2"
 80 |   type: "PReLU"
 81 |   bottom: "conv2"
 82 |   top: "conv2"
 83 | }
 84 | 
 85 | layer {
 86 |   name: "conv3"
 87 |   type: "Convolution"
 88 |   bottom: "conv2"
 89 |   top: "conv3"
 90 |   param {
 91 |     lr_mult: 1
 92 |     decay_mult: 1
 93 |   }
 94 |   param {
 95 |     lr_mult: 2
 96 |     decay_mult: 0
 97 |   }
 98 |   convolution_param {
 99 |     num_output: 32
100 |     kernel_size: 3
101 |     stride: 1
102 |      weight_filler {
103 |       type: "xavier"
104 |     }
105 |     bias_filler {
106 | 	  type: "constant"
107 |       value: 0
108 |     }
109 |   }
110 | }
111 | layer {
112 |   name: "PReLU3"
113 |   type: "PReLU"
114 |   bottom: "conv3"
115 |   top: "conv3"
116 | }
117 | 
118 | 
119 | layer {
120 |   name: "conv4-1"
121 |   type: "Convolution"
122 |   bottom: "conv3"
123 |   top: "conv4-1"
124 |   param {
125 |     lr_mult: 1
126 |     decay_mult: 1
127 |   }
128 |   param {
129 |     lr_mult: 2
130 |     decay_mult: 0
131 |   }
132 |   convolution_param {
133 |     num_output: 2
134 |     kernel_size: 1
135 |     stride: 1
136 |      weight_filler {
137 |       type: "xavier"
138 |     }
139 |     bias_filler {
140 |       type: "constant"
141 |       value: 0
142 |     }
143 |   }
144 | }
145 | 
146 | layer {
147 |   name: "conv4-2"
148 |   type: "Convolution"
149 |   bottom: "conv3"
150 |   top: "conv4-2"
151 |   param {
152 |     lr_mult: 1
153 |     decay_mult: 1
154 |   }
155 |   param {
156 |     lr_mult: 2
157 |     decay_mult: 0
158 |   }
159 |   convolution_param {
160 |     num_output: 4
161 |     kernel_size: 1
162 |     stride: 1
163 |      weight_filler {
164 |       type: "xavier"
165 | 	}
166 |     bias_filler {
167 |       type: "constant"
168 |       value: 0
169 |     }
170 |   }
171 | }
172 | layer {
173 |   name: "prob1"
174 |   type: "Softmax"
175 |   bottom: "conv4-1"
176 |   top: "prob1"
177 | }
178 | 


--------------------------------------------------------------------------------
/mtcnn/det2.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/mtcnn/det2.caffemodel


--------------------------------------------------------------------------------
/mtcnn/det2.prototxt:
--------------------------------------------------------------------------------
  1 | name: "RNet"
  2 | input: "data"
  3 | input_dim: 1
  4 | input_dim: 3
  5 | input_dim: 24
  6 | input_dim: 24
  7 | 
  8 | 
  9 | ##########################
 10 | ######################
 11 | layer {
 12 |   name: "conv1"
 13 |   type: "Convolution"
 14 |   bottom: "data"
 15 |   top: "conv1"
 16 |   param {
 17 |     lr_mult: 0
 18 |     decay_mult: 0
 19 |   }
 20 |   param {
 21 |     lr_mult: 0
 22 |     decay_mult: 0
 23 |   }
 24 |   convolution_param {
 25 |     num_output: 28
 26 |     kernel_size: 3
 27 |     stride: 1
 28 |      weight_filler {
 29 |       type: "xavier"
 30 | 	}
 31 |     bias_filler {
 32 |       type: "constant"
 33 |       value: 0
 34 |     }
 35 |   }
 36 | }
 37 | layer {
 38 |   name: "prelu1"
 39 |   type: "PReLU"
 40 |   bottom: "conv1"
 41 |   top: "conv1"
 42 |   propagate_down: true
 43 | }
 44 | layer {
 45 |   name: "pool1"
 46 |   type: "Pooling"
 47 |   bottom: "conv1"
 48 |   top: "pool1"
 49 |   pooling_param {
 50 |     pool: MAX
 51 |     kernel_size: 3
 52 |     stride: 2
 53 |   }
 54 | }
 55 | 
 56 | layer {
 57 |   name: "conv2"
 58 |   type: "Convolution"
 59 |   bottom: "pool1"
 60 |   top: "conv2"
 61 |   param {
 62 |     lr_mult: 0
 63 |     decay_mult: 0
 64 |   }
 65 |   param {
 66 |     lr_mult: 0
 67 |     decay_mult: 0
 68 |   }
 69 |   convolution_param {
 70 |     num_output: 48
 71 |     kernel_size: 3
 72 |     stride: 1
 73 |     weight_filler {
 74 |       type: "xavier"
 75 | 	}
 76 |     bias_filler {
 77 |       type: "constant"
 78 |       value: 0
 79 |     }
 80 |   }
 81 | }
 82 | layer {
 83 |   name: "prelu2"
 84 |   type: "PReLU"
 85 |   bottom: "conv2"
 86 |   top: "conv2"
 87 |   propagate_down: true
 88 | }
 89 | layer {
 90 |   name: "pool2"
 91 |   type: "Pooling"
 92 |   bottom: "conv2"
 93 |   top: "pool2"
 94 |   pooling_param {
 95 |     pool: MAX
 96 |     kernel_size: 3
 97 |     stride: 2
 98 |   }
 99 | }
100 | ####################################
101 | 
102 | ##################################
103 | layer {
104 |   name: "conv3"
105 |   type: "Convolution"
106 |   bottom: "pool2"
107 |   top: "conv3"
108 |   param {
109 |     lr_mult: 0
110 |     decay_mult: 0
111 |   }
112 |   param {
113 |     lr_mult: 0
114 |     decay_mult: 0
115 |   }
116 |   convolution_param {
117 |     num_output: 64
118 |     kernel_size: 2
119 |     stride: 1
120 |     weight_filler {
121 |       type: "xavier"
122 | 	}
123 |     bias_filler {
124 |       type: "constant"
125 |       value: 0
126 |     }
127 |   }
128 | }
129 | layer {
130 |   name: "prelu3"
131 |   type: "PReLU"
132 |   bottom: "conv3"
133 |   top: "conv3"
134 |   propagate_down: true
135 | }
136 | ###############################
137 | 
138 | ###############################
139 | 
140 | layer {
141 |   name: "conv4"
142 |   type: "InnerProduct"
143 |   bottom: "conv3"
144 |   top: "conv4"
145 |   param {
146 |     lr_mult: 0
147 |     decay_mult: 0
148 |   }
149 |   param {
150 |     lr_mult: 0
151 |     decay_mult: 0
152 |   }
153 |   inner_product_param {
154 |     num_output: 128
155 |     weight_filler {
156 |       type: "xavier"
157 | 	}
158 |     bias_filler {
159 |       type: "constant"
160 |       value: 0
161 |     }
162 |   }
163 | }
164 | layer {
165 |   name: "prelu4"
166 |   type: "PReLU"
167 |   bottom: "conv4"
168 |   top: "conv4"
169 | }
170 | 
171 | layer {
172 |   name: "conv5-1"
173 |   type: "InnerProduct"
174 |   bottom: "conv4"
175 |   top: "conv5-1"
176 |   param {
177 |     lr_mult: 0
178 |     decay_mult: 0
179 |   }
180 |   param {
181 |     lr_mult: 0
182 |     decay_mult: 0
183 |   }
184 |   inner_product_param {
185 |     num_output: 2
186 |     #kernel_size: 1
187 |     #stride: 1
188 |     weight_filler {
189 |       type: "xavier"
190 | 	}
191 |     bias_filler {
192 |       type: "constant"
193 |       value: 0
194 |     }
195 |   }
196 | }
197 | layer {
198 |   name: "conv5-2"
199 |   type: "InnerProduct"
200 |   bottom: "conv4"
201 |   top: "conv5-2"
202 |   param {
203 |     lr_mult: 1
204 |     decay_mult: 1
205 |   }
206 |   param {
207 |     lr_mult: 2
208 |     decay_mult: 1
209 |   }
210 |   inner_product_param {
211 |     num_output: 4
212 |     #kernel_size: 1
213 |     #stride: 1
214 |      weight_filler {
215 |       type: "xavier"
216 | 	}
217 |     bias_filler {
218 |       type: "constant"
219 |       value: 0
220 |     }
221 |   }
222 | }
223 | layer {
224 |   name: "prob1"
225 |   type: "Softmax"
226 |   bottom: "conv5-1"
227 |   top: "prob1"
228 | }


--------------------------------------------------------------------------------
/mtcnn/det3.caffemodel:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/mtcnn/det3.caffemodel


--------------------------------------------------------------------------------
/mtcnn/det3.prototxt:
--------------------------------------------------------------------------------
  1 | name: "ONet"
  2 | input: "data"
  3 | input_dim: 1
  4 | input_dim: 3
  5 | input_dim: 48
  6 | input_dim: 48
  7 | ##################################
  8 | layer {
  9 |   name: "conv1"
 10 |   type: "Convolution"
 11 |   bottom: "data"
 12 |   top: "conv1"
 13 |   param {
 14 |     lr_mult: 1
 15 |     decay_mult: 1
 16 |   }
 17 |   param {
 18 |     lr_mult: 2
 19 |     decay_mult: 1
 20 |   }
 21 |   convolution_param {
 22 |     num_output: 32
 23 |     kernel_size: 3
 24 |     stride: 1
 25 |      weight_filler {
 26 |       type: "xavier"
 27 | 	}
 28 |     bias_filler {
 29 |       type: "constant"
 30 |       value: 0
 31 |     }
 32 |   }
 33 | }
 34 | layer {
 35 |   name: "prelu1"
 36 |   type: "PReLU"
 37 |   bottom: "conv1"
 38 |   top: "conv1"
 39 | }
 40 | layer {
 41 |   name: "pool1"
 42 |   type: "Pooling"
 43 |   bottom: "conv1"
 44 |   top: "pool1"
 45 |   pooling_param {
 46 |     pool: MAX
 47 |     kernel_size: 3
 48 |     stride: 2
 49 |   }
 50 | }
 51 | layer {
 52 |   name: "conv2"
 53 |   type: "Convolution"
 54 |   bottom: "pool1"
 55 |   top: "conv2"
 56 |   param {
 57 |     lr_mult: 1
 58 |     decay_mult: 1
 59 |   }
 60 |   param {
 61 |     lr_mult: 2
 62 |     decay_mult: 1
 63 |   }
 64 |   convolution_param {
 65 |     num_output: 64
 66 |     kernel_size: 3
 67 |     stride: 1
 68 |      weight_filler {
 69 |       type: "xavier"
 70 | 	}
 71 |     bias_filler {
 72 |       type: "constant"
 73 |       value: 0
 74 |     }
 75 |   }
 76 | }
 77 | 
 78 | layer {
 79 |   name: "prelu2"
 80 |   type: "PReLU"
 81 |   bottom: "conv2"
 82 |   top: "conv2"
 83 | }
 84 | layer {
 85 |   name: "pool2"
 86 |   type: "Pooling"
 87 |   bottom: "conv2"
 88 |   top: "pool2"
 89 |   pooling_param {
 90 |     pool: MAX
 91 |     kernel_size: 3
 92 |     stride: 2
 93 |   }
 94 | }
 95 | 
 96 | layer {
 97 |   name: "conv3"
 98 |   type: "Convolution"
 99 |   bottom: "pool2"
100 |   top: "conv3"
101 |   param {
102 |     lr_mult: 1
103 |     decay_mult: 1
104 |   }
105 |   param {
106 |     lr_mult: 2
107 |     decay_mult: 1
108 |   }
109 |   convolution_param {
110 | 	num_output: 64
111 | 	kernel_size: 3
112 |     weight_filler {
113 |       type: "xavier"
114 | 	}
115 |     bias_filler {
116 |       type: "constant"
117 |       value: 0
118 |     }
119 |   }
120 | }
121 | layer {
122 |   name: "prelu3"
123 |   type: "PReLU"
124 |   bottom: "conv3"
125 |   top: "conv3"
126 | }
127 | layer {
128 |   name: "pool3"
129 |   type: "Pooling"
130 |   bottom: "conv3"
131 |   top: "pool3"
132 |   pooling_param {
133 |     pool: MAX
134 |     kernel_size: 2
135 |     stride: 2
136 |   }
137 | }
138 | layer {
139 |   name: "conv4"
140 |   type: "Convolution"
141 |   bottom: "pool3"
142 |   top: "conv4"
143 |   param {
144 |     lr_mult: 1
145 |     decay_mult: 1
146 |   }
147 |   param {
148 |     lr_mult: 2
149 |     decay_mult: 1
150 |   }
151 |   convolution_param {
152 | 	num_output: 128
153 | 	kernel_size: 2
154 |     weight_filler {
155 |       type: "xavier"
156 | 	}
157 |     bias_filler {
158 |       type: "constant"
159 |       value: 0
160 |     }
161 |   }
162 | }
163 | layer {
164 |   name: "prelu4"
165 |   type: "PReLU"
166 |   bottom: "conv4"
167 |   top: "conv4"
168 | }
169 | 
170 | 
171 | layer {
172 |   name: "conv5"
173 |   type: "InnerProduct"
174 |   bottom: "conv4"
175 |   top: "conv5"
176 |   param {
177 |     lr_mult: 1
178 |     decay_mult: 1
179 |   }
180 |   param {
181 |     lr_mult: 2
182 |     decay_mult: 1
183 |   }
184 |   inner_product_param {
185 | 	#kernel_size: 3
186 | 	num_output: 256
187 |     weight_filler {
188 |       type: "xavier"
189 | 	}
190 |     bias_filler {
191 |       type: "constant"
192 |       value: 0
193 |     }
194 |   }
195 | }
196 | 
197 | layer {
198 |   name: "drop5"
199 |   type: "Dropout"
200 |   bottom: "conv5"
201 |   top: "conv5"
202 |   dropout_param {
203 |     dropout_ratio: 0.25
204 |   }
205 | }
206 | layer {
207 |   name: "prelu5"
208 |   type: "PReLU"
209 |   bottom: "conv5"
210 |   top: "conv5"
211 | }
212 | 
213 | 
214 | layer {
215 |   name: "conv6-1"
216 |   type: "InnerProduct"
217 |   bottom: "conv5"
218 |   top: "conv6-1"
219 |   param {
220 |     lr_mult: 1
221 |     decay_mult: 1
222 |   }
223 |   param {
224 |     lr_mult: 2
225 |     decay_mult: 1
226 |   }
227 |   inner_product_param {
228 |     #kernel_size: 1
229 | 	num_output: 2
230 |     weight_filler {
231 |       type: "xavier"
232 | 	}
233 |     bias_filler {
234 |       type: "constant"
235 |       value: 0
236 |     }
237 |   }
238 | }
239 | layer {
240 |   name: "conv6-2"
241 |   type: "InnerProduct"
242 |   bottom: "conv5"
243 |   top: "conv6-2"
244 |   param {
245 |     lr_mult: 1
246 |     decay_mult: 1
247 |   }
248 |   param {
249 |     lr_mult: 2
250 |     decay_mult: 1
251 |   }
252 |   inner_product_param {
253 |   	#kernel_size: 1
254 | 	num_output: 4
255 |     weight_filler {
256 |       type: "xavier"
257 | 	}
258 |     bias_filler {
259 |       type: "constant"
260 |       value: 0
261 |     }
262 |   }
263 | }
264 | layer {
265 |   name: "conv6-3"
266 |   type: "InnerProduct"
267 |   bottom: "conv5"
268 |   top: "conv6-3"
269 |   param {
270 |     lr_mult: 1
271 |     decay_mult: 1
272 |   }
273 |   param {
274 |     lr_mult: 2
275 |     decay_mult: 1
276 |   }
277 |   inner_product_param {
278 |   	#kernel_size: 1
279 | 	num_output: 10
280 |     weight_filler {
281 |       type: "xavier"
282 | 	}
283 |     bias_filler {
284 |       type: "constant"
285 |       value: 0
286 |     }
287 |   }
288 | }
289 | layer {
290 |   name: "prob1"
291 |   type: "Softmax"
292 |   bottom: "conv6-1"
293 |   top: "prob1"
294 | }
295 | 


--------------------------------------------------------------------------------
/src/crop-alignment.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | import caffe
  4 | import cv2
  5 | import numpy as np
  6 | import os
  7 | import copy
  8 | import argparse
  9 | from skimage import transform as trans
 10 | import json
 11 | 
 12 | 
 13 | def bbreg(boundingbox, reg):
 14 |     reg = reg.T 
 15 |     
 16 |     # calibrate bouding boxes
 17 |     if reg.shape[1] == 1:
 18 |         print "reshape of reg"
 19 |         pass # reshape of reg
 20 |     w = boundingbox[:,2] - boundingbox[:,0] + 1
 21 |     h = boundingbox[:,3] - boundingbox[:,1] + 1
 22 | 
 23 |     bb0 = boundingbox[:,0] + reg[:,0]*w
 24 |     bb1 = boundingbox[:,1] + reg[:,1]*h
 25 |     bb2 = boundingbox[:,2] + reg[:,2]*w
 26 |     bb3 = boundingbox[:,3] + reg[:,3]*h
 27 |     
 28 |     boundingbox[:,0:4] = np.array([bb0, bb1, bb2, bb3]).T
 29 |     #print "bb", boundingbox
 30 |     return boundingbox
 31 | 
 32 | 
 33 | def pad(boxesA, w, h):
 34 |     boxes = boxesA.copy() # shit, value parameter!!!
 35 |     #print '#################'
 36 |     #print 'boxes', boxes
 37 |     #print 'w,h', w, h
 38 |     
 39 |     tmph = boxes[:,3] - boxes[:,1] + 1
 40 |     tmpw = boxes[:,2] - boxes[:,0] + 1
 41 |     numbox = boxes.shape[0]
 42 | 
 43 |     #print 'tmph', tmph
 44 |     #print 'tmpw', tmpw
 45 | 
 46 |     dx = np.ones(numbox)
 47 |     dy = np.ones(numbox)
 48 |     edx = tmpw 
 49 |     edy = tmph
 50 | 
 51 |     x = boxes[:,0:1][:,0]
 52 |     y = boxes[:,1:2][:,0]
 53 |     ex = boxes[:,2:3][:,0]
 54 |     ey = boxes[:,3:4][:,0]
 55 |    
 56 |    
 57 |     tmp = np.where(ex > w)[0]
 58 |     if tmp.shape[0] != 0:
 59 |         edx[tmp] = -ex[tmp] + w-1 + tmpw[tmp]
 60 |         ex[tmp] = w-1
 61 | 
 62 |     tmp = np.where(ey > h)[0]
 63 |     if tmp.shape[0] != 0:
 64 |         edy[tmp] = -ey[tmp] + h-1 + tmph[tmp]
 65 |         ey[tmp] = h-1
 66 | 
 67 |     tmp = np.where(x < 1)[0]
 68 |     if tmp.shape[0] != 0:
 69 |         dx[tmp] = 2 - x[tmp]
 70 |         x[tmp] = np.ones_like(x[tmp])
 71 | 
 72 |     tmp = np.where(y < 1)[0]
 73 |     if tmp.shape[0] != 0:
 74 |         dy[tmp] = 2 - y[tmp]
 75 |         y[tmp] = np.ones_like(y[tmp])
 76 |     
 77 |     # for python index from 0, while matlab from 1
 78 |     dy = np.maximum(0, dy-1)
 79 |     dx = np.maximum(0, dx-1)
 80 |     y = np.maximum(0, y-1)
 81 |     x = np.maximum(0, x-1)
 82 |     edy = np.maximum(0, edy-1)
 83 |     edx = np.maximum(0, edx-1)
 84 |     ey = np.maximum(0, ey-1)
 85 |     ex = np.maximum(0, ex-1)
 86 |     
 87 | 
 88 |     #print 'boxes', boxes
 89 |     return [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph]
 90 | 
 91 | 
 92 | 
 93 | def rerec(bboxA):
 94 |     # convert bboxA to square
 95 |     w = bboxA[:,2] - bboxA[:,0]
 96 |     h = bboxA[:,3] - bboxA[:,1]
 97 |     l = np.maximum(w,h).T
 98 |     
 99 |     #print 'bboxA', bboxA
100 |     #print 'w', w
101 |     #print 'h', h
102 |     #print 'l', l
103 |     bboxA[:,0] = bboxA[:,0] + w*0.5 - l*0.5
104 |     bboxA[:,1] = bboxA[:,1] + h*0.5 - l*0.5 
105 |     bboxA[:,2:4] = bboxA[:,0:2] + np.repeat([l], 2, axis = 0).T 
106 |     return bboxA
107 | 
108 | 
109 | def nms(boxes, threshold, type):
110 |     """nms
111 |     :boxes: [:,0:5]
112 |     :threshold: 0.5 like
113 |     :type: 'Min' or others
114 |     :returns: TODO
115 |     """
116 |     if boxes.shape[0] == 0:
117 |         return np.array([])
118 |     x1 = boxes[:,0]
119 |     y1 = boxes[:,1]
120 |     x2 = boxes[:,2]
121 |     y2 = boxes[:,3]
122 |     s = boxes[:,4]
123 |     area = np.multiply(x2-x1+1, y2-y1+1)
124 |     I = np.array(s.argsort()) # read s using I
125 |     
126 |     pick = [];
127 |     while len(I) > 0:
128 |         xx1 = np.maximum(x1[I[-1]], x1[I[0:-1]])
129 |         yy1 = np.maximum(y1[I[-1]], y1[I[0:-1]])
130 |         xx2 = np.minimum(x2[I[-1]], x2[I[0:-1]])
131 |         yy2 = np.minimum(y2[I[-1]], y2[I[0:-1]])
132 |         w = np.maximum(0.0, xx2 - xx1 + 1)
133 |         h = np.maximum(0.0, yy2 - yy1 + 1)
134 |         inter = w * h
135 |         if type == 'Min':
136 |             o = inter / np.minimum(area[I[-1]], area[I[0:-1]])
137 |         else:
138 |             o = inter / (area[I[-1]] + area[I[0:-1]] - inter)
139 |         pick.append(I[-1])
140 |         I = I[np.where( o <= threshold)[0]]
141 |     return pick
142 | 
143 | 
144 | def generateBoundingBox(map, reg, scale, t):
145 |     stride = 2
146 |     cellsize = 12
147 |     map = map.T
148 |     dx1 = reg[0,:,:].T
149 |     dy1 = reg[1,:,:].T
150 |     dx2 = reg[2,:,:].T
151 |     dy2 = reg[3,:,:].T
152 |     (x, y) = np.where(map >= t)
153 | 
154 |     yy = y
155 |     xx = x
156 |     
157 | 
158 |     score = map[x,y]
159 |     reg = np.array([dx1[x,y], dy1[x,y], dx2[x,y], dy2[x,y]])
160 | 
161 |     if reg.shape[0] == 0:
162 |         pass
163 |     boundingbox = np.array([yy, xx]).T
164 | 
165 |     bb1 = np.fix((stride * (boundingbox) + 1) / scale).T # matlab index from 1, so with "boundingbox-1"
166 |     bb2 = np.fix((stride * (boundingbox) + cellsize - 1 + 1) / scale).T # while python don't have to
167 |     score = np.array([score])
168 | 
169 |     boundingbox_out = np.concatenate((bb1, bb2, score, reg), axis=0)
170 | 
171 | 
172 |     return boundingbox_out.T
173 | 
174 | 
175 | count = 0
176 | 
177 | def drawBoxes(im, boxes):
178 |     x1 = boxes[:,0]
179 |     y1 = boxes[:,1]
180 |     x2 = boxes[:,2]
181 |     y2 = boxes[:,3]
182 |     for i in range(x1.shape[0]):
183 |         cv2.rectangle(im, (int(x1[i]), int(y1[i])), (int(x2[i]), int(y2[i])), (0,255,0), 1)
184 |     return im
185 | 
186 | '''
187 | def drawBoxes(im, boxes, model, SNet, clf, name_identities):
188 | 
189 | 
190 |     x1 = boxes[:,0]
191 |     y1 = boxes[:,1]
192 |     x2 = boxes[:,2]
193 |     y2 = boxes[:,3]
194 |     global count
195 |     width, height = im.shape[0], im.shape[1]
196 |     for i in range(x1.shape[0]):
197 |         if int(y1[i]) >=0 and int(y2[i]) < width and int(x1[i]) >= 0 and int(x2[i]) < height:
198 |             img = im[int(y1[i]):int(y2[i]), int(x1[i]):int(x2[i])]
199 |             img = cv2.resize(img, (96, 112))
200 |             save_name = '../tmp/' + str(count) + '.jpg'
201 |             count += 1
202 |             print count
203 |             cv2.imwrite(save_name, img)
204 |             
205 |             #assert img != None
206 |             img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
207 |             img = (img - 127.5 )/ 128
208 |             img = np.transpose(img, (2, 0, 1))
209 |             img = [img]
210 |             SNet.blobs['data'].data[...] = img
211 |             SNet.forward()
212 |             a = copy.copy(SNet.blobs['fc5'].data[0])
213 |             a = [a]
214 |             index = clf.predict(a)
215 |             person =  name_identities[int(index)]
216 |             # print person
217 |             cv2.putText(im, person, (int(x1[i]), int(y1[i])),
218 |                     cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.75,
219 |                     color=(152, 255, 204), thickness=2)
220 |         cv2.rectangle(im, (int(x1[i]), int(y1[i])), (int(x2[i]), int(y2[i])), (0,255,0), 1)
221 |         
222 | 
223 |     return im
224 | 
225 | '''
226 | 
227 | def detect_face(img, minsize, PNet, RNet, ONet, threshold, fastresize, factor):
228 |     
229 |     img2 = img.copy()
230 | 
231 |     factor_count = 0
232 |     total_boxes = np.zeros((0,9), np.float)
233 |     points = []
234 |     h = img.shape[0]
235 |     w = img.shape[1]
236 |     minl = min(h, w)
237 |     img = img.astype(float)
238 |     m = 12.0/minsize
239 |     minl = minl*m
240 |     
241 | 
242 |     #total_boxes = np.load('total_boxes.npy')
243 |     #total_boxes = np.load('total_boxes_242.npy')
244 |     #total_boxes = np.load('total_boxes_101.npy')
245 | 
246 |     
247 |     # create scale pyramid
248 |     scales = []
249 |     while minl >= 12:
250 |         scales.append(m * pow(factor, factor_count))
251 |         minl *= factor
252 |         factor_count += 1
253 |     
254 |     # first stage
255 |     for scale in scales:
256 |         hs = int(np.ceil(h*scale))
257 |         ws = int(np.ceil(w*scale))
258 | 
259 |         if fastresize:
260 |             im_data = (img-127.5)*0.0078125 # [0,255] -> [-1,1]
261 |             im_data = cv2.resize(im_data, (ws,hs)) # default is bilinear
262 |         else: 
263 |             im_data = cv2.resize(img, (ws,hs)) # default is bilinear
264 |             im_data = (im_data-127.5)*0.0078125 # [0,255] -> [-1,1]
265 |         #im_data = imResample(img, hs, ws); print "scale:", scale
266 | 
267 | 
268 |         im_data = np.swapaxes(im_data, 0, 2)
269 |         im_data = np.array([im_data], dtype = np.float)
270 |         PNet.blobs['data'].reshape(1, 3, ws, hs)
271 |         PNet.blobs['data'].data[...] = im_data
272 |         out = PNet.forward()
273 |     
274 |         boxes = generateBoundingBox(out['prob1'][0,1,:,:], out['conv4-2'][0], scale, threshold[0])
275 |         if boxes.shape[0] != 0:
276 |             #print boxes[4:9]
277 |             #print 'im_data', im_data[0:5, 0:5, 0], '\n'
278 |             #print 'prob1', out['prob1'][0,0,0:3,0:3]
279 | 
280 |             pick = nms(boxes, 0.5, 'Union')
281 | 
282 |             if len(pick) > 0 :
283 |                 boxes = boxes[pick, :]
284 | 
285 |         if boxes.shape[0] != 0:
286 |             total_boxes = np.concatenate((total_boxes, boxes), axis=0)
287 |          
288 |     #np.save('total_boxes_101.npy', total_boxes)
289 | 
290 |     #####
291 |     # 1 #
292 |     #####
293 |     #print "[1]:",total_boxes.shape[0]
294 |     #print total_boxes
295 |     #return total_boxes, [] 
296 | 
297 | 
298 |     numbox = total_boxes.shape[0]
299 |     if numbox > 0:
300 |         # nms
301 |         pick = nms(total_boxes, 0.7, 'Union')
302 |         total_boxes = total_boxes[pick, :]
303 |         #print "[2]:",total_boxes.shape[0]
304 |         
305 |         # revise and convert to square
306 |         regh = total_boxes[:,3] - total_boxes[:,1]
307 |         regw = total_boxes[:,2] - total_boxes[:,0]
308 |         t1 = total_boxes[:,0] + total_boxes[:,5]*regw
309 |         t2 = total_boxes[:,1] + total_boxes[:,6]*regh
310 |         t3 = total_boxes[:,2] + total_boxes[:,7]*regw
311 |         t4 = total_boxes[:,3] + total_boxes[:,8]*regh
312 |         t5 = total_boxes[:,4]
313 |         total_boxes = np.array([t1,t2,t3,t4,t5]).T
314 |         #print "[3]:",total_boxes.shape[0]
315 |         #print regh
316 |         #print regw
317 |         #print 't1',t1
318 |         #print total_boxes
319 | 
320 |         total_boxes = rerec(total_boxes) # convert box to square
321 |         #print "[4]:",total_boxes.shape[0]
322 |         
323 |         total_boxes[:,0:4] = np.fix(total_boxes[:,0:4])
324 |         #print "[4.5]:",total_boxes.shape[0]
325 |         #print total_boxes
326 |         [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = pad(total_boxes, w, h)
327 | 
328 |     #print total_boxes.shape
329 |     #print total_boxes
330 | 
331 |     numbox = total_boxes.shape[0]
332 |     if numbox > 0:
333 |         # second stage
334 | 
335 |         #print 'tmph', tmph
336 |         #print 'tmpw', tmpw
337 |         #print "y,ey,x,ex", y, ey, x, ex, 
338 |         #print "edy", edy
339 | 
340 |         #tempimg = np.load('tempimg.npy')
341 | 
342 |         # construct input for RNet
343 |         tempimg = np.zeros((numbox, 24, 24, 3)) # (24, 24, 3, numbox)
344 |         for k in range(numbox):
345 |             tmp = np.zeros((int(tmph[k]) +1, int(tmpw[k]) + 1,3))
346 |           
347 |             #print "dx[k], edx[k]:", dx[k], edx[k]
348 |             #print "dy[k], edy[k]:", dy[k], edy[k]
349 |             #print "img.shape", img[y[k]:ey[k]+1, x[k]:ex[k]+1].shape
350 |             #print "tmp.shape", tmp[dy[k]:edy[k]+1, dx[k]:edx[k]+1].shape
351 | 
352 |             tmp[int(dy[k]):int(edy[k])+1, int(dx[k]):int(edx[k])+1] = img[int(y[k]):int(ey[k])+1, int(x[k]):int(ex[k])+1]
353 |             #print "y,ey,x,ex", y[k], ey[k], x[k], ex[k]
354 |             #print "tmp", tmp.shape
355 |             
356 |             tempimg[k,:,:,:] = cv2.resize(tmp, (24, 24))
357 |             #tempimg[k,:,:,:] = imResample(tmp, 24, 24)
358 |             #print 'tempimg', tempimg[k,:,:,:].shape
359 |             #print tempimg[k,0:5,0:5,0] 
360 |             #print tempimg[k,0:5,0:5,1] 
361 |             #print tempimg[k,0:5,0:5,2] 
362 |             #print k
363 |     
364 |         #print tempimg.shape
365 |         #print tempimg[0,0,0,:]
366 |         tempimg = (tempimg-127.5)*0.0078125 # done in imResample function wrapped by python
367 | 
368 |         #np.save('tempimg.npy', tempimg)
369 | 
370 |         # RNet
371 | 
372 |         tempimg = np.swapaxes(tempimg, 1, 3)
373 |         #print tempimg[0,:,0,0]
374 |         
375 |         RNet.blobs['data'].reshape(numbox, 3, 24, 24)
376 |         RNet.blobs['data'].data[...] = tempimg
377 |         out = RNet.forward()
378 | 
379 |         #print out['conv5-2'].shape
380 |         #print out['prob1'].shape
381 | 
382 |         score = out['prob1'][:,1]
383 |         #print 'score', score
384 |         pass_t = np.where(score>threshold[1])[0]
385 |         #print 'pass_t', pass_t
386 |         
387 |         score =  np.array([score[pass_t]]).T
388 |         total_boxes = np.concatenate( (total_boxes[pass_t, 0:4], score), axis = 1)
389 |         #print "[5]:",total_boxes.shape[0]
390 |         #print total_boxes
391 | 
392 |         #print "1.5:",total_boxes.shape
393 |         
394 |         mv = out['conv5-2'][pass_t, :].T
395 |         #print "mv", mv
396 |         if total_boxes.shape[0] > 0:
397 |             pick = nms(total_boxes, 0.7, 'Union')
398 |             #print 'pick', pick
399 |             if len(pick) > 0 :
400 |                 total_boxes = total_boxes[pick, :]
401 |                 #print "[6]:",total_boxes.shape[0]
402 |                 total_boxes = bbreg(total_boxes, mv[:, pick])
403 |                 #print "[7]:",total_boxes.shape[0]
404 |                 total_boxes = rerec(total_boxes)
405 |                 #print "[8]:",total_boxes.shape[0]
406 |             
407 |         #####
408 |         # 2 #
409 |         #####
410 |         #print "2:",total_boxes.shape
411 | 
412 |         numbox = total_boxes.shape[0]
413 |         if numbox > 0:
414 |             # third stage
415 |             
416 |             total_boxes = np.fix(total_boxes)
417 |             [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = pad(total_boxes, w, h)
418 |            
419 |             #print 'tmpw', tmpw
420 |             #print 'tmph', tmph
421 |             #print 'y ', y
422 |             #print 'ey', ey
423 |             #print 'x ', x
424 |             #print 'ex', ex
425 |         
426 | 
427 |             tempimg = np.zeros((numbox, 48, 48, 3))
428 |             for k in range(numbox):
429 |                 tmp = np.zeros((int(tmph[k]), int(tmpw[k]),3))
430 |                 tmp[int(dy[k]):int(edy[k])+1, int(dx[k]):int(edx[k])+1] = img[int(y[k]):int(ey[k])+1, int(x[k]):int(ex[k])+1]
431 |                 tempimg[k,:,:,:] = cv2.resize(tmp, (48, 48))
432 |             tempimg = (tempimg-127.5)*0.0078125 # [0,255] -> [-1,1]
433 |                 
434 |             # ONet
435 |             tempimg = np.swapaxes(tempimg, 1, 3)
436 |             ONet.blobs['data'].reshape(numbox, 3, 48, 48)
437 |             ONet.blobs['data'].data[...] = tempimg
438 |             out = ONet.forward()
439 |             
440 |             score = out['prob1'][:,1]
441 |             points = out['conv6-3']
442 |             pass_t = np.where(score>threshold[2])[0]
443 |             points = points[pass_t, :]
444 |             score = np.array([score[pass_t]]).T
445 |             total_boxes = np.concatenate( (total_boxes[pass_t, 0:4], score), axis=1)
446 |             #print "[9]:",total_boxes.shape[0]
447 |             
448 |             mv = out['conv6-2'][pass_t, :].T
449 |             w = total_boxes[:,3] - total_boxes[:,1] + 1
450 |             h = total_boxes[:,2] - total_boxes[:,0] + 1
451 | 
452 |             points[:, 0:5] = np.tile(w, (5,1)).T * points[:, 0:5] + np.tile(total_boxes[:,0], (5,1)).T - 1 
453 |             points[:, 5:10] = np.tile(h, (5,1)).T * points[:, 5:10] + np.tile(total_boxes[:,1], (5,1)).T -1
454 | 
455 |             if total_boxes.shape[0] > 0:
456 |                 total_boxes = bbreg(total_boxes, mv[:,:])
457 |                 #print "[10]:",total_boxes.shape[0]
458 |                 pick = nms(total_boxes, 0.7, 'Min')
459 |                 
460 |                 #print pick
461 |                 if len(pick) > 0 :
462 |                     total_boxes = total_boxes[pick, :]
463 |                     #print "[11]:",total_boxes.shape[0]
464 |                     points = points[pick, :]
465 | 
466 |     #####
467 |     # 3 #
468 |     #####
469 |     #print "3:",total_boxes.shape
470 | 
471 |     return total_boxes, points
472 | 
473 | def images_align(input_dir, output_dir):
474 | 
475 |     minsize = 80 #120
476 | 
477 |     caffe_model_path = "../mtcnn"
478 | 
479 |     threshold = [0.6, 0.7, 0.7]
480 |     factor = 0.709
481 |     # factor = 0.5
482 | 
483 |     face_size = (112, 96)
484 | 
485 |     src = np.array([
486 |         [30.2946, 51.6963],
487 |         [65.5318, 51.5014],
488 |         [48.0252, 71.7366],
489 |         [33.5493, 92.3655],
490 |         [62.7299, 92.2041]], dtype = np.float32)
491 |     tform = trans.SimilarityTransform()
492 | 
493 |     
494 |     caffe.set_mode_gpu()
495 |     PNet = caffe.Net(caffe_model_path+"/det1.prototxt", caffe_model_path+"/det1.caffemodel", caffe.TEST)
496 |     RNet = caffe.Net(caffe_model_path+"/det2.prototxt", caffe_model_path+"/det2.caffemodel", caffe.TEST)
497 |     ONet = caffe.Net(caffe_model_path+"/det3.prototxt", caffe_model_path+"/det3.caffemodel", caffe.TEST)
498 | 
499 |     for sub_dir in os.listdir(input_dir):
500 |         subinputdir = input_dir + '/' + sub_dir
501 |         suboutputdir = output_dir + '/' + sub_dir
502 |         if not os.path.exists(suboutputdir):
503 |             os.mkdir(suboutputdir)
504 |         for subsub_dir in os.listdir(subinputdir):
505 |             subsubinputdir = subinputdir + '/' + subsub_dir
506 |             subsuboutputdir = suboutputdir + '/' + subsub_dir
507 |             if not os.path.exists(subsuboutputdir):
508 |                 os.mkdir(subsuboutputdir)
509 |             for filename in os.listdir(subsubinputdir):
510 |                 img_name = subsubinputdir + '/' + filename
511 |                 save_image_name = subsuboutputdir + '/' + filename
512 |                 print save_image_name
513 |                 img = cv2.imread(img_name) #bgr image
514 |                 img = cv2.resize(img, (400, 400))
515 |                 if len(img) == 0:
516 |                     print "open image " + img_name + " error"
517 |                     continue
518 |                 img_matlab = img.copy()
519 |                 tmp = img_matlab[:,:,2].copy()
520 |                 img_matlab[:,:,2] = img_matlab[:,:,0]
521 |                 img_matlab[:,:,0] = tmp  # bgr 2 rgb
522 | 
523 |                 boundingboxes, points = detect_face(img_matlab, minsize, PNet, RNet, ONet, threshold, True, factor)
524 |                 print len(points)
525 |                 if len(points) > 0:
526 |                     tform.estimate(np.array(points[0]).reshape(2,5).T, src)
527 |                     M = tform.params[0:2, :]
528 |                     warped = cv2.warpAffine(img, M, (96, 112), borderValue = 0.0)
529 | 
530 |                     cv2.imwrite(save_image_name, warped)
531 | 
532 | def parser_args():
533 |     parser = argparse.ArgumentParser(description = 'face alignment')
534 |     parser.add_argument('-i', '--input', type = str, default = '/media/hysia/wyj/dataset/face_recog/YoutubeFaces/aligned_images_DB', help = 'input directory')
535 |     parser.add_argument('-o', '--output', type = str, default = '/media/hysia/wyj/dataset/face_recog/YoutubeFaces-crop-align/', help ='save directory')
536 |     args = parser.parse_args()
537 |     return args.input, args.output
538 | 
539 | if __name__ == "__main__":
540 |     input_dir, output_dir  = parser_args()
541 |     images_align(input_dir, output_dir)
542 | 
543 | 


--------------------------------------------------------------------------------
/src/data.py:
--------------------------------------------------------------------------------
 1 | # author: Wang Yongjie
 2 | # Email:  wangyongjie@ict.ac.cn
 3 | 
 4 | """
 5 | generate train batch for deep fusion module
 6 | """
 7 | 
 8 | import scipy.io as sio
 9 | import sys
10 | import random
11 | 
12 | class Data(object):
13 | 
14 |     def __init__(self, filename, batch_size, class_num):
15 |         """
16 |         filename: feature name 
17 |         batch_size: train_batch
18 |         class_num:  class number
19 |         """
20 |         self.filename = filename
21 |         self.batch_size = batch_size
22 |         self.class_num = class_num
23 |     
24 |     def load_feature(self):
25 |         self.features = []
26 |         self.labels = []
27 |         dataset = sio.loadmat(self.filename)
28 |         flag = 0
29 |         f = open("dataset.txt", 'w')
30 |         not_include = ["__version__", "__globals__", "__header__"] # scipy.io.savemat save these unrelevant information
31 |         for k, v in dataset.iteritems():
32 |             if k not in not_include:
33 |                 label = [0] * self.class_num
34 |                 #print flag
35 |                 label[flag] = 1
36 |                 flag = flag + 1
37 |                 sub_feature = []
38 |                 for i in range(len(v)):
39 |                     sub_feature.append(v[i])
40 |                 self.labels.append(label)
41 |                 self.features.append(sub_feature)
42 | 
43 |         #pairs = list(zip(self.features, self.labels))
44 |         #random.shuffle(pairs)
45 |         #self.features, self.labels = zip(*pairs)
46 |         f.close()
47 | 
48 |     def next_batch(self, group_num):
49 |         """
50 |         frame numbers of each group
51 |         """
52 |         train_feature, train_label = [], []
53 |         start = random.randint(0, self.class_num)
54 |         for i in range(start, start + self.batch_size):
55 |             train_group = []
56 |             seed = random.randint(0, len(self.features[i % self.class_num]) - group_num)
57 |             for j in range(seed, seed + group_num):
58 |                 #print i, j
59 |                 train_group.append(self.features[i % self.class_num][j])
60 | 
61 |             train_feature.append(train_group)
62 |             train_label.append(self.labels[i % self.class_num])
63 | 
64 |         return train_feature, train_label
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     filename = "./YoutubeFaces.mat"
69 |     dataset = Data(filename, 3, 1595) 
70 |     dataset.load_feature()
71 |     train_features, train_label = dataset.next_batch(5)
72 |     print train_features, train_label
73 | 
74 | 


--------------------------------------------------------------------------------
/src/extract_feature.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # Author: Wang Yongjie
  4 | # Email:  wangyongjie@ict.ac.cn
  5 | 
  6 | import os
  7 | import sys
  8 | import caffe
  9 | import scipy.io as sio
 10 | import argparse
 11 | import numpy as np
 12 | import copy
 13 | # from sklearn.decomposition import PCA
 14 | 
 15 | 
 16 | class cnn_feature(object):
 17 |     """
 18 |     extract facial feature from cnn neural network.
 19 | 
 20 |     """
 21 |     def __init__(self, prototxt, weights, layer, gpu = True):
 22 | 
 23 |         """
 24 |         default construct function
 25 | 
 26 |         - prototxt:  string, cnn structure(caffe prototxt)
 27 |         - weights:   string, network weights file name
 28 |         - gpu:       boolean,gpu or cpu mode
 29 |         - layer:     extract layer's feature
 30 | 
 31 |         """
 32 |         self.prototxt = prototxt
 33 |         self.weights = weights
 34 |         self.gpu = gpu
 35 |         self.layer = layer
 36 | 
 37 |     def load_network(self):
 38 |         """
 39 |         load network from the prototxt and weights
 40 | 
 41 |         """
 42 |         if self.gpu:
 43 |             caffe.set_mode_gpu()
 44 |         else:
 45 |             caffe.set_mode_cpu()
 46 | 
 47 |         self.net = caffe.Net(self.prototxt, self.weights, caffe.TEST)
 48 |         self.height = self.net.blobs["data"].data.shape[2]
 49 |         self.width = self.net.blobs["data"].data.shape[3]
 50 |         self.channels = self.net.blobs["data"].data.shape[1]
 51 | 
 52 | 
 53 |     def extract_feature(self, image_dir, feature_name):
 54 |         """
 55 |         extract feature from specified directory and save in feature_dir
 56 |         image_dir:  string, face image directory
 57 |         feature_name:    string, feature name ended with .mat
 58 |         """
 59 | 
 60 |         assert type(image_dir) == str and type(feature_name) == str
 61 |         assert feature_name.split(".")[-1] == "mat"
 62 | 
 63 |         self.transformer = caffe.io.Transformer({'data':self.net.blobs['data'].data.shape})
 64 |          
 65 |         # [height, width, channels] -> [channels, height, width]
 66 |         self.transformer.set_transpose('data', (2, 0, 1))
 67 |         # RGB2BGR
 68 |         self.transformer.set_channel_swap('data', (2, 1, 0))
 69 |         # 0 - 255
 70 |         self.transformer.set_raw_scale('data', 255.0)
 71 | 
 72 |         self.net.blobs['data'].reshape(1, 3, 112, 96)
 73 |         feature_set = {}
 74 | 
 75 |         f = open("feature.txt", "w")
 76 | 
 77 |         for term in os.listdir(image_dir):
 78 |             sub_img_dir = os.path.join(image_dir, term)
 79 |             sub_feature_list = []
 80 |             f.write(term + "\n")
 81 |             for subitem in os.listdir(sub_img_dir):
 82 |                 sub_sub_img_dir = os.path.join(sub_img_dir, subitem)
 83 |                 for iterm in os.listdir(sub_sub_img_dir):
 84 |                     filename = os.path.join(sub_sub_img_dir, iterm)
 85 |                     #print filename, iterm
 86 |                     # featurename = os.path.join(sub_fea_dir, iterm)
 87 |                     img = caffe.io.load_image(filename)
 88 |                     if len(img) == 0:
 89 |                         print "open " + filename + " error!"
 90 |                         continue
 91 | 
 92 |                     self.net.blobs['data'].data[...] = self.transformer.preprocess('data', img)
 93 |                     self.net.forward()
 94 |                     # extract feature
 95 |                     feature = copy.copy(self.net.blobs[self.layer].data[0])
 96 |                     sub_feature_list.append(feature)
 97 | 
 98 |             feature_set[term] = sub_feature_list
 99 | 
100 |         sio.savemat(feature_name, feature_set)
101 |         f.close()
102 | 
103 | 
104 | def parser_args():
105 |     """
106 |     parser argument
107 | 
108 |     """
109 |     parser = argparse.ArgumentParser(description = "extract cnn feature")
110 |     parser.add_argument("-p", "--prototxt", type = str, default = "/home/wyj/experiment/sphereface/train/code/sphereface_deploy.prototxt")
111 |     parser.add_argument("-m", "--model", type = str, default = "/home/wyj/experiment/sphereface/train/code/sphereface_model.caffemodel")
112 |     parser.add_argument("-l", "--layer", type = str, default = "fc5")
113 |     parser.add_argument("-g", "--gpu", type = bool, default = True)
114 | 
115 |     parser.add_argument("-d", "--directory", type = str, default = "/media/hysia/wyj/dataset/face_recog/YoutubeFaces-crop-align/")
116 |     parser.add_argument("-n", "--name", type = str, default = "YoutubeFaces.mat")
117 |     args = parser.parse_args()
118 | 
119 |     return args.prototxt, args.model, args.layer, args.gpu, args.directory, args.name
120 | 
121 | 
122 | if __name__ == "__main__":
123 | 
124 |     prototxt, model, layer, gpu, directory, name = parser_args()
125 | 
126 |     Extracter = cnn_feature(prototxt, model, layer, layer)
127 |     Extracter.load_network()
128 |     Extracter.extract_feature(directory, name)
129 | 
130 | 


--------------------------------------------------------------------------------
/src/log/events.out.tfevents.1519200364.Hysia-System:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519200364.Hysia-System


--------------------------------------------------------------------------------
/src/log/events.out.tfevents.1519200402.Hysia-System:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519200402.Hysia-System


--------------------------------------------------------------------------------
/src/log/events.out.tfevents.1519200426.Hysia-System:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519200426.Hysia-System


--------------------------------------------------------------------------------
/src/log/events.out.tfevents.1519200494.Hysia-System:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519200494.Hysia-System


--------------------------------------------------------------------------------
/src/log/events.out.tfevents.1519961335.Hysia-System:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/log/events.out.tfevents.1519961335.Hysia-System


--------------------------------------------------------------------------------
/src/model/attention.ckpt-0.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-0.data-00000-of-00001


--------------------------------------------------------------------------------
/src/model/attention.ckpt-0.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-0.index


--------------------------------------------------------------------------------
/src/model/attention.ckpt-0.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-0.meta


--------------------------------------------------------------------------------
/src/model/attention.ckpt-20000.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-20000.data-00000-of-00001


--------------------------------------------------------------------------------
/src/model/attention.ckpt-20000.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-20000.index


--------------------------------------------------------------------------------
/src/model/attention.ckpt-20000.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-20000.meta


--------------------------------------------------------------------------------
/src/model/attention.ckpt-40000.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-40000.data-00000-of-00001


--------------------------------------------------------------------------------
/src/model/attention.ckpt-40000.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-40000.index


--------------------------------------------------------------------------------
/src/model/attention.ckpt-40000.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-40000.meta


--------------------------------------------------------------------------------
/src/model/attention.ckpt-60000.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-60000.data-00000-of-00001


--------------------------------------------------------------------------------
/src/model/attention.ckpt-60000.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-60000.index


--------------------------------------------------------------------------------
/src/model/attention.ckpt-60000.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-60000.meta


--------------------------------------------------------------------------------
/src/model/attention.ckpt-80000.data-00000-of-00001:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-80000.data-00000-of-00001


--------------------------------------------------------------------------------
/src/model/attention.ckpt-80000.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-80000.index


--------------------------------------------------------------------------------
/src/model/attention.ckpt-80000.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jinyanxu/Neural-Aggregation-Network-for-Video-Face-Recognition/b87a267e8ae667fcae838fc033025f38065afccb/src/model/attention.ckpt-80000.meta


--------------------------------------------------------------------------------
/src/model/checkpoint:
--------------------------------------------------------------------------------
1 | model_checkpoint_path: "attention.ckpt-80000"
2 | all_model_checkpoint_paths: "attention.ckpt-0"
3 | all_model_checkpoint_paths: "attention.ckpt-20000"
4 | all_model_checkpoint_paths: "attention.ckpt-40000"
5 | all_model_checkpoint_paths: "attention.ckpt-60000"
6 | all_model_checkpoint_paths: "attention.ckpt-80000"
7 | 


--------------------------------------------------------------------------------
/src/network.py:
--------------------------------------------------------------------------------
  1 | #Author: Wang Yongjie
  2 | #Email:  wangyongjie@ict.ac.cn
  3 | 
  4 | import tensorflow as tf
  5 | import numpy as np
  6 | import time
  7 | from data import Data
  8 | 
  9 | 
 10 | class Network(object):
 11 |     """
 12 | 
 13 |     CVPR2017: Neural Aggregation Network for Video Face Recognition
 14 |     Aggregation module
 15 | 
 16 |     """
 17 |     def __init__(self, batch_size, feature_len, class_num, group):
 18 |         """
 19 |         batch_size: batch size 
 20 |         feature_len: input feature length
 21 |         class_num: class number
 22 |         """
 23 |         self.batch_size = batch_size
 24 |         self.feature_len = feature_len
 25 |         self.class_num = class_num
 26 |         self.group = group
 27 | 
 28 |     def create_network(self, input_x):
 29 | 
 30 |         w1 = tf.get_variable("fc1/weights", shape = [self.feature_len, self.feature_len], initializer = tf.random_normal_initializer(mean = 0.0, stddev = 1e-4))
 31 |         b1  = tf.get_variable("fc1/biases", shape = [self.feature_len], initializer = tf.constant_initializer(0.0001))
 32 |         w2 = tf.get_variable("fc2/weights", shape = [self.feature_len, self.class_num], initializer = tf.random_normal_initializer(mean = 0.0, stddev = 1e-4))
 33 |         b2 = tf.get_variable("fc2/biases", shape = [self.class_num], initializer = tf.constant_initializer(0.0001))
 34 |         q_param = tf.get_variable("q0", shape = [self.feature_len], initializer = tf.constant_initializer(0.0001))
 35 | 
 36 |         #attention module 1
 37 |         resize_input = tf.reshape(input_x, [self.batch_size * self.group, self.feature_len])
 38 |         expand_param = tf.expand_dims(q_param, 1)
 39 |         temp = tf.matmul(resize_input, expand_param)
 40 |         temp = tf.reshape(temp, [self.batch_size, self.group])
 41 |         temp = tf.nn.softmax(temp)
 42 |         features = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = input_x)
 43 |         temps = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = temp)
 44 |         fusion = [tf.matmul(temps[i], features[i][0]) for i in range(self.batch_size)]
 45 |         r1 = tf.concat(axis = 0, values = fusion)
 46 | 
 47 | 
 48 |         #fc1 layer
 49 |         fc = tf.add(tf.matmul(r1, w1), b1, name = "fc1")
 50 |         tanh = tf.nn.tanh(fc)
 51 | 
 52 |         #attention module 2
 53 |         input_split = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = input_x)
 54 |         q1_split = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = tanh)
 55 |         a1 = [tf.tensordot(features[i], q1_split[i], 1) for i in range(self.batch_size)]
 56 |         a1_fusion = tf.concat(axis = 0, values = a1)
 57 |         e1 = tf.nn.softmax(a1_fusion)
 58 |         temp1 = tf.split(axis = 0, num_or_size_splits = self.batch_size, value = e1)
 59 |         fusion1 = [tf.matmul(temps[i], features[i][0]) for i in range(self.batch_size)]
 60 |         r2 = tf.concat(axis = 0, values = fusion)
 61 | 
 62 | 
 63 |         #fc2 layer
 64 |         predict = tf.add(tf.matmul(r2, w2), b2, name = "predict")
 65 |         return r2, predict
 66 | 
 67 | 
 68 |     def train_network(self, epoch, filename):
 69 |         """
 70 |         """
 71 |         input_x = tf.placeholder(tf.float32, shape = [self.batch_size, self.group, self.feature_len])
 72 |         label_x = tf.placeholder(tf.int32, shape = [self.batch_size, self.class_num]) 
 73 |         _, predict = self.create_network(input_x)
 74 | 
 75 |         dataset = Data(filename, self.batch_size, self.class_num)
 76 |         dataset.load_feature()
 77 | 
 78 |         static = tf.equal(tf.argmax(predict, 1), tf.argmax(label_x, 1))
 79 |         accuracy = tf.reduce_mean(tf.cast(static, tf.float32))
 80 |         tf.summary.scalar("accuracy", accuracy)
 81 | 
 82 |         loss = tf.nn.softmax_cross_entropy_with_logits(labels = label_x, logits = predict)
 83 |         loss = tf.reduce_mean(loss)
 84 |         tf.summary.scalar("loss", loss)
 85 | 
 86 |         optim = tf.train.RMSPropOptimizer(learning_rate = 0.001).minimize(loss)
 87 | 
 88 |         sess = tf.Session()
 89 |         sess.run(tf.global_variables_initializer())
 90 |         saver = tf.train.Saver(tf.global_variables())
 91 |         merged = tf.summary.merge_all()
 92 |         writer = tf.summary.FileWriter("log/", sess.graph)
 93 | 
 94 |         for i in range(epoch):
 95 |             feature_x, labels_x = dataset.next_batch(self.group)
 96 |             _ = sess.run([optim], feed_dict = {input_x:feature_x, label_x:labels_x})
 97 |             if i % 10 == 0:
 98 |                 _acc, _loss, results = sess.run([accuracy, loss, merged], feed_dict = {input_x:feature_x, label_x: labels_x})
 99 |                 print("%s\tIteration\t%d\tAccuracy\t%f\tLoss\t%f"%(time.asctime(), i, _acc.item(), _loss.item()))
100 |                 writer.add_summary(results, i)
101 | 
102 |             if i % (epoch / 5) == 0:
103 |                 saver.save(sess, "./model/attention.ckpt", global_step = i)
104 | 
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     filename = "./YoutubeFaces.mat"
109 |     batch_size = 128
110 |     class_num = 1595
111 |     net = Network(batch_size, 512, class_num, 5)
112 |     net.train_network(1000000, filename)
113 | 


--------------------------------------------------------------------------------