├── .gitignore
├── CustomMxOp.py
├── README.md
├── images
    └── demo-sequences
    │   └── vot15_bag.7z
├── model
    ├── model.mat
    ├── model_dict.pkl
    ├── mxmodel_bgr-0001.params
    ├── mxmodel_bgr-symbol.json
    ├── mxmodel_rgb-0001.params
    └── mxmodel_rgb-symbol.json
├── run_tracker.py
├── transfer_model.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | test_model.py


--------------------------------------------------------------------------------
/CustomMxOp.py:
--------------------------------------------------------------------------------
 1 | import mxnet as mx
 2 | import minpy.numpy as np
 3 | 
 4 | class CustBatchNorm(mx.operator.CustomOp):    
 5 |     def forward(self, is_train, req, in_data, out_data, aux):
 6 |         #mx.nd.add(lhs, rhs)
 7 |         x = mx.nd.SwapAxis(in_data[0], 1, 3) # data         
 8 |         gamma = in_data[1] # gamma
 9 |         beta = in_data[2] # beta
10 |         moving_mean = in_data[3] # mean
11 |         moving_sigma = in_data[4] # sigma
12 |         x_hat = (x - moving_mean) / (moving_sigma + 1e-9)
13 |         out = gamma * x_hat + beta
14 |         out = mx.nd.SwapAxis(out, 1, 3)
15 |         self.assign(out_data[0], req[0], out)
16 |         
17 |     def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
18 |         raise NotImplementedError
19 |     
20 | @mx.operator.register("custbatchnorm")
21 | class CustBatchNormProp(mx.operator.CustomOpProp):
22 |     def __init__(self, need_top_grad=False):
23 |         super(CustBatchNormProp, self).__init__(need_top_grad)
24 |         
25 |     def list_arguments(self):
26 |         return ['data', 'gamma', 'beta', 'moving_mean', 'moving_sigma']
27 |     
28 |     def list_outputs(self):
29 |         return ['output']
30 |     
31 |     def infer_shape(self, in_shape):
32 |         data_shape = in_shape[0]
33 |         other_shape = (in_shape[0][1],)
34 |         output_shape = in_shape[0]
35 |         return [data_shape, other_shape, other_shape, other_shape, other_shape], [output_shape], []
36 |     
37 |     def create_operator(self, ctx, in_shapes, in_dtypes):
38 |         return CustBatchNorm()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### Python implementation of [siamese-fc](https://github.com/bertinetto/siamese-fc)
 2 | 
 3 | --------------
 4 | 
 5 | This repository only include the tracking part of [siamese-fc](https://github.com/bertinetto/siamese-fc).
 6 | 
 7 | --------------
 8 | 
 9 | #### Dependency
10 | 
11 | - Mxnet = 0.9.2
12 | - OpenCV
13 | - Numpy
14 | 
15 | #### Usage
16 | 
17 | Before running the demo, we should convert the `matconvnet` model to `mxnet` model. 
18 | 
19 | ```
20 | python transfer_model.py
21 | ```
22 | 
23 | By default there is already an `mxnet` model in `model` folder with prefix `mxmodel_bgr`, which means you should feed a `BGR` image to the model. If you want to use the `RGB` one, you should modify the tracking code correspondingly.
24 | 
25 | Run the default demo:
26 | 
27 | ```
28 | python run_tracker.py
29 | ```
30 | 


--------------------------------------------------------------------------------
/images/demo-sequences/vot15_bag.7z:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/images/demo-sequences/vot15_bag.7z


--------------------------------------------------------------------------------
/model/model.mat:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/model/model.mat


--------------------------------------------------------------------------------
/model/model_dict.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/model/model_dict.pkl


--------------------------------------------------------------------------------
/model/mxmodel_bgr-0001.params:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/model/mxmodel_bgr-0001.params


--------------------------------------------------------------------------------
/model/mxmodel_bgr-symbol.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nodes": [
  3 |     {
  4 |       "op": "null", 
  5 |       "name": "data", 
  6 |       "inputs": []
  7 |     }, 
  8 |     {
  9 |       "op": "null", 
 10 |       "name": "conv1_weight", 
 11 |       "attr": {
 12 |         "kernel": "(11, 11)", 
 13 |         "num_filter": "96", 
 14 |         "stride": "(2, 2)"
 15 |       }, 
 16 |       "inputs": []
 17 |     }, 
 18 |     {
 19 |       "op": "null", 
 20 |       "name": "conv1_bias", 
 21 |       "attr": {
 22 |         "kernel": "(11, 11)", 
 23 |         "num_filter": "96", 
 24 |         "stride": "(2, 2)"
 25 |       }, 
 26 |       "inputs": []
 27 |     }, 
 28 |     {
 29 |       "op": "Convolution", 
 30 |       "name": "conv1", 
 31 |       "attr": {
 32 |         "kernel": "(11, 11)", 
 33 |         "num_filter": "96", 
 34 |         "stride": "(2, 2)"
 35 |       }, 
 36 |       "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]]
 37 |     }, 
 38 |     {
 39 |       "op": "null", 
 40 |       "name": "bn1_gamma", 
 41 |       "attr": {"op_type": "custbatchnorm"}, 
 42 |       "inputs": []
 43 |     }, 
 44 |     {
 45 |       "op": "null", 
 46 |       "name": "bn1_beta", 
 47 |       "attr": {"op_type": "custbatchnorm"}, 
 48 |       "inputs": []
 49 |     }, 
 50 |     {
 51 |       "op": "null", 
 52 |       "name": "bn1_moving_mean", 
 53 |       "attr": {"op_type": "custbatchnorm"}, 
 54 |       "inputs": []
 55 |     }, 
 56 |     {
 57 |       "op": "null", 
 58 |       "name": "bn1_moving_sigma", 
 59 |       "attr": {"op_type": "custbatchnorm"}, 
 60 |       "inputs": []
 61 |     }, 
 62 |     {
 63 |       "op": "Custom", 
 64 |       "name": "bn1", 
 65 |       "attr": {"op_type": "custbatchnorm"}, 
 66 |       "inputs": [[3, 0, 0], [4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 0]]
 67 |     }, 
 68 |     {
 69 |       "op": "Activation", 
 70 |       "name": "relu1", 
 71 |       "attr": {"act_type": "relu"}, 
 72 |       "inputs": [[8, 0, 0]]
 73 |     }, 
 74 |     {
 75 |       "op": "Pooling", 
 76 |       "name": "pool1", 
 77 |       "attr": {
 78 |         "kernel": "(3, 3)", 
 79 |         "pool_type": "max", 
 80 |         "stride": "(2, 2)"
 81 |       }, 
 82 |       "inputs": [[9, 0, 0]]
 83 |     }, 
 84 |     {
 85 |       "op": "SliceChannel", 
 86 |       "name": "sliced1", 
 87 |       "attr": {"num_outputs": "2"}, 
 88 |       "inputs": [[10, 0, 0]]
 89 |     }, 
 90 |     {
 91 |       "op": "null", 
 92 |       "name": "conv21_weight", 
 93 |       "attr": {
 94 |         "kernel": "(5, 5)", 
 95 |         "num_filter": "128", 
 96 |         "stride": "(1, 1)"
 97 |       }, 
 98 |       "inputs": []
 99 |     }, 
100 |     {
101 |       "op": "null", 
102 |       "name": "conv21_bias", 
103 |       "attr": {
104 |         "kernel": "(5, 5)", 
105 |         "num_filter": "128", 
106 |         "stride": "(1, 1)"
107 |       }, 
108 |       "inputs": []
109 |     }, 
110 |     {
111 |       "op": "Convolution", 
112 |       "name": "conv21", 
113 |       "attr": {
114 |         "kernel": "(5, 5)", 
115 |         "num_filter": "128", 
116 |         "stride": "(1, 1)"
117 |       }, 
118 |       "inputs": [[11, 0, 0], [12, 0, 0], [13, 0, 0]]
119 |     }, 
120 |     {
121 |       "op": "null", 
122 |       "name": "conv22_weight", 
123 |       "attr": {
124 |         "kernel": "(5, 5)", 
125 |         "num_filter": "128", 
126 |         "stride": "(1, 1)"
127 |       }, 
128 |       "inputs": []
129 |     }, 
130 |     {
131 |       "op": "null", 
132 |       "name": "conv22_bias", 
133 |       "attr": {
134 |         "kernel": "(5, 5)", 
135 |         "num_filter": "128", 
136 |         "stride": "(1, 1)"
137 |       }, 
138 |       "inputs": []
139 |     }, 
140 |     {
141 |       "op": "Convolution", 
142 |       "name": "conv22", 
143 |       "attr": {
144 |         "kernel": "(5, 5)", 
145 |         "num_filter": "128", 
146 |         "stride": "(1, 1)"
147 |       }, 
148 |       "inputs": [[11, 1, 0], [15, 0, 0], [16, 0, 0]]
149 |     }, 
150 |     {
151 |       "op": "Concat", 
152 |       "name": "conv2", 
153 |       "attr": {"num_args": "2"}, 
154 |       "inputs": [[14, 0, 0], [17, 0, 0]]
155 |     }, 
156 |     {
157 |       "op": "null", 
158 |       "name": "bn2_gamma", 
159 |       "attr": {"op_type": "custbatchnorm"}, 
160 |       "inputs": []
161 |     }, 
162 |     {
163 |       "op": "null", 
164 |       "name": "bn2_beta", 
165 |       "attr": {"op_type": "custbatchnorm"}, 
166 |       "inputs": []
167 |     }, 
168 |     {
169 |       "op": "null", 
170 |       "name": "bn2_moving_mean", 
171 |       "attr": {"op_type": "custbatchnorm"}, 
172 |       "inputs": []
173 |     }, 
174 |     {
175 |       "op": "null", 
176 |       "name": "bn2_moving_sigma", 
177 |       "attr": {"op_type": "custbatchnorm"}, 
178 |       "inputs": []
179 |     }, 
180 |     {
181 |       "op": "Custom", 
182 |       "name": "bn2", 
183 |       "attr": {"op_type": "custbatchnorm"}, 
184 |       "inputs": [[18, 0, 0], [19, 0, 0], [20, 0, 0], [21, 0, 0], [22, 0, 0]]
185 |     }, 
186 |     {
187 |       "op": "Activation", 
188 |       "name": "relu2", 
189 |       "attr": {"act_type": "relu"}, 
190 |       "inputs": [[23, 0, 0]]
191 |     }, 
192 |     {
193 |       "op": "Pooling", 
194 |       "name": "pool2", 
195 |       "attr": {
196 |         "kernel": "(3, 3)", 
197 |         "pool_type": "max", 
198 |         "stride": "(2, 2)"
199 |       }, 
200 |       "inputs": [[24, 0, 0]]
201 |     }, 
202 |     {
203 |       "op": "null", 
204 |       "name": "conv3_weight", 
205 |       "attr": {
206 |         "kernel": "(3, 3)", 
207 |         "num_filter": "384", 
208 |         "stride": "(1, 1)"
209 |       }, 
210 |       "inputs": []
211 |     }, 
212 |     {
213 |       "op": "null", 
214 |       "name": "conv3_bias", 
215 |       "attr": {
216 |         "kernel": "(3, 3)", 
217 |         "num_filter": "384", 
218 |         "stride": "(1, 1)"
219 |       }, 
220 |       "inputs": []
221 |     }, 
222 |     {
223 |       "op": "Convolution", 
224 |       "name": "conv3", 
225 |       "attr": {
226 |         "kernel": "(3, 3)", 
227 |         "num_filter": "384", 
228 |         "stride": "(1, 1)"
229 |       }, 
230 |       "inputs": [[25, 0, 0], [26, 0, 0], [27, 0, 0]]
231 |     }, 
232 |     {
233 |       "op": "null", 
234 |       "name": "bn3_gamma", 
235 |       "attr": {"op_type": "custbatchnorm"}, 
236 |       "inputs": []
237 |     }, 
238 |     {
239 |       "op": "null", 
240 |       "name": "bn3_beta", 
241 |       "attr": {"op_type": "custbatchnorm"}, 
242 |       "inputs": []
243 |     }, 
244 |     {
245 |       "op": "null", 
246 |       "name": "bn3_moving_mean", 
247 |       "attr": {"op_type": "custbatchnorm"}, 
248 |       "inputs": []
249 |     }, 
250 |     {
251 |       "op": "null", 
252 |       "name": "bn3_moving_sigma", 
253 |       "attr": {"op_type": "custbatchnorm"}, 
254 |       "inputs": []
255 |     }, 
256 |     {
257 |       "op": "Custom", 
258 |       "name": "bn3", 
259 |       "attr": {"op_type": "custbatchnorm"}, 
260 |       "inputs": [[28, 0, 0], [29, 0, 0], [30, 0, 0], [31, 0, 0], [32, 0, 0]]
261 |     }, 
262 |     {
263 |       "op": "Activation", 
264 |       "name": "relu3", 
265 |       "attr": {"act_type": "relu"}, 
266 |       "inputs": [[33, 0, 0]]
267 |     }, 
268 |     {
269 |       "op": "SliceChannel", 
270 |       "name": "sliced2", 
271 |       "attr": {"num_outputs": "2"}, 
272 |       "inputs": [[34, 0, 0]]
273 |     }, 
274 |     {
275 |       "op": "null", 
276 |       "name": "conv41_weight", 
277 |       "attr": {
278 |         "kernel": "(3, 3)", 
279 |         "num_filter": "192", 
280 |         "stride": "(1, 1)"
281 |       }, 
282 |       "inputs": []
283 |     }, 
284 |     {
285 |       "op": "null", 
286 |       "name": "conv41_bias", 
287 |       "attr": {
288 |         "kernel": "(3, 3)", 
289 |         "num_filter": "192", 
290 |         "stride": "(1, 1)"
291 |       }, 
292 |       "inputs": []
293 |     }, 
294 |     {
295 |       "op": "Convolution", 
296 |       "name": "conv41", 
297 |       "attr": {
298 |         "kernel": "(3, 3)", 
299 |         "num_filter": "192", 
300 |         "stride": "(1, 1)"
301 |       }, 
302 |       "inputs": [[35, 0, 0], [36, 0, 0], [37, 0, 0]]
303 |     }, 
304 |     {
305 |       "op": "null", 
306 |       "name": "conv42_weight", 
307 |       "attr": {
308 |         "kernel": "(3, 3)", 
309 |         "num_filter": "192", 
310 |         "stride": "(1, 1)"
311 |       }, 
312 |       "inputs": []
313 |     }, 
314 |     {
315 |       "op": "null", 
316 |       "name": "conv42_bias", 
317 |       "attr": {
318 |         "kernel": "(3, 3)", 
319 |         "num_filter": "192", 
320 |         "stride": "(1, 1)"
321 |       }, 
322 |       "inputs": []
323 |     }, 
324 |     {
325 |       "op": "Convolution", 
326 |       "name": "conv42", 
327 |       "attr": {
328 |         "kernel": "(3, 3)", 
329 |         "num_filter": "192", 
330 |         "stride": "(1, 1)"
331 |       }, 
332 |       "inputs": [[35, 1, 0], [39, 0, 0], [40, 0, 0]]
333 |     }, 
334 |     {
335 |       "op": "Concat", 
336 |       "name": "conv4", 
337 |       "attr": {"num_args": "2"}, 
338 |       "inputs": [[38, 0, 0], [41, 0, 0]]
339 |     }, 
340 |     {
341 |       "op": "null", 
342 |       "name": "bn4_gamma", 
343 |       "attr": {"op_type": "custbatchnorm"}, 
344 |       "inputs": []
345 |     }, 
346 |     {
347 |       "op": "null", 
348 |       "name": "bn4_beta", 
349 |       "attr": {"op_type": "custbatchnorm"}, 
350 |       "inputs": []
351 |     }, 
352 |     {
353 |       "op": "null", 
354 |       "name": "bn4_moving_mean", 
355 |       "attr": {"op_type": "custbatchnorm"}, 
356 |       "inputs": []
357 |     }, 
358 |     {
359 |       "op": "null", 
360 |       "name": "bn4_moving_sigma", 
361 |       "attr": {"op_type": "custbatchnorm"}, 
362 |       "inputs": []
363 |     }, 
364 |     {
365 |       "op": "Custom", 
366 |       "name": "bn4", 
367 |       "attr": {"op_type": "custbatchnorm"}, 
368 |       "inputs": [[42, 0, 0], [43, 0, 0], [44, 0, 0], [45, 0, 0], [46, 0, 0]]
369 |     }, 
370 |     {
371 |       "op": "Activation", 
372 |       "name": "relu4", 
373 |       "attr": {"act_type": "relu"}, 
374 |       "inputs": [[47, 0, 0]]
375 |     }, 
376 |     {
377 |       "op": "SliceChannel", 
378 |       "name": "sliced3", 
379 |       "attr": {"num_outputs": "2"}, 
380 |       "inputs": [[48, 0, 0]]
381 |     }, 
382 |     {
383 |       "op": "null", 
384 |       "name": "conv51_weight", 
385 |       "attr": {
386 |         "kernel": "(3, 3)", 
387 |         "num_filter": "128", 
388 |         "stride": "(1, 1)"
389 |       }, 
390 |       "inputs": []
391 |     }, 
392 |     {
393 |       "op": "null", 
394 |       "name": "conv51_bias", 
395 |       "attr": {
396 |         "kernel": "(3, 3)", 
397 |         "num_filter": "128", 
398 |         "stride": "(1, 1)"
399 |       }, 
400 |       "inputs": []
401 |     }, 
402 |     {
403 |       "op": "Convolution", 
404 |       "name": "conv51", 
405 |       "attr": {
406 |         "kernel": "(3, 3)", 
407 |         "num_filter": "128", 
408 |         "stride": "(1, 1)"
409 |       }, 
410 |       "inputs": [[49, 0, 0], [50, 0, 0], [51, 0, 0]]
411 |     }, 
412 |     {
413 |       "op": "null", 
414 |       "name": "conv52_weight", 
415 |       "attr": {
416 |         "kernel": "(3, 3)", 
417 |         "num_filter": "128", 
418 |         "stride": "(1, 1)"
419 |       }, 
420 |       "inputs": []
421 |     }, 
422 |     {
423 |       "op": "null", 
424 |       "name": "conv52_bias", 
425 |       "attr": {
426 |         "kernel": "(3, 3)", 
427 |         "num_filter": "128", 
428 |         "stride": "(1, 1)"
429 |       }, 
430 |       "inputs": []
431 |     }, 
432 |     {
433 |       "op": "Convolution", 
434 |       "name": "conv52", 
435 |       "attr": {
436 |         "kernel": "(3, 3)", 
437 |         "num_filter": "128", 
438 |         "stride": "(1, 1)"
439 |       }, 
440 |       "inputs": [[49, 1, 0], [53, 0, 0], [54, 0, 0]]
441 |     }, 
442 |     {
443 |       "op": "Concat", 
444 |       "name": "conv5", 
445 |       "attr": {"num_args": "2"}, 
446 |       "inputs": [[52, 0, 0], [55, 0, 0]]
447 |     }
448 |   ], 
449 |   "arg_nodes": [
450 |     0, 
451 |     1, 
452 |     2, 
453 |     4, 
454 |     5, 
455 |     6, 
456 |     7, 
457 |     12, 
458 |     13, 
459 |     15, 
460 |     16, 
461 |     19, 
462 |     20, 
463 |     21, 
464 |     22, 
465 |     26, 
466 |     27, 
467 |     29, 
468 |     30, 
469 |     31, 
470 |     32, 
471 |     36, 
472 |     37, 
473 |     39, 
474 |     40, 
475 |     43, 
476 |     44, 
477 |     45, 
478 |     46, 
479 |     50, 
480 |     51, 
481 |     53, 
482 |     54
483 |   ], 
484 |   "node_row_ptr": [
485 |     0, 
486 |     1, 
487 |     2, 
488 |     3, 
489 |     4, 
490 |     5, 
491 |     6, 
492 |     7, 
493 |     8, 
494 |     9, 
495 |     10, 
496 |     11, 
497 |     13, 
498 |     14, 
499 |     15, 
500 |     16, 
501 |     17, 
502 |     18, 
503 |     19, 
504 |     20, 
505 |     21, 
506 |     22, 
507 |     23, 
508 |     24, 
509 |     25, 
510 |     26, 
511 |     27, 
512 |     28, 
513 |     29, 
514 |     30, 
515 |     31, 
516 |     32, 
517 |     33, 
518 |     34, 
519 |     35, 
520 |     36, 
521 |     38, 
522 |     39, 
523 |     40, 
524 |     41, 
525 |     42, 
526 |     43, 
527 |     44, 
528 |     45, 
529 |     46, 
530 |     47, 
531 |     48, 
532 |     49, 
533 |     50, 
534 |     51, 
535 |     53, 
536 |     54, 
537 |     55, 
538 |     56, 
539 |     57, 
540 |     58, 
541 |     59, 
542 |     60
543 |   ], 
544 |   "heads": [[56, 0, 0]], 
545 |   "attrs": {"mxnet_version": ["int", 901]}
546 | }


--------------------------------------------------------------------------------
/model/mxmodel_rgb-0001.params:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/model/mxmodel_rgb-0001.params


--------------------------------------------------------------------------------
/model/mxmodel_rgb-symbol.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nodes": [
  3 |     {
  4 |       "op": "null", 
  5 |       "name": "data", 
  6 |       "inputs": []
  7 |     }, 
  8 |     {
  9 |       "op": "null", 
 10 |       "name": "conv1_weight", 
 11 |       "attr": {
 12 |         "kernel": "(11, 11)", 
 13 |         "num_filter": "96", 
 14 |         "stride": "(2, 2)"
 15 |       }, 
 16 |       "inputs": []
 17 |     }, 
 18 |     {
 19 |       "op": "null", 
 20 |       "name": "conv1_bias", 
 21 |       "attr": {
 22 |         "kernel": "(11, 11)", 
 23 |         "num_filter": "96", 
 24 |         "stride": "(2, 2)"
 25 |       }, 
 26 |       "inputs": []
 27 |     }, 
 28 |     {
 29 |       "op": "Convolution", 
 30 |       "name": "conv1", 
 31 |       "attr": {
 32 |         "kernel": "(11, 11)", 
 33 |         "num_filter": "96", 
 34 |         "stride": "(2, 2)"
 35 |       }, 
 36 |       "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]]
 37 |     }, 
 38 |     {
 39 |       "op": "null", 
 40 |       "name": "bn1_gamma", 
 41 |       "attr": {"op_type": "custbatchnorm"}, 
 42 |       "inputs": []
 43 |     }, 
 44 |     {
 45 |       "op": "null", 
 46 |       "name": "bn1_beta", 
 47 |       "attr": {"op_type": "custbatchnorm"}, 
 48 |       "inputs": []
 49 |     }, 
 50 |     {
 51 |       "op": "null", 
 52 |       "name": "bn1_moving_mean", 
 53 |       "attr": {"op_type": "custbatchnorm"}, 
 54 |       "inputs": []
 55 |     }, 
 56 |     {
 57 |       "op": "null", 
 58 |       "name": "bn1_moving_sigma", 
 59 |       "attr": {"op_type": "custbatchnorm"}, 
 60 |       "inputs": []
 61 |     }, 
 62 |     {
 63 |       "op": "Custom", 
 64 |       "name": "bn1", 
 65 |       "attr": {"op_type": "custbatchnorm"}, 
 66 |       "inputs": [[3, 0, 0], [4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 0]]
 67 |     }, 
 68 |     {
 69 |       "op": "Activation", 
 70 |       "name": "relu1", 
 71 |       "attr": {"act_type": "relu"}, 
 72 |       "inputs": [[8, 0, 0]]
 73 |     }, 
 74 |     {
 75 |       "op": "Pooling", 
 76 |       "name": "pool1", 
 77 |       "attr": {
 78 |         "kernel": "(3, 3)", 
 79 |         "pool_type": "max", 
 80 |         "stride": "(2, 2)"
 81 |       }, 
 82 |       "inputs": [[9, 0, 0]]
 83 |     }, 
 84 |     {
 85 |       "op": "SliceChannel", 
 86 |       "name": "sliced1", 
 87 |       "attr": {"num_outputs": "2"}, 
 88 |       "inputs": [[10, 0, 0]]
 89 |     }, 
 90 |     {
 91 |       "op": "null", 
 92 |       "name": "conv21_weight", 
 93 |       "attr": {
 94 |         "kernel": "(5, 5)", 
 95 |         "num_filter": "128", 
 96 |         "stride": "(1, 1)"
 97 |       }, 
 98 |       "inputs": []
 99 |     }, 
100 |     {
101 |       "op": "null", 
102 |       "name": "conv21_bias", 
103 |       "attr": {
104 |         "kernel": "(5, 5)", 
105 |         "num_filter": "128", 
106 |         "stride": "(1, 1)"
107 |       }, 
108 |       "inputs": []
109 |     }, 
110 |     {
111 |       "op": "Convolution", 
112 |       "name": "conv21", 
113 |       "attr": {
114 |         "kernel": "(5, 5)", 
115 |         "num_filter": "128", 
116 |         "stride": "(1, 1)"
117 |       }, 
118 |       "inputs": [[11, 0, 0], [12, 0, 0], [13, 0, 0]]
119 |     }, 
120 |     {
121 |       "op": "null", 
122 |       "name": "conv22_weight", 
123 |       "attr": {
124 |         "kernel": "(5, 5)", 
125 |         "num_filter": "128", 
126 |         "stride": "(1, 1)"
127 |       }, 
128 |       "inputs": []
129 |     }, 
130 |     {
131 |       "op": "null", 
132 |       "name": "conv22_bias", 
133 |       "attr": {
134 |         "kernel": "(5, 5)", 
135 |         "num_filter": "128", 
136 |         "stride": "(1, 1)"
137 |       }, 
138 |       "inputs": []
139 |     }, 
140 |     {
141 |       "op": "Convolution", 
142 |       "name": "conv22", 
143 |       "attr": {
144 |         "kernel": "(5, 5)", 
145 |         "num_filter": "128", 
146 |         "stride": "(1, 1)"
147 |       }, 
148 |       "inputs": [[11, 1, 0], [15, 0, 0], [16, 0, 0]]
149 |     }, 
150 |     {
151 |       "op": "Concat", 
152 |       "name": "conv2", 
153 |       "attr": {"num_args": "2"}, 
154 |       "inputs": [[14, 0, 0], [17, 0, 0]]
155 |     }, 
156 |     {
157 |       "op": "null", 
158 |       "name": "bn2_gamma", 
159 |       "attr": {"op_type": "custbatchnorm"}, 
160 |       "inputs": []
161 |     }, 
162 |     {
163 |       "op": "null", 
164 |       "name": "bn2_beta", 
165 |       "attr": {"op_type": "custbatchnorm"}, 
166 |       "inputs": []
167 |     }, 
168 |     {
169 |       "op": "null", 
170 |       "name": "bn2_moving_mean", 
171 |       "attr": {"op_type": "custbatchnorm"}, 
172 |       "inputs": []
173 |     }, 
174 |     {
175 |       "op": "null", 
176 |       "name": "bn2_moving_sigma", 
177 |       "attr": {"op_type": "custbatchnorm"}, 
178 |       "inputs": []
179 |     }, 
180 |     {
181 |       "op": "Custom", 
182 |       "name": "bn2", 
183 |       "attr": {"op_type": "custbatchnorm"}, 
184 |       "inputs": [[18, 0, 0], [19, 0, 0], [20, 0, 0], [21, 0, 0], [22, 0, 0]]
185 |     }, 
186 |     {
187 |       "op": "Activation", 
188 |       "name": "relu2", 
189 |       "attr": {"act_type": "relu"}, 
190 |       "inputs": [[23, 0, 0]]
191 |     }, 
192 |     {
193 |       "op": "Pooling", 
194 |       "name": "pool2", 
195 |       "attr": {
196 |         "kernel": "(3, 3)", 
197 |         "pool_type": "max", 
198 |         "stride": "(2, 2)"
199 |       }, 
200 |       "inputs": [[24, 0, 0]]
201 |     }, 
202 |     {
203 |       "op": "null", 
204 |       "name": "conv3_weight", 
205 |       "attr": {
206 |         "kernel": "(3, 3)", 
207 |         "num_filter": "384", 
208 |         "stride": "(1, 1)"
209 |       }, 
210 |       "inputs": []
211 |     }, 
212 |     {
213 |       "op": "null", 
214 |       "name": "conv3_bias", 
215 |       "attr": {
216 |         "kernel": "(3, 3)", 
217 |         "num_filter": "384", 
218 |         "stride": "(1, 1)"
219 |       }, 
220 |       "inputs": []
221 |     }, 
222 |     {
223 |       "op": "Convolution", 
224 |       "name": "conv3", 
225 |       "attr": {
226 |         "kernel": "(3, 3)", 
227 |         "num_filter": "384", 
228 |         "stride": "(1, 1)"
229 |       }, 
230 |       "inputs": [[25, 0, 0], [26, 0, 0], [27, 0, 0]]
231 |     }, 
232 |     {
233 |       "op": "null", 
234 |       "name": "bn3_gamma", 
235 |       "attr": {"op_type": "custbatchnorm"}, 
236 |       "inputs": []
237 |     }, 
238 |     {
239 |       "op": "null", 
240 |       "name": "bn3_beta", 
241 |       "attr": {"op_type": "custbatchnorm"}, 
242 |       "inputs": []
243 |     }, 
244 |     {
245 |       "op": "null", 
246 |       "name": "bn3_moving_mean", 
247 |       "attr": {"op_type": "custbatchnorm"}, 
248 |       "inputs": []
249 |     }, 
250 |     {
251 |       "op": "null", 
252 |       "name": "bn3_moving_sigma", 
253 |       "attr": {"op_type": "custbatchnorm"}, 
254 |       "inputs": []
255 |     }, 
256 |     {
257 |       "op": "Custom", 
258 |       "name": "bn3", 
259 |       "attr": {"op_type": "custbatchnorm"}, 
260 |       "inputs": [[28, 0, 0], [29, 0, 0], [30, 0, 0], [31, 0, 0], [32, 0, 0]]
261 |     }, 
262 |     {
263 |       "op": "Activation", 
264 |       "name": "relu3", 
265 |       "attr": {"act_type": "relu"}, 
266 |       "inputs": [[33, 0, 0]]
267 |     }, 
268 |     {
269 |       "op": "SliceChannel", 
270 |       "name": "sliced2", 
271 |       "attr": {"num_outputs": "2"}, 
272 |       "inputs": [[34, 0, 0]]
273 |     }, 
274 |     {
275 |       "op": "null", 
276 |       "name": "conv41_weight", 
277 |       "attr": {
278 |         "kernel": "(3, 3)", 
279 |         "num_filter": "192", 
280 |         "stride": "(1, 1)"
281 |       }, 
282 |       "inputs": []
283 |     }, 
284 |     {
285 |       "op": "null", 
286 |       "name": "conv41_bias", 
287 |       "attr": {
288 |         "kernel": "(3, 3)", 
289 |         "num_filter": "192", 
290 |         "stride": "(1, 1)"
291 |       }, 
292 |       "inputs": []
293 |     }, 
294 |     {
295 |       "op": "Convolution", 
296 |       "name": "conv41", 
297 |       "attr": {
298 |         "kernel": "(3, 3)", 
299 |         "num_filter": "192", 
300 |         "stride": "(1, 1)"
301 |       }, 
302 |       "inputs": [[35, 0, 0], [36, 0, 0], [37, 0, 0]]
303 |     }, 
304 |     {
305 |       "op": "null", 
306 |       "name": "conv42_weight", 
307 |       "attr": {
308 |         "kernel": "(3, 3)", 
309 |         "num_filter": "192", 
310 |         "stride": "(1, 1)"
311 |       }, 
312 |       "inputs": []
313 |     }, 
314 |     {
315 |       "op": "null", 
316 |       "name": "conv42_bias", 
317 |       "attr": {
318 |         "kernel": "(3, 3)", 
319 |         "num_filter": "192", 
320 |         "stride": "(1, 1)"
321 |       }, 
322 |       "inputs": []
323 |     }, 
324 |     {
325 |       "op": "Convolution", 
326 |       "name": "conv42", 
327 |       "attr": {
328 |         "kernel": "(3, 3)", 
329 |         "num_filter": "192", 
330 |         "stride": "(1, 1)"
331 |       }, 
332 |       "inputs": [[35, 1, 0], [39, 0, 0], [40, 0, 0]]
333 |     }, 
334 |     {
335 |       "op": "Concat", 
336 |       "name": "conv4", 
337 |       "attr": {"num_args": "2"}, 
338 |       "inputs": [[38, 0, 0], [41, 0, 0]]
339 |     }, 
340 |     {
341 |       "op": "null", 
342 |       "name": "bn4_gamma", 
343 |       "attr": {"op_type": "custbatchnorm"}, 
344 |       "inputs": []
345 |     }, 
346 |     {
347 |       "op": "null", 
348 |       "name": "bn4_beta", 
349 |       "attr": {"op_type": "custbatchnorm"}, 
350 |       "inputs": []
351 |     }, 
352 |     {
353 |       "op": "null", 
354 |       "name": "bn4_moving_mean", 
355 |       "attr": {"op_type": "custbatchnorm"}, 
356 |       "inputs": []
357 |     }, 
358 |     {
359 |       "op": "null", 
360 |       "name": "bn4_moving_sigma", 
361 |       "attr": {"op_type": "custbatchnorm"}, 
362 |       "inputs": []
363 |     }, 
364 |     {
365 |       "op": "Custom", 
366 |       "name": "bn4", 
367 |       "attr": {"op_type": "custbatchnorm"}, 
368 |       "inputs": [[42, 0, 0], [43, 0, 0], [44, 0, 0], [45, 0, 0], [46, 0, 0]]
369 |     }, 
370 |     {
371 |       "op": "Activation", 
372 |       "name": "relu4", 
373 |       "attr": {"act_type": "relu"}, 
374 |       "inputs": [[47, 0, 0]]
375 |     }, 
376 |     {
377 |       "op": "SliceChannel", 
378 |       "name": "sliced3", 
379 |       "attr": {"num_outputs": "2"}, 
380 |       "inputs": [[48, 0, 0]]
381 |     }, 
382 |     {
383 |       "op": "null", 
384 |       "name": "conv51_weight", 
385 |       "attr": {
386 |         "kernel": "(3, 3)", 
387 |         "num_filter": "128", 
388 |         "stride": "(1, 1)"
389 |       }, 
390 |       "inputs": []
391 |     }, 
392 |     {
393 |       "op": "null", 
394 |       "name": "conv51_bias", 
395 |       "attr": {
396 |         "kernel": "(3, 3)", 
397 |         "num_filter": "128", 
398 |         "stride": "(1, 1)"
399 |       }, 
400 |       "inputs": []
401 |     }, 
402 |     {
403 |       "op": "Convolution", 
404 |       "name": "conv51", 
405 |       "attr": {
406 |         "kernel": "(3, 3)", 
407 |         "num_filter": "128", 
408 |         "stride": "(1, 1)"
409 |       }, 
410 |       "inputs": [[49, 0, 0], [50, 0, 0], [51, 0, 0]]
411 |     }, 
412 |     {
413 |       "op": "null", 
414 |       "name": "conv52_weight", 
415 |       "attr": {
416 |         "kernel": "(3, 3)", 
417 |         "num_filter": "128", 
418 |         "stride": "(1, 1)"
419 |       }, 
420 |       "inputs": []
421 |     }, 
422 |     {
423 |       "op": "null", 
424 |       "name": "conv52_bias", 
425 |       "attr": {
426 |         "kernel": "(3, 3)", 
427 |         "num_filter": "128", 
428 |         "stride": "(1, 1)"
429 |       }, 
430 |       "inputs": []
431 |     }, 
432 |     {
433 |       "op": "Convolution", 
434 |       "name": "conv52", 
435 |       "attr": {
436 |         "kernel": "(3, 3)", 
437 |         "num_filter": "128", 
438 |         "stride": "(1, 1)"
439 |       }, 
440 |       "inputs": [[49, 1, 0], [53, 0, 0], [54, 0, 0]]
441 |     }, 
442 |     {
443 |       "op": "Concat", 
444 |       "name": "conv5", 
445 |       "attr": {"num_args": "2"}, 
446 |       "inputs": [[52, 0, 0], [55, 0, 0]]
447 |     }
448 |   ], 
449 |   "arg_nodes": [
450 |     0, 
451 |     1, 
452 |     2, 
453 |     4, 
454 |     5, 
455 |     6, 
456 |     7, 
457 |     12, 
458 |     13, 
459 |     15, 
460 |     16, 
461 |     19, 
462 |     20, 
463 |     21, 
464 |     22, 
465 |     26, 
466 |     27, 
467 |     29, 
468 |     30, 
469 |     31, 
470 |     32, 
471 |     36, 
472 |     37, 
473 |     39, 
474 |     40, 
475 |     43, 
476 |     44, 
477 |     45, 
478 |     46, 
479 |     50, 
480 |     51, 
481 |     53, 
482 |     54
483 |   ], 
484 |   "node_row_ptr": [
485 |     0, 
486 |     1, 
487 |     2, 
488 |     3, 
489 |     4, 
490 |     5, 
491 |     6, 
492 |     7, 
493 |     8, 
494 |     9, 
495 |     10, 
496 |     11, 
497 |     13, 
498 |     14, 
499 |     15, 
500 |     16, 
501 |     17, 
502 |     18, 
503 |     19, 
504 |     20, 
505 |     21, 
506 |     22, 
507 |     23, 
508 |     24, 
509 |     25, 
510 |     26, 
511 |     27, 
512 |     28, 
513 |     29, 
514 |     30, 
515 |     31, 
516 |     32, 
517 |     33, 
518 |     34, 
519 |     35, 
520 |     36, 
521 |     38, 
522 |     39, 
523 |     40, 
524 |     41, 
525 |     42, 
526 |     43, 
527 |     44, 
528 |     45, 
529 |     46, 
530 |     47, 
531 |     48, 
532 |     49, 
533 |     50, 
534 |     51, 
535 |     53, 
536 |     54, 
537 |     55, 
538 |     56, 
539 |     57, 
540 |     58, 
541 |     59, 
542 |     60
543 |   ], 
544 |   "heads": [[56, 0, 0]], 
545 |   "attrs": {"mxnet_version": ["int", 901]}
546 | }


--------------------------------------------------------------------------------
/run_tracker.py:
--------------------------------------------------------------------------------
  1 | import mxnet as mx
  2 | import numpy as np
  3 | import CustomMxOp
  4 | import cv2
  5 | from minpy.core import Function
  6 | import os
  7 | import glob
  8 | from utils import GetRectange
  9 | 
 10 | adjust_f = 0.0010
 11 | adjust_b = -2.1484
 12 | 
 13 | def imshow(img, winname="display", wk=0):
 14 |     show = img
 15 |     if show.dtype == np.float32:
 16 |         show = img.astype(np.uint8)
 17 |     #print show.dtype
 18 |     cv2.imshow(winname, show)
 19 |     cv2.waitKey(wk)
 20 |     
 21 | def avoid_empty_position(r_max, c_max, params):
 22 |     if r_max is None:
 23 |         r_max = np.ceil(params['scoreSize']/2.)
 24 |     if c_max is None:
 25 |         c_max = np.ceil(params['scoreSize']/2.)
 26 |     return (r_max, c_max)
 27 | 
 28 | def cross_correlation_factory(data_shape, kernel_shape):
 29 |     batch, num_filter, y, x = kernel_shape
 30 |     net = mx.sym.Variable('x')
 31 |     net = mx.sym.Convolution(net, name='conv', kernel=(y, x), num_filter=1, no_bias=True)
 32 |     conv = Function(net, input_shapes={'x': data_shape})
 33 |     return conv
 34 | 
 35 | def cross_correlation(data, kernel):
 36 |     batch, num_filter, y, x = kernel.shape
 37 |     net = mx.sym.Variable('x')
 38 |     net = mx.sym.Convolution(net, name='conv', kernel=(y, x), num_filter=1, no_bias=True)
 39 |     conv = Function(net, input_shapes={'x': data.shape})    
 40 |     #print conv._param_shapes
 41 |     res = conv(x=data, conv_weight=kernel)
 42 |     return res
 43 |     
 44 | def tracker_eval(net_x, s_x, z_features, x_crops, targetPosition, window, p, Conv=None):
 45 |     """
 46 |     runs a  forward pass of the search-region branch of the pre-trained Fully-Convolutional Siamese,
 47 |     reusing the feature of the exemplar z computed at the first frame.
 48 |     """
 49 |     # forward pass, using the pyramid of scaled crops as a "batch"
 50 |     x_crops = adjust_data(x_crops)
 51 |     data_iter = mx.io.NDArrayIter(x_crops)
 52 |     #net_x.bind(data_shapes=data_iter.provide_data, for_training=False)
 53 |     x_features = net_x.predict(data_iter)
 54 |     if Conv is None:
 55 |         Conv = cross_correlation_factory(x_features.shape, z_features.shape)    
 56 |     responseMaps = Conv(x=x_features, conv_weight=z_features).asnumpy()
 57 |     responseMaps = responseMaps * adjust_f + adjust_b
 58 |     upsz = p['scoreSize'] * p['responseUp']
 59 |     #responseMapsUP = np.zeros((upsz, upsz, p['numScale']), dtype=np.float32)
 60 |     responseMapsUP = []
 61 |     # Choose the scale whose response map has the highest peak
 62 |     if p['numScale'] > 1:
 63 |         currentScaleID = int(p['numScale']/2)
 64 |         bestScale = currentScaleID
 65 |         bestPeak = -float('Inf')
 66 |         for s in range(p['numScale']):
 67 |             if p['responseUp'] > 1:
 68 |                 # upsample to improve accuracy
 69 |                 responseMapsUP.append(cv2.resize(responseMaps[s,0,:,:,], (upsz, upsz), interpolation=cv2.INTER_CUBIC))
 70 |             else:
 71 |                 responseMapsUP.append(responseMaps[s,0,:,:,])
 72 |             thisResponse = responseMapsUP[-1]
 73 |             # penalize change of scale
 74 |             if s != currentScaleID:
 75 |                 thisResponse = thisResponse * p['scalePenalty']
 76 |             thisPeak = np.max(thisResponse)
 77 |             if thisPeak > bestPeak:
 78 |                 bestPeak = thisPeak
 79 |                 bestScale = s
 80 |         responseMap = responseMapsUP[bestScale]
 81 |     else:
 82 |         #responseMap = responseMapsUP
 83 |         responseMap = cv2.resize(responseMaps[0,0,:,:,], (upsz, upsz), interpolation=cv2.INTER_CUBIC)
 84 |         bestScale = 0
 85 |     
 86 |     # make the response map sum to 1
 87 |     responseMap = responseMap - np.min(responseMap)
 88 |     responseMap = responseMap / np.sum(responseMap)
 89 |     # apply windowing
 90 |     responseMap = (1-p['wInfluence'])*responseMap + p['wInfluence']*window
 91 |     r_max, c_max = np.unravel_index(responseMap.argmax(), responseMap.shape)
 92 |     #r_max, c_max = avoid_empty_position(r_max, c_max, p)
 93 |     p_corr = np.array((r_max, c_max))
 94 |     # Convert to crop-relative coordinates to frame coordinates
 95 |     # displacement from the center in instance final representation ...
 96 |     disp_instanceFinal = p_corr - int(p['scoreSize']*p['responseUp']/2)
 97 |     #  ... in instance input ...
 98 |     disp_instanceInput = disp_instanceFinal * p['totalStride'] / p['responseUp']
 99 |     # ... in instance original crop (in frame coordinates)
100 |     disp_instanceFrame = disp_instanceInput * s_x / p['instanceSize']
101 |     # position within frame in frame coordinates
102 |     newTargetPosition = targetPosition + disp_instanceFrame
103 |     
104 |     return newTargetPosition, bestScale
105 | 
106 | def config_params():
107 |     p = {}
108 |     # These are the default hyper-params for SiamFC-3S
109 |     # The ones for SiamFC (5 scales) are in params-5s.txt
110 |     p['numScale'] = 3
111 |     p['scaleStep'] = 1.0375
112 |     p['scalePenalty'] = 0.9745
113 |     p['scaleLR'] = 0.59 # damping factor for scale update
114 |     p['responseUp'] = 16 # upsampling the small 17x17 response helps with the accuracy
115 |     p['windowing'] = 'cosine' # to penalize large displacements
116 |     p['wInfluence'] = 0.176 # windowing influence (in convex sum)
117 |     p['net_base_path'] = 'model/'
118 |     p['net'] = 'mxmodel_bgr'
119 |     # execution, visualization, benchmark
120 |     p['seq_base_path'] = 'images/demo-sequences/'
121 |     p['video'] = 'vot15_bag'
122 |     p['visualization'] = False
123 |     p['gpus'] = 0
124 |     p['bbox_output'] = False
125 |     p['fout'] = -1
126 |     # Params from the network architecture, have to be consistent with the training
127 |     p['exemplarSize'] = 127
128 |     p['instanceSize'] = 255
129 |     p['scoreSize'] = 17
130 |     p['totalStride'] = 8
131 |     p['contextAmount'] = 0.5
132 |     p['subMean'] = False
133 |     
134 |     return p
135 | 
136 | def get_axis_aligned_BB(region):
137 |     """
138 |     computes axis-aligned bbox with same area as the rotated one (REGION)
139 |     """
140 |     region = np.array(region)
141 |     nv = region.size
142 |     assert (nv==8 or nv==4)
143 |     if nv==8:
144 |         cx = np.mean(region[0::2])
145 |         cy = np.mean(region[1::2])
146 |         x1 = np.min(region[0::2])
147 |         x2 = np.max(region[0::2])
148 |         y1 = np.min(region[1::2])
149 |         y2 = np.max(region[1::2])
150 |         A1 = np.linalg.norm(region[0:2]-region[2:4]) * np.linalg.norm(region[2:4]-region[4:6])
151 |         A2 = (x2 - x1) * (y2 - y1)
152 |         s = np.sqrt(A1/A2)
153 |         w = s * (x2 - x1) + 1
154 |         h = s * (y2 - y1) + 1
155 |     else:
156 |         x = region[0]
157 |         y = region[1]
158 |         w = region[3]
159 |         h = region[4]
160 |         cx = x + w / 2
161 |         cy = y + h / 2
162 |     return (cx-1, cy-1, w, h)
163 | 
164 | def frame_generator(vpath, mode):
165 |     if mode == "images":
166 |         def frames():
167 |             for img in glob.glob(os.path.join(vpath, "*.jpg")):
168 |                 yield cv2.imread(img).astype(np.float32)
169 |         return frames()
170 |     elif mode == "video" or mode == "camera":
171 |         def frames():
172 |             cap = cv2.VideoCapture(vpath)
173 |             while 1:
174 |                 ret, frame = cap.read()
175 |                 if ret:
176 |                     yield frame.astype(np.float32)
177 |                 else:
178 |                     break
179 |         return frames()      
180 | 
181 | def load_video_info(base_path, video):
182 |     # full path to the video's files
183 |     video_path = os.path.join(base_path, video, "imgs/")
184 |     # load ground truth from text file
185 |     ground_truth_path = os.path.join(base_path, video, "groundtruth.txt")
186 |     ground_truth = open(ground_truth_path)
187 |     raw1 = ground_truth.readline()
188 |     #print "raw1", raw1
189 |     region = map(float, raw1.strip().split(","))
190 |     cx, cy, w, h = get_axis_aligned_BB(region)
191 |     pos = (cy, cx)
192 |     target_sz = (h, w)
193 |     
194 |     return frame_generator(video_path, mode="images"), np.array(pos), np.array(target_sz)
195 | 
196 | def load_camera(device):
197 |     cap = cv2.VideoCapture(device)
198 |     rector = GetRectange()
199 |     while 1:
200 |         ready, frame = cap.read()
201 |         if not ready:
202 |             print "device", device, "is not ready"
203 |         cv2.imshow("frame", frame)
204 |         key = cv2.waitKey(1)
205 |         if key != -1:
206 |             break
207 |     pos, target_sz = rector.getRect(frame)
208 |     def frames():
209 |         while 1:
210 |             ret, frame = cap.read()
211 |             if ret:
212 |                 yield frame.astype(np.float32)        
213 |             else:
214 |                 print "device", device, "is not ready"
215 |     return frames(), frame, np.array(pos), np.array(target_sz)
216 | 
217 | def get_subwindow_tracking(im, pos, model_sz, original_sz, avgChans):
218 |     """
219 |     Obtain image sub-window, padding with avg channel if area goes outside of border
220 |     """
221 |     if original_sz is None:
222 |         original_sz = model_sz
223 |     sz = original_sz
224 |     im_sz = im.shape
225 |     # make sure the size is not too small
226 |     assert min(im_sz[:2]) > 2, "the size is too small"
227 |     c = (np.array(sz) + 1) / 2
228 |     
229 |     # check out-of-bounds coordinates, and set them to black
230 |     context_xmin = round(pos[1] - c[1])
231 |     context_xmax = context_xmin + sz[1] - 1
232 |     context_ymin = round(pos[0] - c[0])
233 |     context_ymax = context_ymin + sz[0] - 1
234 |     left_pad = max(0, int(-context_xmin))
235 |     top_pad = max(0, int(-context_ymin))
236 |     right_pad = max(0, int(context_xmax - im_sz[1] + 1))
237 |     bottom_pad = max(0, int(context_ymax - im_sz[0] + 1))
238 |     
239 |     context_xmin = int(context_xmin + left_pad)
240 |     context_xmax = int(context_xmax + left_pad)
241 |     context_ymin = int(context_ymin + top_pad)
242 |     context_ymax = int(context_ymax + top_pad)
243 |     
244 |     if top_pad or left_pad or bottom_pad or right_pad:
245 |         b = np.pad(im[:,:,0], ((top_pad,bottom_pad),(left_pad,right_pad)), mode='constant', constant_values=avgChans[0])
246 |         g = np.pad(im[:,:,1], ((top_pad,bottom_pad),(left_pad,right_pad)), mode='constant', constant_values=avgChans[1])
247 |         r = np.pad(im[:,:,2], ((top_pad,bottom_pad),(left_pad,right_pad)), mode='constant', constant_values=avgChans[2])    
248 |         im = cv2.merge((b,g,r))
249 |         #imshow(im)
250 |         
251 |     im_patch_original = im[context_ymin:context_ymax+1, context_xmin:context_xmax+1, :]
252 |     if not np.array_equal(model_sz, original_sz):
253 |         im_patch = cv2.resize(im_patch_original, model_sz)
254 |     else:
255 |         im_patch = im_patch_original
256 |     
257 |     return im_patch, im_patch_original        
258 | 
259 | def adjust_data(data):
260 |     """
261 |         adjust the input from (h, w, c) to ( 1, c, h, w) for network input
262 |     
263 |     Parameters:
264 |     ----------
265 |         in_data: numpy array of shape (h, w, c) or (n, h, w, c)
266 |             input data
267 |     Returns:
268 |     -------
269 |         out_data: numpy array of shape (1, c, h, w) or (n, c, h, w)
270 |     """    
271 |     if data.dtype is not np.dtype('float32'):
272 |         data = data.astype(np.float32)
273 |         print "convert to float32"
274 |     
275 |     if len(data.shape) < 4:
276 |         data = np.expand_dims(data, axis=0)
277 |     data = np.moveaxis(data, -1, -3)
278 |     
279 |     return data
280 | 
281 | def make_scale_pyramid(im, targetPosition, in_side_scaled, out_side, avgChans, stats, p):
282 |     """
283 |     computes a pyramid of re-scaled copies of the target (centered on TARGETPOSITION)
284 |     and resizes them to OUT_SIDE. If crops exceed image boundaries they are padded with AVGCHANS.
285 |     
286 |     """
287 |     in_side_scaled = np.round(in_side_scaled)
288 |     max_target_side = int(round(in_side_scaled[-1]))
289 |     min_target_side = int(round(in_side_scaled[0]))
290 |     beta = out_side / float(min_target_side)
291 |     # size_in_search_area = beta * size_in_image
292 |     # e.g. out_side = beta * min_target_side
293 |     search_side = int(round(beta * max_target_side))
294 |     search_region, _ = get_subwindow_tracking(im, targetPosition, (search_side, search_side), (max_target_side, max_target_side), avgChans)
295 |     if p['subMean']:
296 |         pass
297 |     assert round(beta*min_target_side) == int(out_side)
298 |     
299 |     tmp_list = []
300 |     tmp_pos = ((search_side-1)/2., (search_side-1)/2.)
301 |     for s in range(p['numScale']):
302 |         target_side = round(beta * in_side_scaled[s])    
303 |         tmp_region, _ = get_subwindow_tracking(search_region, tmp_pos, (out_side, out_side), (target_side, target_side), avgChans)
304 |         tmp_list.append(tmp_region)
305 |         
306 |     pyramid = np.stack(tmp_list)
307 |     
308 |     return pyramid
309 | 
310 | def tracker(demo=True):
311 |     p = config_params()
312 |     # Load two copies of the pre-trained network
313 |     net_z = mx.mod.Module.load(p['net_base_path']+p['net'], 1, context=mx.gpu(0))
314 |     data_iter = mx.io.NDArrayIter(data=np.zeros((1,3,p['exemplarSize'],p['exemplarSize'])))
315 |     net_z.bind(data_shapes=data_iter.provide_data, for_training=False)    
316 |     net_x = mx.mod.Module.load(p['net_base_path']+p['net'], 1, context=mx.gpu(0))
317 |     data_iter = mx.io.NDArrayIter(data=np.zeros((3,3,p['instanceSize'],p['instanceSize'])))
318 |     net_x.bind(data_shapes=data_iter.provide_data, for_training=False)    
319 |     Conv = cross_correlation_factory((3,256,22,22), (1,256,6,6))
320 |     
321 |     if demo:
322 |         imgFiles, targetPosition, targetSize = load_video_info(p['seq_base_path'], p['video'])
323 |         im = imgFiles.next()
324 |     else:
325 |         imgFiles, im, targetPosition, targetSize = load_camera(0)
326 |     
327 |     wc_z = targetSize[1] + p['contextAmount']*np.sum(targetSize)
328 |     hc_z = targetSize[0] + p['contextAmount']*np.sum(targetSize)
329 |     s_z = np.sqrt(wc_z*hc_z)
330 |     scale_z = p['exemplarSize'] / s_z
331 |     
332 |     
333 |     d_search = (p['instanceSize'] - p['exemplarSize']) / 2
334 |     pad = d_search / scale_z
335 |     s_x = s_z + 2*pad
336 |     
337 |     # arbitrary scale saturation
338 |     min_s_x = 0.2*s_x
339 |     max_s_x = 5*s_x
340 |     
341 |     winsz = p['scoreSize'] * p['responseUp']
342 |     if p['windowing'] == 'cosine':
343 |         hann = np.hanning(winsz).reshape(winsz, 1)
344 |         window = hann.dot(hann.T)
345 |     elif p['windowing'] == 'uniform':
346 |         window = np.ones((winsz, winsz), dtype=float32)
347 |         
348 |     # make the window sum 1
349 |     window = window / np.sum(window)
350 |     scales = np.array([p['scaleStep'] ** i for i in range(int(np.ceil(p['numScale']/2.)-p['numScale']), int(np.floor(p['numScale']/2)+1))])
351 |     
352 |     # prepare for first frame    
353 |     # get avg for padding
354 |     avgChans = np.mean(im, axis=(0,1))            
355 |     # initialize the exemplar
356 |     z_crop, _ = get_subwindow_tracking(im, targetPosition, (p['exemplarSize'],p['exemplarSize']), (round(s_z), round(s_z)), avgChans)
357 |     #imshow(z_crop)
358 |     if p['subMean']:
359 |         pass
360 |     # evaluate the offline-trained network for exemplar z features
361 |     data_iter = mx.io.NDArrayIter(adjust_data(z_crop))
362 |     z_features = net_z.predict(data_iter)          
363 | 
364 |     for i, im in enumerate(imgFiles):
365 |         scaledInstance = s_x * scales
366 |         scaledTarget = np.array([ targetSize*scale for scale in scales ])
367 |         # extract scaled crops for search region x at previous target position
368 |         x_crops = make_scale_pyramid(im, targetPosition, scaledInstance, p['instanceSize'], avgChans, None, p)
369 |         # evaluate the offline-trained network for exemplar x features
370 |         newTargetPosition, newScale = tracker_eval(net_x, round(s_x), z_features, x_crops, targetPosition, window, p, Conv)
371 |         targetPosition = newTargetPosition
372 |         # scale damping and saturation
373 |         s_x = max(min_s_x, min(max_s_x, (1-p['scaleLR'])*s_x + p['scaleLR']*scaledInstance[newScale]))
374 |         targetSize = (1-p['scaleLR'])*targetSize + p['scaleLR']*scaledTarget[newScale]            
375 |  
376 |         rectPosition = targetPosition - targetSize / 2.
377 |         tl = tuple(np.round(rectPosition).astype(int)[::-1])
378 |         br = tuple(np.round(rectPosition+targetSize).astype(int)[::-1])
379 |         im_draw = im.astype(np.uint8)
380 |         cv2.rectangle(im_draw, tl, br, (0, 255, 255), thickness=3)
381 |         cv2.imshow("tracking", im_draw)
382 |         cv2.waitKey(1)
383 |         
384 |     
385 | if __name__ == "__main__":
386 |     tracker()
387 |     
388 |     


--------------------------------------------------------------------------------
/transfer_model.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.io as sio
  3 | import mxnet as mx
  4 | import cPickle as pickle
  5 | import CustomMxOp
  6 | 
  7 | def move_weight_axis(x):
  8 |     return np.moveaxis(np.moveaxis(x, 2, 0), 3, 0)
  9 | 
 10 | def load_model_from_matlab(mat_model_path, raw_model_path):
 11 |     mdata = sio.loadmat(mat_model_path)
 12 |     mdata = mdata['model'][0]
 13 |     n_params = len(mdata)
 14 |     print n_params
 15 | 
 16 |     need_move_axis = ['conv1f', 'conv2f', 'conv3f', 'conv4f', 'conv5f']
 17 |     
 18 |     model_dict = {}
 19 |     
 20 |     for param in mdata:
 21 |         name = param[0][0]
 22 |         data = param[1]
 23 |         if name in need_move_axis:
 24 |             data = move_weight_axis(data)
 25 |         elif not name.endswith('x'):
 26 |             data = data.flatten()
 27 |         
 28 |         print name, data.shape
 29 |         
 30 |         model_dict[name] = data
 31 |     
 32 |     pickle.dump(model_dict, open(raw_model_path, mode='wb'), protocol=pickle.HIGHEST_PROTOCOL)  
 33 |     
 34 | def get_sym_siamese_fc():
 35 |     # conv1
 36 |     data = mx.sym.Variable('data')
 37 |     net = mx.sym.Convolution(data, name='conv1', kernel=(11,11), num_filter=96, stride=(2,2))
 38 |     net = mx.sym.Custom(net, name='bn1', op_type='custbatchnorm')
 39 |     net = mx.sym.Activation(net, name='relu1', act_type='relu')   
 40 |     net = mx.sym.Pooling(net, name='pool1', kernel=(3,3), pool_type='max', stride=(2,2))
 41 |     
 42 |     # conv2
 43 |     nets = mx.sym.SliceChannel(net, num_outputs=2, name="sliced1")
 44 |     net1 = mx.sym.Convolution(nets[0], name='conv21', kernel=(5,5), num_filter=128, stride=(1,1))
 45 |     net2 = mx.sym.Convolution(nets[1], name='conv22', kernel=(5,5), num_filter=128, stride=(1,1))
 46 |     net = mx.sym.Concat(net1, net2, name="conv2")
 47 |     net = mx.sym.Custom(net, name='bn2', op_type='custbatchnorm')
 48 |     net = mx.sym.Activation(net, name='relu2', act_type='relu')
 49 |     net = mx.sym.Pooling(net, name='pool2', kernel=(3,3), pool_type='max', stride=(2,2))
 50 |     
 51 |     # conv3
 52 |     net = mx.sym.Convolution(net, name='conv3', kernel=(3,3), num_filter=384, stride=(1,1))
 53 |     net = mx.sym.Custom(net, name='bn3', op_type='custbatchnorm')
 54 |     net = mx.sym.Activation(net, name='relu3', act_type='relu')    
 55 |     
 56 |     # conv4
 57 |     nets = mx.sym.SliceChannel(net, num_outputs=2, name="sliced2")
 58 |     net1 = mx.sym.Convolution(nets[0], name='conv41', kernel=(3,3), num_filter=192, stride=(1,1))
 59 |     net2 = mx.sym.Convolution(nets[1], name='conv42', kernel=(3,3), num_filter=192, stride=(1,1))
 60 |     net = mx.sym.Concat(net1, net2, name="conv4")
 61 |     net = mx.sym.Custom(net, name='bn4', op_type='custbatchnorm')
 62 |     net = mx.sym.Activation(net, name='relu4', act_type='relu')    
 63 |     
 64 |     # conv5
 65 |     nets = mx.sym.SliceChannel(net, num_outputs=2, name="sliced3")
 66 |     net1 = mx.sym.Convolution(nets[0], name='conv51', kernel=(3,3), num_filter=128, stride=(1,1))
 67 |     net2 = mx.sym.Convolution(nets[1], name='conv52', kernel=(3,3), num_filter=128, stride=(1,1))
 68 |     net = mx.sym.Concat(net1, net2, name="conv5")
 69 |     
 70 |     return net
 71 | 
 72 | def gen_mx_model(raw_model_path, mx_model_path, mode="rgb"):
 73 |     net = get_sym_siamese_fc()        
 74 |     
 75 |     model = mx.mod.Module(net)
 76 |     data_iter = mx.io.NDArrayIter(data=np.zeros((1,3,127,127)))
 77 |     model.bind(data_shapes=data_iter.provide_data)
 78 |     
 79 |     raw_model = pickle.load(open(raw_model_path, "rb"))    
 80 | 
 81 |     if mode == "bgr":
 82 |         #print "conv1 shape", raw_model['conv1f'].shape
 83 |         #print raw_model['conv1f'][0,0,0,0], raw_model['conv1f'][0,2,0,0]
 84 |         # swap channels
 85 |         raw_model['conv1f'][:, 0, :, :], raw_model['conv1f'][:, 2, :, :] = raw_model['conv1f'][:, 2, :, :], raw_model['conv1f'][:, 0, :, :].copy()                
 86 |         #print raw_model['conv1f'][0,0,0,0], raw_model['conv1f'][0,2,0,0]
 87 |         
 88 |     arg_params = {
 89 |         "conv1_weight": mx.nd.array(raw_model['conv1f']),
 90 |         "conv1_bias":  mx.nd.array(raw_model['conv1b']),
 91 |         "bn1_beta":  mx.nd.array(raw_model['bn1b']),
 92 |         "bn1_gamma":  mx.nd.array(raw_model['bn1m']),
 93 |         "bn1_moving_mean": mx.nd.array(raw_model['bn1x'][:,0]),
 94 |         "bn1_moving_sigma" : mx.nd.array(raw_model['bn1x'][:,1]),    
 95 |         
 96 |         "conv21_weight": mx.nd.array(raw_model['conv2f'][:128]),
 97 |         "conv21_bias": mx.nd.array(raw_model['conv2b'][:128]),
 98 |         "conv22_weight": mx.nd.array(raw_model['conv2f'][128:]),
 99 |         "conv22_bias": mx.nd.array(raw_model['conv2b'][128:]),   
100 |         "bn2_beta":  mx.nd.array(raw_model['bn2b']),
101 |         "bn2_gamma":  mx.nd.array(raw_model['bn2m']),
102 |         "bn2_moving_mean": mx.nd.array(raw_model['bn2x'][:,0]),
103 |         "bn2_moving_sigma" : mx.nd.array(raw_model['bn2x'][:,1]),    
104 |         
105 |         "conv3_weight": mx.nd.array(raw_model['conv3f']),
106 |         "conv3_bias":  mx.nd.array(raw_model['conv3b']),
107 |         "bn3_beta":  mx.nd.array(raw_model['bn3b']),
108 |         "bn3_gamma":  mx.nd.array(raw_model['bn3m']),
109 |         "bn3_moving_mean": mx.nd.array(raw_model['bn3x'][:,0]),
110 |         "bn3_moving_sigma" : mx.nd.array(raw_model['bn3x'][:,1]),
111 |         
112 |         "conv41_weight": mx.nd.array(raw_model['conv4f'][:192]),
113 |         "conv41_bias": mx.nd.array(raw_model['conv4b'][:192]),
114 |         "conv42_weight": mx.nd.array(raw_model['conv4f'][192:]),
115 |         "conv42_bias": mx.nd.array(raw_model['conv4b'][192:]),   
116 |         "bn4_beta":  mx.nd.array(raw_model['bn4b']),
117 |         "bn4_gamma":  mx.nd.array(raw_model['bn4m']),
118 |         "bn4_moving_mean": mx.nd.array(raw_model['bn4x'][:,0]),
119 |         "bn4_moving_sigma" : mx.nd.array(raw_model['bn4x'][:,1]),      
120 |         
121 |         "conv51_weight": mx.nd.array(raw_model['conv5f'][:128]),
122 |         "conv51_bias": mx.nd.array(raw_model['conv5b'][:128]),
123 |         "conv52_weight": mx.nd.array(raw_model['conv5f'][128:]),
124 |         "conv52_bias": mx.nd.array(raw_model['conv5b'][128:]),         
125 |         }
126 | 
127 |     mx_model_path = mx_model_path + "_" + mode
128 |     model.init_params(arg_params=arg_params)
129 |         
130 |     #out_params = model.get_params()
131 |     #print out_params
132 |     
133 |     model.save_checkpoint(prefix=mx_model_path, epoch=1)   
134 |     
135 | 
136 | def adjust_data(data):
137 |     """
138 |         adjust the input from (h, w, c) to ( 1, c, h, w) for network input
139 | 
140 |     Parameters:
141 |     ----------
142 |         in_data: numpy array of shape (h, w, c)
143 |             input data
144 |     Returns:
145 |     -------
146 |         out_data: numpy array of shape (1, c, h, w)
147 |     """
148 |     
149 |     if data.dtype is not np.dtype('float32'):
150 |         data = data.astype(np.float32)
151 |         print "convert to float32"
152 |     
153 |     data = np.expand_dims(data, axis=0)
154 |     data = np.moveaxis(data, 3, 1)
155 |     
156 |     return data
157 | 
158 | def test_bgr_model():
159 |     mx_model_path = "model/mxmodel_bgr"
160 |     model = mx.model.FeedForward.load(mx_model_path, 1, ctx=mx.cpu(0))
161 |     import cv2
162 |     img = cv2.imread("images/z_crop.jpg")
163 |     
164 |     #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
165 |     
166 |     img = adjust_data(img)
167 |     res = model.predict(img)
168 |     print res.shape
169 |     print res[0][0]
170 | 
171 | def test_model(mx_model_path, mode="rgb"):
172 |     mx_model_path = mx_model_path + "_" + mode
173 |     import time
174 |     model = mx.model.FeedForward.load(mx_model_path, 1, ctx=mx.cpu(0))
175 | 
176 |     z_crop = sio.loadmat("data/z_crop.mat")["z_crop"]
177 |     
178 |     #import cv2
179 |     #draw = z_crop.copy().astype(np.uint8)
180 |     #draw = cv2.cvtColor(draw, cv2.COLOR_BGR2RGB)
181 |     #cv2.imshow("show", draw)
182 |     #cv2.waitKey(0)
183 |     
184 |     z_crop = np.expand_dims(z_crop, axis=0)
185 |     z_crop = np.moveaxis(z_crop, 3, 1)    
186 |     
187 |     print "z_crop", z_crop.shape
188 |     time0 = time.time()
189 |     res = model.predict(z_crop)
190 |     print "time used", time.time() - time0   
191 |     
192 |     print res[0][0]
193 |     
194 | if __name__ == "__main__":
195 |     mat_model_path = "model/model.mat"
196 |     raw_model_path = "model/model_dict.pkl"
197 |     mx_model_path = "model/mxmodel"
198 |     
199 |     #load_model_from_matlab(mat_model_path, raw_model_path)
200 |     #gen_mx_model(raw_model_path, mx_model_path, mode="rgb")
201 |     #test_model(mx_model_path, "rgb")
202 |     
203 |     gen_mx_model(raw_model_path, mx_model_path, mode="bgr")
204 |     #test_bgr_model()
205 | 
206 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | 
 3 | clicked = False
 4 | P1 = (0, 0)
 5 | P2 = (0, 0)
 6 | 
 7 | def onMouse(event, x, y, flags, param):
 8 |     if event == cv2.EVENT_LBUTTONDOWN:
 9 |         param['clicked'] = True
10 |         param['P1'] = (x, y)
11 |         param['P2'] = (x, y)
12 |     elif event == cv2.EVENT_MOUSEMOVE:
13 |         if param['clicked']:
14 |             param['P2'] = (x, y)
15 |     elif event == cv2.EVENT_LBUTTONUP:
16 |         param['p2'] = (x, y)
17 |         param['clicked'] = False
18 | 
19 | class GetRectange:
20 |     def __init__(self):
21 |         self.onClick = False
22 |         
23 |     def getRect(self, img):
24 |         winname = "get rect"       
25 |         param = {}
26 |         param['P1'] = (0, 0)
27 |         param['P2'] = (0, 0)
28 |         param['clicked'] = False
29 |         cv2.imshow(winname, img)
30 |         cv2.setMouseCallback(winname, onMouse, param)
31 |         
32 |         while 1:
33 |             key = cv2.waitKey(1)
34 |             if param['clicked']:
35 |                 img_clone = img.copy()
36 |                 cv2.rectangle(img_clone, param['P1'], param['P2'], (0, 255, 0))
37 |                 cv2.imshow(winname, img_clone)
38 |                 self.onClick = True
39 |             if self.onClick and not param['clicked']:
40 |                 self.onClick = False
41 |                 left = min(param['P1'][0], param['P2'][0])
42 |                 right = max(param['P1'][0], param['P2'][0])
43 |                 top = min(param['P1'][1], param['P2'][1])
44 |                 dowm = max(param['P1'][1], param['P2'][1])
45 |                 cv2.destroyWindow(winname)
46 |                 h = dowm-top+1
47 |                 w = right-left+1
48 |                 return (top+h/2., left+w/2.),  (h, w)
49 | 
50 | if __name__ == "__main__":
51 |     cap = cv2.VideoCapture(0)
52 |     rector = GetRectange()
53 |     while 1:
54 |         ready, frame = cap.read()
55 |         if not ready:
56 |             print "device", device, "is not ready"
57 |         cv2.imshow("frame", frame)        
58 |         key = cv2.waitKey(1)
59 |         if key != -1:
60 |             break    
61 |     rect = rector.getRect(frame)
62 |     print rect


--------------------------------------------------------------------------------