├── .gitignore ├── CustomMxOp.py ├── README.md ├── images └── demo-sequences │ └── vot15_bag.7z ├── model ├── model.mat ├── model_dict.pkl ├── mxmodel_bgr-0001.params ├── mxmodel_bgr-symbol.json ├── mxmodel_rgb-0001.params └── mxmodel_rgb-symbol.json ├── run_tracker.py ├── transfer_model.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | test_model.py -------------------------------------------------------------------------------- /CustomMxOp.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import minpy.numpy as np 3 | 4 | class CustBatchNorm(mx.operator.CustomOp): 5 | def forward(self, is_train, req, in_data, out_data, aux): 6 | #mx.nd.add(lhs, rhs) 7 | x = mx.nd.SwapAxis(in_data[0], 1, 3) # data 8 | gamma = in_data[1] # gamma 9 | beta = in_data[2] # beta 10 | moving_mean = in_data[3] # mean 11 | moving_sigma = in_data[4] # sigma 12 | x_hat = (x - moving_mean) / (moving_sigma + 1e-9) 13 | out = gamma * x_hat + beta 14 | out = mx.nd.SwapAxis(out, 1, 3) 15 | self.assign(out_data[0], req[0], out) 16 | 17 | def backward(self, req, out_grad, in_data, out_data, in_grad, aux): 18 | raise NotImplementedError 19 | 20 | @mx.operator.register("custbatchnorm") 21 | class CustBatchNormProp(mx.operator.CustomOpProp): 22 | def __init__(self, need_top_grad=False): 23 | super(CustBatchNormProp, self).__init__(need_top_grad) 24 | 25 | def list_arguments(self): 26 | return ['data', 'gamma', 'beta', 'moving_mean', 'moving_sigma'] 27 | 28 | def list_outputs(self): 29 | return ['output'] 30 | 31 | def infer_shape(self, in_shape): 32 | data_shape = in_shape[0] 33 | other_shape = (in_shape[0][1],) 34 | output_shape = in_shape[0] 35 | return [data_shape, other_shape, other_shape, other_shape, other_shape], [output_shape], [] 36 | 37 | def create_operator(self, ctx, in_shapes, in_dtypes): 38 | return CustBatchNorm() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Python implementation of [siamese-fc](https://github.com/bertinetto/siamese-fc) 2 | 3 | -------------- 4 | 5 | This repository only include the tracking part of [siamese-fc](https://github.com/bertinetto/siamese-fc). 6 | 7 | -------------- 8 | 9 | #### Dependency 10 | 11 | - Mxnet = 0.9.2 12 | - OpenCV 13 | - Numpy 14 | 15 | #### Usage 16 | 17 | Before running the demo, we should convert the `matconvnet` model to `mxnet` model. 18 | 19 | ``` 20 | python transfer_model.py 21 | ``` 22 | 23 | By default there is already an `mxnet` model in `model` folder with prefix `mxmodel_bgr`, which means you should feed a `BGR` image to the model. If you want to use the `RGB` one, you should modify the tracking code correspondingly. 24 | 25 | Run the default demo: 26 | 27 | ``` 28 | python run_tracker.py 29 | ``` 30 | -------------------------------------------------------------------------------- /images/demo-sequences/vot15_bag.7z: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/images/demo-sequences/vot15_bag.7z -------------------------------------------------------------------------------- /model/model.mat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/model/model.mat -------------------------------------------------------------------------------- /model/model_dict.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/model/model_dict.pkl -------------------------------------------------------------------------------- /model/mxmodel_bgr-0001.params: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/model/mxmodel_bgr-0001.params -------------------------------------------------------------------------------- /model/mxmodel_bgr-symbol.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": [ 3 | { 4 | "op": "null", 5 | "name": "data", 6 | "inputs": [] 7 | }, 8 | { 9 | "op": "null", 10 | "name": "conv1_weight", 11 | "attr": { 12 | "kernel": "(11, 11)", 13 | "num_filter": "96", 14 | "stride": "(2, 2)" 15 | }, 16 | "inputs": [] 17 | }, 18 | { 19 | "op": "null", 20 | "name": "conv1_bias", 21 | "attr": { 22 | "kernel": "(11, 11)", 23 | "num_filter": "96", 24 | "stride": "(2, 2)" 25 | }, 26 | "inputs": [] 27 | }, 28 | { 29 | "op": "Convolution", 30 | "name": "conv1", 31 | "attr": { 32 | "kernel": "(11, 11)", 33 | "num_filter": "96", 34 | "stride": "(2, 2)" 35 | }, 36 | "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]] 37 | }, 38 | { 39 | "op": "null", 40 | "name": "bn1_gamma", 41 | "attr": {"op_type": "custbatchnorm"}, 42 | "inputs": [] 43 | }, 44 | { 45 | "op": "null", 46 | "name": "bn1_beta", 47 | "attr": {"op_type": "custbatchnorm"}, 48 | "inputs": [] 49 | }, 50 | { 51 | "op": "null", 52 | "name": "bn1_moving_mean", 53 | "attr": {"op_type": "custbatchnorm"}, 54 | "inputs": [] 55 | }, 56 | { 57 | "op": "null", 58 | "name": "bn1_moving_sigma", 59 | "attr": {"op_type": "custbatchnorm"}, 60 | "inputs": [] 61 | }, 62 | { 63 | "op": "Custom", 64 | "name": "bn1", 65 | "attr": {"op_type": "custbatchnorm"}, 66 | "inputs": [[3, 0, 0], [4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 0]] 67 | }, 68 | { 69 | "op": "Activation", 70 | "name": "relu1", 71 | "attr": {"act_type": "relu"}, 72 | "inputs": [[8, 0, 0]] 73 | }, 74 | { 75 | "op": "Pooling", 76 | "name": "pool1", 77 | "attr": { 78 | "kernel": "(3, 3)", 79 | "pool_type": "max", 80 | "stride": "(2, 2)" 81 | }, 82 | "inputs": [[9, 0, 0]] 83 | }, 84 | { 85 | "op": "SliceChannel", 86 | "name": "sliced1", 87 | "attr": {"num_outputs": "2"}, 88 | "inputs": [[10, 0, 0]] 89 | }, 90 | { 91 | "op": "null", 92 | "name": "conv21_weight", 93 | "attr": { 94 | "kernel": "(5, 5)", 95 | "num_filter": "128", 96 | "stride": "(1, 1)" 97 | }, 98 | "inputs": [] 99 | }, 100 | { 101 | "op": "null", 102 | "name": "conv21_bias", 103 | "attr": { 104 | "kernel": "(5, 5)", 105 | "num_filter": "128", 106 | "stride": "(1, 1)" 107 | }, 108 | "inputs": [] 109 | }, 110 | { 111 | "op": "Convolution", 112 | "name": "conv21", 113 | "attr": { 114 | "kernel": "(5, 5)", 115 | "num_filter": "128", 116 | "stride": "(1, 1)" 117 | }, 118 | "inputs": [[11, 0, 0], [12, 0, 0], [13, 0, 0]] 119 | }, 120 | { 121 | "op": "null", 122 | "name": "conv22_weight", 123 | "attr": { 124 | "kernel": "(5, 5)", 125 | "num_filter": "128", 126 | "stride": "(1, 1)" 127 | }, 128 | "inputs": [] 129 | }, 130 | { 131 | "op": "null", 132 | "name": "conv22_bias", 133 | "attr": { 134 | "kernel": "(5, 5)", 135 | "num_filter": "128", 136 | "stride": "(1, 1)" 137 | }, 138 | "inputs": [] 139 | }, 140 | { 141 | "op": "Convolution", 142 | "name": "conv22", 143 | "attr": { 144 | "kernel": "(5, 5)", 145 | "num_filter": "128", 146 | "stride": "(1, 1)" 147 | }, 148 | "inputs": [[11, 1, 0], [15, 0, 0], [16, 0, 0]] 149 | }, 150 | { 151 | "op": "Concat", 152 | "name": "conv2", 153 | "attr": {"num_args": "2"}, 154 | "inputs": [[14, 0, 0], [17, 0, 0]] 155 | }, 156 | { 157 | "op": "null", 158 | "name": "bn2_gamma", 159 | "attr": {"op_type": "custbatchnorm"}, 160 | "inputs": [] 161 | }, 162 | { 163 | "op": "null", 164 | "name": "bn2_beta", 165 | "attr": {"op_type": "custbatchnorm"}, 166 | "inputs": [] 167 | }, 168 | { 169 | "op": "null", 170 | "name": "bn2_moving_mean", 171 | "attr": {"op_type": "custbatchnorm"}, 172 | "inputs": [] 173 | }, 174 | { 175 | "op": "null", 176 | "name": "bn2_moving_sigma", 177 | "attr": {"op_type": "custbatchnorm"}, 178 | "inputs": [] 179 | }, 180 | { 181 | "op": "Custom", 182 | "name": "bn2", 183 | "attr": {"op_type": "custbatchnorm"}, 184 | "inputs": [[18, 0, 0], [19, 0, 0], [20, 0, 0], [21, 0, 0], [22, 0, 0]] 185 | }, 186 | { 187 | "op": "Activation", 188 | "name": "relu2", 189 | "attr": {"act_type": "relu"}, 190 | "inputs": [[23, 0, 0]] 191 | }, 192 | { 193 | "op": "Pooling", 194 | "name": "pool2", 195 | "attr": { 196 | "kernel": "(3, 3)", 197 | "pool_type": "max", 198 | "stride": "(2, 2)" 199 | }, 200 | "inputs": [[24, 0, 0]] 201 | }, 202 | { 203 | "op": "null", 204 | "name": "conv3_weight", 205 | "attr": { 206 | "kernel": "(3, 3)", 207 | "num_filter": "384", 208 | "stride": "(1, 1)" 209 | }, 210 | "inputs": [] 211 | }, 212 | { 213 | "op": "null", 214 | "name": "conv3_bias", 215 | "attr": { 216 | "kernel": "(3, 3)", 217 | "num_filter": "384", 218 | "stride": "(1, 1)" 219 | }, 220 | "inputs": [] 221 | }, 222 | { 223 | "op": "Convolution", 224 | "name": "conv3", 225 | "attr": { 226 | "kernel": "(3, 3)", 227 | "num_filter": "384", 228 | "stride": "(1, 1)" 229 | }, 230 | "inputs": [[25, 0, 0], [26, 0, 0], [27, 0, 0]] 231 | }, 232 | { 233 | "op": "null", 234 | "name": "bn3_gamma", 235 | "attr": {"op_type": "custbatchnorm"}, 236 | "inputs": [] 237 | }, 238 | { 239 | "op": "null", 240 | "name": "bn3_beta", 241 | "attr": {"op_type": "custbatchnorm"}, 242 | "inputs": [] 243 | }, 244 | { 245 | "op": "null", 246 | "name": "bn3_moving_mean", 247 | "attr": {"op_type": "custbatchnorm"}, 248 | "inputs": [] 249 | }, 250 | { 251 | "op": "null", 252 | "name": "bn3_moving_sigma", 253 | "attr": {"op_type": "custbatchnorm"}, 254 | "inputs": [] 255 | }, 256 | { 257 | "op": "Custom", 258 | "name": "bn3", 259 | "attr": {"op_type": "custbatchnorm"}, 260 | "inputs": [[28, 0, 0], [29, 0, 0], [30, 0, 0], [31, 0, 0], [32, 0, 0]] 261 | }, 262 | { 263 | "op": "Activation", 264 | "name": "relu3", 265 | "attr": {"act_type": "relu"}, 266 | "inputs": [[33, 0, 0]] 267 | }, 268 | { 269 | "op": "SliceChannel", 270 | "name": "sliced2", 271 | "attr": {"num_outputs": "2"}, 272 | "inputs": [[34, 0, 0]] 273 | }, 274 | { 275 | "op": "null", 276 | "name": "conv41_weight", 277 | "attr": { 278 | "kernel": "(3, 3)", 279 | "num_filter": "192", 280 | "stride": "(1, 1)" 281 | }, 282 | "inputs": [] 283 | }, 284 | { 285 | "op": "null", 286 | "name": "conv41_bias", 287 | "attr": { 288 | "kernel": "(3, 3)", 289 | "num_filter": "192", 290 | "stride": "(1, 1)" 291 | }, 292 | "inputs": [] 293 | }, 294 | { 295 | "op": "Convolution", 296 | "name": "conv41", 297 | "attr": { 298 | "kernel": "(3, 3)", 299 | "num_filter": "192", 300 | "stride": "(1, 1)" 301 | }, 302 | "inputs": [[35, 0, 0], [36, 0, 0], [37, 0, 0]] 303 | }, 304 | { 305 | "op": "null", 306 | "name": "conv42_weight", 307 | "attr": { 308 | "kernel": "(3, 3)", 309 | "num_filter": "192", 310 | "stride": "(1, 1)" 311 | }, 312 | "inputs": [] 313 | }, 314 | { 315 | "op": "null", 316 | "name": "conv42_bias", 317 | "attr": { 318 | "kernel": "(3, 3)", 319 | "num_filter": "192", 320 | "stride": "(1, 1)" 321 | }, 322 | "inputs": [] 323 | }, 324 | { 325 | "op": "Convolution", 326 | "name": "conv42", 327 | "attr": { 328 | "kernel": "(3, 3)", 329 | "num_filter": "192", 330 | "stride": "(1, 1)" 331 | }, 332 | "inputs": [[35, 1, 0], [39, 0, 0], [40, 0, 0]] 333 | }, 334 | { 335 | "op": "Concat", 336 | "name": "conv4", 337 | "attr": {"num_args": "2"}, 338 | "inputs": [[38, 0, 0], [41, 0, 0]] 339 | }, 340 | { 341 | "op": "null", 342 | "name": "bn4_gamma", 343 | "attr": {"op_type": "custbatchnorm"}, 344 | "inputs": [] 345 | }, 346 | { 347 | "op": "null", 348 | "name": "bn4_beta", 349 | "attr": {"op_type": "custbatchnorm"}, 350 | "inputs": [] 351 | }, 352 | { 353 | "op": "null", 354 | "name": "bn4_moving_mean", 355 | "attr": {"op_type": "custbatchnorm"}, 356 | "inputs": [] 357 | }, 358 | { 359 | "op": "null", 360 | "name": "bn4_moving_sigma", 361 | "attr": {"op_type": "custbatchnorm"}, 362 | "inputs": [] 363 | }, 364 | { 365 | "op": "Custom", 366 | "name": "bn4", 367 | "attr": {"op_type": "custbatchnorm"}, 368 | "inputs": [[42, 0, 0], [43, 0, 0], [44, 0, 0], [45, 0, 0], [46, 0, 0]] 369 | }, 370 | { 371 | "op": "Activation", 372 | "name": "relu4", 373 | "attr": {"act_type": "relu"}, 374 | "inputs": [[47, 0, 0]] 375 | }, 376 | { 377 | "op": "SliceChannel", 378 | "name": "sliced3", 379 | "attr": {"num_outputs": "2"}, 380 | "inputs": [[48, 0, 0]] 381 | }, 382 | { 383 | "op": "null", 384 | "name": "conv51_weight", 385 | "attr": { 386 | "kernel": "(3, 3)", 387 | "num_filter": "128", 388 | "stride": "(1, 1)" 389 | }, 390 | "inputs": [] 391 | }, 392 | { 393 | "op": "null", 394 | "name": "conv51_bias", 395 | "attr": { 396 | "kernel": "(3, 3)", 397 | "num_filter": "128", 398 | "stride": "(1, 1)" 399 | }, 400 | "inputs": [] 401 | }, 402 | { 403 | "op": "Convolution", 404 | "name": "conv51", 405 | "attr": { 406 | "kernel": "(3, 3)", 407 | "num_filter": "128", 408 | "stride": "(1, 1)" 409 | }, 410 | "inputs": [[49, 0, 0], [50, 0, 0], [51, 0, 0]] 411 | }, 412 | { 413 | "op": "null", 414 | "name": "conv52_weight", 415 | "attr": { 416 | "kernel": "(3, 3)", 417 | "num_filter": "128", 418 | "stride": "(1, 1)" 419 | }, 420 | "inputs": [] 421 | }, 422 | { 423 | "op": "null", 424 | "name": "conv52_bias", 425 | "attr": { 426 | "kernel": "(3, 3)", 427 | "num_filter": "128", 428 | "stride": "(1, 1)" 429 | }, 430 | "inputs": [] 431 | }, 432 | { 433 | "op": "Convolution", 434 | "name": "conv52", 435 | "attr": { 436 | "kernel": "(3, 3)", 437 | "num_filter": "128", 438 | "stride": "(1, 1)" 439 | }, 440 | "inputs": [[49, 1, 0], [53, 0, 0], [54, 0, 0]] 441 | }, 442 | { 443 | "op": "Concat", 444 | "name": "conv5", 445 | "attr": {"num_args": "2"}, 446 | "inputs": [[52, 0, 0], [55, 0, 0]] 447 | } 448 | ], 449 | "arg_nodes": [ 450 | 0, 451 | 1, 452 | 2, 453 | 4, 454 | 5, 455 | 6, 456 | 7, 457 | 12, 458 | 13, 459 | 15, 460 | 16, 461 | 19, 462 | 20, 463 | 21, 464 | 22, 465 | 26, 466 | 27, 467 | 29, 468 | 30, 469 | 31, 470 | 32, 471 | 36, 472 | 37, 473 | 39, 474 | 40, 475 | 43, 476 | 44, 477 | 45, 478 | 46, 479 | 50, 480 | 51, 481 | 53, 482 | 54 483 | ], 484 | "node_row_ptr": [ 485 | 0, 486 | 1, 487 | 2, 488 | 3, 489 | 4, 490 | 5, 491 | 6, 492 | 7, 493 | 8, 494 | 9, 495 | 10, 496 | 11, 497 | 13, 498 | 14, 499 | 15, 500 | 16, 501 | 17, 502 | 18, 503 | 19, 504 | 20, 505 | 21, 506 | 22, 507 | 23, 508 | 24, 509 | 25, 510 | 26, 511 | 27, 512 | 28, 513 | 29, 514 | 30, 515 | 31, 516 | 32, 517 | 33, 518 | 34, 519 | 35, 520 | 36, 521 | 38, 522 | 39, 523 | 40, 524 | 41, 525 | 42, 526 | 43, 527 | 44, 528 | 45, 529 | 46, 530 | 47, 531 | 48, 532 | 49, 533 | 50, 534 | 51, 535 | 53, 536 | 54, 537 | 55, 538 | 56, 539 | 57, 540 | 58, 541 | 59, 542 | 60 543 | ], 544 | "heads": [[56, 0, 0]], 545 | "attrs": {"mxnet_version": ["int", 901]} 546 | } -------------------------------------------------------------------------------- /model/mxmodel_rgb-0001.params: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GarrickLin/py-siamese_fc/f11cb140f7b6a1bb0a054f984a7e6df404d6ad7b/model/mxmodel_rgb-0001.params -------------------------------------------------------------------------------- /model/mxmodel_rgb-symbol.json: -------------------------------------------------------------------------------- 1 | { 2 | "nodes": [ 3 | { 4 | "op": "null", 5 | "name": "data", 6 | "inputs": [] 7 | }, 8 | { 9 | "op": "null", 10 | "name": "conv1_weight", 11 | "attr": { 12 | "kernel": "(11, 11)", 13 | "num_filter": "96", 14 | "stride": "(2, 2)" 15 | }, 16 | "inputs": [] 17 | }, 18 | { 19 | "op": "null", 20 | "name": "conv1_bias", 21 | "attr": { 22 | "kernel": "(11, 11)", 23 | "num_filter": "96", 24 | "stride": "(2, 2)" 25 | }, 26 | "inputs": [] 27 | }, 28 | { 29 | "op": "Convolution", 30 | "name": "conv1", 31 | "attr": { 32 | "kernel": "(11, 11)", 33 | "num_filter": "96", 34 | "stride": "(2, 2)" 35 | }, 36 | "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]] 37 | }, 38 | { 39 | "op": "null", 40 | "name": "bn1_gamma", 41 | "attr": {"op_type": "custbatchnorm"}, 42 | "inputs": [] 43 | }, 44 | { 45 | "op": "null", 46 | "name": "bn1_beta", 47 | "attr": {"op_type": "custbatchnorm"}, 48 | "inputs": [] 49 | }, 50 | { 51 | "op": "null", 52 | "name": "bn1_moving_mean", 53 | "attr": {"op_type": "custbatchnorm"}, 54 | "inputs": [] 55 | }, 56 | { 57 | "op": "null", 58 | "name": "bn1_moving_sigma", 59 | "attr": {"op_type": "custbatchnorm"}, 60 | "inputs": [] 61 | }, 62 | { 63 | "op": "Custom", 64 | "name": "bn1", 65 | "attr": {"op_type": "custbatchnorm"}, 66 | "inputs": [[3, 0, 0], [4, 0, 0], [5, 0, 0], [6, 0, 0], [7, 0, 0]] 67 | }, 68 | { 69 | "op": "Activation", 70 | "name": "relu1", 71 | "attr": {"act_type": "relu"}, 72 | "inputs": [[8, 0, 0]] 73 | }, 74 | { 75 | "op": "Pooling", 76 | "name": "pool1", 77 | "attr": { 78 | "kernel": "(3, 3)", 79 | "pool_type": "max", 80 | "stride": "(2, 2)" 81 | }, 82 | "inputs": [[9, 0, 0]] 83 | }, 84 | { 85 | "op": "SliceChannel", 86 | "name": "sliced1", 87 | "attr": {"num_outputs": "2"}, 88 | "inputs": [[10, 0, 0]] 89 | }, 90 | { 91 | "op": "null", 92 | "name": "conv21_weight", 93 | "attr": { 94 | "kernel": "(5, 5)", 95 | "num_filter": "128", 96 | "stride": "(1, 1)" 97 | }, 98 | "inputs": [] 99 | }, 100 | { 101 | "op": "null", 102 | "name": "conv21_bias", 103 | "attr": { 104 | "kernel": "(5, 5)", 105 | "num_filter": "128", 106 | "stride": "(1, 1)" 107 | }, 108 | "inputs": [] 109 | }, 110 | { 111 | "op": "Convolution", 112 | "name": "conv21", 113 | "attr": { 114 | "kernel": "(5, 5)", 115 | "num_filter": "128", 116 | "stride": "(1, 1)" 117 | }, 118 | "inputs": [[11, 0, 0], [12, 0, 0], [13, 0, 0]] 119 | }, 120 | { 121 | "op": "null", 122 | "name": "conv22_weight", 123 | "attr": { 124 | "kernel": "(5, 5)", 125 | "num_filter": "128", 126 | "stride": "(1, 1)" 127 | }, 128 | "inputs": [] 129 | }, 130 | { 131 | "op": "null", 132 | "name": "conv22_bias", 133 | "attr": { 134 | "kernel": "(5, 5)", 135 | "num_filter": "128", 136 | "stride": "(1, 1)" 137 | }, 138 | "inputs": [] 139 | }, 140 | { 141 | "op": "Convolution", 142 | "name": "conv22", 143 | "attr": { 144 | "kernel": "(5, 5)", 145 | "num_filter": "128", 146 | "stride": "(1, 1)" 147 | }, 148 | "inputs": [[11, 1, 0], [15, 0, 0], [16, 0, 0]] 149 | }, 150 | { 151 | "op": "Concat", 152 | "name": "conv2", 153 | "attr": {"num_args": "2"}, 154 | "inputs": [[14, 0, 0], [17, 0, 0]] 155 | }, 156 | { 157 | "op": "null", 158 | "name": "bn2_gamma", 159 | "attr": {"op_type": "custbatchnorm"}, 160 | "inputs": [] 161 | }, 162 | { 163 | "op": "null", 164 | "name": "bn2_beta", 165 | "attr": {"op_type": "custbatchnorm"}, 166 | "inputs": [] 167 | }, 168 | { 169 | "op": "null", 170 | "name": "bn2_moving_mean", 171 | "attr": {"op_type": "custbatchnorm"}, 172 | "inputs": [] 173 | }, 174 | { 175 | "op": "null", 176 | "name": "bn2_moving_sigma", 177 | "attr": {"op_type": "custbatchnorm"}, 178 | "inputs": [] 179 | }, 180 | { 181 | "op": "Custom", 182 | "name": "bn2", 183 | "attr": {"op_type": "custbatchnorm"}, 184 | "inputs": [[18, 0, 0], [19, 0, 0], [20, 0, 0], [21, 0, 0], [22, 0, 0]] 185 | }, 186 | { 187 | "op": "Activation", 188 | "name": "relu2", 189 | "attr": {"act_type": "relu"}, 190 | "inputs": [[23, 0, 0]] 191 | }, 192 | { 193 | "op": "Pooling", 194 | "name": "pool2", 195 | "attr": { 196 | "kernel": "(3, 3)", 197 | "pool_type": "max", 198 | "stride": "(2, 2)" 199 | }, 200 | "inputs": [[24, 0, 0]] 201 | }, 202 | { 203 | "op": "null", 204 | "name": "conv3_weight", 205 | "attr": { 206 | "kernel": "(3, 3)", 207 | "num_filter": "384", 208 | "stride": "(1, 1)" 209 | }, 210 | "inputs": [] 211 | }, 212 | { 213 | "op": "null", 214 | "name": "conv3_bias", 215 | "attr": { 216 | "kernel": "(3, 3)", 217 | "num_filter": "384", 218 | "stride": "(1, 1)" 219 | }, 220 | "inputs": [] 221 | }, 222 | { 223 | "op": "Convolution", 224 | "name": "conv3", 225 | "attr": { 226 | "kernel": "(3, 3)", 227 | "num_filter": "384", 228 | "stride": "(1, 1)" 229 | }, 230 | "inputs": [[25, 0, 0], [26, 0, 0], [27, 0, 0]] 231 | }, 232 | { 233 | "op": "null", 234 | "name": "bn3_gamma", 235 | "attr": {"op_type": "custbatchnorm"}, 236 | "inputs": [] 237 | }, 238 | { 239 | "op": "null", 240 | "name": "bn3_beta", 241 | "attr": {"op_type": "custbatchnorm"}, 242 | "inputs": [] 243 | }, 244 | { 245 | "op": "null", 246 | "name": "bn3_moving_mean", 247 | "attr": {"op_type": "custbatchnorm"}, 248 | "inputs": [] 249 | }, 250 | { 251 | "op": "null", 252 | "name": "bn3_moving_sigma", 253 | "attr": {"op_type": "custbatchnorm"}, 254 | "inputs": [] 255 | }, 256 | { 257 | "op": "Custom", 258 | "name": "bn3", 259 | "attr": {"op_type": "custbatchnorm"}, 260 | "inputs": [[28, 0, 0], [29, 0, 0], [30, 0, 0], [31, 0, 0], [32, 0, 0]] 261 | }, 262 | { 263 | "op": "Activation", 264 | "name": "relu3", 265 | "attr": {"act_type": "relu"}, 266 | "inputs": [[33, 0, 0]] 267 | }, 268 | { 269 | "op": "SliceChannel", 270 | "name": "sliced2", 271 | "attr": {"num_outputs": "2"}, 272 | "inputs": [[34, 0, 0]] 273 | }, 274 | { 275 | "op": "null", 276 | "name": "conv41_weight", 277 | "attr": { 278 | "kernel": "(3, 3)", 279 | "num_filter": "192", 280 | "stride": "(1, 1)" 281 | }, 282 | "inputs": [] 283 | }, 284 | { 285 | "op": "null", 286 | "name": "conv41_bias", 287 | "attr": { 288 | "kernel": "(3, 3)", 289 | "num_filter": "192", 290 | "stride": "(1, 1)" 291 | }, 292 | "inputs": [] 293 | }, 294 | { 295 | "op": "Convolution", 296 | "name": "conv41", 297 | "attr": { 298 | "kernel": "(3, 3)", 299 | "num_filter": "192", 300 | "stride": "(1, 1)" 301 | }, 302 | "inputs": [[35, 0, 0], [36, 0, 0], [37, 0, 0]] 303 | }, 304 | { 305 | "op": "null", 306 | "name": "conv42_weight", 307 | "attr": { 308 | "kernel": "(3, 3)", 309 | "num_filter": "192", 310 | "stride": "(1, 1)" 311 | }, 312 | "inputs": [] 313 | }, 314 | { 315 | "op": "null", 316 | "name": "conv42_bias", 317 | "attr": { 318 | "kernel": "(3, 3)", 319 | "num_filter": "192", 320 | "stride": "(1, 1)" 321 | }, 322 | "inputs": [] 323 | }, 324 | { 325 | "op": "Convolution", 326 | "name": "conv42", 327 | "attr": { 328 | "kernel": "(3, 3)", 329 | "num_filter": "192", 330 | "stride": "(1, 1)" 331 | }, 332 | "inputs": [[35, 1, 0], [39, 0, 0], [40, 0, 0]] 333 | }, 334 | { 335 | "op": "Concat", 336 | "name": "conv4", 337 | "attr": {"num_args": "2"}, 338 | "inputs": [[38, 0, 0], [41, 0, 0]] 339 | }, 340 | { 341 | "op": "null", 342 | "name": "bn4_gamma", 343 | "attr": {"op_type": "custbatchnorm"}, 344 | "inputs": [] 345 | }, 346 | { 347 | "op": "null", 348 | "name": "bn4_beta", 349 | "attr": {"op_type": "custbatchnorm"}, 350 | "inputs": [] 351 | }, 352 | { 353 | "op": "null", 354 | "name": "bn4_moving_mean", 355 | "attr": {"op_type": "custbatchnorm"}, 356 | "inputs": [] 357 | }, 358 | { 359 | "op": "null", 360 | "name": "bn4_moving_sigma", 361 | "attr": {"op_type": "custbatchnorm"}, 362 | "inputs": [] 363 | }, 364 | { 365 | "op": "Custom", 366 | "name": "bn4", 367 | "attr": {"op_type": "custbatchnorm"}, 368 | "inputs": [[42, 0, 0], [43, 0, 0], [44, 0, 0], [45, 0, 0], [46, 0, 0]] 369 | }, 370 | { 371 | "op": "Activation", 372 | "name": "relu4", 373 | "attr": {"act_type": "relu"}, 374 | "inputs": [[47, 0, 0]] 375 | }, 376 | { 377 | "op": "SliceChannel", 378 | "name": "sliced3", 379 | "attr": {"num_outputs": "2"}, 380 | "inputs": [[48, 0, 0]] 381 | }, 382 | { 383 | "op": "null", 384 | "name": "conv51_weight", 385 | "attr": { 386 | "kernel": "(3, 3)", 387 | "num_filter": "128", 388 | "stride": "(1, 1)" 389 | }, 390 | "inputs": [] 391 | }, 392 | { 393 | "op": "null", 394 | "name": "conv51_bias", 395 | "attr": { 396 | "kernel": "(3, 3)", 397 | "num_filter": "128", 398 | "stride": "(1, 1)" 399 | }, 400 | "inputs": [] 401 | }, 402 | { 403 | "op": "Convolution", 404 | "name": "conv51", 405 | "attr": { 406 | "kernel": "(3, 3)", 407 | "num_filter": "128", 408 | "stride": "(1, 1)" 409 | }, 410 | "inputs": [[49, 0, 0], [50, 0, 0], [51, 0, 0]] 411 | }, 412 | { 413 | "op": "null", 414 | "name": "conv52_weight", 415 | "attr": { 416 | "kernel": "(3, 3)", 417 | "num_filter": "128", 418 | "stride": "(1, 1)" 419 | }, 420 | "inputs": [] 421 | }, 422 | { 423 | "op": "null", 424 | "name": "conv52_bias", 425 | "attr": { 426 | "kernel": "(3, 3)", 427 | "num_filter": "128", 428 | "stride": "(1, 1)" 429 | }, 430 | "inputs": [] 431 | }, 432 | { 433 | "op": "Convolution", 434 | "name": "conv52", 435 | "attr": { 436 | "kernel": "(3, 3)", 437 | "num_filter": "128", 438 | "stride": "(1, 1)" 439 | }, 440 | "inputs": [[49, 1, 0], [53, 0, 0], [54, 0, 0]] 441 | }, 442 | { 443 | "op": "Concat", 444 | "name": "conv5", 445 | "attr": {"num_args": "2"}, 446 | "inputs": [[52, 0, 0], [55, 0, 0]] 447 | } 448 | ], 449 | "arg_nodes": [ 450 | 0, 451 | 1, 452 | 2, 453 | 4, 454 | 5, 455 | 6, 456 | 7, 457 | 12, 458 | 13, 459 | 15, 460 | 16, 461 | 19, 462 | 20, 463 | 21, 464 | 22, 465 | 26, 466 | 27, 467 | 29, 468 | 30, 469 | 31, 470 | 32, 471 | 36, 472 | 37, 473 | 39, 474 | 40, 475 | 43, 476 | 44, 477 | 45, 478 | 46, 479 | 50, 480 | 51, 481 | 53, 482 | 54 483 | ], 484 | "node_row_ptr": [ 485 | 0, 486 | 1, 487 | 2, 488 | 3, 489 | 4, 490 | 5, 491 | 6, 492 | 7, 493 | 8, 494 | 9, 495 | 10, 496 | 11, 497 | 13, 498 | 14, 499 | 15, 500 | 16, 501 | 17, 502 | 18, 503 | 19, 504 | 20, 505 | 21, 506 | 22, 507 | 23, 508 | 24, 509 | 25, 510 | 26, 511 | 27, 512 | 28, 513 | 29, 514 | 30, 515 | 31, 516 | 32, 517 | 33, 518 | 34, 519 | 35, 520 | 36, 521 | 38, 522 | 39, 523 | 40, 524 | 41, 525 | 42, 526 | 43, 527 | 44, 528 | 45, 529 | 46, 530 | 47, 531 | 48, 532 | 49, 533 | 50, 534 | 51, 535 | 53, 536 | 54, 537 | 55, 538 | 56, 539 | 57, 540 | 58, 541 | 59, 542 | 60 543 | ], 544 | "heads": [[56, 0, 0]], 545 | "attrs": {"mxnet_version": ["int", 901]} 546 | } -------------------------------------------------------------------------------- /run_tracker.py: -------------------------------------------------------------------------------- 1 | import mxnet as mx 2 | import numpy as np 3 | import CustomMxOp 4 | import cv2 5 | from minpy.core import Function 6 | import os 7 | import glob 8 | from utils import GetRectange 9 | 10 | adjust_f = 0.0010 11 | adjust_b = -2.1484 12 | 13 | def imshow(img, winname="display", wk=0): 14 | show = img 15 | if show.dtype == np.float32: 16 | show = img.astype(np.uint8) 17 | #print show.dtype 18 | cv2.imshow(winname, show) 19 | cv2.waitKey(wk) 20 | 21 | def avoid_empty_position(r_max, c_max, params): 22 | if r_max is None: 23 | r_max = np.ceil(params['scoreSize']/2.) 24 | if c_max is None: 25 | c_max = np.ceil(params['scoreSize']/2.) 26 | return (r_max, c_max) 27 | 28 | def cross_correlation_factory(data_shape, kernel_shape): 29 | batch, num_filter, y, x = kernel_shape 30 | net = mx.sym.Variable('x') 31 | net = mx.sym.Convolution(net, name='conv', kernel=(y, x), num_filter=1, no_bias=True) 32 | conv = Function(net, input_shapes={'x': data_shape}) 33 | return conv 34 | 35 | def cross_correlation(data, kernel): 36 | batch, num_filter, y, x = kernel.shape 37 | net = mx.sym.Variable('x') 38 | net = mx.sym.Convolution(net, name='conv', kernel=(y, x), num_filter=1, no_bias=True) 39 | conv = Function(net, input_shapes={'x': data.shape}) 40 | #print conv._param_shapes 41 | res = conv(x=data, conv_weight=kernel) 42 | return res 43 | 44 | def tracker_eval(net_x, s_x, z_features, x_crops, targetPosition, window, p, Conv=None): 45 | """ 46 | runs a forward pass of the search-region branch of the pre-trained Fully-Convolutional Siamese, 47 | reusing the feature of the exemplar z computed at the first frame. 48 | """ 49 | # forward pass, using the pyramid of scaled crops as a "batch" 50 | x_crops = adjust_data(x_crops) 51 | data_iter = mx.io.NDArrayIter(x_crops) 52 | #net_x.bind(data_shapes=data_iter.provide_data, for_training=False) 53 | x_features = net_x.predict(data_iter) 54 | if Conv is None: 55 | Conv = cross_correlation_factory(x_features.shape, z_features.shape) 56 | responseMaps = Conv(x=x_features, conv_weight=z_features).asnumpy() 57 | responseMaps = responseMaps * adjust_f + adjust_b 58 | upsz = p['scoreSize'] * p['responseUp'] 59 | #responseMapsUP = np.zeros((upsz, upsz, p['numScale']), dtype=np.float32) 60 | responseMapsUP = [] 61 | # Choose the scale whose response map has the highest peak 62 | if p['numScale'] > 1: 63 | currentScaleID = int(p['numScale']/2) 64 | bestScale = currentScaleID 65 | bestPeak = -float('Inf') 66 | for s in range(p['numScale']): 67 | if p['responseUp'] > 1: 68 | # upsample to improve accuracy 69 | responseMapsUP.append(cv2.resize(responseMaps[s,0,:,:,], (upsz, upsz), interpolation=cv2.INTER_CUBIC)) 70 | else: 71 | responseMapsUP.append(responseMaps[s,0,:,:,]) 72 | thisResponse = responseMapsUP[-1] 73 | # penalize change of scale 74 | if s != currentScaleID: 75 | thisResponse = thisResponse * p['scalePenalty'] 76 | thisPeak = np.max(thisResponse) 77 | if thisPeak > bestPeak: 78 | bestPeak = thisPeak 79 | bestScale = s 80 | responseMap = responseMapsUP[bestScale] 81 | else: 82 | #responseMap = responseMapsUP 83 | responseMap = cv2.resize(responseMaps[0,0,:,:,], (upsz, upsz), interpolation=cv2.INTER_CUBIC) 84 | bestScale = 0 85 | 86 | # make the response map sum to 1 87 | responseMap = responseMap - np.min(responseMap) 88 | responseMap = responseMap / np.sum(responseMap) 89 | # apply windowing 90 | responseMap = (1-p['wInfluence'])*responseMap + p['wInfluence']*window 91 | r_max, c_max = np.unravel_index(responseMap.argmax(), responseMap.shape) 92 | #r_max, c_max = avoid_empty_position(r_max, c_max, p) 93 | p_corr = np.array((r_max, c_max)) 94 | # Convert to crop-relative coordinates to frame coordinates 95 | # displacement from the center in instance final representation ... 96 | disp_instanceFinal = p_corr - int(p['scoreSize']*p['responseUp']/2) 97 | # ... in instance input ... 98 | disp_instanceInput = disp_instanceFinal * p['totalStride'] / p['responseUp'] 99 | # ... in instance original crop (in frame coordinates) 100 | disp_instanceFrame = disp_instanceInput * s_x / p['instanceSize'] 101 | # position within frame in frame coordinates 102 | newTargetPosition = targetPosition + disp_instanceFrame 103 | 104 | return newTargetPosition, bestScale 105 | 106 | def config_params(): 107 | p = {} 108 | # These are the default hyper-params for SiamFC-3S 109 | # The ones for SiamFC (5 scales) are in params-5s.txt 110 | p['numScale'] = 3 111 | p['scaleStep'] = 1.0375 112 | p['scalePenalty'] = 0.9745 113 | p['scaleLR'] = 0.59 # damping factor for scale update 114 | p['responseUp'] = 16 # upsampling the small 17x17 response helps with the accuracy 115 | p['windowing'] = 'cosine' # to penalize large displacements 116 | p['wInfluence'] = 0.176 # windowing influence (in convex sum) 117 | p['net_base_path'] = 'model/' 118 | p['net'] = 'mxmodel_bgr' 119 | # execution, visualization, benchmark 120 | p['seq_base_path'] = 'images/demo-sequences/' 121 | p['video'] = 'vot15_bag' 122 | p['visualization'] = False 123 | p['gpus'] = 0 124 | p['bbox_output'] = False 125 | p['fout'] = -1 126 | # Params from the network architecture, have to be consistent with the training 127 | p['exemplarSize'] = 127 128 | p['instanceSize'] = 255 129 | p['scoreSize'] = 17 130 | p['totalStride'] = 8 131 | p['contextAmount'] = 0.5 132 | p['subMean'] = False 133 | 134 | return p 135 | 136 | def get_axis_aligned_BB(region): 137 | """ 138 | computes axis-aligned bbox with same area as the rotated one (REGION) 139 | """ 140 | region = np.array(region) 141 | nv = region.size 142 | assert (nv==8 or nv==4) 143 | if nv==8: 144 | cx = np.mean(region[0::2]) 145 | cy = np.mean(region[1::2]) 146 | x1 = np.min(region[0::2]) 147 | x2 = np.max(region[0::2]) 148 | y1 = np.min(region[1::2]) 149 | y2 = np.max(region[1::2]) 150 | A1 = np.linalg.norm(region[0:2]-region[2:4]) * np.linalg.norm(region[2:4]-region[4:6]) 151 | A2 = (x2 - x1) * (y2 - y1) 152 | s = np.sqrt(A1/A2) 153 | w = s * (x2 - x1) + 1 154 | h = s * (y2 - y1) + 1 155 | else: 156 | x = region[0] 157 | y = region[1] 158 | w = region[3] 159 | h = region[4] 160 | cx = x + w / 2 161 | cy = y + h / 2 162 | return (cx-1, cy-1, w, h) 163 | 164 | def frame_generator(vpath, mode): 165 | if mode == "images": 166 | def frames(): 167 | for img in glob.glob(os.path.join(vpath, "*.jpg")): 168 | yield cv2.imread(img).astype(np.float32) 169 | return frames() 170 | elif mode == "video" or mode == "camera": 171 | def frames(): 172 | cap = cv2.VideoCapture(vpath) 173 | while 1: 174 | ret, frame = cap.read() 175 | if ret: 176 | yield frame.astype(np.float32) 177 | else: 178 | break 179 | return frames() 180 | 181 | def load_video_info(base_path, video): 182 | # full path to the video's files 183 | video_path = os.path.join(base_path, video, "imgs/") 184 | # load ground truth from text file 185 | ground_truth_path = os.path.join(base_path, video, "groundtruth.txt") 186 | ground_truth = open(ground_truth_path) 187 | raw1 = ground_truth.readline() 188 | #print "raw1", raw1 189 | region = map(float, raw1.strip().split(",")) 190 | cx, cy, w, h = get_axis_aligned_BB(region) 191 | pos = (cy, cx) 192 | target_sz = (h, w) 193 | 194 | return frame_generator(video_path, mode="images"), np.array(pos), np.array(target_sz) 195 | 196 | def load_camera(device): 197 | cap = cv2.VideoCapture(device) 198 | rector = GetRectange() 199 | while 1: 200 | ready, frame = cap.read() 201 | if not ready: 202 | print "device", device, "is not ready" 203 | cv2.imshow("frame", frame) 204 | key = cv2.waitKey(1) 205 | if key != -1: 206 | break 207 | pos, target_sz = rector.getRect(frame) 208 | def frames(): 209 | while 1: 210 | ret, frame = cap.read() 211 | if ret: 212 | yield frame.astype(np.float32) 213 | else: 214 | print "device", device, "is not ready" 215 | return frames(), frame, np.array(pos), np.array(target_sz) 216 | 217 | def get_subwindow_tracking(im, pos, model_sz, original_sz, avgChans): 218 | """ 219 | Obtain image sub-window, padding with avg channel if area goes outside of border 220 | """ 221 | if original_sz is None: 222 | original_sz = model_sz 223 | sz = original_sz 224 | im_sz = im.shape 225 | # make sure the size is not too small 226 | assert min(im_sz[:2]) > 2, "the size is too small" 227 | c = (np.array(sz) + 1) / 2 228 | 229 | # check out-of-bounds coordinates, and set them to black 230 | context_xmin = round(pos[1] - c[1]) 231 | context_xmax = context_xmin + sz[1] - 1 232 | context_ymin = round(pos[0] - c[0]) 233 | context_ymax = context_ymin + sz[0] - 1 234 | left_pad = max(0, int(-context_xmin)) 235 | top_pad = max(0, int(-context_ymin)) 236 | right_pad = max(0, int(context_xmax - im_sz[1] + 1)) 237 | bottom_pad = max(0, int(context_ymax - im_sz[0] + 1)) 238 | 239 | context_xmin = int(context_xmin + left_pad) 240 | context_xmax = int(context_xmax + left_pad) 241 | context_ymin = int(context_ymin + top_pad) 242 | context_ymax = int(context_ymax + top_pad) 243 | 244 | if top_pad or left_pad or bottom_pad or right_pad: 245 | b = np.pad(im[:,:,0], ((top_pad,bottom_pad),(left_pad,right_pad)), mode='constant', constant_values=avgChans[0]) 246 | g = np.pad(im[:,:,1], ((top_pad,bottom_pad),(left_pad,right_pad)), mode='constant', constant_values=avgChans[1]) 247 | r = np.pad(im[:,:,2], ((top_pad,bottom_pad),(left_pad,right_pad)), mode='constant', constant_values=avgChans[2]) 248 | im = cv2.merge((b,g,r)) 249 | #imshow(im) 250 | 251 | im_patch_original = im[context_ymin:context_ymax+1, context_xmin:context_xmax+1, :] 252 | if not np.array_equal(model_sz, original_sz): 253 | im_patch = cv2.resize(im_patch_original, model_sz) 254 | else: 255 | im_patch = im_patch_original 256 | 257 | return im_patch, im_patch_original 258 | 259 | def adjust_data(data): 260 | """ 261 | adjust the input from (h, w, c) to ( 1, c, h, w) for network input 262 | 263 | Parameters: 264 | ---------- 265 | in_data: numpy array of shape (h, w, c) or (n, h, w, c) 266 | input data 267 | Returns: 268 | ------- 269 | out_data: numpy array of shape (1, c, h, w) or (n, c, h, w) 270 | """ 271 | if data.dtype is not np.dtype('float32'): 272 | data = data.astype(np.float32) 273 | print "convert to float32" 274 | 275 | if len(data.shape) < 4: 276 | data = np.expand_dims(data, axis=0) 277 | data = np.moveaxis(data, -1, -3) 278 | 279 | return data 280 | 281 | def make_scale_pyramid(im, targetPosition, in_side_scaled, out_side, avgChans, stats, p): 282 | """ 283 | computes a pyramid of re-scaled copies of the target (centered on TARGETPOSITION) 284 | and resizes them to OUT_SIDE. If crops exceed image boundaries they are padded with AVGCHANS. 285 | 286 | """ 287 | in_side_scaled = np.round(in_side_scaled) 288 | max_target_side = int(round(in_side_scaled[-1])) 289 | min_target_side = int(round(in_side_scaled[0])) 290 | beta = out_side / float(min_target_side) 291 | # size_in_search_area = beta * size_in_image 292 | # e.g. out_side = beta * min_target_side 293 | search_side = int(round(beta * max_target_side)) 294 | search_region, _ = get_subwindow_tracking(im, targetPosition, (search_side, search_side), (max_target_side, max_target_side), avgChans) 295 | if p['subMean']: 296 | pass 297 | assert round(beta*min_target_side) == int(out_side) 298 | 299 | tmp_list = [] 300 | tmp_pos = ((search_side-1)/2., (search_side-1)/2.) 301 | for s in range(p['numScale']): 302 | target_side = round(beta * in_side_scaled[s]) 303 | tmp_region, _ = get_subwindow_tracking(search_region, tmp_pos, (out_side, out_side), (target_side, target_side), avgChans) 304 | tmp_list.append(tmp_region) 305 | 306 | pyramid = np.stack(tmp_list) 307 | 308 | return pyramid 309 | 310 | def tracker(demo=True): 311 | p = config_params() 312 | # Load two copies of the pre-trained network 313 | net_z = mx.mod.Module.load(p['net_base_path']+p['net'], 1, context=mx.gpu(0)) 314 | data_iter = mx.io.NDArrayIter(data=np.zeros((1,3,p['exemplarSize'],p['exemplarSize']))) 315 | net_z.bind(data_shapes=data_iter.provide_data, for_training=False) 316 | net_x = mx.mod.Module.load(p['net_base_path']+p['net'], 1, context=mx.gpu(0)) 317 | data_iter = mx.io.NDArrayIter(data=np.zeros((3,3,p['instanceSize'],p['instanceSize']))) 318 | net_x.bind(data_shapes=data_iter.provide_data, for_training=False) 319 | Conv = cross_correlation_factory((3,256,22,22), (1,256,6,6)) 320 | 321 | if demo: 322 | imgFiles, targetPosition, targetSize = load_video_info(p['seq_base_path'], p['video']) 323 | im = imgFiles.next() 324 | else: 325 | imgFiles, im, targetPosition, targetSize = load_camera(0) 326 | 327 | wc_z = targetSize[1] + p['contextAmount']*np.sum(targetSize) 328 | hc_z = targetSize[0] + p['contextAmount']*np.sum(targetSize) 329 | s_z = np.sqrt(wc_z*hc_z) 330 | scale_z = p['exemplarSize'] / s_z 331 | 332 | 333 | d_search = (p['instanceSize'] - p['exemplarSize']) / 2 334 | pad = d_search / scale_z 335 | s_x = s_z + 2*pad 336 | 337 | # arbitrary scale saturation 338 | min_s_x = 0.2*s_x 339 | max_s_x = 5*s_x 340 | 341 | winsz = p['scoreSize'] * p['responseUp'] 342 | if p['windowing'] == 'cosine': 343 | hann = np.hanning(winsz).reshape(winsz, 1) 344 | window = hann.dot(hann.T) 345 | elif p['windowing'] == 'uniform': 346 | window = np.ones((winsz, winsz), dtype=float32) 347 | 348 | # make the window sum 1 349 | window = window / np.sum(window) 350 | scales = np.array([p['scaleStep'] ** i for i in range(int(np.ceil(p['numScale']/2.)-p['numScale']), int(np.floor(p['numScale']/2)+1))]) 351 | 352 | # prepare for first frame 353 | # get avg for padding 354 | avgChans = np.mean(im, axis=(0,1)) 355 | # initialize the exemplar 356 | z_crop, _ = get_subwindow_tracking(im, targetPosition, (p['exemplarSize'],p['exemplarSize']), (round(s_z), round(s_z)), avgChans) 357 | #imshow(z_crop) 358 | if p['subMean']: 359 | pass 360 | # evaluate the offline-trained network for exemplar z features 361 | data_iter = mx.io.NDArrayIter(adjust_data(z_crop)) 362 | z_features = net_z.predict(data_iter) 363 | 364 | for i, im in enumerate(imgFiles): 365 | scaledInstance = s_x * scales 366 | scaledTarget = np.array([ targetSize*scale for scale in scales ]) 367 | # extract scaled crops for search region x at previous target position 368 | x_crops = make_scale_pyramid(im, targetPosition, scaledInstance, p['instanceSize'], avgChans, None, p) 369 | # evaluate the offline-trained network for exemplar x features 370 | newTargetPosition, newScale = tracker_eval(net_x, round(s_x), z_features, x_crops, targetPosition, window, p, Conv) 371 | targetPosition = newTargetPosition 372 | # scale damping and saturation 373 | s_x = max(min_s_x, min(max_s_x, (1-p['scaleLR'])*s_x + p['scaleLR']*scaledInstance[newScale])) 374 | targetSize = (1-p['scaleLR'])*targetSize + p['scaleLR']*scaledTarget[newScale] 375 | 376 | rectPosition = targetPosition - targetSize / 2. 377 | tl = tuple(np.round(rectPosition).astype(int)[::-1]) 378 | br = tuple(np.round(rectPosition+targetSize).astype(int)[::-1]) 379 | im_draw = im.astype(np.uint8) 380 | cv2.rectangle(im_draw, tl, br, (0, 255, 255), thickness=3) 381 | cv2.imshow("tracking", im_draw) 382 | cv2.waitKey(1) 383 | 384 | 385 | if __name__ == "__main__": 386 | tracker() 387 | 388 | -------------------------------------------------------------------------------- /transfer_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.io as sio 3 | import mxnet as mx 4 | import cPickle as pickle 5 | import CustomMxOp 6 | 7 | def move_weight_axis(x): 8 | return np.moveaxis(np.moveaxis(x, 2, 0), 3, 0) 9 | 10 | def load_model_from_matlab(mat_model_path, raw_model_path): 11 | mdata = sio.loadmat(mat_model_path) 12 | mdata = mdata['model'][0] 13 | n_params = len(mdata) 14 | print n_params 15 | 16 | need_move_axis = ['conv1f', 'conv2f', 'conv3f', 'conv4f', 'conv5f'] 17 | 18 | model_dict = {} 19 | 20 | for param in mdata: 21 | name = param[0][0] 22 | data = param[1] 23 | if name in need_move_axis: 24 | data = move_weight_axis(data) 25 | elif not name.endswith('x'): 26 | data = data.flatten() 27 | 28 | print name, data.shape 29 | 30 | model_dict[name] = data 31 | 32 | pickle.dump(model_dict, open(raw_model_path, mode='wb'), protocol=pickle.HIGHEST_PROTOCOL) 33 | 34 | def get_sym_siamese_fc(): 35 | # conv1 36 | data = mx.sym.Variable('data') 37 | net = mx.sym.Convolution(data, name='conv1', kernel=(11,11), num_filter=96, stride=(2,2)) 38 | net = mx.sym.Custom(net, name='bn1', op_type='custbatchnorm') 39 | net = mx.sym.Activation(net, name='relu1', act_type='relu') 40 | net = mx.sym.Pooling(net, name='pool1', kernel=(3,3), pool_type='max', stride=(2,2)) 41 | 42 | # conv2 43 | nets = mx.sym.SliceChannel(net, num_outputs=2, name="sliced1") 44 | net1 = mx.sym.Convolution(nets[0], name='conv21', kernel=(5,5), num_filter=128, stride=(1,1)) 45 | net2 = mx.sym.Convolution(nets[1], name='conv22', kernel=(5,5), num_filter=128, stride=(1,1)) 46 | net = mx.sym.Concat(net1, net2, name="conv2") 47 | net = mx.sym.Custom(net, name='bn2', op_type='custbatchnorm') 48 | net = mx.sym.Activation(net, name='relu2', act_type='relu') 49 | net = mx.sym.Pooling(net, name='pool2', kernel=(3,3), pool_type='max', stride=(2,2)) 50 | 51 | # conv3 52 | net = mx.sym.Convolution(net, name='conv3', kernel=(3,3), num_filter=384, stride=(1,1)) 53 | net = mx.sym.Custom(net, name='bn3', op_type='custbatchnorm') 54 | net = mx.sym.Activation(net, name='relu3', act_type='relu') 55 | 56 | # conv4 57 | nets = mx.sym.SliceChannel(net, num_outputs=2, name="sliced2") 58 | net1 = mx.sym.Convolution(nets[0], name='conv41', kernel=(3,3), num_filter=192, stride=(1,1)) 59 | net2 = mx.sym.Convolution(nets[1], name='conv42', kernel=(3,3), num_filter=192, stride=(1,1)) 60 | net = mx.sym.Concat(net1, net2, name="conv4") 61 | net = mx.sym.Custom(net, name='bn4', op_type='custbatchnorm') 62 | net = mx.sym.Activation(net, name='relu4', act_type='relu') 63 | 64 | # conv5 65 | nets = mx.sym.SliceChannel(net, num_outputs=2, name="sliced3") 66 | net1 = mx.sym.Convolution(nets[0], name='conv51', kernel=(3,3), num_filter=128, stride=(1,1)) 67 | net2 = mx.sym.Convolution(nets[1], name='conv52', kernel=(3,3), num_filter=128, stride=(1,1)) 68 | net = mx.sym.Concat(net1, net2, name="conv5") 69 | 70 | return net 71 | 72 | def gen_mx_model(raw_model_path, mx_model_path, mode="rgb"): 73 | net = get_sym_siamese_fc() 74 | 75 | model = mx.mod.Module(net) 76 | data_iter = mx.io.NDArrayIter(data=np.zeros((1,3,127,127))) 77 | model.bind(data_shapes=data_iter.provide_data) 78 | 79 | raw_model = pickle.load(open(raw_model_path, "rb")) 80 | 81 | if mode == "bgr": 82 | #print "conv1 shape", raw_model['conv1f'].shape 83 | #print raw_model['conv1f'][0,0,0,0], raw_model['conv1f'][0,2,0,0] 84 | # swap channels 85 | raw_model['conv1f'][:, 0, :, :], raw_model['conv1f'][:, 2, :, :] = raw_model['conv1f'][:, 2, :, :], raw_model['conv1f'][:, 0, :, :].copy() 86 | #print raw_model['conv1f'][0,0,0,0], raw_model['conv1f'][0,2,0,0] 87 | 88 | arg_params = { 89 | "conv1_weight": mx.nd.array(raw_model['conv1f']), 90 | "conv1_bias": mx.nd.array(raw_model['conv1b']), 91 | "bn1_beta": mx.nd.array(raw_model['bn1b']), 92 | "bn1_gamma": mx.nd.array(raw_model['bn1m']), 93 | "bn1_moving_mean": mx.nd.array(raw_model['bn1x'][:,0]), 94 | "bn1_moving_sigma" : mx.nd.array(raw_model['bn1x'][:,1]), 95 | 96 | "conv21_weight": mx.nd.array(raw_model['conv2f'][:128]), 97 | "conv21_bias": mx.nd.array(raw_model['conv2b'][:128]), 98 | "conv22_weight": mx.nd.array(raw_model['conv2f'][128:]), 99 | "conv22_bias": mx.nd.array(raw_model['conv2b'][128:]), 100 | "bn2_beta": mx.nd.array(raw_model['bn2b']), 101 | "bn2_gamma": mx.nd.array(raw_model['bn2m']), 102 | "bn2_moving_mean": mx.nd.array(raw_model['bn2x'][:,0]), 103 | "bn2_moving_sigma" : mx.nd.array(raw_model['bn2x'][:,1]), 104 | 105 | "conv3_weight": mx.nd.array(raw_model['conv3f']), 106 | "conv3_bias": mx.nd.array(raw_model['conv3b']), 107 | "bn3_beta": mx.nd.array(raw_model['bn3b']), 108 | "bn3_gamma": mx.nd.array(raw_model['bn3m']), 109 | "bn3_moving_mean": mx.nd.array(raw_model['bn3x'][:,0]), 110 | "bn3_moving_sigma" : mx.nd.array(raw_model['bn3x'][:,1]), 111 | 112 | "conv41_weight": mx.nd.array(raw_model['conv4f'][:192]), 113 | "conv41_bias": mx.nd.array(raw_model['conv4b'][:192]), 114 | "conv42_weight": mx.nd.array(raw_model['conv4f'][192:]), 115 | "conv42_bias": mx.nd.array(raw_model['conv4b'][192:]), 116 | "bn4_beta": mx.nd.array(raw_model['bn4b']), 117 | "bn4_gamma": mx.nd.array(raw_model['bn4m']), 118 | "bn4_moving_mean": mx.nd.array(raw_model['bn4x'][:,0]), 119 | "bn4_moving_sigma" : mx.nd.array(raw_model['bn4x'][:,1]), 120 | 121 | "conv51_weight": mx.nd.array(raw_model['conv5f'][:128]), 122 | "conv51_bias": mx.nd.array(raw_model['conv5b'][:128]), 123 | "conv52_weight": mx.nd.array(raw_model['conv5f'][128:]), 124 | "conv52_bias": mx.nd.array(raw_model['conv5b'][128:]), 125 | } 126 | 127 | mx_model_path = mx_model_path + "_" + mode 128 | model.init_params(arg_params=arg_params) 129 | 130 | #out_params = model.get_params() 131 | #print out_params 132 | 133 | model.save_checkpoint(prefix=mx_model_path, epoch=1) 134 | 135 | 136 | def adjust_data(data): 137 | """ 138 | adjust the input from (h, w, c) to ( 1, c, h, w) for network input 139 | 140 | Parameters: 141 | ---------- 142 | in_data: numpy array of shape (h, w, c) 143 | input data 144 | Returns: 145 | ------- 146 | out_data: numpy array of shape (1, c, h, w) 147 | """ 148 | 149 | if data.dtype is not np.dtype('float32'): 150 | data = data.astype(np.float32) 151 | print "convert to float32" 152 | 153 | data = np.expand_dims(data, axis=0) 154 | data = np.moveaxis(data, 3, 1) 155 | 156 | return data 157 | 158 | def test_bgr_model(): 159 | mx_model_path = "model/mxmodel_bgr" 160 | model = mx.model.FeedForward.load(mx_model_path, 1, ctx=mx.cpu(0)) 161 | import cv2 162 | img = cv2.imread("images/z_crop.jpg") 163 | 164 | #img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 165 | 166 | img = adjust_data(img) 167 | res = model.predict(img) 168 | print res.shape 169 | print res[0][0] 170 | 171 | def test_model(mx_model_path, mode="rgb"): 172 | mx_model_path = mx_model_path + "_" + mode 173 | import time 174 | model = mx.model.FeedForward.load(mx_model_path, 1, ctx=mx.cpu(0)) 175 | 176 | z_crop = sio.loadmat("data/z_crop.mat")["z_crop"] 177 | 178 | #import cv2 179 | #draw = z_crop.copy().astype(np.uint8) 180 | #draw = cv2.cvtColor(draw, cv2.COLOR_BGR2RGB) 181 | #cv2.imshow("show", draw) 182 | #cv2.waitKey(0) 183 | 184 | z_crop = np.expand_dims(z_crop, axis=0) 185 | z_crop = np.moveaxis(z_crop, 3, 1) 186 | 187 | print "z_crop", z_crop.shape 188 | time0 = time.time() 189 | res = model.predict(z_crop) 190 | print "time used", time.time() - time0 191 | 192 | print res[0][0] 193 | 194 | if __name__ == "__main__": 195 | mat_model_path = "model/model.mat" 196 | raw_model_path = "model/model_dict.pkl" 197 | mx_model_path = "model/mxmodel" 198 | 199 | #load_model_from_matlab(mat_model_path, raw_model_path) 200 | #gen_mx_model(raw_model_path, mx_model_path, mode="rgb") 201 | #test_model(mx_model_path, "rgb") 202 | 203 | gen_mx_model(raw_model_path, mx_model_path, mode="bgr") 204 | #test_bgr_model() 205 | 206 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | clicked = False 4 | P1 = (0, 0) 5 | P2 = (0, 0) 6 | 7 | def onMouse(event, x, y, flags, param): 8 | if event == cv2.EVENT_LBUTTONDOWN: 9 | param['clicked'] = True 10 | param['P1'] = (x, y) 11 | param['P2'] = (x, y) 12 | elif event == cv2.EVENT_MOUSEMOVE: 13 | if param['clicked']: 14 | param['P2'] = (x, y) 15 | elif event == cv2.EVENT_LBUTTONUP: 16 | param['p2'] = (x, y) 17 | param['clicked'] = False 18 | 19 | class GetRectange: 20 | def __init__(self): 21 | self.onClick = False 22 | 23 | def getRect(self, img): 24 | winname = "get rect" 25 | param = {} 26 | param['P1'] = (0, 0) 27 | param['P2'] = (0, 0) 28 | param['clicked'] = False 29 | cv2.imshow(winname, img) 30 | cv2.setMouseCallback(winname, onMouse, param) 31 | 32 | while 1: 33 | key = cv2.waitKey(1) 34 | if param['clicked']: 35 | img_clone = img.copy() 36 | cv2.rectangle(img_clone, param['P1'], param['P2'], (0, 255, 0)) 37 | cv2.imshow(winname, img_clone) 38 | self.onClick = True 39 | if self.onClick and not param['clicked']: 40 | self.onClick = False 41 | left = min(param['P1'][0], param['P2'][0]) 42 | right = max(param['P1'][0], param['P2'][0]) 43 | top = min(param['P1'][1], param['P2'][1]) 44 | dowm = max(param['P1'][1], param['P2'][1]) 45 | cv2.destroyWindow(winname) 46 | h = dowm-top+1 47 | w = right-left+1 48 | return (top+h/2., left+w/2.), (h, w) 49 | 50 | if __name__ == "__main__": 51 | cap = cv2.VideoCapture(0) 52 | rector = GetRectange() 53 | while 1: 54 | ready, frame = cap.read() 55 | if not ready: 56 | print "device", device, "is not ready" 57 | cv2.imshow("frame", frame) 58 | key = cv2.waitKey(1) 59 | if key != -1: 60 | break 61 | rect = rector.getRect(frame) 62 | print rect --------------------------------------------------------------------------------